lq-text Full Text Retrieval Database Part 06/13

Liam R. E. Quin lee at sq.sq.com
Mon Mar 4 12:05:00 AEST 1991


: cut here --- cut here --
: To unbundle, sh this file
#! /bin/sh
: part 06
echo x - lq-text/src/liblqtext/progname.c 1>&2
sed 's/^X//' >lq-text/src/liblqtext/progname.c <<'@@@End of lq-text/src/liblqtext/progname.c'
X/* progname.c -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X * This file simply declares progname.
X * This variable MUST be set by main().
X */
X
Xchar *progname = (char *) 0;
X
X/* $Id: progname.c,v 1.2 90/10/06 00:12:19 lee Rel1-10 $
X *
X * $Log:	progname.c,v $
X * Revision 1.2  90/10/06  00:12:19  lee
X * Prepared for first beta release.
X * 
X * Revision 1.1  90/03/24  17:07:22  lee
X * Initial revision
X * 
X *
X */
@@@End of lq-text/src/liblqtext/progname.c
echo x - lq-text/src/liblqtext/smalldb.c 1>&2
sed 's/^X//' >lq-text/src/liblqtext/smalldb.c <<'@@@End of lq-text/src/liblqtext/smalldb.c'
X/* smalldb.c -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* Simple interface to start and end dbm.
X * You may also need to supply dbm_store() and dbm_fetch(), but these
X * should certainly be macros.
X *
X * $Id: smalldb.c,v 1.5 91/03/03 00:15:22 lee Rel1-10 $
X *
X * $Log:	smalldb.c,v $
X * Revision 1.5  91/03/03  00:15:22  lee
X * Improved an error message and fixed a permissions bug.
X * 
X * Revision 1.4  91/03/02  18:52:48  lee
X * Default access is now read only -- lqWriteAccess must be called otherwise.
X * 
X * Revision 1.3  90/10/06  00:12:20  lee
X * Prepared for first beta release.
X * 
X * Revision 1.2  90/09/20  17:53:26  lee
X * slight error reporting improvement.
X * 
X * Revision 1.1  90/08/09  19:16:56  lee
X * Initial revision
X * 
X * Revision 2.2  89/10/08  20:47:14  lee
X * Working version of nx-text engine.  Addfile and wordinfo work OK.
X * 
X * Revision 2.1  89/10/02  01:15:55  lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X * 
X * Revision 1.2  89/09/16  21:18:39  lee
X * First demonstratable version.
X * 
X * Revision 1.1  89/09/07  21:06:11  lee
X * Initial revision
X * 
X *
X */
X
X#include "globals.h"
X
X#include <stdio.h>
X
X#include <fcntl.h>
X#ifdef BSD
X# include <sys/param.h>
X# define PATH_MAX MAXPATHLEN /* untested, sorry */
X#else /*!BSD*/
X# include <limits.h> /* for PATH_MAX */
X#endif
X#include "smalldb.h"
X#include "emalloc.h"
X
Xextern int strcmp();
Xextern char *strcpy();
X
X/* The physical database for the list of words, and for the list
X * of files, uses ndbm.
X * The advantage of this is that it takes only two file system accesses
X * to retrieve any data item (honest!).
X * It's also reasonably fast at insertion.
X * One disadvantage is that it doesn't cope if too many words have the
X * same (32-bit) hash function, although publicly available replacements
X * such as the GNU project's gdbm fix this.
X *
X * Since starting the database is expensive (two opens and a malloc),
X * I have a cache of DBM pointers and keep them open.  Versions of the
X * dbm routines that don't support more than one database will have to
X * have a cache-size of one!
X * I am not sure what the impact of this would be on performance; for
X * adding a new file it shouldn't be too bad, as the file list is examined
X * only once for each file, during reading, and the word database is looked
X * at (at least once for each distinct word) only on writing.
X * For retrieval, however, the word database will be looked at for each
X * word in the query, and the file database for (potentially) each match
X * of each word, so the requests will be more interspersed.
X * Under no circumstances is it acceptable to dispense with the cache, as
X * otherwise you will be doing (literally) thousands of calls to
X * open() and close() per second!
X *
X */
X
X#undef startdb
X
X#ifndef CACHE
X/* It's unusual to deal with lots of databases at once, so let's not
X * waste RAM...
X */
X# define CACHE 3
X#endif
X
Xstatic char NameCache[CACHE][PATH_MAX + 1]; /* + 1 for \0, I think */
Xstatic DBM *Cache[CACHE]; /* (set to zero by definition) */
X
Xstatic int MaxInCache = (-1);
X
X/* FileFlags and Mode are passed to dbm_open */
Xstatic int FileFlags = O_RDONLY;
Xstatic int FileModes = 0;
X
Xvoid
XlqWriteAccess()
X{
X    FileFlags = O_RDWR|O_CREAT;
X    FileModes = 0664; /* owner and group write, others read only */
X}
X
XDBM *
Xstartdb(FilePrefix)
X    char *FilePrefix;
X{
X    extern int errno;
X    register int i;
X
X    for (i = 0; i <= MaxInCache; i++) {
X	if (Cache[i] && STREQ(NameCache[i], FilePrefix)) {
X	    return Cache[i];
X	}
X    }
X
X    /* Find an empty slot */
X    for (i = 0; i <= MaxInCache; i++) {
X	if (Cache[i] == (DBM *) 0) break;
X    }
X
X    if (i > MaxInCache) {
X	if (i >= CACHE) i = 0;
X    }
X
X    if (Cache[i]) dbm_close(Cache[i]);
X    NameCache[i][0] = '\0';
X
X    errno = 0;
X
X    if ((Cache[i] = dbm_open(FilePrefix, FileFlags, FileModes)) == (DBM *)0) {
X	int e = errno;
X	(void) fprintf(stderr, "%s: dbm_open error %d: ", progname, errno);
X	errno = e;
X	perror(FilePrefix);
X	exit(1);
X    }
X    (void) strcpy(NameCache[i], FilePrefix);
X    if (i > MaxInCache) MaxInCache = i;
X
X    return Cache[i];
X}
X
X#undef enddb
X
X/*ARGSUSED*/
Xvoid
Xenddb(db)
X    DBM *db;
X{
X    /* no-op */
X}
X
Xvoid
Xcleanupdb()
X{
X    register int i;
X
X    for (i = 0; i <= MaxInCache; i++) {
X	if (Cache[i]) dbm_close(Cache[i]);
X	Cache[i] = (DBM *) 0;
X	NameCache[i][0] = '\0';
X    }
X}
@@@End of lq-text/src/liblqtext/smalldb.c
echo x - lq-text/src/liblqtext/system.c 1>&2
sed 's/^X//' >lq-text/src/liblqtext/system.c <<'@@@End of lq-text/src/liblqtext/system.c'
X/* system.c -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X *
X * This is not a very portable way of doing things... and certainly not
X * a very fast one.  MUST be re-written.
X * Only for use from within curses.
X *
X * Lee
X *
X * $Id: system.c,v 1.3 90/10/06 00:21:37 lee Rel1-10 $
X */
X
X#ifdef ultrix
X# include <cursesX.h>
X#else
X# include <curses.h>
X#endif
X
X#ifndef echo
Xextern int echo();
X#endif
X#ifndef wmove
Xextern int wmove();
X#endif
X#ifndef nl
Xextern int nl();
X#endif
X#ifndef noecho
Xextern int noecho();
X#endif
X#ifndef nonl
Xextern int nonl();
X#endif
X#ifndef wrefresh
Xextern int wrefresh();
X#endif
X#ifndef waddstr
Xextern int waddstr();
X#endif
X#ifndef wclear
Xextern int wclear();
X#endif
X
Xint
XMySystem(string)
X    char *string;
X{
X    int val;
X
X    clearok(stdscr, TRUE);
X    clear();
X    refresh();
X    noraw();
X    echo();
X    nl();
X    val = system("stty opost icanon onlcr icrnl echo");
X    (void) system(string);
X    fprintf(stderr, "\n[press  return  to continue] ");
X    raw();
X    noecho();
X    nonl();
X    (void) getch();
X    clearok(stdscr, TRUE);
X    mvwaddstr(stdscr, 10, 10, "                        "); /* ???!?? */
X
X    return val;
X}
X
@@@End of lq-text/src/liblqtext/system.c
echo x - lq-text/src/lqtext/FindCommon.sh 1>&2
sed 's/^X//' >lq-text/src/lqtext/FindCommon.sh <<'@@@End of lq-text/src/lqtext/FindCommon.sh'
X:
X# FindCommon -- Copyright 1990 Liam R. Quin.  All Rights Reserved.
X# This code is NOT in the public domain.
X# See the file COPYRIGHT for full details.
X#
X# $Id: FindCommon.sh,v 1.2 90/10/06 00:50:31 lee Rel1-10 $
X
X# Find the most common words in the database.
X# usage is % n, where n is the n most comon words to find
X
Xlqword -a | sed -e 's/^......................\(.........\)..\(..*\)$/\1	\2/' |
Xsort -nr | sed ${1-500}q
X
Xexit $?
X
X#         1 |       0 |       2 | pcpaintbrush
X#         2 |       0 |       2 | escape
X#         3 |       0 |       1 | durham
X#         4 |   60928 |      12 | making
X#         5 |       0 |       1 | ethical
X#         6 |       0 |       1 | committing
X
X# $Log:	FindCommon.sh,v $
X# Revision 1.2  90/10/06  00:50:31  lee
X# Prepared for first beta release.
X# 
X#
@@@End of lq-text/src/lqtext/FindCommon.sh
echo x - lq-text/src/lqtext/Makefile 1>&2
sed 's/^X//' >lq-text/src/lqtext/Makefile <<'@@@End of lq-text/src/lqtext/Makefile'
X# Makefile for LQ-Text, a full text retrieval package by Liam R. Quin
X# This Makefile belongs in the "src/lqtext" directory.
X#
X# Note that most of the actual configuration is done in ../Makefile and
X# in ../h/global.h, and not here.
X
X# Makefile -- Copyright 1990 Liam R. Quin.  All Rights Reserved.
X# This code is NOT in the public domain.
X# See the file ../COPYRIGHT for full details.
X#
X# $Id: Makefile,v 1.5 91/03/03 00:19:26 lee Rel1-10 $
X
X
XPWD=lqtext
X
XTARGETS = lqaddfile lqfile lqword lqphrase lqshow lqkwik lq
XBINFILES =lqaddfile lqfile lqword lqshow lqphrase lqkwik
X
XDESTDIR=../bin
XMODE=755
XRANLIB=echo
X
XEXTRA=-I../h
X
Xall: $(TARGETS)
X
X# for ndbm (simplest), leave empty or use -lndbm if you need it
X# for sdbm (best so far), use ../lib/libsdbm.a
X# for gdbm... well, I dunno.
XDBMLIBS=../lib/libsdbm.a
X# DBMLIBS=-lndbm
X# DBMLIBS=ndbm.o bcopy.o
X
XTEXTLIB=../lib/liblqtext.a ../lib/liblq.a
X
X# The following are for "make depend" and for sabre to load...
XDEPENDFILES = ReadAhead.c SixBit.c fileindex.c lqaddfile.c lqphrase.c \
X	      lqshow.c lqword.c sizes.c wordtable.c
X
X# MALLFILES=/usr/lib/debug/malloc.o /usr/lib/debug/mallocmap.o
XMALLFILES = 
X
Xinstall: all
X	for i in $(BINFILES); do cp "$$i" $(DESTDIR); \
X	strip "$(DESTDIR)/$$i" ; \
X	done ; \
X	mv lq $(DESTDIR)/lq; chmod $(MODE) $(DESTDIR)/lq;
X
X.SUFFIXES: .c .o .src .obj
X
X.c.src:
X	#load $(CFLAGS) $<
X
X.o.obj:
X	#load $(CFLAGS) $<
X
X# If you are going to use saber on these, you should name the programs.
Xsaber_src:
X
Xsaber_obj:
X
Xlq: lq.sh
X	cp lq.sh lq
X	chmod +x lq
X
Xlqshow: lqshow.o $(TEXTLIB)
X	$(CC) $(CFLAGS) -o lqshow lqshow.o $(TEXTLIB) $(TERMCAP) $(DBMLIBS)
X
Xlqaddfile: lqaddfile.o wordtable.o $(TEXTLIB)
X	$(CC) $(CFLAGS) -o lqaddfile lqaddfile.o wordtable.o \
X			$(TEXTLIB) $(MALLOC) $(DBMLIBS) $(MALLFILES)
X
Xlqfile: fileindex.o $(TEXTLIB)
X	$(CC) $(CFLAGS) -o lqfile fileindex.o $(TEXTLIB) $(MALLOC) $(DBMLIBS)
X	
Xlqword: lqword.o $(TEXTLIB)
X	$(CC) $(CFLAGS) -o lqword lqword.o $(TEXTLIB) $(MALLOC) $(DBMLIBS)
X
Xlqkwik: lqkwik.o $(TEXTLIB)
X	$(CC) $(CFLAGS) -o lqkwik lqkwik.o $(TEXTLIB) $(MALLOC) $(DBMLIBS)
X
Xlqphrase: lqphrase.o $(TEXTLIB)
X	$(CC) $(CFLAGS) -o lqphrase lqphrase.o $(TEXTLIB) $(DBMLIBS)
X
Xlint: AddFile.Lint News.Lint FileInfo.Lint Phrase.Lint
X
Xtidy:
X	/bin/rm -f *.o core
X
Xclean: tidy
X	/bin/rm -f $(TARGETS) $(TEST)
X
Xdepend:
X	mkdep $(CFLAGS) *.c
X
X#
X# $Log:	Makefile,v $
X# Revision 1.5  91/03/03  00:19:26  lee
X# added lqkwik
X# 
X# Revision 1.4  90/10/06  00:50:42  lee
X# Prepared for first beta release.
X# 
X# Revision 1.3  90/10/05  23:54:57  lee
X# deleted mkdep output.
X# 
X# Revision 1.2  90/09/28  21:54:01  lee
X# No longer uses OWNER.
X# 
X# Revision 1.1  90/08/09  19:17:39  lee
X# Initial revision
X# 
X 
X# DO NOT DELETE THIS LINE -- mkdep uses it.
X# DO NOT PUT ANYTHING AFTER THIS LINE, IT WILL GO AWAY.
X
XSixBit.o: SixBit.c ../h/globals.h 
XSixBit.o: ../h/wordrules.h
Xfileindex.o: fileindex.c ../h/globals.h 
Xfileindex.o: ../h/emalloc.h ../h/fileinfo.h
Xlqaddfile.o: lqaddfile.c 
Xlqaddfile.o: ../h/globals.h ../h/fileinfo.h ../h/emalloc.h
Xlqaddfile.o: ../h/wordinfo.h ../h/pblock.h ../h/wordrules.h ../h/filter.h
Xlqkwik.o: lqkwik.c ../h/globals.h ../h/fileinfo.h
Xlqkwik.o: ../h/wordinfo.h ../h/pblock.h ../h/wordrules.h ../h/pblock.h
Xlqkwik.o: ../h/emalloc.h
Xlqphrase.o: lqphrase.c ../h/globals.h ../h/emalloc.h
Xlqphrase.o: ../h/fileinfo.h ../h/wordinfo.h ../h/pblock.h ../h/pblock.h
Xlqphrase.o: ../h/phrase.h
Xlqshow.o: lqshow.c ../h/globals.h
Xlqword.o: lqword.c ../h/globals.h 
Xwordtable.o: wordtable.c ../h/globals.h
X
X# IF YOU PUT ANYTHING HERE IT WILL GO AWAY
@@@End of lq-text/src/lqtext/Makefile
echo x - lq-text/src/lqtext/ReadAhead.c 1>&2
sed 's/^X//' >lq-text/src/lqtext/ReadAhead.c <<'@@@End of lq-text/src/lqtext/ReadAhead.c'
@@@End of lq-text/src/lqtext/ReadAhead.c
echo x - lq-text/src/lqtext/fileindex.c 1>&2
sed 's/^X//' >lq-text/src/lqtext/fileindex.c <<'@@@End of lq-text/src/lqtext/fileindex.c'
X/* fileindex.c -- Copyright 1989, 1990 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* A simple program to give information about one or more files about
X * which information is stored in the NX-Text database.
X *
X * $Id: fileindex.c,v 1.4 91/03/02 18:56:53 lee Rel1-10 $
X */
X
X#include "globals.h" /* defines and declarations for database filenames */
X
X#include <stdio.h>
X#include <sys/types.h>
X#include <malloc.h>
X#include "emalloc.h"
X#include "fileinfo.h"
X
Xstatic char *Revision = "@(#) lqtext 2.3 89/11/34";
X
X/* The position of the \n in the 26-char string returned by ctime(3): */
X#define DATENEWLINE 24
X
Xchar *progname;
Xint AsciiTrace = 0;
X
X/** System calls and library functions used in this file: **/
X
X/** Unix System calls: **/
Xextern void exit();
X/** System Library Functions: **/
X
X/** external lqtext functions: **/
Xextern void cleanupdb(), SetDefaults();
Xint SaveFileInfo(), GetFilterType();
X#ifndef efree
X extern void efree();
X#endif
X/** Functions defined within this file: **/
Xvoid AddInfo(), AllInfo(), Display(), PrintInfo();
X
Xint AllFiles = 0;
Xint ListMode = 0;
Xint AddFiles = 0;
X
Xint
Xmain(argc, argv)
X    int argc;
X    char *argv[];
X{
X    extern int optind, getopt();
X    /** extern char *optarg; (unused at the moment) **/
X    int ch;
X    int ErrorFlag = 0;
X
X    progname = argv[0];
X
X    SetDefaults(argc, argv);
X
X    /* All programs take Zz:Vv */
X    while ((ch = getopt(argc, argv, "Zz:VvAax")) != EOF) {
X	switch (ch) {
X	case 'z':
X	case 'Z':
X	    break; /* done by SetDefaults(); */
X	case 'V':
X	    fprintf(stderr, "%s version %s\n", progname, Revision);
X	    break;
X	case 'v':
X	    AsciiTrace = 1;
X	    break;
X	case 'A':
X	    AddFiles = 1;
X	    break;
X	case 'a':
X	    AllFiles = 1;
X	    break;
X	case 'l':
X	    ListMode = 1;
X	    break;
X	case 'x':
X	    ErrorFlag = (-1);
X	    break;
X	case '?':
X	    ErrorFlag = 1;
X	    break;
X	}
X    }
X
X    /* Normally put call to lrqError here to give a helpful message,
X     * but not yet ready to ship the error handling package, sorry
X     */
X    if (ErrorFlag) {
X	fprintf(stderr, "%s: usage: %s [options] [files]\n",progname,progname);
X	fprintf(stderr, "%s: options are:\n", progname);
X	fputs("\
X	-c file -- treat the named file as a list of common words\n\
X	-d dir	-- use the lq-text database in the directory \"dir\"\n\
X	-l	-- list mode: no header output or lines drawn\n\
X	-s	-- show the list of saved files\n\
X	-t N	-- set trace level to N [default: 0]\n\
X	-V	-- print version information\n\
X	-v	-- be verbose (same as -t 1)\n\
X	-x	-- print this explanation\n\
X\n\
XIn addition, if no files are given, the following are understood:\n\
X	-A	-- add the named files to the list of known files\n\
X	-a	-- list information about all files\n", stderr);
X	exit((ErrorFlag > 0) ? 1 : 0);
X    }
X
X    if (AllFiles && AddFiles) {
X	fprintf(stderr, "%s: do not use both -a and -A options\n", progname);
X	fprintf(stderr, "\tuse %s -x for further explanation.\n", progname);
X	exit(1);
X    }
X
X    if (optind >= argc && !AllFiles && !AddFiles) {
X	fprintf(stderr,
X	"%s: You must either give the -a option or specify files to list.\n",
X		progname);
X	fprintf(stderr, "\tuse %s -x for further explanation.\n", progname);
X	exit(1);
X    }
X
X    if (!AddFiles || !ListMode) {
X	printf("%-7.7s | T | %-20.20s | %s\n",
X		"FID", "Date Last indexed", "Current Location");
X	puts(
X"========|===|======================|=========================================="
X	);
X    }
X    if (AllFiles) {
X	AllInfo();
X    } else {
X	if (AddFiles) {
X	    extern lqWriteAccess();
X
X	    lqWriteAccess();
X	}
X
X	while (optind < argc) {
X	    if (AddFiles) {
X		AddInfo(argv[optind++]);
X	    } else {
X		PrintInfo(argv[optind++]); /* ugh */
X	    }
X	}
X    }
X    cleanupdb(); /* close dbm files */
X    exit(0);
X    /*NOTREACHED*/
X    return 1; /* for lint and gcc... */
X}
X
Xvoid
XPrintInfo(Name)
X    char *Name;
X{
X    extern t_FileInfo *GetFileInfo();
X    long FID;
X    extern long atol();
X    extern t_FID Name2FID();
X
X    t_FileInfo *FileInfo;
X
X    if ((FID = Name2FID(Name)) == (t_FID) 0) {
X	fprintf(stderr, "No FID information for filename: %s\n", Name);
X        if ((FID = atol(Name)) == (t_FID) 0) {
X	    return;
X	}
X    }
X
X    /* get info from the list */
X    if ((FileInfo = GetFileInfo(FID)) == (t_FileInfo *) 0) {
X	fprintf(stderr, "No index information for: %s\n", Name);
X	return;
X    }
X    Display(FileInfo);
X}
X
Xvoid
XDisplay(FileInfo)
X    t_FileInfo *FileInfo;
X{
X    extern char *ctime();
X    char *DateString;
X
X    DateString = ctime(&(FileInfo->Date));
X    DateString[DATENEWLINE] = '\0'; /* delete the trailing newline */
X
X    if (ListMode) {
X	printf("%lu %d %s %s\n",
X	FileInfo->FID, FileInfo->FilterType, &DateString[4], FileInfo->Name);
X    } else {
X	printf("%7lu | %d | %-20.20s | %s\n",
X	    FileInfo->FID, FileInfo->FilterType, &DateString[4], FileInfo->Name);
X    }
X}
X
X/**
XMon Sep 25 23:58:53 BST 1989
XFID     | T | Date Last indexed    | Current Location
X========|===|======================|===========================================
X      1 | 0 | Sep 25 20:31:26 1989 | /usr2/liam/Bible/NT/John/john01.kjv
X      2 | 0 | Sep 25 20:31:28 1989 | /usr2/liam/Bible/NT/John/john02.kjv
X      3 | 0 | Sep 25 20:31:30 1989 | /usr2/liam/Bible/NT/John/john03.kjv
X**/
X
Xvoid
XAllInfo()
X{
X    extern long GetMaxFID();
X    extern t_FileInfo *GetFileInfo();
X
X    t_FileInfo *FileInfo;
X    long FID;
X    long MaxFid = GetMaxFID();
X
X    for (FID = 0L; FID <= MaxFid; FID++) {
X	if ((FileInfo = GetFileInfo(FID)) != (t_FileInfo *) 0) {
X	    Display(FileInfo);
X	    efree(FileInfo); /* NOTDONE use destroyfileinfo() */
X	}
X    }
X    printf("Max File Identifier is %lu\n", MaxFid);
X}
X
Xvoid
XAddInfo(FileName)
X    char *FileName;
X{
X    extern time_t time();
X    extern unsigned long GetNextFID();
X    t_FileInfo FileInfo;
X
X    FileInfo.Name = FileName;
X    (void) time(&(FileInfo.Date));
X    FileInfo.FID = GetNextFID();
X    FileInfo.Stream = 0; /* force GetFilterType to use open()? */
X
X    /* determine filter type */
X    FileInfo.FilterType = GetFilterType(&FileInfo);
X
X    printf("%d %s (type %d) %s\n",
X	    FileInfo.FID,
X	    FileInfo.Name,
X	    FileInfo.FilterType,
X	    SaveFileInfo(&FileInfo) == 0 ?
X			    "saved successfully." :
X			    "not saved."
X    );
X}
X
X/*
X * $Log:	fileindex.c,v $
X * Revision 1.4  91/03/02  18:56:53  lee
X * Now asks for write access iff [sic] necessary
X * 
X * Revision 1.3  90/10/06  00:50:50  lee
X * Prepared for first beta release.
X * 
X * Revision 1.2  90/08/29  21:44:51  lee
X * Alpha release
X * 
X * Revision 1.1  90/08/09  19:17:11  lee
X * Initial revision
X * 
X * Revision 2.2  89/10/08  20:45:46  lee
X * Working version of nx-text engine.  Addfile and wordinfo work OK.
X * 
X * Revision 2.1  89/10/02  01:14:18  lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X * 
X * Revision 1.2  89/09/16  21:16:17  lee
X * First demonstratable version.
X * 
X * Revision 1.1  89/09/07  21:05:55  lee
X * Initial revision
X *
X */
@@@End of lq-text/src/lqtext/fileindex.c
echo x - lq-text/src/lqtext/intersect.sh 1>&2
sed 's/^X//' >lq-text/src/lqtext/intersect.sh <<'@@@End of lq-text/src/lqtext/intersect.sh'
X:
X# intersect word-one word-two
X#
X# intersect -- Copyright 1990 Liam R. Quin.  All Rights Reserved.
X# This code is NOT in the public domain.
X# See the file ../COPYRIGHT for full details.
X#
X# $Id: intersect.sh,v 1.3 91/03/03 00:18:59 lee Rel1-10 $
X#
X
X
XFileNumber=0
XFileList=
XProgram=lqphrase
XProgOpts=
XAll=/tmp/iAll$$
Xexport All
X
Xtrap '/bin/rm -f $All $tmp $First $FileList; exit' 0 1 2 3 15
X
Xif [ x"$1" = x"" ]
Xthen
X    echo "$0: Usage: `basename $0` {-w word} | {-p phrase} ..." 1>&2
X    exit 1
Xfi
X
X
Xfor i
Xdo
X    if [ x"$i" = x"-p" ]
X    then
X	Program=lqphrase
X	ProgOpts=
X    elif [ x"$i" = x"-w" ]
X    then
X	Program=lqword
X	ProgOpts=-l
X    else
X	tmp=/tmp/inter.$FileNumber
X	$Program $ProgOpts "$i" | tee -a $ALL | awk '{ print $3 }' | sort -u > $tmp
X	if [ x"$First" = x"" ]
X	then
X	    First="$tmp"
X	else
X	    FileList="$FileList $tmp"
X	fi
X	FileNumber=`expr $FileNumber + 1`
X    fi
Xdone
X
X# Find matches...
Xtmp=/tmp/inter.tmp$$
X
Xfor i in $FileList
Xdo
X    fgrep -x -f $First $i | sort -u > $tmp
X    mv $tmp $First
Xdone
X
Xmv $First $tmp
Xsed 's/^/ /' $tmp > $First
X
Xfgrep -f $First $All
Xexit 0
X
X#
X#
X# $Log:	intersect.sh,v $
X# Revision 1.3  91/03/03  00:18:59  lee
X# brought up to date a little...
X# 
X# Revision 1.2  90/10/06  00:50:52  lee
X# Prepared for first beta release.
X# 
X# Revision 1.1  90/08/29  21:45:01  lee
X# Initial revision
X# 
X#
X#
@@@End of lq-text/src/lqtext/intersect.sh
echo x - lq-text/src/lqtext/lq.sh 1>&2
sed 's/^X//' >lq-text/src/lqtext/lq.sh <<'@@@End of lq-text/src/lqtext/lq.sh'
X#! /bin/sh
X: use /bin/sh
X# put the : line first on System V
X
X# lq -- Copyright 1990 Liam R. Quin.  All Rights Reserved.
X# This code is NOT in the public domain.
X# See the file ../COPYRIGHT for full details.
X#
X# $Id: lq.sh,v 1.3 90/10/06 00:50:53 lee Rel1-10 $
X#
X
Xif [ x"`echo -n hello`" = x'hello' ]
Xthen
X    N=-n
X    C=
Xelse
X    N=
X    C='\c'
Xfi
X
Xquit=no
Xt=/tmp/lq$$
XListFile=/tmp/lqshow$$
Xexport ListFile
X
Xtrap '/bin/rm -f $t; exit' 0 1 2 3 15
X
X
Xwhile  [ x"$quit" != x"yes" ]
Xdo
X    cat << boy
X| Type a words or phrases to find, one per line,
X| and then press return.
Xboy
X    x='fhdjfd'
X    Phrases=
X    while [ x"$x" != x"" ]
X    do
X	echo $N "| $C"
X	read x
X	if [ x"$x" != x"" ]
X	then
X	    New=`echo "$x" | sed 's/"/:/g'`
X	    Phrases="${Phrases} \"$x\""
X	fi
X    done
X    echo $Phrases
X    eval lqphrase -p $Phrases \> $t
X    if [ ! -s $t ]
X    then
X	echo "No match"
X    else
X	# determine the order in which matches will be presented to the user:
X	sort +2 -o "$t" "$t" # (this is our ranking function)
X	# (it only makes a difference if there was more than one phrase)
X
X	# Now some arcanery, I'm afraid...  The trick is that lqshow can be
X	# given the name of a file descriptor in which to save the names of
X	# any files the user selects (with "s").
X	old_t="$t"
X	t="$t ${ListFile}"
X	lqshow -o 3 -f $t 3>> ${ListFile}
X	t="$old_t"
X	if [ -s ${ListFile} ]
X	then ## the user typed s/k/whatever to save some files...
X	    # make the list by interpreting the list file:
X	    LIST=`awk '
X	    /^#.*$/ { next }
X	    ($1 == "s") { SAVE[$2]++ }
X	    ($1 == "d") { SAVE[$2] = 0 }
X	    END {
X		for (i in SAVE) {
X		    if (SAVE[i] > 0) print i
X		}
X	    }' $ListFile | sort -u`
X	    # make a new list file...
X	    echo "$LIST" | sed '/^[ 	]*$/d' > $ListFile
X	    LIST="" # save memory
X	fi
X	# now see if it's still non-empty...
X	if [ -s ${ListFile} ]
X	then
X	    List="Type S filename to save the list of files (s also quits) "
X	else
X	    /bin/rm -f ${ListFile}
X	fi
X    fi
X    echo $List
X    echo $N "Type q to quit, or return to continue: $C"
X    read quit rest
X    case "$quit" in
X    [qQ]*) quit="yes" ;;
X    [sS]) # save the list of matches
X	cat $ListFile
X
X	if [ ! -s "$ListFile" ]
X	then
X	    echo "No files in the list to save."
X	    quit=no
X	else
X	    if [ -z "$rest" ]
X	    then rest="lq.list"
X	    fi
X
X	    if [ -f "$rest" ]
X	    then echo "Appending to existing file $rest"
X	    fi
X
X	    cat $ListFile >> $rest
X	    rm $ListFile
X	    if [ x"$quit" = x"s" ]
X	    then quit=yes
X	    else quit=no
X	    fi
X	fi
X    ;;
X    *)	   quit=no ;;
X    esac
Xdone
X
X#
X# $Log:	lq.sh,v $
X# Revision 1.3  90/10/06  00:50:53  lee
X# Prepared for first beta release.
X# 
X#
X#
@@@End of lq-text/src/lqtext/lq.sh
echo x - lq-text/src/lqtext/lqaddfile.c 1>&2
sed 's/^X//' >lq-text/src/lqtext/lqaddfile.c <<'@@@End of lq-text/src/lqtext/lqaddfile.c'
X/* lqaddfile.c -- Copyright 1989, 1990 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* addfile -- add a file to the LQ-Text text retrieval index
X * Liam Quin, August 1989 and later...
X *
X * $Id: lqaddfile.c,v 1.14 91/03/02 21:22:39 lee Rel1-10 $ 
X */
X
Xstatic char *Version = "@(#) $Id: lqaddfile.c,v 1.14 91/03/02 21:22:39 lee Rel1-10 $";
X
X#ifdef SYSV
Xextern int _filbuf(); /* used but not defined in stdio.h */
X#endif
X#include <stdio.h>
X#include <malloc.h>
X#include <ctype.h>
X#include <sys/types.h>
X#include <sys/stat.h>
X#ifdef BSD
X# include <strings.h>
X#else
X# include <string.h>
X#endif
X
X#include "globals.h" /* defines and declarations for database filenames */
X#include "fileinfo.h"
X#include "wordinfo.h"
X#include "wordrules.h"
X#include "filter.h"
X
X#include "emalloc.h"
X
X#define enew(var, type) (var = (type *) emalloc(sizeof(type)))
X
X#ifdef SYSV
X#define TOLOWER(ch) ch = tolower(ch)
X#else
X#define TOLOWER(ch) if (isupper(ch)) ch = tolower(ch)
X#endif
X
Xvoid DestroyFileInfo(), SaveFileInfo(), AddStream(), AddFrom();
Xextern lqWriteAccess(); /* Allow write access to the database */
X/* Symbol Table Interface */
Xextern void AddWord(), WriteCurrentMaxWID();
Xextern void DumpCache(), cleanupdb();
Xextern char *WordRoot();
Xextern int TooCommon(), GetFilterType();
Xint RealGetChar(), AddFile();
X
X/** System calls and library routines used in this file: **/
X/** System calls: **/
Xextern void exit();
Xextern int stat();
X/** Library Functions: **/
Xextern int atoi();
X#ifndef tolower
X extern int tolower();
X#endif
Xextern void perror();
X/**/
X
Xchar *progname = "@(#) : addfile.c,v 1.1 89/08/28 20:16:05 lee Locked $";
Xstatic int UseLineNumbers = 0;
X
X/* FROM pblock.c */
Xextern int AsciiTrace; /* provide increasingly verbose info if not zero */
X
Xstatic int LastChar = 0;
Xstatic int _chForLee = 0;
X
X#define GetChar(F) \
X    ( LastChar ? \
X	(++BytesRead, (_chForLee = LastChar), (LastChar = 0), _chForLee) : \
X	( (_chForLee = getc(FileInfo->Stream)) != '\'' || !InWord) ? \
X		(++BytesRead, _chForLee) : RealGetChar(F) )
X	 
Xint
Xmain(argc, argv)
X    int argc;
X    char *argv[];
X{
X    extern char *strrchr();
X    extern int getopt(), cknatstr();
X    extern void SetDefaults();
X    extern char *optarg;
X    extern int optind;
X    extern int MaxWordsInCache; /* see wordtable.c */
X
X    int c;
X    int ErrorFlag = 0;
X    int DoNothing = 0;
X    char *InputFile = (char *) 0;
X
X#ifdef MALLOCTRACE
X    malloc_debug(2);
X#endif
X
X    progname = argv[0]; /* retain the full path at first */
X
X#ifdef M_MXFAST
X    (void) mallopt(M_MXFAST, sizeof(t_WordPlace));
X    /* may need to comment mallopt() out entirely for BSD -- use ifndef.
X     * seems to work under SunOS, though.
X     * When it works, it says "Allocate 100 or so chunks of this size at a
X     * time, and whenver I ask for this much or less, give me one of the
X     * chunks".  Clearly it had better not be too large, but it is a big
X     * win with a structure allocated for every occurrence of every word!
X     */
X#endif
X
X    SetDefaults(argc, argv);
X
X    while ((c = getopt(argc, argv, "w:f:xVZz:")) != -1) {
X	switch (c) {
X	case 'w':
X	    if (!cknatstr(optarg)) {
X		fprintf(stderr,
X			    "%s: -w must be given a number >= 0, not \"%s\"\n",
X							    progname, optarg);
X		fprintf(stderr, "\tuse %s -xv for further information\n");
X		exit(1);
X	    }
X	    MaxWordsInCache = atoi(optarg);
X	    break;
X	case 'Z':
X	case 'z':
X	    break; /* work done in SetDefault() */
X	case 'V':
X	    fprintf(stderr, "%s: version: %s\n", progname, Version);
X	    DoNothing = 1;
X	    break;
X	case 'f':
X	    if (InputFile) {
X		fprintf(stderr,
X"%s: only one -f option allowed; use -xv for explanation\n", progname);
X
X		exit(1);
X	    }
X	    InputFile = optarg;
X	    break;
X	case 'x':
X	    ErrorFlag = (-1);
X	    break;
X	default:
X	case '?':
X	    ErrorFlag = 1;
X	}
X    }
X
X    if ((progname = strrchr(progname, '/')) != (char *) NULL) {
X	++progname; /* step over the last / */
X    } else {
X	progname = argv[0];
X    }
X
X    if (ErrorFlag > 0) {
X	fprintf(stderr, "use %s -x or %s -xv for an explanation.\n",
X							progname, progname);
X	exit(1);
X    } else if (ErrorFlag < 0) { /* -x was used */
X	fprintf(stderr, "%s -- add files to an lq-text retrieval database\n",
X								    progname);
X
X	fputs("Options are:\n\
X	-f file -- read the list of files to index from \"file\"\n\
X	-c file	-- cfile contains a list of common words to be ignored\n\
X	-d dir	-- use the lq-text database in the named directory\n\
X	-t N	-- set the trace level to N [default: N = 0]\n\
X	-V	-- print Version number and exit\n\
X	-v	-- be verbose (equivalent to -t 1)\n\
X	-w n	-- dump the word-cache every n words\n\
X	-x	-- print this eXplanation and exit\n\
X	--	-- all following arguments are file names\n\
X\n\
X", stderr);
X	if (AsciiTrace == 1) {
X	    /* used -v or -t1 */
X	fprintf(stderr, "\n\
X    Any remaining arguments are taken to be file names.  The current\n\
XDOCPATH (%s) is searched for the files, and they are read and added\n\
Xto the index.  (If you use the -f option, you should not give filename\n\
Xarguments on the command line, although you can use \"-f -\" to read the\n\
Xlist of files from standard input, one per line.\n\
XSetting (with -w) the size of the cache may dramatically\n\
Ximprove performance.  Systems with memory larger than the data can try -w0.\n\
XSee lqtext(1) for more information.\n", DocPath);
X	}
X    exit(0);
X
X    }
X
X    if (DoNothing) {
X	if (optind < argc) {
X	    fprintf(stderr, "%s: warning: %d extra argument%s ignored...\n",
X				progname, argc - optind,
X				argc - optind == 1 ? "" : "%s" );
X	    fprintf(stderr, "Use %s -x for an explanation\n", progname);
X	}
X	exit(0);
X    }
X
X    lqWriteAccess();
X
X    if (InputFile) {
X	if (optind < argc) {
X	    fprintf(stderr, "%s: -f: too many arguments; use -xv\n", progname);
X	    exit(1);
X	}
X	AddFrom(InputFile);
X    } else for (; optind < argc; ++optind) {
X	if (AddFile(argv[optind]) < 0 && AsciiTrace >= 1) {
X	    fprintf(stderr, "%s: warning: Problem adding file %s\n",
X			progname, argv[optind]);
X	}
X    }
X
X#ifndef MALLOCTRACE
X    DumpCache(0); /* the 0 means don't bother calling free() */
X#else
X    DumpCache(1); /* Free everthing so whatever is left is a memory leak */
X#endif
X
X    cleanupdb(); /* empty the dbm cache */
X    WriteCurrentMaxWID();
X
X#ifdef MALLOCTRACE
X    (void) fprintf(stderr, "%s: Malloctrace: checking...\n", progname);
X    malloc_verify();
X    (void) fprintf(stderr, "%s: Malloc Map\n", progname);
X    mallocmap();
X#endif
X
X    exit(0);
X    /*NOTREACHED*/
X    return 1; /* disaster if we get here -- it's just for lint! */
X}
X
Xvoid
XAddFrom(Name)
X    char *Name;
X{
X    char *GetLine();
X
X    FILE *fp;
X    char *Line;
X
X    if (Name[0] == '-' && Name[1] == '\0') {
X	fp = stdin;
X    } else {
X	fp = fopen(Name, "r");
X    }
X
X    if (fp == (FILE *) 0) {
X	extern int errno;
X	int e = errno;
X
X	fprintf(stderr, "%s: -f: can't open ", progname);
X	errno = e;
X	perror(Name);
X	exit(1);
X    }
X
X    while ((Line = GetLine(fp, Name)) != (char *) 0) {
X	if (AddFile(Line) < 0 && AsciiTrace >= 1) {
X	    /* we already got one error message from AddFile() */
X	    fprintf(stderr, "%s: warning: Problem adding file %s\n",
X			progname, Line);
X	}
X    }
X
X    if (fp != stdin) {
X	(void) fclose(fp);
X    }
X}
X
Xstatic int LineInFile = 0;
Xstatic FILE *LastFile = 0;
X
Xchar *
XGetLine(fp, Name)
X    FILE *fp;
X    char *Name;
X{
X    static char *Line = (char *) 0;
X    static int Length = 0;
X    int ch;
X    register char *p;
X
X    if (!Line) {
X	if (Length <= 10) Length = 30;
X	Line = emalloc(Length);
X    }
X
X    p = Line;
X
X    if (fp == LastFile) {
X	++LineInFile;
X    } else {
X	LineInFile = 0; /* number lines from zero! */
X	LastFile = fp;
X    }
X
X    while ((ch = getc(fp)) != EOF) {
X	static int HaveWarned = 0;
X
X	if (isspace(ch)) {
X	    if (p == Line) { /* ignore blank lines and leading blanks */
X		continue;
X	    }
X	    if (ch == '\n') {
X		if (p == (char *) 0) {
X		    /* how could this ever happen?  do I need it? */
X		    p = Line;
X		    continue;
X		}
X		*p = '\0';
X		return Line;
X	    }
X	    if (AsciiTrace && !HaveWarned) {
X		fprintf(stderr,
X"%s: -f: Warning: spaces found in filenames read from \"%s\"\n",
X							    progname, Name);
X		HaveWarned = 1;
X	    }
X	}
X
X	/* add the character to the string */
X	if (p - Line + 1 >= Length) {
X	    int SaveWhere = p - Line;
X	    Length += 30;
X	    Line = erealloc(Line, Length);
X	    p = &Line[SaveWhere];
X	}
X	*p++ = ch;
X    }
X
X    if (p && Line && p != Line) {
X	fprintf(stderr, "%s: -f: warning: no newline at the end of \"%s\"\n",
X						progname, Name);
X	*p = '\0';
X	return Line;
X    }
X
X    return (char *) 0;
X}
X
Xextern int fclose(), pclose();
X
Xt_FileInfo *
XMakeFileInfo(Name)
X    char *Name;
X{
X#ifdef BSD
X    extern time_t time();
X#else
X    extern long time();
X#endif
X    extern t_FID Name2FID();
X    extern t_FileInfo *GetFileInfo();
X    extern t_FID GetNextFID();
X    FILE *MakeInput();
X    struct stat StatBuf;
X
X    t_FileInfo *FileInfo = 0;
X    t_FID FID;
X
X    if (!Name || !*Name) return (t_FileInfo *) 0; /* sanity */
X
X    if (stat(Name, &StatBuf) < 0) {
X#ifndef FindFile /* it is a macro these days... */
X	extern char *FindFile();
X#endif
X	extern int errno;
X
X	int e = errno;
X	char *doc;
X
X	if ((doc = FindFile(Name)) == (char *) 0) {
X	    fprintf(stderr, "Can't index ");
X	    errno = e; /* fprintf might well clobber errno! */
X	    perror(Name);
X	    return (t_FileInfo *) 0;
X	}
X
X	if (stat(doc, &StatBuf) < 0) {
X	    e = errno;
X	    fprintf(stderr, "Can't index ");
X	    errno = e; /* fprintf might well clobber errno! */
X	    perror(Name);
X	    return (t_FileInfo *) 0;
X	}
X	Name = doc;
X    }
X
X    if (StatBuf.st_size == 0L) {
X	if (AsciiTrace) {
X	    fprintf(stderr, "%s empty -- not indexed\n", Name);
X	}
X	return (t_FileInfo *) 0;
X    }
X    /* See if it's in the index already: */
X    if ((FID = Name2FID(Name)) != (t_FID) 0) {
X
X	if ((FileInfo = GetFileInfo(FID)) != (t_FileInfo *) 0) {
X	    /* Check to see if the file hass changed since it was last
X	     * indexed.  If it has, we should delete the old one from
X	     * the database and give this one a new FID, but I have
X	     * not done that yet -- that's /usr/local/lib/lqtextd or
X	     * something, I suppose!
X	     */
X	    if (FileInfo->Date >= StatBuf.st_mtime) {
X		if (AsciiTrace) {
X		    fprintf(stderr, "%s unchanged -- not indexed\n", Name);
X		}
X		DestroyFileInfo(FileInfo);
X		return (t_FileInfo *) 0;
X	    }
X	}
X    } else {
X	FID = GetNextFID((long) StatBuf.st_size);
X    }
X
X    if (FileInfo == (t_FileInfo *) 0) {
X	/* Allocate Structure */
X	enew(FileInfo, t_FileInfo);
X
X	/* Although not always necessary, call emalloc here so that a
X	 * FileInfo can always be deleted with DestroyFileInfo()
X	 */
X	FileInfo->Name = emalloc((unsigned)(strlen(Name) + 1));
X	(void) strcpy(FileInfo->Name, Name);
X
X	/* Other bits to set: */
X
X	/* date */
X	FileInfo->Date = StatBuf.st_mtime;
X
X	/* file type */
X	if ((FileInfo->FilterType = GetFilterType(FileInfo, &StatBuf)) < 0) {
X	    if (AsciiTrace) {
X		fprintf(stderr, "%s unknown file type -- not indexed\n", Name);
X	    }
X	    (void) efree(FileInfo->Name);
X	    (void) efree((char *) FileInfo);
X	    return (t_FileInfo *) 0;
X	}
X    }
X
X    FileInfo->FID = FID;
X    FileInfo->Date = (long) time((long *) 0); /* it's a time_t on BSD */
X
X    if ((FileInfo->Stream = MakeInput(FileInfo)) == (FILE *) 0) {
X	fprintf(stderr, "%s: couldn't open filter for %s -- not indexed\n",
X						    progname, FileInfo->Name);
X	(void) efree(FileInfo->Name);
X	(void) efree((char *) FileInfo);
X	return (t_FileInfo *) 0;
X    }
X
X    return FileInfo;
X}
X
Xvoid
XDestroyFileInfo(FileInfo)
X    t_FileInfo *FileInfo;
X{
X    if (FileInfo->Stream) {
X	if (FileInfo->FilterType >= 0 && FileInfo->FilterType < MaxFilterType){
X	    (* FilterTable[FileInfo->FilterType].close)(FileInfo->Stream);
X	}
X	FileInfo->Stream = (FILE *) 0;
X    }
X    if (FileInfo->Name) (void) efree(FileInfo->Name);
X    (void) efree((char *) FileInfo);
X}
X
Xint
XAddFile(Name)
X    char *Name;
X{
X    t_FileInfo *FileInfo;
X
X    if (!Name || !*Name) return -1;
X    if ((FileInfo = MakeFileInfo(Name)) == (t_FileInfo *) 0) return -1;
X
X    AddStream(FileInfo);
X    SaveFileInfo(FileInfo);
X    DestroyFileInfo(FileInfo);
X
X    return 0;
X}
X
XFILE *
XMakeInput(FileInfo)
X    t_FileInfo *FileInfo;
X{
X    FILE *fp;
X    char *Buffer;
X    unsigned BufLen;
X    extern FILE *fopen(), *popen();
X
X#define FSTRING FilterTable[FileInfo->FilterType].String
X
X    if (FileInfo->FilterType > MaxFilterType) {
X	fprintf(stderr, "%s: Warning: filter type %d for %s too high (max %d)\n",
X		progname, FileInfo->FilterType, FileInfo->Name, MaxFilterType);
X	return (FILE *) 0;
X    }
X
X    if (FilterTable[FileInfo->FilterType].Type != FileInfo->FilterType) {
X	fprintf(stderr, "Fatal Filter table error, %d\n", FileInfo->FilterType);
X	exit(3);
X    }
X
X    if (FSTRING == (char *) 0) {
X	return fopen(FileInfo->Name, "r");
X    }
X
X    BufLen = strlen(FileInfo->Name) * 2 + 4 + strlen(FSTRING);
X	/* The +4 is to allow for an embedded " < " plus a \0;
X	 * we append "< Name", but also expand %s to be the Name, hence
X	 * the strlen * 2
X	 */
X    Buffer = emalloc(BufLen);
X
X    (void) sprintf(Buffer, FSTRING, FileInfo->Name);
X    (void) strcat(Buffer, " < ");
X    (void) strcat(Buffer, FileInfo->Name);
X
X    fp = popen(Buffer, "r");
X    (void) efree(Buffer);
X    return fp;
X}
X
Xstatic long BytesRead = 0L;
Xstatic int InWord = 0;
X
X/* Character input */
X
X#ifdef __GNU__
Xinline
X#endif
Xint
XRealGetChar(FileInfo)
X    t_FileInfo *FileInfo;
X{
X    /* ASSERT: InWord && _chForLee == '\'' */
X    LastChar = getc(FileInfo->Stream);
X    if (WithinWord(LastChar) && LastChar != '\'') {
X	BytesRead++;
X	return '\'';
X    } else {
X	/* delete the single quote, as it was at the end of
X	 * a word, not in the middle
X	 */
X	BytesRead++;
X	return ' ';
X    }
X    /*NOTREACHED*/
X    /* exit(1); */
X}
X
Xt_WordInfo *
XReadWord(FileInfo)
X    t_FileInfo *FileInfo;
X{
X    /* use two static storage areas so we can be called twice in a row.
X     * This is necessary to implement the WPF_LASTINBLOCK flag.
X     */
X    static t_WordInfo This, That;
X    static int ThisOrThat = 0;
X    t_WordInfo *WordInfo;
X    static char Buffer[MaxWordLength + 1];
X    int ch;
X    register char *q = Buffer;
X    static int WordInBlock;
X    static t_FID LastFid = 0L;
X    static long LastPos = 0L;
X    static int SawCommon = 0;
X    static int SawLetters = 0;
X    static int BlockInFile = 0L;
X    static unsigned long LastBlock;
X    unsigned long Start;
X
X    WordInfo = (ThisOrThat ? &This : &That);
X
X    if (FileInfo->FID != LastFid) {
X	LastFid = FileInfo->FID;
X	WordInBlock = (-1); /* none, yet! */
X	LastPos = BlockInFile = LastBlock = 0L;
X	BytesRead = 0L;
X	SawCommon = SawLetters = 0;
X	if (AsciiTrace) {
X	    fprintf(stderr, "Reading file \"%s\"", FileInfo->Name);
X	}
X    }
X
X    /* Skip non-word characters */
X    while ((ch = GetChar(FileInfo)) != EOF) {
X	if (StartsWord(ch)) break;
X    }
X
X    /* ASSERT: we have read at least one character */
X
X    if (ch == EOF) {
X	if (AsciiTrace) {
X	    fprintf(stderr, "\n");
X	}
X	return (t_WordInfo *) 0;
X    }
X
X    Start = BytesRead - 1;
X
X    if (UseLineNumbers) {
X	BlockInFile = LineInFile;
X    } else {
X	BlockInFile = Start / FileBlockSize;
X    }
X
X    if (BlockInFile != LastBlock) {
X	LastBlock = BlockInFile;
X	if (AsciiTrace > 1) {
X	    fprintf(stderr, ".");
X#ifdef sun
X	    /* SunOS seems to line-buffer stderr! */
X	    fflush(stderr);
X#endif
X	}
X	WordInBlock = (-1);
X    }
X
X    if (isupper(ch)) {
X	WordInfo->WordPlace.Flags = WPF_UPPERCASE;
X	ch = tolower(ch);
X    } else {
X	WordInfo->WordPlace.Flags = 0;
X    }
X
X    InWord = 1; /* For GetChar() */
X
X    do {
X	if (q - Buffer < MaxWordLength) {
X	    *q++ = ch;
X	}
X	ch = GetChar(FileInfo);
X	TOLOWER(ch);
X    } while (WithinWord(ch) || EndsWord(ch));
X
X    *q = '\0';
X    InWord = 0;
X
X#ifdef __GNUC__
X    /* this is to get round a gcc bug... */
X    {
X	int i = q - Buffer;
X	WordInfo->Length = i;
X
X	if (i < MinWordLength) {
X	    register char *p;
X
X	    for (p = Buffer; p < q; p++) {
X		if (isalpha(*p)) {
X		    SawLetters = 1;
X		    break;
X		}
X	    }
X	    return ReadWord(FileInfo);
X	}
X    }
X#else
X    if ((WordInfo->Length = q - Buffer) < MinWordLength) {
X	register char *p;
X
X	for (p = Buffer; p < q; p++) {
X	    if (isalpha(*p)) {
X		SawLetters = 1;
X		break;
X	    }
X	}
X	return ReadWord(FileInfo);
X    }
X#endif
X
X    WordInfo->Word = Buffer;
X
X    (void) WordRoot(WordInfo);
X
X    WordInfo->Length = strlen(WordInfo->Word);
X
X    if (TooCommon(WordInfo)) {
X	SawCommon++;
X	WordInBlock++;
X#ifdef ASCIITRACE
X	if (AsciiTrace > 10) {
X	    fprintf(stderr, "%s too common to index\n", WordInfo->Word);
X	}
X#endif
X	return ReadWord(FileInfo);
X    } else if (SawCommon) {
X	SawCommon = 0;
X	WordInfo->WordPlace.Flags |= (WPF_LASTWASCOMMON|WPF_LASTHADLETTERS);
X    }
X    if (SawLetters) {
X	SawLetters = 0;
X	WordInfo->WordPlace.Flags |= WPF_LASTHADLETTERS;
X    }
X
X    /* StuffBefore is the # of chars between the end of the last word and
X     * the start of this one.
X     */
X    if (Start > 1L) {
X	if (Start - (LastPos + 1) <= 0) {
X	    WordInfo->WordPlace.StuffBefore = 1; /* save a byte in the index */
X	} else if (Start - (LastPos + 1) >= 255 ) {
X	    WordInfo->WordPlace.StuffBefore = 255;
X	} else {
X	    WordInfo->WordPlace.StuffBefore = Start - (LastPos + 1);
X	}
X    } else {
X	WordInfo->WordPlace.StuffBefore = 1; /* i.e., the default */
X    }
X
X    WordInfo->WordPlace.FID = WordInfo->FID = FileInfo->FID;
X    WordInfo->WID = (t_WID) 0;
X    WordInfo->Next = (t_WordInfo *) 0;
X    WordInfo->WordPlaces = (t_WordPlace *) 0;
X    WordInfo->WordPlacesInHere = 0;
X    WordInfo->WordPlace.WordInBlock = (++WordInBlock);
X    WordInfo->WordPlace.BlockInFile = BlockInFile;
X    WordInfo->DataBlock = (char *) 0;
X
X    WordInfo->Word[WordInfo->Length] = '\0';
X
X    {
X	/* I want to avoid using malloc() here... 
X	 * Another kludge would be to malloc sizeof(t_WordInfo) +
X	 * strlen(WordInfo->Word + 1) and to put the string at the end
X	 * of (i.e. just after) the struct.
X	 */
X	static char Word2[MaxWordLength + 1];
X	static char Word1[MaxWordLength + 1];
X	char *p = (ThisOrThat) ? Word1 : Word2;
X
X	(void) strncpy(p, WordInfo->Word, (int) WordInfo->Length);
X	WordInfo->Word = p;
X	WordInfo->Word[WordInfo->Length] = '\0';
X    }
X
X    LastPos = BytesRead - 1;
X
X    ThisOrThat = !ThisOrThat;
X    /* toggle between 0 and 1.  Boring life, really */
X
X    if (!WordInfo->Word[0]) {
X	fprintf(stderr, "Null word in ReadWord()\n");
X    }
X    return WordInfo;
X}
X
Xvoid
XAddStream(FileInfo)
X    t_FileInfo *FileInfo;
X{
X    /* I have to mark the last word in the block.
X     * I do that by marking the previous word if it was in a differant block
X     * than the current one.
X     */
X    t_WordInfo *WordInfo;
X    t_WordInfo *LastWord = 0;
X
X    BytesRead = 0;
X
X    while ((WordInfo = ReadWord(FileInfo)) != (t_WordInfo *) 0) {
X	if (LastWord) {
X	    if (LastWord->WordPlace.BlockInFile !=
X	    				WordInfo->WordPlace.BlockInFile) {
X		LastWord->WordPlace.Flags |= WPF_LASTINBLOCK;
X	    }
X	    AddWord(LastWord);
X	}
X	LastWord = WordInfo;
X    }
X    if (LastWord) {
X	/* it's the last in the file, so it is also the last in the block */
X	LastWord->WordPlace.Flags |= WPF_LASTINBLOCK;
X	AddWord(LastWord);
X    }
X
X    if (AsciiTrace) {
X	fprintf(stderr, "Read %lu bytes from \"%s\"\n", BytesRead, FileInfo->Name);
X    }
X}
X
X/* lqaddfile has been carried through several incarnations of lq-text,
X * and hence has more than one Inital Revision in the following history.
X *
X * $Log:	lqaddfile.c,v $
X * Revision 1.14  91/03/02  21:22:39  lee
X * Added write access call.
X * 
X * Revision 1.13  91/03/02  18:53:25  lee
X * Common words are now counted, so you can now edit the common word list
X * without invalidating the index.
X * 
X * Revision 1.12  90/10/06  00:50:54  lee
X * Prepared for first beta release.
X * 
X * Revision 1.11  90/10/05  23:46:11  lee
X * Allow compilation with -UASCIITRACE
X * 
X * Revision 1.10  90/10/04  17:54:46  lee
X * fixed a typo in the usage message.
X * 
X * Revision 1.9  90/09/28  23:20:22  lee
X * Put more of GetChar into a macro and parameterised TOLOWER.
X * 
X * Revision 1.8  90/09/28  22:19:04  lee
X * Did the previous fix _properly_!
X * 
X * Revision 1.7  90/09/28  22:12:35  lee
X * Made getchar a macro, and deleted the call to CallFree...
X * 
X * Revision 1.6  90/09/20  18:46:03  lee
X * Closed up a (very small) memory leak.
X * 
X * Revision 1.5  90/09/19  20:16:41  lee
X * Fixed problems associated with indexing an empty file.
X * 
X * Revision 1.4  90/08/29  21:45:18  lee
X * Alpha release
X * 
X * Revision 1.3  90/08/09  19:17:12  lee
X * *** empty log message ***
X * 
X * Revision 1.1  90/02/27  11:05:02  lee
X * Initial revision
X * 
X * Revision 2.2  89/10/08  20:45:13  lee
X * Working version of nx-text engine.  Addfile and wordinfo work OK.
X * 
X * Revision 2.1  89/10/02  01:14:12  lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X * 
X * Revision 1.3  89/09/17  23:02:42  lee
X * Various fixes; NumberInBlock now a short...
X * 
X * Revision 1.2  89/09/16  21:16:11  lee
X * First demonstratable version.
X * 
X * Revision 1.1  89/09/07  21:05:52  lee
X * Initial revision
X *
X */
@@@End of lq-text/src/lqtext/lqaddfile.c
echo end of part 06
-- 
Liam R. E. Quin,  lee at sq.com, SoftQuad Inc., Toronto, +1 (416) 963-8337



More information about the Alt.sources mailing list