lq-text Full Text Retrieval Database Part 02/13
Liam R. E. Quin
lee at sq.sq.com
Mon Mar 4 12:01:07 AEST 1991
: cut here --- cut here --
: To unbundle, sh this file
#! /bin/sh
: part 02
echo x - lq-text/src/filters/Makefile 1>&2
sed 's/^X//' >lq-text/src/filters/Makefile <<'@@@End of lq-text/src/filters/Makefile'
X# filters/Makefile -- Copyright 1990 Liam R. Quin. All Rights Reserved.
X# This code is NOT in the public domain.
X# See the file ../COPYRIGHT for full details.
X
X# This Makefile belongs in the "src/filters" directory.
X#
X# Note that most of the actual configuration is done in ../Makefile and
X# in ../h/global.h, and not here.
X
X# $Id: Makefile,v 1.4 90/10/06 00:57:26 lee Rel $
X
X
X# This is what gets made:
XTARGETS = MailFilter NewsFilter
XLIBFILES=$(TARGETS)
XEXTRA=-DMAILFILTER='$(MAILFILTER)' -DNEWSFILTER='$(NEWSFILTER)' $(EXTRA)
X
XSRCS = FilterMain.c FilterType.c MailFilter.c NewsFilter.c
XOBJS = FilterMain.o FilterType.o MailFilter.o NewsFilter.o
X
XPWD=filters
X
XDESTDIR=../lib
XLQ=../lib/liblq.a
XMODE=755
X
X# for compiling:
XEXTRA=-I../h
XRANLIB=echo
X
Xall: $(TARGETS)
X
Xsaber_src:
X echo $(PWD)
X #cd $(PWD)
X #load $(CFLAGS) $(SRCS)
X #cd ..
X
Xsaber_obj:
X #cd $(PWD)
X #load $(CFLAGS) $(SRCS)
X #cd ..
X
Xinstall: all
X for i in $(LIBFILES); do cp "$$i" $(DESTDIR); \
X strip "$(DESTDIR)/$$i" ; \
X chmod $(MODE) "$(DESTDIR)/$$i" ; \
X done
X
Xtidy:
X /bin/rm -f *.o core m.log tags
X
Xclean: tidy
X /bin/rm -f $(TARGETS) $(TEST)
X
Xdepend:
X mkdep $(CFLAGS) *.c
X
XCFilter: FilterMain.o CFilter.o
X $(CC) $(CFLAGS) -o CFilter FilterMain.o CFilter.o $(MALLOC) $(LQ)
X
XNewsFilter: FilterMain.o NewsFilter.o
X $(CC) $(CFLAGS) -o NewsFilter FilterMain.o NewsFilter.o $(MALLOC) $(LQ)
X
XMailFilter: FilterMain.o MailFilter.o
X $(CC) $(CFLAGS) -o MailFilter FilterMain.o MailFilter.o $(MALLOC) $(LQ)
X
XCDMSFilter: FilterMain.o CDMSFilter.o
X $(CC) $(CFLAGS) -o CDMSFilter FilterMain.o CDMSFilter.o $(MALLOC) $(LQ)
X
X#
X# $Log: Makefile,v $
X# Revision 1.4 90/10/06 00:57:26 lee
X# Prepared for first beta release.
X#
X# Revision 1.3 90/10/03 21:14:45 lee
X# Added MAILFILTER stuff.
X#
X# Revision 1.2 90/09/28 21:54:43 lee
X# No longer uses OWNER.
X#
X# Revision 1.1 90/08/09 19:17:58 lee
X# Initial revision
X
X# DO NOT PUT ANYTHING AFTER THIS LINE
X# DO NOT DELETE THIS LINE -- mkdep uses it.
X# DO NOT PUT ANYTHING AFTER THIS LINE, IT WILL GO AWAY.
X
XFilterMain.o: FilterMain.c
XMailFilter.o: MailFilter.c /usr/include/malloc.h
XMailFilter.o: ../h/wordrules.h ../h/emalloc.h
XNewsFilter.o: NewsFilter.c
XNewsFilter.o: ../h/wordrules.h ../h/emalloc.h
XTroffFilter.o: TroffFilter.c
XTroffFilter.o: ../h/wordrules.h ../h/emalloc.h
X
X# IF YOU PUT ANYTHING HERE IT WILL GO AWAY
@@@End of lq-text/src/filters/Makefile
echo x - lq-text/src/filters/NewsFilter.c 1>&2
sed 's/^X//' >lq-text/src/filters/NewsFilter.c <<'@@@End of lq-text/src/filters/NewsFilter.c'
X/* NewsFilter.c -- Copyright 1989 Liam R. Quin. All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* $Id: NewsFilter.c,v 1.5 90/10/06 00:57:27 lee Rel1-10 $
X */
X
X/* Filter for usenet articles.
X * Throw away all of the header except
X * Subject
X * From
X * Organi[sz]ation
X *
X * Probably ought to keep Message-ID, but I can't store it anyway!
X *
X * See FilterMain and wordrules.h for more info.
X *
X */
X
X#ifdef SYSV
X extern int _filbuf(), _flsbuf(); /* for lint! */
X#endif
X#include <stdio.h>
X#include <malloc.h>
X#include <ctype.h>
X#include "wordrules.h"
X
X#include "emalloc.h"
X
X#define STREQ(boy, girl) ((*(boy) == *(girl)) && !strcmp(boy, girl))
X
X/** C Library functions that need to be declared: **/
X#ifndef tolower
X extern int tolower();
X#endif
Xextern int strcmp();
X/** Functions in this file that need to be declared **/
Xint GetChar();
Xvoid Header(), Body();
X/** **/
X
Xextern char *progname;
Xvoid Filter();
X
Xchar *KeepThese[] = { /* these must be sorted on the first character */
X "from",
X "keywords",
X "summary",
X "subject",
X "organisation",
X "organization",
X 0
X};
X
Xint icstreq(s1, s2) /* case insensitive strcmp */
X char *s1, *s2;
X{
X register char ch1, ch2;
X
X while (*s1 && *s2) {
X if (*s1 != *s2) {
X if (isupper(*s1)) {
X ch1 = tolower(*s1);
X ch2 = (*s2);
X } else if (isupper(*s2)) {
X /* Note that we only have to test one character for case! */
X ch1 = (*s1);
X ch2 = tolower(*s2);
X } else {
X return 0; /* they are different */
X }
X if (ch1 != ch2) return 0; /* the strings differ */
X }
X s1++; s2++;
X }
X if (!*s1 && !*s2) {
X return 1; /* they are the same */
X }
X return 0; /* they are different */
X}
X
Xint
XIsWanted(String)
X char *String;
X{
X char **p;
X int ch = String[0];
X
X if (isupper(ch)) ch = tolower(ch);
X
X for (p = KeepThese; *p && **p; p++) {
X if (ch < **p) return 0; /* gone too far */
X else if (icstreq(String, *p)) return 1;
X }
X return 0;
X}
X
Xvoid
XFilter(InputFile, Name)
X FILE *InputFile;
X char *Name;
X{
X Header(InputFile, Name);
X Body(InputFile, Name);
X}
X
Xtypedef enum {
X F_NotSeenAnythingYet,
X F_InTheFirstWord,
X F_AfterTheFirstWord
X} t_FirstWord;
X
Xint InWord = 0;
X
Xvoid
XHeader(InputFile, Name)
X FILE *InputFile;
X char *Name;
X{
X int AtStartOfLine = 1;
X int IgnoreLine = 1; /* initialised for lint and gcc -W really... */
X t_FirstWord FirstWord = F_NotSeenAnythingYet;
X int ch;
X static int BufLen;
X static char *Buffer = 0;
X int AtStartOfWord;
X register char *q;
X
X if (Buffer == 0) {
X BufLen = 24;
X Buffer = emalloc(BufLen);
X }
X
X q = Buffer;
X InWord = 0;
X
X while ((ch = GetChar(InputFile)) != EOF) {
X if (ch == '\n') {
X if (AtStartOfLine) { /* a blank line */
X putchar('\n');
X return;
X }
X }
X
X InWord = InWord ? WithinWord(ch) : StartsWord(ch);
X
X switch (FirstWord) {
X case F_NotSeenAnythingYet:
X if (InWord) {
X FirstWord = F_InTheFirstWord;
X if (q - Buffer >= BufLen - 1) {
X int where = q - Buffer;
X
X BufLen += 24;
X Buffer = erealloc(Buffer, BufLen);
X q = &Buffer[where];
X }
X *q++ = ch;
X } else {
X putchar(' ');
X }
X break;
X case F_InTheFirstWord:
X if (InWord) {
X if (q - Buffer >= BufLen - 1) {
X int where = q - Buffer;
X
X BufLen += 24;
X Buffer = erealloc(Buffer, BufLen += 24);
X q = &Buffer[where];
X }
X *q++ = ch;
X break;
X } else { /* reached the end of the first word on the line */
X *q = '\0';
X /* See if it's a keyword */
X if ((IgnoreLine = !IsWanted(Buffer)) != 0) {
X /* Turn the word into one that won't get indexed,
X * so that word counmts are unaffected:
X * We use qxxxxxxx (any number of x's) for this.
X */
X for (q = Buffer; *q; q++) {
X putchar((q == Buffer) ? 'q' : 'x');
X }
X putchar (ch == '\n' ? '\n' : ' ');
X } else {
X printf("%s%c", Buffer, ch == '\n' ? ch : ' ');
X }
X FirstWord = F_AfterTheFirstWord;
X }
X break;
X default:
X if ((AtStartOfLine = (ch == '\n'))) {
X IgnoreLine = 0;
X q = Buffer;
X FirstWord = F_NotSeenAnythingYet;
X AtStartOfWord = 1;
X }
X if (InWord && !IgnoreLine) {
X putchar(ch);
X } else {
X if (AtStartOfWord && InWord) {
X putchar('q');
X AtStartOfWord = 0;
X } else if (InWord) {
X putchar('x');
X } else if (isspace(ch)) {
X putchar(ch);
X } else {
X putchar(' ');
X }
X }
X if (!InWord) AtStartOfWord = 1;
X }
X if ((AtStartOfLine = (ch == '\n'))) {
X IgnoreLine = 0;
X q = Buffer;
X FirstWord = F_NotSeenAnythingYet;
X AtStartOfWord = 1;
X }
X }
X if (ch == EOF) {
X fprintf(stderr, "%s: warning: Mail folder %s has no message body\n",
X progname, Name);
X }
X}
X
Xvoid
XBody(InputFile, Name)
X FILE *InputFile;
X char *Name;
X{
X int ch;
X
X while ((ch = GetChar(InputFile)) != EOF) {
X if (InWord = InWord ? WithinWord(ch) : StartsWord(ch)) {
X putchar(ch);
X } else {
X putchar((ch == '\n') ? '\n' : ' ');
X }
X }
X}
X
X#ifdef __GNU__
Xinline
X#endif
Xint
XGetChar(fd)
X FILE *fd;
X{
X static int LastChar = 0;
X
X if (LastChar) {
X int ch = LastChar;
X LastChar = 0;
X return ch;
X }
X
X /* Only return a single quote if it is surrounded by letters */
X if ((LastChar = getc(fd)) == '\'') {
X LastChar = getc(fd);
X if (InWord && isalpha(LastChar)) return '\'';
X else return ' ';
X } else {
X int ch = LastChar;
X LastChar = 0;
X return ch;
X }
X}
X
X/*
X * $Log: NewsFilter.c,v $
X * Revision 1.5 90/10/06 00:57:27 lee
X * Prepared for first beta release.
X *
X * Revision 1.4 90/09/20 16:36:59 lee
X * Fixed icstrcmp() and IsWanted() so that the unwanted parts of headers
X * get deleted again.... (oops!)
X *
X * Revision 1.3 90/09/19 21:19:50 lee
X * Now supports turning unindexed stuff into qxxxxx-words.
X *
X * Revision 1.2 90/08/29 21:56:58 lee
X * Alpha release.
X *
X * Revision 1.1 90/08/09 19:17:57 lee
X * Initial revision
X *
X * Revision 1.2 89/09/16 21:16:01 lee
X * First demonstratable version.
X *
X * Revision 1.1 89/09/07 21:05:48 lee
X * Initial revision
X *
X */
@@@End of lq-text/src/filters/NewsFilter.c
echo x - lq-text/src/h/Liamdbm.h 1>&2
sed 's/^X//' >lq-text/src/h/Liamdbm.h <<'@@@End of lq-text/src/h/Liamdbm.h'
X/* Liamdbm.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* $Id: Liamdbm.h,v 1.2 90/10/06 02:18:14 lee Rel1-10 $
X *
X * This is used with gdbm. I have not linked with gdbm, and, if you
X * wish to do so, you must be careful not to voilate any copyright
X * notices... (sigh)
X *
X * The version of gdbm for which I had a manual is rather old and had no
X * ndbm compatibility.
X */
X
X#include "gdbm.h"
Xextern datum gdbm_fetch();
Xextern datum gdbm_firstkey();
Xextern datum gdbm_nextkey();
X
Xtypedef char DBM;
X
X#define dbm_store(db, key, data, mode) gdbm_store(db, key, data)
X/* gdbm_open is stupder than ndbm_open.... */
X#define dbm_open(FileName, Mode, m) gdbm_open(FileName, 512, Mode, 0)
X#define dbm_fetch gdbm_fetch
X#define dbm_close gdbm_close
X#define dbm_firstkey gdbm_firstkey
X#define dbm_nextkey gdbm_nextkey
X
X/*
X * $Log: Liamdbm.h,v $
X * Revision 1.2 90/10/06 02:18:14 lee
X * Prepared for first beta release.
X *
X *
X */
@@@End of lq-text/src/h/Liamdbm.h
echo x - lq-text/src/h/Revision.h 1>&2
sed 's/^X//' >lq-text/src/h/Revision.h <<'@@@End of lq-text/src/h/Revision.h'
X/* This header file gets updated with every distributed change to any source
X * file anywhere in the lq-text package.
X * A short description of the change is added to the Log here, too.
X * Lee.
X */
X
X#define LQTEXTREVISION "Release 1.10"
X
X/* $Revision: 1.10 $
X *
X * Revision 1.6 90/10/04 17:12:45 lee
X * lqtext now compiles and mostly works under BSD.
X * Fixes bug in phrase matching -- PhraseMatchLevel now works on one-word
X * phrases.
X *
X * Revision 1.5 90/09/28 22:19:36 lee
X * Made GetChar() a macro in lqaddfile -- speed improvement...
X *
X * Revision 1.4 90/09/20 16:37:35 lee
X * Fixed Mail and News filters so that they throw away the unwanted header
X * parts correctly.
X *
X * Revision 1.3 90/09/20 12:51:24 lee
X * Major sdbm initialisation bug fixed.
X *
X * Revision 1.2 90/09/20 11:52:35 lee
X * Fixed the filters so that lqshow highlights the right word (the qxx fix)
X *
X * Revision 1.1 90/09/20 11:52:18 lee
X * Initial revision
X *
X *
X */
@@@End of lq-text/src/h/Revision.h
echo x - lq-text/src/h/blkheader.h 1>&2
sed 's/^X//' >lq-text/src/h/blkheader.h <<'@@@End of lq-text/src/h/blkheader.h'
X/* blkheader.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X *
X * (was called blockheader.h, but this was too long on SysV for RCS)
X */
X
X/* descibe the physical WOrdPlace database...
X *
X * $Header: /usr/src/cmd/lq-text/src/h/RCS/blkheader.h,v 1.2 90/03/20 20:57:46 lee Rel1-10 $
X *
X * $Log: blkheader.h,v $
X * Revision 1.2 90/03/20 20:57:46 lee
X * removed WID from the block. This reduces checking, but should also
X * noticeably reduce the size of the database.
X *
X * Revision 1.1 90/03/20 20:54:44 lee
X * Initial revision
X *
X */
X
X/* The header of each block -- I can't use sReadNumber, because I don't know
X * the size of NextOffset until I get to the end, and it's too late by then!
X *
X * I should really store the block offset, and not the byte offset. This
X * would save a whole byte -- I could use 3 bytes for the NextBlock!
X */
Xtypedef struct {
X unsigned long NextOffset; /* a byte offset */
X char Data[1]; /* the address of this is where the number start... */
X} t_BlockHeader;
@@@End of lq-text/src/h/blkheader.h
echo x - lq-text/src/h/emalloc.h 1>&2
sed 's/^X//' >lq-text/src/h/emalloc.h <<'@@@End of lq-text/src/h/emalloc.h'
X/* emalloc.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* emalloc.h -- header file for emalloc.c, Liam Quin's malloc() wrapper
X *
X * $Id: emalloc.h,v 1.5 91/03/02 19:40:04 lee Rel1-10 $
X *
X * $Log: emalloc.h,v $
X * Revision 1.5 91/03/02 19:40:04 lee
X * Simpler version of malloc defines if MALLOCTRACE unused...
X *
X * Revision 1.4 91/03/02 18:31:21 lee
X * Simpler call to malloc wrappers if MALLOCTRACE undefined.
X *
X * Revision 1.3 90/10/06 02:18:26 lee
X * Prepared for first beta release.
X *
X * Revision 1.2 90/08/29 21:57:44 lee
X * removed most of the testing code
X *
X * Revision 1.1 90/08/09 19:14:48 lee
X * Initial revision
X *
X * Revision 2.2 89/10/08 20:45:20 lee
X * Working version of nx-text engine. Addfile and wordinfo work OK.
X *
X *
X */
X
Xextern int _LiamIsInCurses;
X
X#define InitScr() (_LiamIsInCurses = initscr())
X#define EndWin() (_LiamIsInCurses ? (_LiamIsInCurses = 0), endwin() : 0)
X
Xextern char *_emalloc(), *_erealloc(), *_ecalloc();
Xextern void _efree();
X
X#ifdef MALLOCTRACE
X#define emalloc(u) _emalloc(u, __FILE__, __LINE__)
X#define erealloc(s, u) _erealloc(s, u, __FILE__, __LINE__)
X#define ecalloc(n, siz) _ecalloc(n, siz, __FILE__, __LINE__)
X#define efree(s) _efree(s, __FILE__, __LINE__)
X#else
X#define emalloc _emalloc
X#define erealloc _erealloc
X#define ecalloc _ecalloc
X#define efree _efree
X#endif
@@@End of lq-text/src/h/emalloc.h
echo x - lq-text/src/h/fileinfo.h 1>&2
sed 's/^X//' >lq-text/src/h/fileinfo.h <<'@@@End of lq-text/src/h/fileinfo.h'
X/* fileinfo.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* Internal structure used by NX-Text to represent a word */
X
X/* Needs: sys/types.h */
X
X/* $Id: fileinfo.h,v 1.2 90/10/06 02:18:27 lee Rel1-10 $
X *
X * $Log: fileinfo.h,v $
X * Revision 1.2 90/10/06 02:18:27 lee
X * Prepared for first beta release.
X *
X * Revision 1.1 90/08/09 19:14:57 lee
X * Initial revision
X *
X * Revision 2.2 89/10/08 20:45:57 lee
X * Working version of nx-text engine. Addfile and wordinfo work OK.
X *
X * Revision 2.1 89/10/02 01:14:29 lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X *
X * Revision 1.2 89/09/16 21:15:19 lee
X * First demonstratable version.
X *
X * Revision 1.1 89/09/07 21:00:34 lee
X * Initial revision
X *
X *
X */
X
Xtypedef unsigned long t_FID;
X
Xtypedef struct {
X char *Name;
X t_FID FID; /* File Identifier */
X int FilterType; /* command to ASCIIify, 0 unknown, 1 none */
X time_t Date; /* when the file was last indexed */
X FILE *Stream;
X} t_FileInfo;
X
X#define FindFile(name) ((*(name) == '/') ? (name) : _FindFile(name))
Xextern char *_FindFile();
@@@End of lq-text/src/h/fileinfo.h
echo x - lq-text/src/h/filter.h 1>&2
sed 's/^X//' >lq-text/src/h/filter.h <<'@@@End of lq-text/src/h/filter.h'
X/* filter.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* filter.h -- define filter table for NX-Text, Liam Quin's text retrieval
X * program.
X * This table is built from a file like a simplified /etc/magic, normally
X * stored in /usr/local/lib/nx-text/lib/filtertable
X * but you can set this either here or in the Makefile.
X *
X * NEEDS: stdio.h
X *
X * $Id: filter.h,v 1.6 91/03/02 18:45:04 lee Rel1-10 $
X *
X * $Log: filter.h,v $
X * Revision 1.6 91/03/02 18:45:04 lee
X * Spell MAILFILTER correctly in the ifdef...
X *
X * Revision 1.5 90/10/13 03:11:31 lee
X * Now defines filters for easier stand-alone testing of stuff...
X *
X * Revision 1.4 90/10/06 02:18:28 lee
X * Prepared for first beta release.
X *
X * Revision 1.3 90/09/28 23:03:16 lee
X * Now use MAILFILTER and NEWSFILTER...
X *
X * Revision 1.2 90/08/29 21:57:57 lee
X * removed most of the testing code
X *
X * Revision 1.1 90/08/09 19:15:01 lee
X * Initial revision
X *
X * Revision 2.2 89/10/08 20:46:04 lee
X * Working version of nx-text engine. Addfile and wordinfo work OK.
X *
X * Revision 2.1 89/10/02 01:14:33 lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X *
X *
X */
X
X#define FTYPE_NEWS 1
X#define FTYPE_MAIL 2
X#define FTYPE_CDMS 3
X#define FTYPE_MOSTLYASCII 4
X#define FTYPE_C_SOURCE 5
X
X/* The Type field in each array entry is so that I can do some very simple
X * checking...
X */
Xextern int fclose(), pclose();
Xstruct s_FilterTable {
X int Type;
X int (* close)(); /* how to close the darned stream */
X char *String;
X};
X#ifndef FILTERDEF
Xextern struct s_FilterTable FilterTable[];
X#else
Xstruct s_FilterTable FilterTable[] = {
X { 0, fclose, 0 }, /* use fopen() */
X#ifndef NEWSFILTER
X# define NEWSFILTER "NewsFilter"
X#endif
X { FTYPE_NEWS, pclose, NEWSFILTER },
X#ifndef MAILFILTER
X# define MAILFILTER "MailFilter"
X#endif
X { FTYPE_MAIL, pclose, MAILFILTER },
X#ifdef FTYPE_CDMS /* CrystalWriter from Syntactics... */
X { FTYPE_CDMS, pclose, "CDMSFilter" },
X#endif
X#ifdef FTYPE_NTROFF
X { FTYPE_NTROFF, pclose, "lqderoff" }, /* not yet released, sorry */
X#endif
X { FTYPE_MOSTLYASCII, pclose, "AsciiFilter" },
X#ifdef FTYPE_C_SOURCE
X { FTYPE_C_SOURCE, pclose, "CFilter" }, /* leave me last! */
X#endif
X /* If you add more, you MUST update MaxFilterType */
X { 0, 0, 0 }
X};
X#endif
X#define MaxFilterType FTYPE_C_SOURCE
@@@End of lq-text/src/h/filter.h
echo x - lq-text/src/h/globals.h 1>&2
sed 's/^X//' >lq-text/src/h/globals.h <<'@@@End of lq-text/src/h/globals.h'
X/* globals.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X *
X * $Id: globals.h,v 1.6 91/02/20 19:26:53 lee Rel1-10 $
X *
X * (see Log at end of this file for change history. Keep this up to date
X * using rcs if you have it...)
X */
X
X/* globals.h -- declarations of globally accessible variables, and also
X * of configurable parameters.
X *
X * Some of the configuation options might be given in ../Makefile, so
X * you must check in there too.
X *
X * Everything that includes this file must be linked with Defaults.c
X */
X
X/*
X * DOCPATH gives the list of directories in which to search in order
X * to find files to retrieve and to index. The default can be wired
X * in here, or can be simply "." (in which case relative pathnames will
X * be from wherever one invokes the commands, and absolute pathnames
X * will be absolute. For example,
X * #define DFLTDOCPATH "/usr/man:."
X * In any case, it can be overridden by a DOCPATH line in the configuration
X * file for a given database (README in the database directory), and also
X * by an environment variable DOCPATH (the latter taking precedence over
X * the former).
X *
X * Use ((char *) 0) to disable the default -- in this case, you always have
X * to give one, either with the $DOCPATH variable or in the database file.
X *
X */
X#ifndef DFLTDOCPATH
X# define DFLTDOCPATH ((char *) 0)
X#endif
X
X/* LQTEXTDIR: if the programs can't find the directory to use -- i.e.,
X * there was no -d option and $(LQTEXTDIR) is unset, we either
X * look in UNDERHOME (if that was defined here) or in wherever LQTEXTDIR
X * is defined to point.
X */
X#ifndef LQTEXTDIR
X# define LQTEXTDIR "/usr/spool/lqtextdir"
X#endif
X
X/* If UNDERHOME is set, look there for a directory -- e.g.
X * #define UNDERHOME "sockdrawer"
X * would make lqtext programs look for a directory something like
X * /users/liam/sockdrawer
X * (where /users/liam is my login directory)
X */
X#ifndef UNDERHOME
X# define UNDERHOME "LQTEXTDIR"
X#endif
X
X/* The name of a configuration file found in the database directory:
X */
X#define CONFIGFILE "README"
X
X/* If the config file doesn't give a filename for a list of common
X * words, we look for one called DFLTCOMMONFILE (and don't mind if we
X * don't find it). Use "/dev/null" or ((char *) 0) if you want to
X * disable the default.
X * It's case sensitive, of course.
X */
X#define DFLTCOMMONFILE "CommonWords"
X
X#ifndef PAGER
X/* The default pager to use if the user doesn't set $PAGER. This is only
X * used in lqshow, the browser. Good things to try are
X * more, "less -Ce", and (generally only on System V) "pg -ns".
X * Specify an absolute path if possible. It's often a lot faster, and
X * it's somewhat safer...
X */
X# ifdef BSD
X# define PAGER "/usr/ucb/more"
X# else
X# define PAGER "/usr/bin/pg -ns"
X# endif
X#endif
X
X#ifndef DBMCREAT
X/* If you are using dbm or gdbm (?), you will need to create the dbm files
X * by hand yourself. Defining DBMCREAT as 0 makes the software do this
X * automatically, with a very slight performance penalty.
X *
X * ndbm and sdbm can use O_CREAT, so set it to 1 here for them.
X * You will also have to look at ../Makefile, ../PORTING, smalldb.h and
X * ../lqlib/smalldb.h, making whatever changes are needed.
X */
X# define DBMCREAT 1 /* 1 for ndbm, 0 for dbm */
X#endif
X
X#ifdef sparc
X# define NEEDALIGN
X#endif
X
X#ifdef mips /* e.g. SGI machines */
X# define NEEDALIGN
X#endif
X
X/* NEEDALIGN is for C compilers that require C structures to start at
X * word boundaries. You need this on sparc and sgi machines...
X */
X
X/***
X *** If you want to change anything beyond here...
X ***
X *** well, you can.
X *** After all, it's your copy.
X ***
X *** But don't come running back to me if it doesn't work!
X *** At least not until you have tried
X *** + understanding what the problem is;
X *** + looking at the source to see why;
X *** + fixing the problem;
X *** + taking off your shoes and socks and grinning for a while.
X ***
X *** Liam.
X ***
X ***/
X
X/* The following let you reconfigure the names of the files that form
X * part of the database, but there is no point in doing so unless you
X * are porting to some strange system that has absurd filename restrictions!
X */
X#ifndef WORDINDEX
X# define WORDINDEX "wordlist"
X /* This is a dbm file, so you'll get two files, one with ".pag"
X * stuck on the end and one with ".dir" on the end.
X * It contains an entry for every word in the database, enabling
X * the software to go from a word to an integer (well, a t_WID)
X * very quickly.
X * It tends to be a little over one tenth of the size of the DATABASE.
X */
X#endif
X#ifndef WIDINDEXFILE
X# define WIDINDEXFILE "WIDIndex"
X /* WIDINDEXFILE contains each word in the datbase, together with some
X * information and the first few bytes of data.
X * It contains WIDBLOCKSIZE bytes for every word, but this has to
X * be at least MAXWORDLEN + 10 bytes long (see WordInfo.c).
X */
X#endif
X#ifndef DATABASE
X# define DATABASE "data"
X /* For those words whose data doesn't fit into the first WIDBLOCKSIZE
X * bytes, space is allocated in this file in BLOCKSIZE chunks. Make
X * BLOCKSIZE small, or you will waste a lot of space -- on the other
X * hand, there's a 4-byte-per-block overhead at the moment.
X * This file gets very b i g indeed.
X */
X#endif
X#ifndef FILEINDEX
X# define FILEINDEX "FileList"
X /* This is a list of every file in the database, again in dbm format,
X * so there are actually two files (a .pag and a .dir) involved.
X * If your files are short, it will quickly grow to a tenth of the size
X * of the database.
X * It stores the filename, and some other information.
X */
X#endif
X#ifndef FIDFILE
X# define FIDFILE "FIDFile"
X /* This contains the largest currently used file number... you can
X * look at it to see how many files have been indexed.
X * It is only a few bytes long.
X */
X#endif
X#ifndef WIDFILE
X# define WIDFILE "WIDFile"
X /* This contains the largest currently used word number... you can
X * look at it to see how many unique words have been seen.
X * It is only a few bytes long.
X */
X#endif
X
X#ifndef WIDBLOCKSIZE
X# define WIDBLOCKSIZE 32
X/* WIDBLOCKSIZE absolutely must be large enough to fit at least one byte
X * of actual data, or all hell will break loose.
X * (actually that could be fixed...).
X * In any case, it has to contain (apart from the >= 1 byte of data):
X * + the length count (1 byte) and the word itself (no null on the end)
X * + the block number in the database (1..5 bytes)
X * + the number of matches (1..5 bytes)
X *
X * It helps efficiency very, very slightly if these are a power of two
X * bytes, as then they never cross Unix block boundaries.
X *
X */
X#endif
X
X#ifndef BLOCKSIZE
X#define BLOCKSIZE 64
X/* BLOCKSIZE is the size of blocks in the data file. There are several
X * tradeoffs:
X * + there is a 4-bytes-per-block overhead for list pointers, so it's
X * a good idea to make them large
X * + there's a bit of work involved in fetching the blocks, so things go
X * faster if they're larger...
X * + many blocks are not full, so it's a good idea to make them small.
X * On average, a little over (BLOCKSIZE - 4) / 2 bytes are wasted for
X * every word chain.
X * + since many of the blocks are not full, it's a good idea to make them
X * small, minimising the amount of extra data that gets copied around by
X * the Unix kernel. If the blocks are smaller it'll go faster...
X *
X * It helps efficiency very, very slightly if these are a power of two
X * bytes, as then they never cross Unix block boundaries.
X *
X */
X#endif
X
X/**** Some useful macros */
X
X/* STREQ(a,b) is much faster than strcmp() in the (common) case that the
X * first character of the strings differ.
X * It is due (as far as I know) to Henry Spencer, at the University of
X * Toronto Zoology Dept.,
X * utzoo!henry
X */
X#ifndef STREQ
X# define STREQ(henry,utzoo) (*(henry) == *(utzoo) && !strcmp(henry, utzoo))
X#endif
X
X/* Inline functions are functions that get expanded inline during
X * compilation -- sort of like macros with real local arguments.
X * Not all compilers support them.
X */
X#ifdef __GNUC__
X# define INLINE inline
X#else
X# define INLINE /* not supported */
X#endif
X
X#ifdef DefineThem
X# define DECL(name, type, value) type name = value
X# define EXTERN /* just define them please */
X#else
X# define EXTERN extern /* declare but do not define */
X# define DECL(name, type, value) EXTERN type name
X#endif
X
X/****/
X
X/* Now declare (or define) things: */
X
Xextern char *progname; /* from progname.c, for error messages */
XDECL(CommonWordFile, char *, DFLTCOMMONFILE);
XDECL(DatabaseDir, char *, LQTEXTDIR);
XDECL(FileIndex, char *, FILEINDEX);
XDECL(WordIndex, char *, WORDINDEX);
XDECL(DataBase, char *, DATABASE);
XDECL(FidFile, char *, FIDFILE);
XDECL(WidFile, char *, WIDFILE);
XDECL(WidIndexFile, char *, WIDINDEXFILE);
XDECL(DocPath, char *, DFLTDOCPATH);
X
X/*
X * $Log: globals.h,v $
X * Revision 1.6 91/02/20 19:26:53 lee
X * Added NEEDALIGN on mips systems
X * (thanks to Mark Moraes, moraes at cs.toronto.edu)
X *
X * Revision 1.5 90/10/07 20:41:20 lee
X * Added NEEDALIGN for fussy architectures.
X *
X * Revision 1.4 90/10/06 02:21:21 lee
X * Prepared for first beta release.
X *
X * Revision 1.3 90/10/03 21:31:54 lee
X * Added definition of PAGER, which has moved here from lqshow.c
X *
X * Revision 1.2 90/08/09 19:15:03 lee
X * after BSD lint and saber-C
X *
X * Revision 1.1 90/03/23 17:32:11 lee
X * Initial revision
X *
X *
X */
@@@End of lq-text/src/h/globals.h
echo x - lq-text/src/h/numbers.h 1>&2
sed 's/^X//' >lq-text/src/h/numbers.h <<'@@@End of lq-text/src/h/numbers.h'
X/* numbers.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* ReadNumber and WriteNumber take/return a long, using a compression
X * algorithm to reduce the amount of data taken.
X *
X * They use (char *) pointers instead if prefixes with an s.
X *
X * $Id: numbers.h,v 1.3 90/10/06 02:18:30 lee Rel1-10 $
X *
X */
X
Xextern INLINE unsigned long fReadNumber();
Xextern INLINE unsigned long sReadNumber();
X
Xextern INLINE void fWriteNumber();
Xextern INLINE void sWriteNumber();
X
X/*
X * $Log: numbers.h,v $
X * Revision 1.3 90/10/06 02:18:30 lee
X * Prepared for first beta release.
X *
X * Revision 1.2 90/08/09 19:15:42 lee
X * after BSD lint and saber-C
X *
X * Revision 1.1 90/04/19 19:27:04 lee
X * Initial revision
X *
X * Revision 2.2 89/10/08 20:46:43 lee
X * Working version of nx-text engine. Addfile and wordinfo work OK.
X *
X * Revision 1.2 89/09/16 21:15:40 lee
X * First demonstratable version.
X *
X * Revision 1.1 89/09/07 21:06:02 lee
X * Initial revision
X *
X */
@@@End of lq-text/src/h/numbers.h
echo x - lq-text/src/h/pblock.h 1>&2
sed 's/^X//' >lq-text/src/h/pblock.h <<'@@@End of lq-text/src/h/pblock.h'
X/* pblock.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X#ifndef PBLOCK_H /* the matching endif is at the end of the file... */
X
X# define PBLOCK_H
X/* The physical Word Database...
X *
X * First, there is the WID (from 1 to 4 bytes)
X *
X * Then, there is a NEXT pointer (or 0).
X *
X * Then, there is a list of (FID, OFFSET) pairs.
X *
X * $Header: /usr/src/cmd/lq-text/src/h/RCS/pblock.h,v 1.2 90/08/09 19:15:45 lee Rel1-10 $
X *
X * $Log: pblock.h,v $
X * Revision 1.2 90/08/09 19:15:45 lee
X * after BSD lint and saber-C
X *
X * Revision 1.1 90/03/01 23:54:37 lee
X * Initial revision
X *
X * Revision 2.2 89/10/08 20:47:04 lee
X * Working version of nx-text engine. Addfile and wordinfo work OK.
X *
X * Revision 2.1 89/10/02 01:15:36 lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X *
X * Revision 1.2 89/09/16 21:15:43 lee
X * First demonstratable version.
X *
X * Revision 1.1 89/09/07 21:06:09 lee
X * Initial revision
X *
X *
X */
X
Xtypedef struct {
X t_FID FID;
X unsigned long BlockInFile;
X unsigned short WordInBlock;
X unsigned short Flags;
X unsigned char StuffBefore; /* preceding ignored garbage */
X} t_WordPlace;
X
X/* This structure is really only used by addfile; elsewhere arrays of
X * WordlPlace are used.
X */
X
Xtypedef struct s_WordPlaceList {
X char *Word;
X t_WordPlace WordPlace;
X struct s_WordPlaceList *Next;
X} t_WordPlaceList;
X
X/* Warning: One cannot use structure copy for a pblock! */
X
X/* This does *NOT* correspond to the physical disk layout -- see pblock.c */
Xtypedef struct {
X t_WID WID; /* for checking; */
X unsigned long ChainStart;
X unsigned long NumberOfWordPlaces;
X t_WordPlace WordPlaces[1]; /* made by joining lots of disk blocks... */
X} t_pblock;
X
X#endif
@@@End of lq-text/src/h/pblock.h
echo x - lq-text/src/h/phrase.h 1>&2
sed 's/^X//' >lq-text/src/h/phrase.h <<'@@@End of lq-text/src/h/phrase.h'
X/* phrase.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* LQ-Text -- Liam's Text Retrieval Package
X * Liam R. Quin, September 1989, and later...
X *
X * phrase.h -- data structures for handling entire phrases
X *
X */
X
X/* $Id: phrase.h,v 1.2 90/10/06 02:18:33 lee Rel1-10 $
X *
X */
X
X/* Represent a Phrase as a linked list of WordInfo pointers, plus a list
X * of matches.
X */
X
Xtypedef struct s_PhraseItem {
X t_WordInfo *Word;
X struct s_PhraseItem *Next;
X unsigned long SearchIndex; /* For phrase-matching */
X char *WordStart; /* pointer into original phrase */
X} t_PhraseItem;
X
Xtypedef enum {
X PCM_AnyCase, /* Ignore case entirely */
X PCM_HalfCase, /* Upper only matches upper; lower matches either */
X PCM_SameCase, /* Exact matching */
X} t_PhraseCaseMatch;
X
Xtypedef struct s_Match {
X t_WID WID;
X t_WordPlace *Where;
X struct s_Match *Next;
X} t_Match;
X
Xtypedef struct s_MatchList {
X t_Match *Match;
X struct s_MatchList *Next;
X} t_MatchList;
X
X
Xtypedef struct s_Phrase {
X t_PhraseItem *Words; /* list of words and pblocks */
X char *OriginalString; /* as supplied by the user */
X char *ModifiedString; /* after deleting short/unindexed words */
X unsigned long NumberOfMatches;
X t_MatchList *Matches;
X struct s_Phrase *Next; /* for use when we're in a list of phrases... */
X unsigned short HasUnknownWords;
X} t_Phrase;
X
X/* This is for FilleList() */
Xtypedef struct s_Answer {
X char *Answer;
X struct s_Answer *Next;
X} t_Answer;
X
X/*
X * $Log: phrase.h,v $
X * Revision 1.2 90/10/06 02:18:33 lee
X * Prepared for first beta release.
X *
X * Revision 1.1 90/08/09 19:15:49 lee
X * Initial revision
X *
X * Revision 1.1 89/09/17 23:03:37 lee
X * Initial revision
X *
X */
@@@End of lq-text/src/h/phrase.h
echo x - lq-text/src/h/smalldb.h 1>&2
sed 's/^X//' >lq-text/src/h/smalldb.h <<'@@@End of lq-text/src/h/smalldb.h'
X/* smalldb.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* $Id: smalldb.h,v 1.3 91/03/03 00:12:56 lee Exp $
X */
X
X/* You must include fcntl.h before this file. */
X
X#ifdef ndbm
X# include <ndbm.h>
X# define FoundDbmOK
X# define NDBM
X#endif
X
X#ifdef sdbm
X# include "sdbm.h"
X# define FoundDbmOK
X# define NDBM /* it's compatible */
X#endif
X
X#ifdef ozmahash
X# include "ozmadbm.h"
X# define FoundDbmOK
X# define NDBM /* it's compatible as well... */
X#endif
X
X#ifndef FoundDbmOK
X# include "Liamdbm.h"
X#endif
X
X#ifndef O_RDWR
X# include <fcntl.h>
X#endif
X
X#define CACHE 2 /* size of DBM cache in startdb() -- I only use two! */
X/* If you rip out the dbm cache stuff for use elsewhere, increse the 2
X * to something like 5 or so!!! Each entry uses two file pointers.
X * Lee
X */
X
X#ifndef CACHE
X# define startdb(FilePrefix) dbm_open(FilePrefix, O_RDWR|O_CREAT, 0640)
X# define enddb(db) { if (db) dbm_close(db); }
X#endif
X
X
X#ifndef startdb
XDBM *startdb();
X#endif
X
X#ifndef enddb
X# ifdef CACHE
X# define enddb(db) /* nothing to do, because of the cache */
X# else
X void enddb();
X# endif /* CACHE */
X#endif /* !enddb */
X
X/*
X * $Log: smalldb.h,v $
X * Revision 1.3 91/03/03 00:12:56 lee
X * Integrated ozmahash.
X *
X * Revision 1.2 90/10/06 02:18:36 lee
X * Prepared for first beta release.
X *
X * Revision 1.1 90/08/09 19:16:00 lee
X * Initial revision
X *
X * Revision 2.2 89/10/08 20:47:19 lee
X * Working version of nx-text engine. Addfile and wordinfo work OK.
X *
X * Revision 2.1 89/10/02 01:16:01 lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X *
X * Revision 1.2 89/09/16 21:15:45 lee
X * First demonstratable version.
X *
X * Revision 1.1 89/09/07 21:06:12 lee
X * Initial revision
X *
X */
@@@End of lq-text/src/h/smalldb.h
echo x - lq-text/src/h/wordindex.h 1>&2
sed 's/^X//' >lq-text/src/h/wordindex.h <<'@@@End of lq-text/src/h/wordindex.h'
X/* wordindex.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* (this file is currently empty, but might return...) */
X
X/*
X * $Id: wordindex.h,v 1.2 90/10/06 02:18:38 lee Rel1-10 $
X *
X * $Log: wordindex.h,v $
X * Revision 1.2 90/10/06 02:18:38 lee
X * Prepared for first beta release.
X *
X * Revision 1.1 90/08/09 19:16:02 lee
X * Initial revision
X *
X * Revision 2.1 89/10/02 01:16:06 lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X *
X * Revision 1.2 89/09/16 21:15:47 lee
X * First demonstratable version.
X *
X * Revision 1.1 89/09/07 21:06:13 lee
X * Initial revision
X *
X *
X */
@@@End of lq-text/src/h/wordindex.h
echo x - lq-text/src/h/wordinfo.h 1>&2
sed 's/^X//' >lq-text/src/h/wordinfo.h <<'@@@End of lq-text/src/h/wordinfo.h'
X/* wordinfo.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/*
X * $Id: wordinfo.h,v 1.3 90/10/06 02:21:30 lee Rel1-10 $
X */
X
Xtypedef unsigned long t_WID;
X
X#ifndef PBLOCK_H
X# include "pblock.h"
X#endif
X
X#ifndef WIDBLOCKSIZE
X#define WIDBLOCKSIZE 32
X#endif
X
Xextern char *WidIndexFile; /* Default.c */
X
X/* this is a hack for speed: */
X#define GetNextWID SpoofGetNextWID
X
X/** A t_WordInfo describes a single word, in terms of
X ** where it came from
X ** how to find its database entries
X ** how to find the in-core database entries (a copy of the above)
X **/
X
X/* There would be a performance benefit if this struct was smaller.
X * It was foolish of me to use WordInfo for so many different things in
X * addfile, and now I pay the price.
X * Addfile may end up calling malloc for 10,000 of these things...
X *
X * There should be:
X * t_WordPlace (exists, pblock.h)
X * for recording a specific occurrence of a given word in a given file
X * t_WordInfo (definition follows... look down...)
X * for recording information about a WID's entry in the database
X * t_WordPlaceList
X * for addfile to make a list of word places...
X * t_pblock (exists, see pblock.h)
X * for containing the list of WordPlaces found in the database for a
X * given word, or for putting them there. Uses arrays rather than
X * lists to squeeze a few extra milliseconds. Some hope :-( :-)
X *
X * t_WordPlaceList almost certainly happen in the next major edit phase...
X * t_WordInfo will then be somewhat smaller.
X * All of the entries marked with a leading comment (below) should
X * be elsewhere (and some of them were, in the Grand Design!).
X *
X */
Xtypedef struct s_WordInfo {
X char *Word;
X t_WID WID; /* My Word Identifier */
X unsigned long NumberOfWordPlaces; /* total */
X t_FID FID; /* where we got it from */
X unsigned long Offset; /* word entry position in the data base */
X struct s_WordInfo *Next; /* for making lists of WordInfo structs */
X char *DataBlock; /* for writing me out to the index */
X char *WordPlaceStart;
X t_WordPlace *WordPlaces; /* first few pairs */
X t_WordPlace WordPlace; /* For addfile -- this is due to go!!!! */
X /* shorts are at the end to obviate alignment padding... */
X unsigned long WordPlacesInHere;
X unsigned short Length; /* Word length; reduce the need for strlen */
X#if 0
X unsigned char Flags;
X /* Flags serve two purposes:
X * the LSB says whether the entry is sorted.
X * the remainder are a logical AND of all entries in a sorted
X * block. NOTE: if the block is unsorted, the other bits should
X * still be up to date.
X */
X#endif
X} t_WordInfo;
X
X/*
X * $Log: wordinfo.h,v $
X * Revision 1.3 90/10/06 02:21:30 lee
X * Prepared for first beta release.
X *
X * Revision 1.2 90/08/09 19:16:04 lee
X * after BSD lint and saber-C
X *
X * Revision 2.2 89/10/08 20:47:27 lee
X * Working version of nx-text engine. Addfile and wordinfo work OK.
X *
X * Revision 2.1 89/10/02 01:16:15 lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X *
X * Revision 1.3 89/09/17 23:04:52 lee
X * Various fixes; NumberInBlock now a short...
X *
X * Revision 1.2 89/09/16 21:15:49 lee
X * First demonstratable version.
X *
X * Revision 1.1 89/09/07 21:06:16 lee
X * Initial revision
X *
X */
@@@End of lq-text/src/h/wordinfo.h
echo x - lq-text/src/h/wordrules.h 1>&2
sed 's/^X//' >lq-text/src/h/wordrules.h <<'@@@End of lq-text/src/h/wordrules.h'
X/* wordrules.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* $Id: wordrules.h,v 1.2 90/10/06 02:18:39 lee Rel1-10 $
X *
X */
X
X/* Rules for determining what an indexable word looks like;
X * These are implemented by the various filters, as well as by
X * the indexing software itself. This means that the filters
X * don't need to keep track of word lengths, as addfile will do this,
X * but that they should not emit non-word stuff if they can help it,
X * turning it into the equivalent amount (in bytes) of white-space
X * instead.
X * They should also turn words they don't want indexed into "qxxx",
X * with the right number of x's (e.g. "bare" --> "qxxx").
X */
X
X/* A "word" is a letter followed by any combination of
X * letters, digits or '_'. An embedded (not trailing) ' is also allowed
X * (_ is allowed so that one can index progamming languages; strictly
X * speaking, a lot of languages allow _ at the start too, but I don't
X * want to get confused by nroff output etc., which contains lines of
X * underscores)
X *
X * This scheme currently excludes numbers...
X * 31, 31.4 and 31.9e4 will all be ignored. So will 1987.
X */
X
X#define StartsWord(ch) isalpha(ch)
X#define WithinWord(ch) (isalnum(ch) || (ch == '_') || (ch == '\''))
X#define EndsWord(ch) isalnum(ch)
X
X/* Don't index words unless they are at least MinWordLength characters
X * long!
X */
X#define MinWordLength 3
X#define MaxWordLength 18 /* truncate words to this */
X/* The Following is for *.WordPlace.BlockInFile. If words are constrained
X * to be 3 or more characters long, there can be at most
X * (FileBlockSize / 4) of them in a block (since words must be separated
X * by at least one character).
X * Hence, 7 bits, which allows 0..127 giving 128 distinct values,
X * gives us a block that is 128 * (MinWordLength + 1) bytes long.
X */
X#define FileBlockSize (128 * (MinWordLength + 1))
X
X/* WordPlace Flags:
X * When a plural word is found, or a possessive word, it is reduced to
X * being singular, and flags are set appropriately.
X * Also, a flag is set to say if the word started with a Capital Letter.
X * This puts Window, windows, and Window's all together, but enables them
X * to be differentiated for searching if required.
X * These flags are implemented by WordInfo and addfile, not by the various
X * filters, but the filters must preserve capitalisation of the first letter
X * in each word, and pass through apostrophes within words (like this's).
X */
X
X#define WPF_WASPLURAL 0001 /* The word... ended in s */
X#define WPF_UPPERCASE 0002 /* ...Started with a capital letter */
X#define WPF_POSSESSIVE 0004 /* ...ended in 's */
X#define WPF_ENDEDINING 0010 /* ...ended in ing */
X#define WPF_LASTWASCOMMON 0020 /* the previous word was common */
X#define WPF_LASTHADLETTERS 0040 /* we skipped some letters to get here */
X#define WPF_HASSTUFFBEFORE 0100 /* Other than 1 byte of garbage before */
X#define WPF_LASTINBLOCK 0200 /* I'm the last word in this block */
X
X/* new note (jan 90):
X * You can't currently have both plural and posessive in the most common case
X * of the boys' muddy feet (for example), as the trailing ' gets deleted.
X * this doesn't matter, but perhaps that combination should be reserved for
X * had-another-standard-ending??? e.g. -ed or -ing, that isn't often followed by
X * -s or -'s...
X *
X * Also, ENDEDINING (ended in "ing") is currently unused entirely.
X * Perhaps if it is set, the plural and possessive bits should index which of
X * four endings was found, although this would preclude special-casing of the
X * s's combination. Probably better that way.
X *
X * I should very much like to have another flag or two, perhaps embedded in
X * one of the other fields. This might be feasible if there is a pre-scan
X * when the index is written to determine the most common (modal) flags and
X * distance (currently I assume 1) and to omit these whenever they are the default.
X * In this case, the fact that every occurrence of Jesus starts with a capital
X * letter (and ends in -s, *blush*), can still lead to most of the flags being
X * omitted.
X *
X * The next revision will separate the list of FIDs from the rest of the information,
X * in which case the embedding of the flags becomes a little trickier. This
X * belongs in the TODO file now, sorry.
X *
X * Liam Quin, January 22nd 1990, at home in Warrington, England (ugh)
X *
X */
X
X/*
X * $Log: wordrules.h,v $
X * Revision 1.2 90/10/06 02:18:39 lee
X * Prepared for first beta release.
X *
X * Revision 1.1 90/08/09 19:16:05 lee
X * Initial revision
X *
X * Revision 2.2 89/10/08 20:47:35 lee
X * Working version of nx-text engine. Addfile and wordinfo work OK.
X *
X * Revision 2.1 89/10/02 01:16:19 lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X *
X * Revision 1.2 89/09/16 21:15:52 lee
X * First demonstratable version.
X *
X * Revision 1.1 89/09/07 21:06:17 lee
X * Initial revision
X *
X */
@@@End of lq-text/src/h/wordrules.h
echo end of part 02
--
Liam R. E. Quin, lee at sq.com, SoftQuad Inc., Toronto, +1 (416) 963-8337
More information about the Alt.sources
mailing list