lq-text Full Text Retrieval Database Part 04/13
Liam R. E. Quin
lee at sq.sq.com
Mon Mar 4 12:03:16 AEST 1991
: cut here --- cut here --
: To unbundle, sh this file
#! /bin/sh
: part 04
echo x - lq-text/src/liblqtext/Phrase.c 1>&2
sed 's/^X//' >lq-text/src/liblqtext/Phrase.c <<'@@@End of lq-text/src/liblqtext/Phrase.c'
X/* Phrase.c -- Copyright 1989 Liam R. Quin. All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/*
X * Deal with (WID, FID, Offfset) triples
X * Liam Quin, September 1989
X *
X * $Id: Phrase.c,v 1.11 91/02/22 18:22:59 lee Rel1-10 $
X *
X * $Log: Phrase.c,v $
X * Revision 1.11 91/02/22 18:22:59 lee
X * Improved a trace message.
X *
X * Revision 1.10 90/10/06 00:11:57 lee
X * Prepared for first beta release.
X *
X * Revision 1.9 90/10/04 17:10:43 lee
X * Case matching now works on one-word phrases.
X *
X * Revision 1.8 90/10/03 20:47:06 lee
X * changed an int to an unsigned long
X *
X * Revision 1.7 90/08/29 21:46:39 lee
X * Alpha release.
X *
X * Revision 1.6 90/03/29 19:59:05 lee
X * Now passes gcc -Wall
X *
X * Revision 1.5 90/03/19 00:02:23 lee
X * Simplified phrase matching greatly by adding new routine, and also
X * improved checking of word flags and generation of ModifiedString,
X * the canonical phrase, in String2Phrase.
X *
X * Revision 1.4 90/03/16 22:43:15 lee
X * After Richard's consummate help...
X * fixed two severe bugs in matching, and started to simplify
X * MakeMatches().
X *
X * Revision 1.3 90/03/14 21:02:14 lee
X * Before Richard...
X *
X * Revision 1.2 90/03/12 22:39:30 lee
X * prepared for general release.
X *
X * Revision 1.1 89/09/17 23:01:34 lee
X * Initial revision
X *
X *
X */
X
X/** Unix system calls that need to be declared: **/
Xextern void exit();
X/** Unix/C Library Functions: **/
Xextern unsigned int sleep();
X#ifndef tolower
X extern int tolower();
X#endif
Xextern int strlen();
Xextern char *strcpy();
X/** lqtext functions: **/
Xextern int TooCommon();
Xextern char *UnFlag();
X/** **/
X
X#include "globals.h" /* defines and declarations for database filenames */
X
X#include <stdio.h> /* stderr, also for fileinfo.h */
X#include <fcntl.h>
X#include <malloc.h>
X#include <sys/types.h>
X#include <ctype.h>
X
X#include "fileinfo.h" /* for wordinfo.h */
X#include "wordinfo.h"
X#include "pblock.h"
X#include "phrase.h"
X#include "wordrules.h"
X
X#include "emalloc.h"
X
X#ifndef STREQ
X# define STREQ(boy,girl) ((*(boy) == *(girl)) && (!strcmp((boy),(girl))))
X#endif
X
X#ifndef new
X# define new(type) ((type *) emalloc(sizeof (type)))
X#endif
X
X#ifndef MAXPHRASELEN
X# define MAXPHRASELEN 2000
X#endif
X
Xextern int AsciiTrace;
X
Xt_Phrase *
XString2Phrase(String)
X char *String;
X{
X extern t_WordInfo *WID2WordInfo();
X extern t_WID Word2WID();
X extern char *WordRoot();
X
X t_Phrase *Result;
X t_PhraseItem **ThisWord;
X /* (* 3 because in the worst case, "a a a" expands to "[a] [a] [a]") */
X register char *p;
X register char *q;
X char *LastStart = 0;
X char *PrevLastEnd = (char *) 0;
X int InWord = 0;
X int Flags = 0;
X int FoundLetters = 0;
X
X if (AsciiTrace > 50) {
X fprintf(stderr, "String2Phrase(%s)\n", String);
X }
X Result = (t_Phrase *) emalloc(sizeof(t_Phrase));
X Result->Next = (t_Phrase *) 0;
X p = Result->ModifiedString = emalloc(strlen(String) * 3 + 1);
X
X Result->HasUnknownWords = 0;
X
X *(ThisWord = &Result->Words) = (t_PhraseItem *) 0;
X
X /* March along the supplied phrase, looking for keywords.
X * surround unindexed or short words with [brackets].
X * Also converts to lower case and strips plurals.
X */
X for (q = String; /*LOTSOFTIMES*/; q++) {
X
X if (AsciiTrace > 50) {
X fputc(*q, stderr);
X }
X
X if (!InWord && !StartsWord(*q)) {
X if (!*q) {
X break;
X } else {
X if (!LastStart) continue;
X }
X }
X
X if (!InWord) {
X LastStart = q;
X if (StartsWord(*q)) {
X InWord = 1;
X }
X continue;
X } else if (isalpha(*q)) {
X /* in a word and found letters, so remember in case we skip
X * this word...
X */
X FoundLetters = 1;
X }
X
X /* ASSERT: inword == 1 */
X
X if (*q == '\'') {
X if (!WithinWord(q[1])) {
X InWord = 0;
X }
X }
X
X if (!*q || !WithinWord(*q)) {
X InWord = 0;
X }
X
X
X if (LastStart && !InWord) {
X int Length = q - LastStart;
X int UsedABracket = 0;
X
X if (p > Result->ModifiedString) *p++ = ' ';
X
X /* we have reached the end of a word, is it long enough? */
X if (!FoundLetters) {
X *p++ = '[';
X UsedABracket = 1;
X } else if (Length < MinWordLength) {
X *p++ = '[';
X UsedABracket = 1;
X if (FoundLetters) {
X Flags |= WPF_LASTHADLETTERS;
X FoundLetters = 0;
X }
X } else {
X t_WID WID;
X t_WordInfo *W;
X char SaveEnd = (*q);
X t_WordInfo TryRoot;
X register char *p2;
X char RootBuffer[MaxWordLength + 1];
X char *R = RootBuffer;
X
X /* Add the word to the chain, too: */
X *q = '\0';
X
X FoundLetters = 0; /* unnecessary now (?) */
X TryRoot.Length = Length;
X TryRoot.WordPlace.Flags = Flags;
X if (isupper(*LastStart)) {
X TryRoot.WordPlace.Flags |= WPF_UPPERCASE;
X }
X Flags = 0;
X
X for (p2 = LastStart; *p2; p2++) {
X *R++ = isupper(*p2) ? tolower(*p2) : *p2;
X }
X *R = '\0';
X TryRoot.Word = RootBuffer;
X
X R = WordRoot(&TryRoot);
X
X if (TooCommon(&TryRoot)) {
X *p++ = '[';
X *p++ = '*';
X UsedABracket = 1;
X Flags |= WPF_LASTWASCOMMON;
X if (AsciiTrace > 10) {
X fprintf(stderr, " Common(%s) ", TryRoot.Word);
X }
X } else if (!(WID = Word2WID(TryRoot.Word, TryRoot.Length)) ||
X (W = WID2WordInfo(WID)) == (t_WordInfo *) 0) {
X *p++ = '[';
X *p++ = WID ? '@' : '?';
X UsedABracket = 1;
X if (AsciiTrace > 10) {
X fprintf(stderr, " Unknown(%s) ", TryRoot.Word);
X }
X Result->HasUnknownWords++;
X } else {
X if ((*ThisWord = new(t_PhraseItem)) == (t_PhraseItem *) 0) {
X fprintf(stderr,
X "Not enough memory for PHRASE \"%s\"", String);
X return (t_Phrase *) 0;
X }
X W->WordPlace.Flags |= TryRoot.WordPlace.Flags;
X if (PrevLastEnd == (char *) 0) {
X W->WordPlace.StuffBefore = 0;
X } else {
X W->WordPlace.StuffBefore = LastStart - PrevLastEnd;
X }
X PrevLastEnd = &q[1];
X
X if (AsciiTrace) {
X fprintf(stderr, "Word %s --> %s, %lu matches\n",
X LastStart,
X UnFlag(W, W->WordPlace.Flags),
X W->NumberOfWordPlaces);
X }
X /* point to the new space */
X (*ThisWord)->Word = W;
X (*ThisWord)->WordStart = LastStart;
X (*ThisWord)->Next = (t_PhraseItem *) 0;
X (*ThisWord)->SearchIndex = 0L;
X ThisWord = &(*ThisWord)->Next;
X
X /** (void) strcpy(p, R); **/
X /** p += TryRoot.Length; **/
X
X (void) strcpy(p, LastStart);
X p += q - LastStart; /* q points one beyond the end */
X
X LastStart = q;
X
X }
X *q = SaveEnd;
X }
X while (LastStart < q) {
X *p++ = *LastStart++;
X }
X if (UsedABracket) {
X *p++ = ']';
X }
X LastStart = 0;
X } /* if */
X if (!*q) break;
X } /* for */
X *p= '\0';
X
X if (ThisWord == &Result->Words) {
X /* There were no words in the phrase! */
X return (t_Phrase *) 0;
X }
X
X Result->OriginalString = emalloc(q - String + 2);
X (void) strcpy(Result->OriginalString, String);
X
X Result->NumberOfMatches = 0;
X Result->Matches = (t_MatchList *) 0;
X if (AsciiTrace > 1) {
X fprintf(stderr, "phrase \"%s\",\n", Result->OriginalString, String);
X fprintf(stderr, "Canonical form \"%s\"\n", Result->ModifiedString);
X }
X return Result;
X}
X
X#define MaxDistance 20
X
Xt_Answer *
XGetFiles(Phrase)
X t_Phrase *Phrase;
X{
X char *MakeOneDescription();
X
X t_Answer *Result = 0;
X t_Answer **RP = &Result;
X t_MatchList *MP;
X t_FID LastFID;
X unsigned long ThisFIDNumberOfMatches = 0L;
X
X if (!Phrase || !Phrase->Matches) {
X return Result;
X }
X
X LastFID = Phrase->Matches->Match->Where->FID;
X
X for (MP = Phrase->Matches; MP; MP = MP->Next) {
X if (MP->Match->Where->FID != LastFID) {
X char *p;
X
X p = MakeOneDescription(LastFID, ThisFIDNumberOfMatches);
X
X if ((*RP = new(t_Answer)) == (t_Answer *) 0) {
X return Result;
X }
X (*RP)->Answer = p;
X RP = &(*RP)->Next;
X *RP = (t_Answer *) 0;
X ThisFIDNumberOfMatches = 0L;
X } else {
X ++ThisFIDNumberOfMatches;
X }
X
X LastFID = MP->Match->Where->FID;
X }
X
X if (ThisFIDNumberOfMatches) {
X char *p = MakeOneDescription(LastFID, ThisFIDNumberOfMatches);
X
X if ((*RP = new(t_Answer)) == (t_Answer *) 0) {
X return Result;
X }
X (*RP)->Answer = p;
X RP = &(*RP)->Next;
X *RP = (t_Answer *) 0;
X ThisFIDNumberOfMatches = 0L;
X }
X
X return Result;
X}
X
Xchar *
XMakeOneDescription(FID, NumberOfMatches)
X t_FID FID;
X unsigned long NumberOfMatches;
X{
X extern char *ctime();
X extern t_FileInfo *GetFileInfo();
X
X char *Date;
X char *p;
X t_FileInfo *FileInfo;
X char NumBuf[20];
X
X if (!FID) return (char *) 0;
X
X if (!(FileInfo = GetFileInfo(FID))) return (char *) 0;
X
X Date = ctime(&(FileInfo->Date));
X /** Tue Oct 3 00:57:11 BST 1989 **/
X Date[10] = '\0';
X
X (void) sprintf(NumBuf, "%lu", NumberOfMatches);
X
X p = emalloc((unsigned) (strlen(FileInfo->Name) + strlen(NumBuf) + 11));
X (void) sprintf(p, "%-.5s %s %s", NumBuf, Date, FileInfo->Name);
X efree(FileInfo->Name);
X efree(FileInfo);
X return p;
X}
X
Xvoid
XResetPhraseMatch(Phrase)
X t_Phrase *Phrase;
X{
X t_PhraseItem *Word;
X
X if (!Phrase || !Phrase->Words) return;
X
X for (Word = Phrase->Words; Word; Word = Word->Next) {
X Word->SearchIndex = 0;
X }
X Phrase->NumberOfMatches = 0;
X}
X
X/* Default is to check case, etc. only if given in the input phrase.
X * This is an enum from phrase.h, and only used in MakeMatches().
X */
X
Xextern t_PhraseCaseMatch PhraseMatchLevel;
X
Xlong
XMakeMatches(Phrase)
X t_Phrase *Phrase;
X{
X /* Each word has a pointer (SearchIndex) to the last Word Place
X * that was examined. This enables an O(NumberOfWords) search instead
X * of O(NumberOfWords * NumberOfWords) search.
X */
X static int ContinuesMatch();
X
X unsigned long PIFB; /* PlaceInFirstBlock */
X t_MatchList **MLPP = &(Phrase->Matches);
X t_Match **MPP;
X t_Match **OldMatch;
X t_WordPlace *pp;
X t_PhraseItem *Word;
X long Result = 0L;
X long LastResult = (-1L); /* to detect new matches */
X t_PhraseItem *LeastWord;
X int HowGood;
X
X if (!Phrase) {
X return 0L;
X }
X
X ResetPhraseMatch(Phrase);
X /* Each iteration over this list either produces a match or rejects a
X * possible phrase starting place.
X */
X
X if (AsciiTrace > 1) {
X fprintf(stderr, "Match(%s)\n", Phrase->ModifiedString);
X }
X
X /* A phrase with garbage words can't match anything */
X if (Phrase->HasUnknownWords && PhraseMatchLevel != PCM_AnyCase) {
X return 0L;
X }
X
X /* Ensure that the matches for the first word have been read */
X if (Phrase->Words->Word->WordPlacesInHere <
X Phrase->Words->Word->NumberOfWordPlaces) {
X extern t_WordPlace *GetWordPlaces();
X t_WordInfo *W = Phrase->Words->Word; /* less indirection! */
X
X if (W->WordPlaces) {
X (void) efree(W->WordPlaces);
X }
X
X W->WordPlaces = GetWordPlaces(
X W->WID,
X W->WordPlaceStart,
X (unsigned) WIDBLOCKSIZE - (W->WordPlaceStart - W->DataBlock),
X W->Offset,
X W->NumberOfWordPlaces
X );
X W->WordPlacesInHere = W->NumberOfWordPlaces;
X }
X
X /* Find the word in the phrase with least matches: */
X LeastWord = Phrase->Words;
X for (Word = Phrase->Words; Word; Word = Word->Next) {
X if (Word->Word->NumberOfWordPlaces <
X LeastWord->Word->NumberOfWordPlaces) {
X LeastWord = Word;
X }
X }
X
X /* For each match in the first word in the phrase: */
X for (PIFB = 0; PIFB < Phrase->Words->Word->NumberOfWordPlaces; PIFB++) {
X t_WordPlace *LastFOP = (t_WordPlace *) 0;
X
X /* The idea is that the next two loops are are likely to reduce
X * considerably the number of places we have to consider in the
X * case that the first word in the phrase has a lot of matches
X * and there is a subsequent word with relatively few matches.
X * Experiments suggest that this is fairly common.
X *
X * This is still a nearly (i.e. slightly-better-than) linear
X * algorithm w.r.t the total number of matches in all of the
X * words added up. Note that I alter LeastWord->SearchIndex in
X * one of the two loops that follow, so when WordPlaces from that
X * word are considered, we don't have to look at any twice.
X *
X * In order to do better, one would have to be able to avoid
X * looking at some or (better!) most of the WordPlaces.
X *
X * For example, not fetching so many from disk:
X * if we didn't do the fetches until we needed to, and we gave
X * GetWordPlaces a minimum FID to look for, we might be able
X * to reduce things by (say) 15%.
X * If all of the FIDS were stored separately, we would not
X * have to look at the (Block, Word, Flags, StuffBefore) stuff at
X * all, and that would be much faster. One way to do that might be
X * to store the list of FIDs with the word (as now), and perhaps
X * some flags and the count of words/fid, and to store the rest
X * in a per-file data structure.
X *
X * That would be a major, major hack...
X * ... sigh.o
X *
X */
X
X while (LeastWord->Word->WordPlaces[LeastWord->SearchIndex].FID <
X Phrase->Words->Word->WordPlaces[PIFB].FID) {
X if (++(LeastWord->SearchIndex) >=
X LeastWord->Word->NumberOfWordPlaces) {
X goto GiveUp;
X }
X }
X
X while (Phrase->Words->Word->WordPlaces[PIFB].FID <
X LeastWord->Word->WordPlaces[LeastWord->SearchIndex].FID) {
X if (++PIFB >= Phrase->Words->Word->NumberOfWordPlaces) {
X goto GiveUp;
X }
X }
X
X /* The following comment tells Sabre_C not to moan about "if (0)" */
X /*SUPPRESS558*/
X if (0) {
XGiveUp:
X break;
X }
X /* end of attempted speed improvement */
X
X /* Optimistically allocate a new match: */
X if (1 || Result != LastResult) {
X *MLPP = (t_MatchList *) emalloc(sizeof(t_MatchList));
X (*MLPP)->Match = (t_Match *) 0;
X OldMatch = MPP = &((*MLPP)->Match);
X MLPP = &(*MLPP)->Next;
X *MLPP = (t_MatchList *) 0;
X }
X LastResult = Result;
X
X pp = &Phrase->Words->Word->WordPlaces[Phrase->Words->SearchIndex = PIFB];
X /* When we have a partially completed match,
X * FOP (declared below) will point to the WordPlace currently
X * being considered to see if it extends the partial match;
X * LastFOP points to the previous WordPlace in the match.
X */
X
X /* For each word in the phrase: */
X for (Word = Phrase->Words; Word; Word = Word->Next) {
X int GotOne = 0;
X
X /* Ensure that the matches word have been read */
X if (Word->Word->WordPlacesInHere <
X Word->Word->NumberOfWordPlaces) {
X extern t_WordPlace *GetWordPlaces();
X t_WordInfo *W = Word->Word; /* less indirection! */
X
X if (W->WordPlaces) {
X (void) efree(W->WordPlaces);
X }
X W->WordPlaces = GetWordPlaces(
X W->WID,
X W->WordPlaceStart,
X (unsigned) WIDBLOCKSIZE - (W->WordPlaceStart - W->DataBlock),
X W->Offset,
X W->NumberOfWordPlaces
X );
X W->WordPlacesInHere = W->NumberOfWordPlaces;
X }
X
X /* For each occurrence of that word: */
X for (; Word->SearchIndex < Word->Word->NumberOfWordPlaces;
X ++Word->SearchIndex) {
X register t_WordPlace *FOP =
X &Word->Word->WordPlaces[Word->SearchIndex];
X
X#if 0
X /* Speedup -- binary search to find next candidate...
X * this is commented out because it actually seems to
X * make things run slower!
X */
X {
X int low = Word->SearchIndex;
X int high = Word->Word->NumberOfWordPlaces - 1;
X t_WordPlace *Places = Word->Word->WordPlaces;
X int guess = (high + low) / 2;
X
X while (low < high) {
X if (Places[guess].FID < pp->FID) {
X /* not gone far enough */
X low = guess + 1;
X } else {
X high = guess;
X }
X guess = (high + low) / 2;
X }
X if (guess != Word->SearchIndex) {
X Word->SearchIndex = guess;
X FOP = &Word->Word->WordPlaces[Word->SearchIndex];
X }
X }
X#endif
X
X if (!LastFOP) {
X LastFOP = FOP;
X }
X
X /** So:
X ** | int PIFB = each match in the first word in the phrase
X ** | t_WordPlace *pp = each match in the phrase
X ** | t_PhraseItem *Word = each word in the phrase
X ** | unsigned SearchIndex = each match of that word
X ** | t_WordPlace *FOP = each occurrence of that word
X **
X ** Hence, we are comparing pp and FOP, hoping that each time
X ** round the (Word) loop we will advance FOP.
X ** Once we have decided that FOP and pp relate to the
X ** same file and that FOP is no earlier than pp in the
X ** file, we must then check that FOP is advancing the
X ** chain by comparing it to the previous element in the
X ** list (LastFOP).
X **
X ** When we break from this inner list, we must either have
X ** eliminated this particular (PIFB) as starting a match-
X ** chain, or have decided that we have extended the
X ** current match chain (by setting GotOne).
X **/
X
X
X if (LastFOP == FOP) {
X HowGood = CheckFlags(Word->Word, FOP);
X } else {
X HowGood = ContinuesMatch(Word->Word, pp, LastFOP, FOP);
X }
X
X switch (HowGood) {
X case 0:
X /* G O T C H A !!!! */
X /* extend the HitList, since it's OK so far. */
X
X *MPP = (t_Match *) emalloc(sizeof(t_Match));
X (*MPP)->WID = Word->Word->WID;
X (*MPP)->Where = FOP;
X (*MPP)->Next = (t_Match *) 0;
X MPP = &(*MPP)->Next;
X GotOne++;
X break;
X case 1: /* gone too far */
X if (AsciiTrace > 10) {
X t_WordInfo WW;
X
X WW = *(Word->Word);
X
X if (LastFOP == FOP) {
X /* UnFlag() returns a pointer to a static buffer,
X * so I have to use two printf() calls here.
X */
X fprintf(stderr, "Reject(%s (%d) != ",
X UnFlag(&WW, WW.WordPlace.Flags),
X WW.WordPlace.Flags);
X fprintf(stderr, "%s (%d)) [flags]\n",
X UnFlag(&WW, FOP->Flags), FOP->Flags);
X } else {
X fprintf(stderr, "Reject(%s) -- too far\n",
X UnFlag(&WW, WW.WordPlace.Flags));
X }
X }
X break;
X case -1:
X continue; /* not there yet */
X default:
X fprintf(stderr, "\n\rInternal Error %s: %d\n", __FILE__,
X __LINE__ - 1);
X (void) sleep(4); /* for curses stuff... */
X exit(1);
X }
X
X /* Remember where we got up to... so that we can extend
X * the list when we start looking at the next word.
X */
X LastFOP = FOP;
X
X if (AsciiTrace >= 4) {
X t_WordInfo WW;
X
X WW = *(Word->Word);
X /* UnFlag() returns a pointer to a static buffer */
X fprintf(stderr, "Partial match %s",
X UnFlag(&WW, Word->Word->WordPlace.Flags));
X fprintf(stderr, "(Word (%s,%lu,%u) in file %lu)\n",
X UnFlag(&WW, FOP->Flags),
X FOP->BlockInFile, FOP->WordInBlock,
X FOP->FID
X );
X }
X /* If we got to here, we extended the list, which is fine;
X * otherwise, if we hit a continue, we try to carry on
X * looking at matches of this word, and if we hit a break
X * before we set "GotOne", we give up on this match
X * altogether.
X */
X break;
X } /* For each occurrence of that word: */
X
X if (!GotOne) {
X t_Match *MP;
X /* This word isn't here, so neither is the phrase found
X * in this file starting here.
X */
X
X for (MP = (*OldMatch); MP != (t_Match *) 0; /*void*/) {
X t_Match *Next = MP->Next;
X
X efree((char *) MP);
X MP = Next;
X }
X
X *OldMatch = (t_Match *) 0;
X break;
X } else {
X /* If we've reached the end of the phrase, i.e. if
X * Word->Next is zero, we have successfully added a new
X * phrase!
X */
X if (Word->Next == (t_PhraseItem *) 0) {
X if (AsciiTrace > 10) {
X fprintf(stderr, "Result now %d\n", Result + 1);
X }
X Result++;
X }
X }
X
X } /* end for (each word in the phrase) */
X } /* end (for each FID/Offset pair in the first word */
X return Phrase->NumberOfMatches = Result;
X}
X
X
Xstatic int
XContinuesMatch(QueryWord, First, Prev, New)
X t_WordInfo *QueryWord;
X t_WordPlace *First;
X t_WordPlace *Prev;
X t_WordPlace *New;
X{
X /* Return Value is
X * -1 --- if New occurs before Prev (and thus isn't part of the match)
X * 0 --- if it's the next word in the match
X * +1 --- if we've gone past it
X * Note: you can use these values in a switch() if you want.
X */
X
X /* First check we are looking at the right file:
X * Have we gone far enough?
X */
X if (New->FID < First->FID) {
X return -1; /* not far enough */
X } else if (New->FID > First->FID) {
X return 1; /* too far */
X } else if (Prev == New) {
X return 0;
X }
X
X /* Hey everybody, they're the same!
X * That means that this might be a candidate for a MATCH!!!!
X */
X
X /* if (SimplyAnywhereWillDo) { OK; break; } */
X
X /* Clearly later words in the phrase can't be in earlier
X * blocks...
X */
X if (New->BlockInFile < First->BlockInFile) {
X /* Although we are in the right file, we have not
X * yet reached the correct offset.
X */
X return -1;
X }
X
X /* If we get to here,
X * . we are in the right file
X * . we are at least as far into the file as the start
X * of the phrase
X */
X
X /* Now check that we are a reasonable distance past
X * the preceding word (checking that we are not on the first
X * match in the list, of course):
X */
X if (New->BlockInFile < Prev->BlockInFile) {
X /* not gone far enough */
X return -1;
X }
X if (New->BlockInFile > Prev->BlockInFile + 1) {
X /* If they are more than one block apart, I
X * don't believe them to be part of a phrase!
X */
X return 1;
X }
X if (New->BlockInFile == Prev->BlockInFile) {
X /* If they are in the same block, one must be
X * exactly one word beyond the other. I don't
X * think they can ever be the same, unless there
X * is a serious bug somewhere!
X */
X if (New->WordInBlock <= Prev->WordInBlock) {
X /* too early in the block */
X return -1;
X }
X switch (PhraseMatchLevel) {
X case PCM_AnyCase:
X if (New->WordInBlock > Prev->WordInBlock + 4) {
X return 1; /* gone too far */
X /* We allow a few words slop in this case, though... */
X }
X break; /* clearly OK */
X case PCM_SameCase:
X case PCM_HalfCase:
X default:
X if (New->WordInBlock > Prev->WordInBlock + 1) {
X return 1; /* gone too far */
X }
X }
X } else {
X /* they are in adjacent blocks */
X if (New->WordInBlock > 0 ||
X !(Prev->Flags & WPF_LASTINBLOCK)) {
X /* there is another word between them, so
X * we have gone too far.
X * I went to a lot of effort in addfile.c to
X * mantain that flag, just for this!
X */
X return 1;
X }
X }
X /* So they are adjacent words.
X * Now, I wonder if they are plausible distances
X * apart, and whether the common words skipped are
X * the same?
X * Also, what about other flag details?
X */
X
X /* NOTDONE */
X
X /* Now we check that the word matches the given word
X * -- in other words, that possessive/plural/case
X * is correct if required. Do this later as it is
X * relatively expensive I expect, and we will not
X * usually care about case.
X *
X * Since the word is in the right place, if it fails here there
X * is no point in looking at the next word in this block!
X */
X
X return CheckFlags(QueryWord, New);
X}
X
XCheckFlags(QueryWord, New)
X t_WordInfo *QueryWord;
X t_WordPlace *New;
X{
X /* First check case */
X switch (PhraseMatchLevel) {
X
X default: /* defensive! */
X fprintf(stderr, "\n\rinternal error %d %s\n", __LINE__, __FILE__);
X (void) sleep(4);
X break;
X case PCM_AnyCase:
X break; /* clearly OK */
X case PCM_SameCase:
X if ((QueryWord->WordPlace.Flags & (WPF_UPPERCASE|WPF_POSSESSIVE)) !=
X (New->Flags & (WPF_UPPERCASE|WPF_POSSESSIVE))) {
X /* The cases are different, no good */
X return 1; /* give up on this match */
X }
X if (QueryWord->WordPlace.StuffBefore > 0) {
X int Difference;
X
X Difference = QueryWord->WordPlace.StuffBefore - New->StuffBefore;
X if (Difference < -2 || Difference > 4) {
X return 1; /* give up on this match */
X }
X }
X /* Now, what about skipped common words? */
X if ((New->Flags & WPF_LASTWASCOMMON) !=
X (QueryWord->WordPlace.Flags & WPF_LASTWASCOMMON)) {
X return 1; /* give up on this match */
X }
X
X /* plurals: this should be separate */
X if ((QueryWord->WordPlace.Flags & WPF_WASPLURAL) &&
X !(New->Flags & WPF_WASPLURAL)) {
X return 1; /* give up on this match */
X }
X
X /* Only do this test if we are being awfully strict.
X * Remember also that the first word in the phrase will
X * not usually have this set.
X */
X if ((QueryWord->WordPlace.Flags & WPF_LASTHADLETTERS) &&
X !(New->Flags & WPF_LASTHADLETTERS)) {
X return 1; /* give up on this match */
X }
X break;
X case PCM_HalfCase:
X /* In this case, we are lax about things, but if the
X * user typed plural/possessive/capital, we only
X * match one with the same attribute.
X */
X if ((QueryWord->WordPlace.Flags & WPF_UPPERCASE) &&
X !(New->Flags & WPF_UPPERCASE)) {
X if (AsciiTrace > 4) {
X fprintf(stderr, "Reject [uppercase]\n");
X }
X return 1; /* give up on this match */
X }
X
X /* plurals: this should be separate */
X if ((New->Flags & WPF_WASPLURAL) &&
X !(QueryWord->WordPlace.Flags & WPF_WASPLURAL)) {
X if (AsciiTrace > 4) {
X fprintf(stderr, "Reject [plural]\n");
X }
X return 1; /* give up on this match */
X }
X
X /* Now, what about skipped common words? */
X if ((QueryWord->WordPlace.Flags & WPF_LASTWASCOMMON) &&
X !(New->Flags & WPF_LASTWASCOMMON)) {
X if (AsciiTrace > 4) {
X fprintf(stderr, "Reject [last was common]\n");
X }
X return 1; /* give up on this match */
X }
X /* Stuff before, if given, must be present: */
X if (QueryWord->WordPlace.StuffBefore > 1) {
X if (New->StuffBefore < QueryWord->WordPlace.StuffBefore - 1) {
X if (AsciiTrace > 4) {
X fprintf(stderr, "Reject [Stuff Before %d != Q%d]\n",
X QueryWord->WordPlace.StuffBefore,
X New->StuffBefore);
X }
X return 1;
X } /* don't care if there is too much there, though */
X }
X if ((QueryWord->WordPlace.Flags & WPF_POSSESSIVE) &&
X !(New->Flags & WPF_POSSESSIVE)) {
X if (AsciiTrace > 4) {
X fprintf(stderr, "Reject [user flag]\n");
X }
X return 1; /* give up on this match */
X }
X break;
X }
X
X /* If we got here...
X *
X */
X
X return 0; /* It's all OK! */
X}
@@@End of lq-text/src/liblqtext/Phrase.c
echo x - lq-text/src/liblqtext/Root.c 1>&2
sed 's/^X//' >lq-text/src/liblqtext/Root.c <<'@@@End of lq-text/src/liblqtext/Root.c'
X/* Root.c -- Copyright 1989 Liam R. Quin. All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/*
X * $Id: Root.c,v 2.7 91/03/03 00:13:36 lee Rel1-10 $
X *
X * $Log: Root.c,v $
X * Revision 2.7 91/03/03 00:13:36 lee
X * cosmetic changes.
X *
X * Revision 2.6 90/10/06 00:11:59 lee
X * Prepared for first beta release.
X *
X * Revision 2.5 90/08/29 21:46:42 lee
X * Alpha release.
X *
X * Revision 2.4 90/08/09 19:16:29 lee
X * BSD lint and fixes...
X *
X * Revision 2.3 90/03/29 23:00:04 lee
X * Now passes gcc -Wall
X *
X * Revision 2.2 89/10/08 20:44:56 lee
X * Working version of nx-text engine. Addfile and wordinfo work OK.
X *
X * Revision 2.1 89/10/02 01:13:07 lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X *
X *
X */
X
X#include "globals.h" /* defines and declarations for database filenames */
X
X#include <sys/types.h>
X#include <fcntl.h> /* for my header files, sorry */
X#include <stdio.h>
X#include <malloc.h>
X#include <ctype.h>
X
X#include "fileinfo.h"
X#include "wordinfo.h"
X#include "wordrules.h"
X#include "emalloc.h"
X
X/** Unix system calls that need to be declared: **/
X /* (none) */
X/** C Library functions that nees to be declared: **/
Xextern void perror();
Xextern int strcmp();
Xextern int strlen();
Xextern char *strcpy();
Xextern char *strcat();
X#ifndef tolower
X extern int toupper();
X#endif
X
X/** lqtext functions that need to be declared: **/
X/** Functions from this file that need to be declared: **/
Xvoid InsertCommonWord();
X/** **/
X
X/** Useful macros **/
X#define new(type) ((type *) emalloc(sizeof(type)))
X /* so you can say
X * struct foo *x = enew(struct foo)
X */
X
X#define STRCMP(s1,s2) ((*(s1) == *(s2)) ? strcmp(s1, s2) : *(s1) - *(s2))
X /* faster then strcmp in the (common) case where the
X * strings differ at the first character.
X * From an idea by Henry Spencer (utzoo!henry)
X */
X
X/** **/
X
Xextern int AsciiTrace;
X
X/* This routine is only sensible for English (although it could be
X * modified...), but that does not matter.
X */
Xchar *
XWordRoot(WordInfo)
X t_WordInfo *WordInfo;
X{
X char *Word;
X
X if (!WordInfo) return "@#!!";
X
X Word = WordInfo->Word;
X
X if (!Word) {
X return "oh dear";
X }
X
X if (!*Word) {
X return Word;
X }
X
X /** delete trailing <'s> and mark posessive */
X while (WordInfo->Length >= 3 && Word[WordInfo->Length - 1] == 's' &&
X Word[WordInfo->Length - 2] == '\'') {
X WordInfo->Length -= 2;
X Word[WordInfo->Length] = '\0';
X WordInfo->WordPlace.Flags |= WPF_POSSESSIVE;
X }
X
X /** delete trailing plural suffix and mark plural */
X
X /* It's important to realise that the purpose of this routine is not
X * in any way to reduce a word to an etymological root. In other words,
X * no attempt is made to differentiate between plurals and present
X * participles, or words that simply happen to end in `s'.
X * Hence, elephants, blunderbus, hostess, runs and tomatoes are all
X * candidates. Of course, one would like to do as well as one can!
X * Again, the object isn't to derive the correct singular, but instead
X * to be fairly fast, and, above all, to ensure that any transformations
X * are reversible!
X *
X * The result is that I can store dog and dogs in the same Wordinfo
X * chain. In the case that either word is unusual, there is a space
X * saving of (on average) 30 or so bytes. More usefully, if you ask
X * for `Window', I will automatically find `Windows' as well.
X *
X * so...
X * XXXo, XXXss, XXXsh, XXXch, XXXx --> +es
X * except: pianos, dynamos, photos
X * XXCy --> XXCies [ C consonant]
X * XXVy --> XXVys [ V vowel ]
X * f or fe --> ves (12 cases only)
X * vowel change:
X * foot/feet (why bother with these? -- use a thesaurus!)
X * need to keep penny/pence separate
X * See Thomson & Martinet, section 8ff (I think)
X */
X if (WordInfo->Length > 2 && Word[WordInfo->Length - 1] == 's') {
X WordInfo->WordPlace.Flags |= WPF_WASPLURAL; /* WRONG */
X switch (Word[WordInfo->Length - 2]) {
X case 'e':
X if (WordInfo->Length >= 3) switch (Word[WordInfo->Length - 3]) {
X case 'i': /* xxcies --> xxxy */
X if (WordInfo->Length > 3) {
X Word[WordInfo->Length - 3] = 'y';
X WordInfo->Length -= 2;
X } else { /* ies not a plural, but lies is :-) */
X WordInfo->Length--; /* just the s */
X }
X break;
X case 's':
X case 'h':
X case 'x':
X case 'o': /* xxxoes --> xxx */
X WordInfo->Length -= 2;
X break;
X default: /* xxxes -> xxxe */
X WordInfo->Length -= 1;
X break;
X } else { /* too short */
X WordInfo->WordPlace.Flags &=
X (unsigned short)~(unsigned short)WPF_WASPLURAL;
X }
X break;
X case 'y': /* xxxvys --> xxxvy */
X switch (Word[WordInfo->Length - 2]) { /* e.g. holidays */
X case 'a': /* flays */
X case 'e': /* beys */
X case 'i': /* ??iys?? */
X case 'o': /* boys */
X case 'u': /* guys */
X WordInfo->Length--; /* just remove the s */
X break;
X default: /*not a plural, e.g. Unixsys, happy */
X WordInfo->WordPlace.Flags &=
X (unsigned short)~(unsigned short)WPF_WASPLURAL;
X break;
X }
X break;
X case 's': /* trailing ss doesn't mark a plural! */
X WordInfo->WordPlace.Flags &=
X (unsigned short)~(unsigned short)WPF_WASPLURAL;
X break;
X case 'u':
X /* ONE bus, thus, omnibus; TWO gnus, TWO emus */
X /* So it doesn't work for gnus and emus right now! */
X WordInfo->WordPlace.Flags &=
X (unsigned short)~(unsigned short)WPF_WASPLURAL;
X break;
X case 'i': /* not a plural.. this, his, fleur-de-lis */
X WordInfo->WordPlace.Flags &=
X (unsigned short)~(unsigned short)WPF_WASPLURAL;
X break;
X case 'a': /* has */
X case 'o': /* cos */
X if (WordInfo->Length < 4) {
X WordInfo->WordPlace.Flags &=
X (unsigned short)~(unsigned short)WPF_WASPLURAL;
X break;
X }
X /* else fall through */
X default: /* just plain s */
X WordInfo->Length -= 1;
X break;
X }
X Word[WordInfo->Length] = '\0';
X }
X /* Should check for ii --> ius here, but that would increase the length
X * of the word and therefore will break lots of things.
X */
X return WordInfo->Word;
X}
X
Xchar *
XUnFlag(WordInfo, Flags)
X t_WordInfo *WordInfo;
X unsigned int Flags;
X{
X static char Buffer[MaxWordLength + 5]; /* 's + es + \0 */
X register char *p, *q;
X int Length;
X
X if (!WordInfo) return "(null word info)";
X if (!WordInfo->Word) return "(null word)";
X if (!WordInfo->Word[0]) return "(empty word)";
X
X p = Buffer;
X q = WordInfo->Word;
X while (*p++ = *q++)
X ;
X *p = '\0';
X
X if ((Length = p - Buffer) != WordInfo->Length) {
X /* Well, maybe I can't count */
X WordInfo->Length = Length = strlen(Buffer);
X }
X
X if (Flags & WPF_WASPLURAL) {
X if (Length >= 2) switch (Buffer[Length - 1]) {
X case 'y':
X if (Length > 2) switch (Buffer[Length - 2]) {
X case 'a':
X case 'e':
X case 'i':
X case 'o':
X case 'u':
X Buffer[Length++] = 's'; /* e.g. days */
X break;
X default:
X strcpy(&Buffer[Length - 1], "ies"); /* ladies */
X Length += 2;
X }
X break;
X case 's':
X if (Length > 2) if (Buffer[Length - 2] == 'u') {
X strcpy(&Buffer[Length - 1], "ii"); /* Genii */
X break;
X } /* else fall through... */
X case 'o':
X case 'h':
X case 'x':
X strcat(Buffer, "es");
X Length += 2;
X break;
X default:
X Buffer[Length++] = 's';
X }
X Buffer[Length] = '\0';
X }
X
X if (Flags & WPF_POSSESSIVE) {
X Buffer[Length++] = '\'';
X Buffer[Length++] = 's';
X Buffer[Length] = '\0';
X }
X
X if (Flags & WPF_UPPERCASE) {
X Buffer[0] = toupper(Buffer[0]);
X }
X
X return Buffer;
X}
X
Xtypedef struct s_WordList {
X char *Word;
X unsigned short Flags;
X struct s_WordList *Next;
X} t_WordList;
X
Xstatic t_WordList *CommonWords = 0;
X
Xint
XTooCommon(WordInfo)
X t_WordInfo *WordInfo;
X{
X register char *Word = WordInfo->Word;
X register t_WordList **WP;
X
X for (WP = &CommonWords; *WP; WP = &(*WP)->Next) {
X int i = STRCMP((*WP)->Word, Word);
X
X if (i == 0) return 1; /* yes, it's common */
X else if (i > 0) return 0;
X }
X return 0;
X}
X
Xstatic char *FileName = "Internal Error";
X/* should be set before being printed! */
X
Xint
XReadCommonWords(CommonFile)
X char *CommonFile;
X{
X extern char *fgets();
X extern int AsciiTrace;
X
X FILE *fd;
X extern int errno;
X char Buffer[200];
X t_WordInfo W;
X char *Root;
X t_WordList *WP;
X
X errno = 0;
X
X if ((fd = fopen(CommonFile, "r")) == (FILE *) 0) {
X int e = errno;
X
X fprintf(stderr, "Can't open common word list ");
X errno = e;
X perror(CommonFile);
X return -1;
X }
X
X FileName = CommonFile;
X
X while (fgets(Buffer, sizeof(Buffer), fd) != (char *) 0) {
X register char *p;
X char *Start;
X
X for (p = Buffer; *p; p++) {
X if (*p == '#') break;
X if (StartsWord(*p)) break;
X }
X
X if (*p == '#' || !*p) {
X continue;
X }
X
X Start = p;
X
X for (; *p; p++) {
X if (!WithinWord(*p)) break;
X if (*p == '\'' && !WithinWord(p[1])) break;
X }
X
X if (p - Start + 1 < MinWordLength) continue;
X
X *p = '\0'; /* delete trailing \n or whatever */
X W.WordPlace.Flags = 0;
X W.Word = Start;
X W.Length = p - Start; /* length excludes the \0 */
X
X Root = WordRoot(&W);
X InsertCommonWord(Root, W.WordPlace.Flags);
X }
X (void) fclose(fd);
X
X#if 0
X if (!CommonWords) {
X fprintf(stderr, "No common words found in file \"%s\"\n", FileName);
X exit(1);
X }
X#endif
X
X if (AsciiTrace > 1) {
X for (WP = CommonWords; WP; WP = WP->Next) {
X fprintf(stderr, "Ignore: \"%s\"\n", WP->Word);
X }
X }
X FileName = "Internal Error";
X return 0;
X}
X
Xvoid
XInsertCommonWord(Root, Flags)
X char *Root;
X unsigned int Flags;
X{
X register t_WordList **WP;
X t_WordList *W;
X
X for (WP = &CommonWords; *WP; WP = &(*WP)->Next) {
X int i = STRCMP((*WP)->Word, Root);
X
X if (i == 0) return;
X else if (i > 0) break;
X }
X /* insert it before this one! */
X W = (*WP);
X (*WP) = (t_WordList *) emalloc(sizeof(t_WordList));
X (*WP)->Word = emalloc(strlen(Root) + 1);
X (void) strcpy((*WP)->Word, Root);
X (*WP)->Flags = Flags;
X (*WP)->Next = W;
X return;
X}
@@@End of lq-text/src/liblqtext/Root.c
echo x - lq-text/src/liblqtext/WordInfo.c 1>&2
sed 's/^X//' >lq-text/src/liblqtext/WordInfo.c <<'@@@End of lq-text/src/liblqtext/WordInfo.c'
X/* WordInfo.c -- Copyright 1989 Liam R. Quin. All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* WordInfo.c -- handle the database of words for NX-Text.
X *
X * NX-Text keeps a master list of all of the words that have ever been
X * seen. Currently, this is in dbm format (sdbm or ndbm).
X * For each word, there's an associated WID (a unique number), an offset
X * into the master database (see pblock.c), and possibly thesaurus info.
X *
X * $Id: WordInfo.c,v 2.11 90/10/13 03:10:07 lee Rel1-10 $
X *
X * $Log: WordInfo.c,v $
X * Revision 2.11 90/10/13 03:10:07 lee
X * Type error -- efree() needs a char *.
X *
X * Revision 2.10 90/10/06 00:12:01 lee
X * Prepared for first beta release.
X *
X * Revision 2.9 90/09/29 23:47:30 lee
X * Reduced the size of a buffer, and plugged yet another memory leak!
X *
X * Revision 2.8 90/09/10 13:38:50 lee
X * deleted declaration of sleep()
X *
X * Revision 2.7 90/08/29 21:46:48 lee
X * Alpha release.
X *
X * Revision 2.6 90/08/12 17:33:38 lee
X * malloc changes; added SlayWordInfo() and MakeWordInfo().
X *
X * Revision 2.5 90/08/09 19:16:35 lee
X * BSD lint and fixes...
X *
X * Revision 2.4 90/03/22 14:23:19 lee
X * new calls to efree();
X * Offset now stored as a block number, not a byte offset
X *
X * Revision 2.3 90/03/21 14:59:13 lee
X * Numerous changes. WID2WordInfo() no longer calles GetWordPlaces().
X *
X * Revision 2.2 89/10/08 20:45:05 lee
X * Working version of nx-text engine. Addfile and wordinfo work OK.
X *
X * Revision 2.1 89/10/02 01:13:56 lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X *
X * Revision 1.4 89/09/17 23:01:53 lee
X * Various fixes; NumberInBlock now a short...
X *
X * Revision 1.3 89/09/16 21:16:07 lee
X * First demonstratable version.
X *
X * Revision 1.2 89/09/11 00:35:03 lee
X * Some speedups, but WID not working properly...
X *
X * Revision 1.1 89/09/07 21:05:51 lee
X * Initial revision
X *
X *
X */
X
X#include "globals.h" /* defines and declarations for database filenames */
X
X#include <errno.h>
X#include <fcntl.h>
X#include <malloc.h>
X#include <signal.h>
X#include <stdio.h>
X#include <string.h>
X#include <ctype.h>
X#include <sys/types.h>
X#include <sys/stat.h>
X#include <unistd.h>
X
X#include "fileinfo.h"
X#include "smalldb.h"
X#include "wordindex.h"
X#include "wordinfo.h"
X#include "numbers.h"
X
X#include "emalloc.h"
X
X#include "wordrules.h" /* max word length */
X
X#include "pblock.h"
X
X/** declarations: **/
X/** Unix system calls that need to be declared: **/
Xextern int open(), close(); /* (these are not the stdio fopen and fclose) */
Xextern int creat();
Xextern void exit();
Xextern long lseek(); /* watch out for this on 16 bit (286, PDP11) systems! */
Xextern int read(), write();
Xextern int stat();
Xextern unsigned alarm(/* unsigned */);
X
X/** Unix Library Calls that need to be declared: **/
X#ifndef tolower /* e.g. on SunOS */
Xextern int tolower();
X#endif
Xextern void perror();
X/* extern int sleep(); -- this is unsigned on some systems */
Xextern int lockf();
X/** lqtext Library calls that need to be declared: **/
Xextern void Deletepblock();
X
X/** Functions within this file that need to be declared: **/
Xt_WordInfo *MakeWordInfo();
Xvoid SlayWordInfo();
X
X/** **/
X
Xextern int AsciiTrace;
Xextern char *progname;
X
X#define new(type) ( ((type) *) emalloc(sizeof(type)) )
X
X/* Format when using ndbm: */
X
Xtypedef struct {
X t_WID WID;
X unsigned long Offset; /* position in the database */
X unsigned long NumberOfWordPlaces;
X char Word[1]; /* Cheat here */
X} t_WordIndexEntry;
X
X/* Replacement fomat, intended to be faster and to use much less disk space.
X * Still use dbm for Word2WID, but not for the reverse mapping.
X * Also, cache the most recently-used WordInfo entry...
X * I have not measured how much of a win this is, but a lot of the code
X * calls Word2Wid() and then WID2WordInfo().
X */
X
Xstatic int Widfd = (-1);
X
Xt_WordInfo *
XWID2WordInfo(WID)
X t_WID WID;
X{
X extern t_WordPlace *GetWordPlaces(); /* pblock.c */
X
X char Buffer[WIDBLOCKSIZE + 5]; /* the +5 allows for overrun... */
X char *q = Buffer;
X t_WordInfo *WP;
X
X /* The above calculation is derived like this:
X *
X * The entry contains the total number of pairs (>= 1 byte),
X * the length of the string (>= 1 byte), and the string (>= 3 bytes)
X * (actually >= MinWordLength bytes, but setting this less than 3
X * would be a major disaster!)
X * Hence, there are WIDBLOCKSIZE - (2 + length) bytes left for pairs.
X * Now, each pair is at least 2 bytes, so halve the remainder. I add
X * one coz WIDBLOCKSIZE - 3 is odd, so I would otherwise lose one!
X */
X
X if (Widfd < 0) {
X if ((Widfd = open(WidIndexFile, O_RDWR|O_CREAT, 0766)) < 0) {
X fprintf(stderr, "Can't open WID file \"%s\"\n", WidIndexFile);
X exit(1);
X }
X }
X
X if (lseek(Widfd, (long) (WID * WIDBLOCKSIZE), 0) < 0) {
X return (t_WordInfo *) 0;
X }
X
X if (read(Widfd, Buffer, WIDBLOCKSIZE) != WIDBLOCKSIZE) {
X return (t_WordInfo *) 0;
X }
X
X {
X unsigned short L;
X
X if ((L = sReadNumber(&q)) == 0) {
X (void) fprintf(stderr,
X "%s: Database corrupt, WID %lu has length zero\n",
X progname, WID);
X return (t_WordInfo *) 0;
X }
X WP = MakeWordInfo(WID, (int) L, q);
X q += L;
X }
X
X WP->Offset = (sReadNumber(&q) >> 1) * BLOCKSIZE;
X WP->NumberOfWordPlaces = sReadNumber(&q);
X
X /* Now, maybe read some WordPlace tuplets: */
X#if 1
X if (q - Buffer < WIDBLOCKSIZE) {
X WP->WordPlaces = GetWordPlaces(
X WP->WID,
X q,
X WIDBLOCKSIZE - (q - Buffer),
X WP->Offset,
X WP->NumberOfWordPlaces
X );
X WP->WordPlacesInHere = WP->NumberOfWordPlaces;
X } else {
X fprintf(stderr, "%s: Internal error, block too small for %ld (%s)\n",
X progname, WP->WID, WP->Word);
X exit(1);
X }
X
X#else
X WP->WordPlaces = (t_WordPlace *) 0;
X if (q - Buffer < WIDBLOCKSIZE) {
X WP->DataBlock = emalloc(WIDBLOCKSIZE + 5);
X (void) memcpy(WP->DataBlock, Buffer, WIDBLOCKSIZE);
X WP->WordPlaceStart = &(WP->DataBlock[q - Buffer]);
X }
X#endif
X
X /* done! */
X return WP;
X}
X
Xstatic char PairBuffer[WIDBLOCKSIZE + 5]; /* the +5 allows for overrun... */
X
X/* Make WordInfo Block Header... */
Xvoid
XMkWIBH(WordInfo, pblock)
X t_WordInfo *WordInfo;
X t_pblock *pblock;
X{
X char *q = PairBuffer;
X
X#ifdef ASCIITRACE
X if (AsciiTrace > 15) {
X fprintf(stderr, "\tMake info block header for %s, Offset %lu==%lu\n",
X WordInfo->Word, pblock->ChainStart, WordInfo->Offset);
X }
X#endif
X
X sWriteNumber(&q, WordInfo->Length);
X (void) strncpy(q, WordInfo->Word, WordInfo->Length);
X q += WordInfo->Length;
X if (pblock) sWriteNumber(&q, (pblock->ChainStart / BLOCKSIZE) << 1);
X else sWriteNumber(&q, 0L);
X sWriteNumber(&q, WordInfo->NumberOfWordPlaces);
X
X WordInfo->WordPlaceStart = q;
X WordInfo->DataBlock = PairBuffer;
X}
X
X/* Make WordInfo Block ... */
Xint
XMkWIB(WordInfo, pblock)
X t_WordInfo *WordInfo;
X t_pblock *pblock;
X{
X extern unsigned short PutWordPlaces();
X
X /* See how many pairs from the given pblock fit into WordInfo,
X * and leave them in PairBuffer...
X */
X
X if (AsciiTrace > 3) {
X fprintf(stderr, "Make info block for %s\n", WordInfo->Word);
X }
X
X MkWIBH(WordInfo, pblock);
X
X if (pblock == (t_pblock *) 0) {
X /* No WordPlaces to put in! */
X WordInfo->WordPlacesInHere = 0;
X return 0;
X }
X
X return WordInfo->WordPlacesInHere = PutWordPlaces(
X pblock->WordPlaces,
X WordInfo->WID,
X WordInfo->WordPlaceStart,
X WIDBLOCKSIZE - (WordInfo->WordPlaceStart - PairBuffer),
X pblock->ChainStart,
X pblock->NumberOfWordPlaces);
X}
X
Xchar *
XString2SixBitString(String)
X unsigned char *String;
X{
X static unsigned char Buffer[MaxWordLength + 1];
X register unsigned char *p;
X register unsigned char *Bufp = Buffer;
X unsigned short Val;
X int BitsLeft = 0;
X
X /* BUG: we lose word-processing accents, etc. and 8-bitness if
X * we do this. Also, it slows things down very, very slightly.
X */
X
X /* Some ascii character equivalents:
X * '0' 48 060 0x30
X * 'A' 65 0101 0x41
X * '_' 95 0137 0x5f
X * 'a' 97 0141 0x61
X */
X for (p = String; *p; p++) {
X if (!isalnum(*p) && *p != '\'' && *p != '_') {
X return (char *) 0;
X }
X if (isupper(*p)) *p = tolower(*p);
X /* Store as
X * 0-9 --> 0-9 (easy!)
X * a-z --> 10...35
X * _/' --> 36/37
X * hence, I need 6 bits per character. This also leaves rather
X * a lot of bits spare (38..64, 27 or so spaces). As I fold case,
X * and don't have controls, I don't know what to do there. I
X * could store digrams. There are 38*38 = 1444 of these, but
X * some of them don't happen. Not worth the effort.
X */
X if (isdigit(*p)) {
X Val = (*p) - '0';
X } else if (isalpha(*p)) {
X Val = (*p) - 'a' + ('9' - '0');
X } else if (*p == '\'') {
X Val = ('9' - '0') + ('z' - 'a') + 1;
X } else if (*p == '_') {
X Val = ('9' - '0') + ('z' - 'a') + 2;
X } else {
X#define NEXTISEIGHT ('9' - '0') + ('z' - 'a') + 3
X Val = NEXTISEIGHT;
X }
X /* Write the first half */
X if (!(BitsLeft & 07)) { /* i.e. it's 0 or 8 */
X *Bufp = (Val << 2);
X BitsLeft = 2;
X } else {
X /* top BITSLEFT bits */
X *Bufp++ |= (Val >> (6 - BitsLeft));
X *Bufp = (unsigned) (Val << (2 + BitsLeft)); /* lose some bits */
X if ((BitsLeft -= 6) < 0) BitsLeft += 8;
X }
X }
X if (BitsLeft) {
X Bufp++;
X }
X *Bufp = 0;
X return (char *) Buffer;
X}
X
Xunsigned char *
XSixBitString2String(SixBitString)
X char *SixBitString;
X{
X static unsigned char Buffer[MaxWordLength + 2];
X register unsigned char *p = (unsigned char *) SixBitString;
X int BitsLeft = 0;
X unsigned char *Bufp = Buffer;
X
X while (*p) {
X if (!(BitsLeft & 07)) { /* i.e. it's 0 or 8 */
X *Bufp++ = (*p) >> 2;
X BitsLeft = 2;
X } else {
X /* W R O N G */
X /* bottom BITSLEFT bits */
X *Bufp = ((*p) << (6 - BitsLeft));
X /* Rest */
X *Bufp = (unsigned) ((*p) << (2 + BitsLeft)); /* lose some bits */
X if ((BitsLeft -= 6) < 0) BitsLeft += 8;
X }
X }
X return (unsigned char *) "notdone'fixme"; /* NOTDONE FIXME */
X}
X
X#ifdef TESTSIX
Xchar *progname= "testsix";
Xmain(argc, argv)
X int argc;
X char *argv[];
X{
X extern char *gets();
X
X char Line[4096];
X int Encode = 1;
X
X if (argc != 3) {
X fprintf(stderr, "bad arg count; usage: %s -[de]\n", progname);
X }
X if (STREQ(argv[1], "-e")) Encode = 1;
X else if (STREQ(argv[1], "-d")) Decode = 1;
X else {
X fprintf(stderr, "usage: %s -[d|e]\n", progname);
X exit(1);
X }
X while (gets(Line) != (char *) 0) {
X char *Result;
X
X if (Encode) {
X Result = String2SixBitString(Line);
X } else {
X if (STREQ(Line, "(cannot be encoded)")) {
X Result = "(this line was not saved)";
X } else {
X Result = SixBitString2String(line);
X }
X }
X if (Result) {
X printf("%s\n", Result);
X } else {
X printf("(cannot be encoded)\n");
X }
X }
X}
X
X#endif /*TESTSIX*/
X
X
Xt_WID
XWord2WID(Word, Length)
X char *Word;
X unsigned int Length;
X{
X DBM *db;
X datum key, data;
X char *q;
X t_WID WID;
X char Buffer[8];
X /* enough for the binary representation of a number -- see numbers.c;
X * this is _not_ sizeof(long). It's probably 5, in fact, although
X * for small numbers it's less.
X */
X
X if (Length > MaxWordLength) {
X Length = MaxWordLength; /* NOTE: no trailing \0 required. */
X }
X
X /* contact database server */
X if ((db = startdb(WordIndex)) == (DBM *) 0) {
X fprintf(stderr, "dbmopen(%s) failed\n", WordIndex);
X exit(2);
X }
X
X key.dptr = Word;
X key.dsize = Length;
X
X data = dbm_fetch(db, key);
X
X enddb(db);
X
X if (data.dsize == 0) {
X return (t_WID) 0;
X }
X
X /* do this because ReadNumber will leave q pointing beyond Buffer: */
X (void) memcpy(Buffer, data.dptr, data.dsize);
X q = Buffer;
X WID = sReadNumber(&q);
X if (q - Buffer != data.dsize) {
X fprintf(stderr, "Word2Wid failed... got %lu\n");
X return (t_WID) 0;
X }
X return WID;
X}
X
Xchar *
XWID2Word(WID)
X t_WID WID;
X{
X t_WordInfo *W;
X char *Word;
X
X if (WID == (t_WID) 0) {
X return (char *) 0;
X }
X
X if ((W = WID2WordInfo(WID)) == (t_WordInfo *) 0) {
X return (char *) 0;
X }
X Word = W->Word;
X W->Word = (char *) 0;
X SlayWordInfo(W);
X return Word;
X}
X
Xint
XPutWordInfoIntoIndex(WordInfo, Offset)
X t_WordInfo *WordInfo;
X unsigned long Offset;
X{
X DBM *db;
X char NumBuf[sizeof(t_WID) + 1];
X char *q = NumBuf;
X datum key, data;
X int RetVal;
X
X /** First, write the WID itself, so we can go from Word to WID */
X
X key.dptr = WordInfo->Word;
X key.dsize = WordInfo->Length;
X
X sWriteNumber(&q, WordInfo->WID);
X
X data.dptr = NumBuf;
X data.dsize = q - NumBuf;
X
X /* contact database server */
X if ((db = startdb(WordIndex)) == (DBM *) 0) {
X fprintf(stderr, "dbmopen(%s) failed\n", WordIndex);
X exit(2);
X }
X
X RetVal = dbm_store(db, key, data, DBM_REPLACE);
X
X enddb(db);
X
X /** Now, ensure that we have a physical block for WordInfo. If
X ** we don't, there is something very wrong in pblock.c, our only
X ** possible caller.
X **/
X
X if (WordInfo->DataBlock == (char *) 0) {
X if (Offset) {
X fprintf(stderr, "WARNING: WordInfo corrupt for \"%s\"\n",
X WordInfo->Word);
X }
X (void) MkWIB(WordInfo, (t_pblock *) 0);
X }
X
X /** Now write the physical entry... */
X
X if (Widfd < 0) {
X if ((Widfd = open(WidIndexFile, O_RDWR|O_CREAT, 0766)) < 0) {
X fprintf(stderr, "Can't open WID file \"%s\"\n", WidIndexFile);
X exit(1);
X }
X }
X
X if (lseek(Widfd, (long) (WordInfo->WID * WIDBLOCKSIZE), 0) < 0) {
X perror("lseek");
X exit(1);
X }
X
X if (write(Widfd, WordInfo->DataBlock, WIDBLOCKSIZE) != WIDBLOCKSIZE) {
X perror("write");
X exit(1);
X }
X
X return RetVal;
X}
X
Xint
XWID_sig()
X{
X return 0;
X}
X
Xt_WID
XGetMaxWID()
X{
X extern int errno;
X extern long atol();
X
X int fd;
X char Buffer[20]; /* large enough to sprintf() a WID */
X struct stat StatBuf;
X#if 0
X int FileKey; /* what one gets from a lock... */
X int NumberOfTriesLeft = 5;
X#endif
X
X /* ensure that the file is there */
X if (stat(WidFile, &StatBuf) == -1) {
X return 0;
X }
X
X if ((fd = open(WidFile, O_RDWR, 0)) < 0) {
X fprintf(stderr, "Warning: Can't open WID file");
X return 0;
X }
X
X#if 0
X errno = 0;
X
X /** Lock the file **/
X
X do {
X /* Set a timeout of 2 seconds */
X signal(SIGALRM, WID_sig);
X (void) alarm(3);
X if ((FileKey = lockf(fd, F_LOCK, 0L)) < 0) {
X switch (errno) {
X case EACCES: /*[sic]*/ /* another process has the lock */
X fprintf(stderr, "Please wait...\n");
X /* shouldn't happen */
X break;
X case EDEADLK:
X fprintf(stderr, "Warning: can't lock \"%s\" -- EDEADLK\n", WidFile);
X FileKey = 1;
X break;
X case EINTR:
X fprintf(stderr, "Please Wait... someone has the key...\n");
X sleep(1);
X break;
X }
X }
X if (--NumberOfTriesLeft <= 0) {
X fprintf(stderr, "Warning: can't lock ");
X perror(WidFile);
X (void) close(fd);
X return 0;
X }
X } while (FileKey < 0);
X (void) alarm(0);
X
X if (stat(WidFile, &StatBuf) == -1) {
X fprintf(stderr, "It went away!\n");
X return 0;
X }
X#endif
X
X /* Read the file */
X if (read(fd, Buffer, (unsigned int) StatBuf.st_size) < 0) {
X fprintf(stderr, "Can't read from \"%s\"\n", WidFile);
X exit(1);
X }
X
X#if 0
X /** Unlock the file **/
X if (lockf(fd, F_ULOCK, 0L) < 0 && FileKey == 0) {
X fprintf(stderr, "Warning: might not have unlocked \"%s\"\n",
X WidFile);
X }
X#endif
X (void) close(fd);
X
X Buffer[StatBuf.st_size] = '\0';
X
X return atol(Buffer);
X}
X
X
Xt_WID LastNextWIDVal = (t_WID) 0;
X
X#undef GetNextWID
X
Xt_WID GetNextWID(), GetMaxWID();
X
XINLINE
Xt_WID
XSpoofGetNextWID()
X{
X static int SinceLastUpdate = 0;
X
X /* Call the real function sometimes, so that the database does
X * get updated in case of a crash or for other users.
X */
X if (++SinceLastUpdate > 500) {
X SinceLastUpdate = 0;
X return GetNextWID(1);
X }
X
X if (LastNextWIDVal == (t_WID) 0) {
X SinceLastUpdate = 0;
X LastNextWIDVal = GetMaxWID();
X }
X return ++LastNextWIDVal;
X}
X
Xvoid
XWriteCurrentMaxWID()
X{
X (void) GetNextWID(1);
X}
X
Xt_WID
XGetNextWID(WriteCurrent)
X int WriteCurrent; /* simply write the current MaxWID if true */
X{
X extern int errno;
X extern long atol();
X
X int fd;
X char Buffer[20];
X struct stat StatBuf;
X#if 0
X int FileKey; /* what one gets from a lock... */
X int NumberOfTriesLeft = 5;
X#endif
X t_WID Result;
X
X /** Alter the file, so other programs can see the new words...
X **/
X
X /* ensure that the file is there */
X if (stat(WidFile, &StatBuf) == -1) {
X fprintf(stderr, "Creating WID file \"%s\"\n", WidFile);
X if ((fd = creat(WidFile, 02666)) < 0) {
X fprintf(stderr, "Can't create WID file \"%s\"\n", WidFile);
X exit(1);
X }
X (void) close(fd);
X return GetNextWID(WriteCurrent);
X
X /*NOTREACHED*/
X }
X
X if ((fd = open(WidFile, O_RDWR, 0)) < 0) {
X fprintf(stderr, "Can't open WID file");
X perror(WidFile);
X exit(1);
X }
X
X#if 0
X errno = 0;
X
X /** Lock the file **/
X
X do {
X /* Set a timeout of 2 seconds */
X signal(SIGALRM, WID_sig);
X (void) alarm(3);
X if ((FileKey = lockf(fd, F_LOCK, 0L)) < 0) {
X switch (errno) {
X case EACCES: /*[sic]*/ /* another process has the lock */
X fprintf(stderr, "Please wait...\n");
X /* shouldn't happen */
X break;
X case EDEADLK:
X fprintf(stderr, "Warning: can't lock \"%s\" -- EDEADLK\n", WidFile);
X FileKey = 1;
X break;
X case EINTR:
X fprintf(stderr, "Please Wait... someone has the key...\n");
X sleep(1);
X break;
X }
X }
X if (--NumberOfTriesLeft <= 0) {
X fprintf(stderr, "Warning: can't lock the file \"%s\"\n", WidFile);
X }
X } while (FileKey < 0);
X (void) alarm(0);
X
X if (stat(WidFile, &StatBuf) == -1) {
X fprintf(stderr, "It went away!\n");
X exit(1);
X }
X#endif
X
X /* Read the file */
X
X if (read(fd, Buffer, (unsigned int) StatBuf.st_size) < 0) {
X fprintf(stderr, "Can't read from \"%s\"\n", WidFile);
X exit(1);
X }
X
X Buffer[StatBuf.st_size] = '\0';
X
X Result = atol(Buffer);
X
X /* if WriteCurrent is set, we should not increment anything */
X if (!WriteCurrent) {
X ++LastNextWIDVal;
X ++Result;
X }
X
X if (Result < LastNextWIDVal) {
X Result = LastNextWIDVal;
X }
X
X (void) sprintf(Buffer, "%lu\n", Result);
X
X /* Move to the start of the file and write the now value.
X * No need to truncate the file, because it didn't shrink!
X */
X (void) lseek(fd, 0, 0L);
X (void) write(fd, Buffer, (unsigned int) strlen(Buffer));
X
X#if 0
X /** Unlock the file **/
X
X if (lockf(fd, F_ULOCK, 0L) < 0 && FileKey == 0) {
X fprintf(stderr, "Warning: might not have unlocked \"%s\"\n",
X WidFile);
X }
X#endif
X (void) close(fd);
X
X return Result;
X}
X
Xint
XDeleteWord(Word)
X char *Word;
X{
X extern t_pblock *Getpblock();
X
X t_WID WID;
X t_WordInfo *WordInfo;
X t_pblock *tmp;
X
X if ((WID = Word2WID(Word, strlen(Word))) == (t_WID) 0) {
X return -1; /* not there */
X }
X
X /* get info from the list */
X if ((WordInfo = WID2WordInfo(WID)) == (t_WordInfo *) 0) {
X return -1;
X }
X
X if ((tmp = Getpblock(WordInfo)) != (t_pblock *) NULL) {
X Deletepblock(tmp);
X (void) efree(tmp);
X }
X
X /* delete the offset from the database, but retain the WID: */
X WordInfo->Offset = 0L;
X WordInfo->NumberOfWordPlaces = 0L;
X WordInfo->WordPlacesInHere = 0;
X PutWordInfoIntoIndex(WordInfo, 0L);
X SlayWordInfo(WordInfo);
X
X return 0;
X}
X
X/* Routines to create and destroy WordInfo structures */
XINLINE t_WordInfo *
XMakeWordInfo(WID, Length, Word)
X t_WID WID;
X int Length;
X char *Word; /* the word, which might not be nul-terminated */
X{
X register t_WordInfo *WP;
X WP = (t_WordInfo *) emalloc(sizeof(t_WordInfo));
X
X WP->WID = WID;
X WP->FID = (t_FID) 0;
X WP->Next = (t_WordInfo *) 0;
X WP->NumberOfWordPlaces = 0;
X WP->DataBlock = (char *) 0;
X WP->WordPlaceStart = (char *) 0;
X WP->WordPlaces = (t_WordPlace *) 0;
X WP->WordPlacesInHere = 0;
X WP->WordPlace.FID = 0; /* mark as invalid */
X WP->WordPlace.Flags = 0; /* this gets used anyway, so set it to zero! */
X
X WP->Word = emalloc(Length + 1);
X
X (void) strncpy(WP->Word, Word, Length);
X WP->Length = Length;
X WP->Word[Length] = '\0'; /* strncpy does not add a null */
X WP->Offset = 0;
X
X return WP;
X}
X
Xvoid
XSlayWordInfo(WP)
X t_WordInfo *WP;
X{
X if (!WP) return;
X if (WP->Word) efree(WP->Word);
X if (WP->WordPlaces) efree((char *)WP-> WordPlaces);
X
X WP->Next = (t_WordInfo *) 0;
X /* The above line is to force a run-time error in the common
X * (but wrong) case
X * for (w = WordList; w; w = w->Next) SlayWordInfo(w);
X */
X efree((char *) WP);
X}
@@@End of lq-text/src/liblqtext/WordInfo.c
echo end of part 04
--
Liam R. E. Quin, lee at sq.com, SoftQuad Inc., Toronto, +1 (416) 963-8337
More information about the Alt.sources
mailing list