A spelling corrector
ken at boring.UUCP
ken at boring.UUCP
Thu Aug 29 02:43:07 AEST 1985
References:
Sender: ken at mcvax.UUCP (Ken Yap)
Reply-To: ken at mcvax.UUCP (Ken Yap)
Followup-To: net.sources.bugs
Distribution: net
Organization: Amoeba Project, CWI, Amsterdam
Keywords:
#! /bin/sh
# This is a shell archive, meaning:
# 1. Remove everything above the #! /bin/sh line.
# 2. Save the resulting text in a file.
# 3. Execute the file with /bin/sh (not csh) to create the files:
# README
# correct.1
# Makefile
# correct.v0.c
# word.c
# word.h
# This archive created: Wed Aug 28 10:28:39 1985
# By: Ken Yap (Amoeba Project, CWI, Amsterdam)
export PATH; PATH=/bin:$PATH
echo shar: extracting "'README'" '(811 characters)'
if test -f 'README'
then
echo shar: will not over-write existing file "'README'"
else
cat << \SHAR_EOF > 'README'
This is a simplistic spelling corrector. It takes the list of words on
the command line (or one line of standard input), applies small
perturbations to them and checks the variants against a standard
dictionary (via the spell program). The survivors are then suggested as
corrections for the presumably mispelled word.
For example:
$ correct calender arithmatic
arithmetic
calendar
This idea came from "Computer Programs for Spelling Correction",
Peterson, Springer-Verlag LNCS.
Its deficiencies are noted in the manual page. I am working on a better
version, but would be glad to hear of bug reports or improvements. I
don't promise to do anything about such reports though.
Ken
28th August 1985
Centrum voor Wiskunde en Informatica,
Kruislaan 413, 1098 SJ Amsterdam,
Netherlands.
ken at mcvax.UUCP
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'correct.1'" '(1609 characters)'
if test -f 'correct.1'
then
echo shar: will not over-write existing file "'correct.1'"
else
cat << \SHAR_EOF > 'correct.1'
.TH CORRECT 1 "2 August 1985"
.SH NAME
correct, lookup \- spelling corrector
.SH SYNOPSIS
.B correct
[
.B \-D
] [
.B \-S
] [
.B \-f
] [
.B \-s
] [
.B \-d
hlist ]
[ words ]
.PP
.B lookup
[
.B \-f
] [ words ]
.SH DESCRIPTION
.I Correct
takes the presumably mispelled words, applies small perturbations to
them and looks up the perturbations in a hashed dictionary.
If these perturbations are found
they are suggested as corrections for the mispelled word.
If no words are given on the command line,
correct reads one line from the standard input.
.PP
Under the
.B \-f
option, words are folded to lower case before processing.
.PP
Under the
.B \-s
option, sorting and duplicate filtering are supressed.
.PP
Under the
.B \-S
option, server mode is entered.
.I Correct
is run in the background and enquiries are
sent to it by
.I lookup.
This requires the Amoeba (C) transaction library.
.PP
The
.B \-D
option turns on some debugging messages.
.PP
The hashed dictionary used may be specified by
the argument following the
.BR \-d
option.
.SH FILES
/usr/dict/hlist[ab] hashed correcting lists, American & British, default for
.B \-d
.br
/tmp/correct\(** temporary file
.br
.SH SEE ALSO
spell(1), spellout(1), deroff(1), sort(1), tee(1), sed(1)
.SH AUTHOR
Ken Yap (Centrum voor Wiskunde en Informatica, Amsterdam)
.SH BUGS
Coverage of words in the dictionary is uneven.
Absence of output may mean that the intended word
was not found rather than that the spelling was correct.
.PP
Long words often have permutations that cause spurious hits
on the dictionary.
Take the output of this program with a grain of salt.
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'Makefile'" '(523 characters)'
if test -f 'Makefile'
then
echo shar: will not over-write existing file "'Makefile'"
else
cat << \SHAR_EOF > 'Makefile'
#
# Makefile for dictionary server
#
# Ken Yap, June 1985
#
# Sources
SRC = correct.v0.c word.c dict.c
DICT = \"/usr/dict/hlistb\"
CFLAGS = -O -DDEFAULT_DICT=$(DICT)
correct.v0: correct.v0.o getopt.o word.o
cc -o correct correct.v0.o getopt.o word.o
lookup: lookup.o getopt.o trans.o
cc -o lookup lookup.o getopt.o trans.o
correct.v0.o: word.h
word.o: word.h
lint:
lint -DDEFAULT_DICT=$(DICT) $(SRC)
quietly:
@rm -f nohup.out
sh -ce 'nohup make &'
backup:
tar cf ../correct.tar *.c *.h *.1 Makefile
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'correct.v0.c'" '(4367 characters)'
if test -f 'correct.v0.c'
then
echo shar: will not over-write existing file "'correct.v0.c'"
else
cat << \SHAR_EOF > 'correct.v0.c'
/*
** (C) Centrum voor Wiskunde en Informatica, 1985
**
** This software may be freely distributed and used, save
** for profit or military purposes, provided always this notice
** is retained.
**
** No warranty is made on the suitability of this software
** for any purpose whatsoever.
**
** Last modified:
**
** Ken Yap (CWI) August 1985
*/
/*
** A program to generate alternate spellings from a mispelled word
** and return those that are in the dictionary.
**
** Ken Yap, CWI, July 1985
*/
#include <sys/types.h>
#include <sys/file.h>
#include <ctype.h>
#include <stdio.h>
#include <signal.h>
#ifdef AMOEBA
#include "amoeba.h"
#endif AMOEBA
#include "word.h"
static char *dictfile = DEFAULT_DICT;
static int server = 0;
static int debug = 0;
static int fold = 0;
static int sortuniq = 1;
static char ibuf[1024], buf[10240];
#ifdef AMOEBA
header hdr;
#endif AMOEBA
/*
** Print message and exit on error
*/
chkerror(cc, msg)
int cc;
char *msg;
{
if (cc < 0)
{
perror(msg);
exit(1);
}
}
cleanup()
{
exit(0);
}
/*
** Generate one word's permutations
** Reject words containing non-alphabetics
*/
int altgen(word, buf, len)
char *word, *buf;
int len;
{
register int op;
register char *p;
for (p = word; *p != '\0'; p++)
if (!isalpha(*p))
return (0);
p = buf;
for (op = DEL1CHAR; op <= ADD1CHAR; op++)
{
transform(word, INIT, p);
while (transform(word, op, p))
{
p += strlen(p);
*p++ = '\n';
if (p - buf > len - 20)
return (p - buf);
}
}
*p = '\0';
return (p - buf);
}
/*
** Pick up one word from buf, returning updated position in buf
*/
char *getword(buf, word, wlen)
char *buf, *word;
int wlen;
{
while (isspace(*buf) && *buf != '\0')
buf++;
while (!isspace(*buf) && *buf != '\0')
{
if (wlen-- <= 0)
break;
*word++ = *buf++;
}
*word = '\0';
return (buf);
}
/*
** Lookup several words
*/
int lookup(words, alternates, altlen)
char *words, *alternates;
int altlen;
{
register int l, ch;
register char *p, *tempfile;
register FILE *tempf, *cmdpipe;
char word[64];
int dup2();
char *getword(), *mktemp();
FILE *fopen(), *popen();
tempfile = mktemp("/tmp/correctXXXXXX");
if ((tempf = fopen(tempfile, "w")) == NULL)
chkerror(-1, tempfile);
p = words;
while (*(p = getword(p, word, sizeof(word))) != '\0')
{
if (debug) printf("<%s>\n", word);
l = altgen(word, alternates, altlen);
fwrite(alternates, sizeof(char), l, tempf);
}
fclose(tempf);
sprintf(word, "spellout -d %s < %s %s", dictfile, tempfile,
sortuniq ? "| sort -u" : "");
if ((cmdpipe = popen(word, "r")) == NULL)
return (-1);
p = alternates;
while ((ch = getc(cmdpipe)) != EOF)
{
*p++ = ch;
if (p - alternates > altlen)
break;
}
pclose(cmdpipe);
unlink(tempfile);
return (p - alternates);
}
#ifdef AMOEBA
dictserver()
{
register int n;
int amoeba_init(), getreq(), putrep(), lookup();
strncpy((char *)&hdr.h_port, "bodict", PORTSIZE);
chkerror(amoeba_init(&hdr.h_port), "init");
for (;;)
{
do {
if ((n = getreq(&hdr, ibuf, sizeof(ibuf))) < 0)
{
perror("getreq");
continue;
}
ibuf[n] = '\0';
n = lookup(ibuf, buf, sizeof(buf));
if (putrep(&hdr, buf, n) < 0)
perror("putrep");
} while (n > 0);
}
}
#endif AMOEBA
lower(p)
char *p;
{
for ( ; *p != '\0'; p++)
if (isupper(*p)) *p = tolower(*p);
}
main(argc, argv)
int argc;
char *argv[];
{
register int i; /* the option flag name */
register char *words;
extern int optind; /* defined in getopt */
extern char *optarg; /* defined in getopt */
int getopt();
while ((i = getopt (argc, argv, "DSd:fs")) != EOF)
{
switch (i)
{
case 'D': debug++; break;
case 'S': server++; break;
case 's': sortuniq = 0; break;
case 'd': dictfile = optarg; break;
case 'f': fold++; break;
default:
fprintf (stderr, "usage: %s [-DSfs] [-d dictfile] [words]\n", argv[0]);
exit (1);
}
}
signal(SIGTERM, cleanup);
#ifdef AMOEBA
if (server)
dictserver();
else
#endif AMOEBA
{
words = ibuf;
for (argc -= optind, argv += optind; argc > 0; argc--, argv++)
{
strcpy(words, *argv);
words += strlen(words);
*words++ = ' ';
}
i = (words == ibuf) ? (fgets(ibuf, sizeof(ibuf), stdin), strlen(words))
: words - ibuf;
words[i] = '\0';
if (fold) lower(ibuf);
chkerror((i = lookup(ibuf, buf, sizeof(buf))), "pipe");
write(1, buf, i);
}
}
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'word.c'" '(2399 characters)'
if test -f 'word.c'
then
echo shar: will not over-write existing file "'word.c'"
else
cat << \SHAR_EOF > 'word.c'
/*
** (C) Centrum voor Wiskunde en Informatica, 1985
**
** This software may be freely distributed and used, save
** for profit or military purposes, provided always this notice
** is retained.
**
** No warranty is made on the suitability of this software
** for any purpose whatsoever.
**
** Last modified:
**
** Ken Yap (CWI) August 1985
*/
#include <ctype.h>
#include "word.h"
int transform(word, op, result)
char *word, *result;
int op;
{
register int i;
static struct {
int len, pos;
char let;
} context;
switch (op)
{
case INIT:
context.len = strlen(word);
context.pos = 0;
context.let = isupper(*word) ? 'A' : 'a';
break;
case DEL1CHAR:
if (context.pos >= context.len)
return (0);
for (i = 0; i < context.pos; i++)
*result++ = word[i];
for (i = context.pos + 1; i < context.len; i++)
*result++ = word[i];
context.pos++;
break;
case SWAP2CHARS:
nextpos:
if (context.pos >= context.len - 1)
return (0);
for (i = 0; i < context.pos; i++)
*result++ = word[i];
if (word[i] == word[i+1])
{
context.pos++;
goto nextpos;
}
*result++ = word[i+1];
*result++ = word[i];
for (i = context.pos + 2; i < context.len; i++)
*result++ = word[i];
context.pos++;
break;
case CHG1CHAR:
if (context.pos >= context.len)
return (0);
for (i = 0; i < context.pos; i++)
*result++ = word[i];
*result++ = context.let;
for (i = context.pos + 1; i < context.len; i++)
*result++ = word[i];
if (context.let == 'Z' || context.let == 'z')
{
context.pos++;
context.let = isupper(word[context.pos]) ? 'A' : 'a';
}
else
context.let++;
break;
case ADD1CHAR:
if (context.pos > context.len)
return (0);
for (i = 0; i < context.pos; i++)
*result++ = word[i];
*result++ = context.let;
for (i = context.pos; i < context.len; i++)
*result++ = word[i];
if (context.let == 'Z' || context.let == 'z')
{
context.pos++;
context.let = isupper(word[context.pos]) ? 'A' : 'a';
}
else
context.let++;
break;
default:
;
}
*result = '\0';
return (1);
}
#ifdef TEST
main(argc, argv)
int argc;
char *argv[];
{
register int op;
char buf[5120];
if (argc <= 1)
exit(1);
for (op = DEL1CHAR; op <= SWAP2CHARS; op++)
{
printf("Transformation #%d\n", op);
transform(argv[1], INIT, buf);
while (transform(argv[1], op, buf))
printf("%s\n", buf);
}
exit(0);
}
#endif TEST
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'word.h'" '(94 characters)'
if test -f 'word.h'
then
echo shar: will not over-write existing file "'word.h'"
else
cat << \SHAR_EOF > 'word.h'
#define INIT 0
#define DEL1CHAR 1
#define SWAP2CHARS 2
#define CHG1CHAR 3
#define ADD1CHAR 4
SHAR_EOF
fi # end of overwriting check
# End of shell archive
exit 0
--
UUCP: ..!{seismo,okstate,garfield,decvax,philabs}!mcvax!ken Voice: Ken!
Mail: Centrum voor Wiskunde en Informatica, Kruislaan 413, 1098 SJ, Amsterdam.
More information about the Comp.sources.unix
mailing list