"fgrep" doesn't always match everything it should
Guy Harris
guy at sun.uucp
Fri Apr 18 15:59:56 AEST 1986
The S5 "fgrep" has a bug in it. If you have a file with
shxp
hers
his
xk
in it, and you feed that as a pattern file to the S5 "fgrep", it will NOT
match on "shxk"!
Here's a fix. It also includes Doug Gwyn's recent fixes, and some other
changes:
Checks for I/O errors reading files.
Better error messages - uses "perror".
Runs much faster - doesn't call a subroutine to do character
matching (I posted this fix a while ago).
Disallows multiple "-f" flags. (Should also disallow multiple
"-e" flags as well - I just noticed this, so it's not in
this "diff" listing.)
Allows "-e" flag when reading standard input.
*** fgrep.c.orig Thu Apr 17 21:27:30 1986
--- fgrep.c.fixed Thu Apr 17 21:32:27 1986
***************
*** 9,14
*/
#include <stdio.h>
#define MAXSIZ 6000
#define QSIZE 400
--- 9,15 -----
*/
#include <stdio.h>
+ #include <ctype.h>
#define MAXSIZ 6000
#define QSIZE 400
***************
*** 30,35
int nsucc;
long tln;
FILE *wordf;
char *argptr;
extern char *optarg;
extern int optind;
--- 31,37 -----
int nsucc;
long tln;
FILE *wordf;
+ char *wordfname;
char *argptr;
extern char *optarg;
extern int optind;
***************
*** 34,39
extern char *optarg;
extern int optind;
main(argc, argv)
char **argv;
{
--- 36,45 -----
extern char *optarg;
extern int optind;
+ /* The following macro was inserted to allow for the "-i" option */
+
+ #define same(a, b) ((a) == (b) || iflag && ((a) ^ (b)) == ' ' && letter(a) == letter(b))
+
main(argc, argv)
char **argv;
{
***************
*** 63,68
continue;
case 'f':
fflag++;
wordf = fopen(optarg, "r");
if (wordf==NULL) {
--- 69,78 -----
continue;
case 'f':
+ if (fflag) {
+ fprintf(stderr, "fgrep: Only one \"-f\" option allowed\n");
+ exit(2);
+ }
fflag++;
wordfname = optarg;
wordf = fopen(wordfname, "r");
***************
*** 64,70
case 'f':
fflag++;
! wordf = fopen(optarg, "r");
if (wordf==NULL) {
fprintf(stderr, "fgrep: can't open %s\n", optarg);
exit(2);
--- 74,81 -----
exit(2);
}
fflag++;
! wordfname = optarg;
! wordf = fopen(wordfname, "r");
if (wordf==NULL) {
fprintf(stderr, "fgrep: ");
perror(wordfname);
***************
*** 66,72
fflag++;
wordf = fopen(optarg, "r");
if (wordf==NULL) {
! fprintf(stderr, "fgrep: can't open %s\n", optarg);
exit(2);
}
continue;
--- 77,84 -----
wordfname = optarg;
wordf = fopen(wordfname, "r");
if (wordf==NULL) {
! fprintf(stderr, "fgrep: ");
! perror(wordfname);
exit(2);
}
continue;
***************
*** 92,98
}
argc -= optind;
! if (errflg || ((argc <= 0) && !fflag)) {
printf("usage: fgrep %s\n",usage);
exit(2);
}
--- 104,110 -----
}
argc -= optind;
! if (errflg || ((argc <= 0) && !fflag && !eflag)) {
printf("usage: fgrep %s\n",usage);
exit(2);
}
***************
*** 128,134
char *nlp;
if (file) {
if ((fptr = fopen(file, "r")) == NULL) {
! fprintf(stderr, "fgrep: can't open %s\n", file);
retcode = 2;
return;
}
--- 140,147 -----
char *nlp;
if (file) {
if ((fptr = fopen(file, "r")) == NULL) {
! fprintf(stderr, "fgrep: ");
! perror(file);
retcode = 2;
return;
}
***************
*** 187,192
blkno += ccount;
}
}
if ( (vflag && (failed == 0 || xflag == 0)) || (vflag == 0 && xflag && failed) )
goto nomatch;
succeed: nsucc = 1;
--- 200,207 -----
blkno += ccount;
}
}
+ if (ccount <= 0 && ferror(fptr))
+ goto done;
if ( (vflag && (failed == 0 || xflag == 0)) || (vflag == 0 && xflag && failed) )
goto nomatch;
succeed: nsucc = 1;
***************
*** 198,204
}
else {
if (nfile > 1) printf("%s:", file);
! if (bflag) printf("%d:", (blkno-ccount-1)/BUFSIZ);
if (nflag) printf("%ld:", lnum);
if (p <= nlp) {
while (nlp < &buf[2*BUFSIZ]) putchar(*nlp++);
--- 213,219 -----
}
else {
if (nfile > 1) printf("%s:", file);
! if (bflag) printf("%ld:", (blkno-ccount-1)/BUFSIZ);
if (nflag) printf("%ld:", lnum);
if (p <= nlp) {
while (nlp < &buf[2*BUFSIZ]) putchar(*nlp++);
***************
*** 221,226
failed = 0;
}
}
fclose(fptr);
if (cflag) {
if (nfile > 1)
--- 236,249 -----
failed = 0;
}
}
+ done:
+ if (ccount <= 0 && ferror(fptr)) {
+ fprintf(stderr, "fgrep: Read error on ");
+ perror(file ? file : "standard input");
+ retcode = 2;
+ fclose(fptr);
+ return;
+ }
fclose(fptr);
if (cflag) {
if (nfile > 1)
***************
*** 232,241
getargc()
{
register c;
! if (wordf)
! return(getc(wordf));
! if ((c = *argptr++) == '\0')
! return(EOF);
return(c);
}
--- 255,274 -----
getargc()
{
register c;
! if (wordf) {
! c = getc(wordf);
! if (c == EOF) {
! if (ferror(wordf)) {
! fprintf(stderr, "fgrep: Read error on ");
! perror(wordfname);
! exit(1);
! }
! fclose(wordf);
! }
! } else {
! if ((c = *argptr++) == '\0')
! return(EOF);
! }
return(c);
}
***************
*** 303,309
}
overflo() {
! fprintf(stderr, "wordlist too large\n");
exit(2);
}
cfail() {
--- 336,342 -----
}
overflo() {
! fprintf(stderr, "fgrep: wordlist too large\n");
exit(2);
}
cfail() {
***************
*** 310,315
struct words *queue[QSIZE];
struct words **front, **rear;
struct words *state;
register char c;
register struct words *s;
s = w;
--- 343,349 -----
struct words *queue[QSIZE];
struct words **front, **rear;
struct words *state;
+ int bstart;
register char c;
register struct words *s;
s = w;
***************
*** 328,333
front = queue;
else front++;
cloop: if ((c = s->inp) != 0) {
*rear = (q = s->nst);
if (front < rear)
if (rear >= &queue[QSIZE-1])
--- 362,368 -----
front = queue;
else front++;
cloop: if ((c = s->inp) != 0) {
+ bstart = 0;
*rear = (q = s->nst);
if (front < rear)
if (rear >= &queue[QSIZE-1])
***************
*** 337,343
else
if (++rear == front) overflo();
state = s->fail;
! floop: if (state == 0) state = w;
if (state->inp == c) {
qloop: q->fail = state->nst;
if ((state->nst)->out == 1) q->out = 1;
--- 372,381 -----
else
if (++rear == front) overflo();
state = s->fail;
! floop: if (state == 0) {
! state = w;
! bstart = 1;
! }
if (state->inp == c) {
qloop: q->fail = state->nst;
if ((state->nst)->out == 1) q->out = 1;
***************
*** 345,350
}
else if ((state = state->link) != 0)
goto floop;
}
if ((s = s->link) != 0)
goto cloop;
--- 383,392 -----
}
else if ((state = state->link) != 0)
goto floop;
+ else if(bstart == 0){
+ state = 0;
+ goto floop;
+ }
}
if ((s = s->link) != 0)
goto cloop;
***************
*** 351,357
}
}
! /* The following functions were inserted to allow for the "-i" option */
same(a, b)
register int a, b;
--- 393,399 -----
}
}
! /* The following function was inserted to allow for the "-i" option */
letter(c)
register int c;
***************
*** 353,364
/* The following functions were inserted to allow for the "-i" option */
- same(a, b)
- register int a, b;
- {
- return (a == b || iflag && (a ^ b) == ' ' && letter(a) == letter(b));
- }
-
letter(c)
register int c;
{
--- 395,400 -----
/* The following function was inserted to allow for the "-i" option */
letter(c)
register int c;
{
***************
*** 362,370
letter(c)
register int c;
{
! if (c >= 'a' && c <= 'z')
! return (c);
! if (c >= 'A' && c <= 'z')
! return (c + 'a' - 'A');
return(0);
}
--- 398,406 -----
letter(c)
register int c;
{
! if (islower(c))
! return(c);
! if (isupper(c))
! return(_tolower(c));
return(0);
}
The use of a macro for character comparison, as well as this bug fix, came
from the 4.2BSD version of "fgrep".
I think this bug is also in the V7 "fgrep"; the section of code in the S5
"fgrep" where this bug lies is similar to the V7 one and lacks the changes
from 4.2.
Here's an edited "diff" listing of the interesting differences between the
V7 and 4.2 "fgrep", in case you're curious:
24a12
> #include <ctype.h>
37c25
< int bflag, cflag, fflag, lflag, nflag, vflag, xflag;
---
> int bflag, cflag, fflag, lflag, nflag, vflag, xflag, yflag;
93a83,86
> case 'i': /* Berkeley */
> case 'y': /* Btl */
> yflag++;
> continue;
95c88
< fprintf(stderr, "egrep: unknown flag\n");
---
> fprintf(stderr, "fgrep: unknown flag\n");
104c97
< fprintf(stderr, "egrep: can't open %s\n", *argv);
---
> fprintf(stderr, "fgrep: can't open %s\n", *argv);
125a119,120
> # define ccomp(a,b) (yflag ? lca(a)==lca(b) : a==b)
> # define lca(x) (isupper(x) ? tolower(x) : x)
161c158
< if (c->inp == *p) {
---
> if (ccomp(c->inp, *p)) {
174c171
< if (c->inp == *p) {
---
> if (ccomp(c->inp , *p)) {
319a317
> int bstart;
337a336
> bstart = 0;
347c346,349
< floop: if (state == 0) state = w;
---
> floop: if (state == 0) {
> state = w;
> bstart = 1;
> }
349c351
< q->fail = state->nst;
---
> qloop: q->fail = state->nst;
351c353
< continue;
---
> if ((q = q->link) != 0) goto qloop;
354a357,360
> else if(bstart == 0){
> state = 0;
> goto floop;
> }
The interesting things to note are:
1) The "ignore case" flags - "-i" and "-y" - are not in the
V7 version, even though the Berkeley code claims "-y"
is a BTL flag.
The S5 version does have such a flag - but, despite what
the comments might make you expect, it's "-i", not "-y"!
2) The error messages in the V7 "fgrep" call it "egrep", not
"fgrep".
3) The bug in question is fixed by the stuff at the end that
involves the variable "bstart". (I *think* the problem is that
the "fgrep" implementation of the Aho/Corasick algorithms -
"Efficient String Matching: An Aid to Bibliographic Search",
Alfred V. Aho and Margaret J. Corasick, *Communications of the
ACM, June 1975, Volume 18, Number 6, pp. 333 - 340 - was lacking
the part of Algorithm 3 reading
while g(state, a) = fail do state := f(state)
Since "fgrep" doesn't care what particular pattern was
matched, and doesn't bother doing any further string matching
once it's found one match on a line, I think that this part
isn't completely necessary; however, the stuff added by the
bug fix is necessary.)
4) Either this fix, or the one involving "qloop", or both, were
in some update to V7 which I remember seeing. This update
also had "fsck", some fixes to bugs mentioned by Dennis Ritchie
in a talk which was commented on in some issue of ";login"
(one such bug, I remember, was that some error bit for some
DEC tape driver was incorrectly #defined in the driver), new
versions of "f77" and "awk" and some other programs (the
new versions ended up in 4BSD, but didn't completely make it
into System N until S5R2 or so).
This bit of UNIX trivia not brought to you by Abbott Horn (or is it Abbott
and Horn?), makers of Trivial Pursuit.
Another note, on a standard I/O "feature"/bug:
While testing this, I noticed that if you ran "fgrep" and just typed "shkx"
followed by control-D, it waited for another control-D before exiting. I
built it with the 4.2BSD standard I/O library, and it exited as soon as you
typed ^D.
There was a lot of flaming about the so-called "sticky EOF" feature of the
4.2BSD standard I/O a while ago. The 4.2BSD standard I/O returns an EOF on
all attempts to read from a stream which has already gotten an EOF (i.e.,
any stream where the _IOEOF bit is set, so that "feof(stream)" returns
TRUE). This change caused programs which read something from standard input
until EOF, and then start reading more data from standard input without
doing a "clearerr(stdin)", to break. These programs were assuming that
standard input was a terminal (unless they did an "fseek" after the first
EOF, there is no way that subsequent reads from a regular file wouldn't get
EOF, assuming somebody wasn't writing to the file they were reading from).
I believe the change was made in 4.2BSD to fix "fread"s which saw an EOF
before they got all the data requested. This change also fixes the problem
described above, as well as making the behavior of "feof" consistent with
the behavior of reads from the file; other versions of standard I/O can have
"feof(stream)" be true, but have reads from "stream" not return an
end-of-file indication!
--
Guy Harris
{ihnp4, decvax, seismo, decwrl, ...}!sun!guy
guy at sun.arpa
More information about the Net.bugs.usg
mailing list