Pattern-matching source code
John Gordon
gordon at osiris.cso.uiuc.edu
Fri Feb 15 01:37:05 AEST 1991
Hi, Netters. Here is source code for a "grep"-like function. Use
it in your own programs. Note that there are two listings, match.c and
esc.c .
---
John Gordon
Internet: gordon at osiris.cso.uiuc.edu #include <disclaimer.h>
gordon at cerl.cecer.army.mil #include <clever_saying.h>
--------CUT HERE-----------
/*********** MATCH.C ************************ Listing 1 ***********
* Author: Allen I. Holub
* Function: A group of subroutines to find a substring represented
* by a grep-like regular expression in a second string.
* Compiler: Any ANSI Standard C compiler.
* Verified for: Turbo C++ 1.0, Microsoft C 6.0
* Memory Models: any
*
* Compile-time switches: -DMAIN to make a small grep-like main()
*
* (c) C Gazette. May be used freely as long as author and publication
* are acknowledged
*----------------------------------------------------------------------
*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#ifdef DEBUG
#define D(x) x /* If DEBUG is defined, D(printf("hello");) expands */
#else /* to printf("hello"). If DEBUG isn't defined, D(...) */
#define D(x) /* is expanded to an empty string, effectively */
#endif /* removing the printf() statement from the input. */
/* Metacharacters in the input: */
#define BOL '^' /* start-of-line anchor */
#define EOL '$' /* end-of-line anchor */
#define ANY '.' /* matches any character */
#define CCL '[' /* start a character class */
#define CCLEND ']' /* end a character class */
#define NCCL '^' /* negates character class if 1st char. */
#define CLOSURE '*' /* Kleene closure (matches 0 or more) */
#define PCLOSE '+' /* Positive closure (1 or more) */
#define OPT '?' /* Optional closure (0 or 1) */
typedef enum action /* These are put in the pattern string */ {
/* to represent metacharacters. */
M_BOL = ( 0x80 | '^' ), M_EOL = ( 0x80 | '$' ),
M_ANY = ( 0x80 | '.' ), M_CCL = ( 0x80 | '[' ),
M_OPT = ( 0x80 | '?' ), M_CLOSE = ( 0x80 | '*' ),
M_PCLOSE = ( 0x80 | '+' )
}
action;
typedef unsigned char pattern; /* pattern strings are unsigned char */
#define IS_ACTION(x) ((x)&0x80) /* true => element of pat. string is an */
/* action that represents a metacharacter */
#define MAXPAT 128 /* max num. of pattern elements. Remember that */
/* character classes require 17 pattern elements */
/*----------------------------------------------------------------------*/
#define MAPSIZE 16 /* need this many bytes for character class bit map */
/* Advance a pointer into the pattern template */
/* to the next pattern element, this is a +1 for */
/* all pattern elements but M_CCL, where you */
/* to skip past both the M_CCL character and the */
/* bitmap that follows that character */
#define ADVANCE(pat) (pat += (*pat == M_CCL) ? (MAPSIZE+1) : 1)
/* Bitmap functions. Set bit b in the map and */
/* test bit b to see if it was set previously */
#define SETBIT(b,map) ((map)[((b) & 0x7f) >>3] |= (1<< ((b) & 0x07)) )
#define TSTBIT(b,map) ((map)[((b) & 0x7f) >>3] & (1<< ((b) & 0x07)) )
/*----------------------------------------------------------------------*/
#define E_NONE 0 /* Possible return values from pat_err.*/
#define E_ILLEGAL 1 /* Set in makepat() to indicate prob- */
#define E_NOMEM 2 /* lems that came up while making the */
#define E_PAT 3 /* pattern template. */
static int Error = E_NONE; /* error flag, like errno */
int pat_err(){ return Error; } /* returns current error status */
/*----------------------------------------------------------------------*/
static pattern * doccl (pattern *map, unsigned char *src);
extern pattern * makepat (char *exp);
extern char * match (char *look_for, char *in_this_string);
extern int pat_err (void );
extern char * matchs (char *str, pattern *pat, int ret_endp);
static int omatch (char **strp, pattern *pat, char *start);
extern char * patcmp (char *str, pattern *pat, char *start);
extern int esc (char **);
/*----------------------------------------------------------------------*/
pattern *makepat( exp )
char *exp; /* regular expression */
{
/* Make a pattern template from the string pointed to by exp.
* Stop when '\0' or '\n' is found in exp.
* Return: a pointer to the pattern template on success, NULL
* on failure (in which case the pat_err() subroutine
* will return one of the following values:
*
* E_ILLEGAL Illegal input pattern.
* E_NOMEM out of memory
* E_PAT pattern too long.
*/
pattern *pat; /* pattern template being assembled */
pattern *cur; /* pointer to current pattern element */
pattern *prev; /* pointer to previous pattern element */
pat = NULL;
Error = E_ILLEGAL;
if(!*exp || *exp=='\n' )
goto exit;
if( *exp==CLOSURE || *exp==PCLOSE || *exp==OPT )
goto exit;
Error = E_NOMEM;
if( ! (pat = malloc(MAXPAT)) ) /* get pattern buffer */
goto exit;
D( memset(pat, 0, MAXPAT); ) /* zero buffer if debugging */
prev = cur = pat;
Error = E_PAT;
while( *exp && *exp != '\n' ) {
if( cur >= &pat[MAXPAT-1] ) {
free( pat );
pat = NULL;
goto exit;
}
switch( *exp ) {
case ANY:
*cur = M_ANY;
prev = cur++;
++exp;
break;
case BOL:
*cur = (cur == pat) ? M_BOL : *exp;
prev = cur++;
++exp;
break;
case EOL:
*cur = (!exp[1] || exp[1]=='\n') ? M_EOL : *exp;
prev = cur++;
++exp;
break;
case CCL:
if( ((cur - pat) + MAPSIZE) >= MAXPAT ) {
free( pat );
pat = NULL;
goto exit; /* not enough room for bit map */
}
prev = cur;
*cur++ = M_CCL;
exp = doccl( cur, exp );
cur += MAPSIZE ;
break;
case OPT: case CLOSURE: case PCLOSE:
switch( *prev ) {
case M_BOL: case M_EOL:
case M_OPT:
case M_PCLOSE: case M_CLOSE:
free( pat );
pat = NULL;
goto exit;
}
memmove( prev+1, prev, cur-prev );
*prev = (*exp == OPT) ? M_OPT :
(*exp == PCLOSE) ? M_PCLOSE : M_CLOSE ;
++cur;
++exp;
break;
default:
prev = cur;
*cur++ = esc( &exp );
break;
}
}
*cur = '\0';
Error = E_NONE;
exit: return( pat );
}
/*----------------------------------------------------------------------*/
static pattern *doccl( map, src )
pattern *src, *map;
{
/* Set bits in the map corresponding to characters specified in
* the src character class. */
int first, last, negative;
pattern *start = src;
++src; /* skip past the [ */
if( negative = (*src == NCCL) ) /* check for negative ccl */
++src;
start = src; /* start of characters in class */
memset(map, 0, MAPSIZE); /* bitmap initially empty */
while( *src && *src != CCLEND ) {
if( *src != '-') {
first = esc(&src); /* Use temp. to avoid macro */
SETBIT( first, map ); /* side effects. */
}
else if( src == start ) {
SETBIT( '-', map ); /* literal dash at start or end */
++src;
}
else {
++src; /* skip to end-of-sequence char */
if( *src < src[-2] ) {
first = *src;
last = src[-2];
}
else {
first = src[-2];
last = *src;
}
while( ++first <= last )
SETBIT( first, map );
src++;
}
}
if( *src == CCLEND )
++src; /* Skip CCLEND */
if( negative )
for( first = MAPSIZE; --first >= 0 ;)
*map++ ^= ~0; /* invert all bits */
return src;
}
/*---------------------------------------------------------------*/
char *match( look_for, in_this_string )
char *look_for;
char *in_this_string;
{
/* Return a pointer to the characters matching look_for
* if it's in the string, else return NULL. You can use
* the pat_err() subroutine to distinguish a bad regular
* expression from a string-not-found condition [NULL is
* returned from match() in both cases].
*/
pattern *pat;
char *rval = NULL;
if( pat = makepat(look_for) ) /* make the pattern */ {
rval = matchs( in_this_string, pat, 0 ); /* see if it's there */
free( pat );
}
return rval;
}
/*----------------------------------------------------------------------*/
char *matchs( str, pat, ret_endp )
char *str;
pattern *pat;
int ret_endp;
{
/* Uses patcmp() to look for a match of pat anywhere in str using
* a brute-force algorithm. str is a character string while pat is
* a pattern template made by makepat(). Returns:
* 1. NULL if no match was found.
* 2. A pointer the last character satisfying the match if
* ret_endp is true.
* 3. A pointer to the beginning of the matched string
* if ret_endp is false.
*/
char *start;
char *end = NULL;
if( !pat ) /* This lets you do: matchs(str,makepat(...),?);*/
return NULL; /* without horrible consequences if makepat() */
/* fails. */
if( !*str ) {
if((*pat == M_EOL) ||
(*pat == M_BOL && (!pat[1] || pat[1]==M_EOL)))
end = str;
}
else {
start = str; /* Do a brute-force substring search, com- */
while( *str ) /* paring a pattern against the input string */ {
if( !(end = patcmp(str, pat, start)) )
str++;
else /* successful match */ {
if( !ret_endp )
end = str ;
break;
}
}
}
return( end );
}
/*----------------------------------------------------------------------*/
char *patcmp( str, pat, start ) /* formerly amatch() */
char *str, *start;
pattern *pat;
{
/* Like strcmp, but compares str against pat. Each element of str
* is compared with the template until either a mis-match is
* found or the end of the template is reached. In the former
* case a 0 is returned; in the latter, a pointer into str
* (pointing to the last character in the matched pattern)
* is returned. Strstart points at the first character in the
* string, which might not be the same thing as line if the
* search started in the middle of the string.
*/
char *bocl, /* beginning of closure string. */
*end; /* return value: end-of-string pointer. */
if( !pat ) /* make sure pattern is valid */
return( NULL );
while( *pat ) {
if( *pat == M_OPT ) {
/* Zero or one matches. It doesn't matter if omatch
* fails---it will advance str past the character on
* success, though. Always advance the pattern past
* both the M_OPT and the operand.
*/
omatch( &str, ++pat, start );
ADVANCE(pat);
}
else if( !(*pat == M_CLOSE || *pat == M_PCLOSE) ) {
/* Do a simple match. Note that omatch() fails if there's
* still something in pat but we're at end of string.
*/
if( !omatch( &str, pat, start ) )
return NULL;
ADVANCE(pat);
}
else /* Process a Kleene or positive closure */ {
if( *pat++ == M_PCLOSE ) /* one match required */
if( !omatch(&str, pat, start ) )
return NULL;
/* Match as many as possible, zero is okay */
bocl = str;
while ( *str && omatch(&str, pat, start) )
;
/* 'str' now points to the character that made
* made us fail. Try to process the rest of the
* string. If the character following the closure
* could have been in the closure (as in the pattern
* "[a-z]*t") the final 't' will be sucked up in the
* while loop. So, if the match fails, back up a notch
* and try to match the rest of the string again,
* repeating this process recursively until we get back
* to the beginning of the closure. The recursion
* goes, at most, one levels deep.
*/
if( *ADVANCE(pat) ) {
for(; bocl <= str; --str )
if( end = patcmp(str, pat, start) )
break;
return end;
}
break;
}
}
/* omatch() advances str to point at the next
* character to be matched. So str points at the
* character following the last character matched
* when you reach the end of the template. The
* exceptions are templates containing only a
* BOLN or EOLN token. In these cases omatch doesn't
* advance. Since we must return a pointer to the last
* matched character, decrement str to make it point at
* the end of the matched string, making sure that the
* decrement hasn't gone past the beginning of the string.
*
* Note that $ is a position, not a character, but in the case
* of a pattern ^$, a pointer to the end of line character is
* returned. In ^xyz$, a pointer to the z is returned.
*
* The --str is done outside the return statement because
* max() is often a macro that has side-effects.
*/
--str;
return( max(start, str) );
}
/*----------------------------------------------------------------------*/
static int omatch( strp, pat, start )
char **strp;
pattern *pat;
char *start;
{
/* Match one pattern element, pointed at by pat, against the
* character at **strp. Return 0 on a failure, 1 on success.
* *strp is advanced to skip over the matched character on a
* successful match. Closure is handled one level up by patcmp().
*
* "start" points at the character at the left edge of the
* line. This might not be the same thing as *strp if the
* search is starting in the middle of the string. An end-of-
* line anchor matches '\n' or '\0'.
*/
int advance = -1; /* amount to advance *strp, -1 == error */
switch( *pat ) {
case M_BOL: /* First char in string? */
if( *strp == start ) /* Only one star here. */
advance = 0;
break;
case M_ANY: /* . = anything but newline */
if( **strp != '\n' )
advance = 1;
break;
case M_EOL:
if( **strp == '\n' || **strp == '\0' )
advance = 0;
break;
case M_CCL:
if( TSTBIT( **strp, pat + 1 ) )
advance = 1;
break;
default: /* literal match */
if( **strp == *pat )
advance = 1;
break;
}
if( advance > 0 )
*strp += advance;
return( advance + 1 );
}
/*----------------------------------------------------------------------*/
#ifdef MAIN
main( int argc, char **argv )
{
static pattern *pat;
static FILE *inp;
static char inp_buf[ 1024 ];
if( argc < 2 || argv[1][0]=='-' ) {
fprintf(stderr, "Usage: match reg_exp filename\n");
fprintf(stderr, "Usage: match reg_exp < filename\n");
exit( 1 );
}
if( !(pat = makepat( argv[1] )) ) {
fprintf(stderr, "Can't make expression template\n");
exit( 1 );
}
if( argc == 2 )
inp = stdin;
else if( !(inp = fopen(argv[2],"r")) ) {
perror( argv[2] );
exit( 1 );
}
while( fgets( inp_buf, sizeof(inp_buf), inp ) )
if( matchs(inp_buf, pat, 0) )
fputs( inp_buf, stdout );
exit( 0 );
}
#endif
/*********** Listing 2 ****************** ESC.C *******************
* Program: ESC.C
* Author: Allen I. Holub
* Function: A subroutine to parse escape sequences
* Compiler: Any.
* Memory Models: Any
*
* (c) 1990 C Gazette. Use freely if authorship acknowledged.
*----------------------------------------------------------------------
*/
#include <stdio.h>
#include <ctype.h>
static int hex2bin ( int c );
static int oct2bin ( int c );
/*------------------------------------------------------------*/
#define ISHEXDIGIT(x) (isdigit(x) \
|| ('a'<=(x) && (x)<='f') \
|| ('A'<=(x) && (x)<='F') )
#define ISOCTDIGIT(x) ('0'<=(x) && (x)<='7')
static int hex2bin(c)
int c;
{
/* Convert the hex digit represented by 'c' to an int. 'c'
* must be one of: 0123456789abcdefABCDEF
*/
return (isdigit(c) ? (c)-'0': ((toupper(c))-'A')+10) & 0xf;
}
static int oct2bin(c)
int c;
{
/* Convert the hex digit represented by 'c' to an int. 'c'
* must be a digit in the range '0'-'7'.
*/
return ( ((c)-'0') & 0x7 );
}
/*------------------------------------------------------------*/
int esc(s)
char **s;
{
/* Map escape sequences into their equivalent symbols. Return
* the equivalent ASCII character. *s is advanced past the
* escape sequence. If no escape sequence is present, the
* current character is returned and the string is advanced by
* one. The following are recognized:
*
* \b backspace
* \f formfeed
* \n newline
* \r carriage return
* \s space
* \t tab
* \e ASCII ESC character ('\033')
* \DDD number formed of 1-3 octal digits
* \xDDD number formed of 1-3 hex digits
* \^C C = any letter. Control code
*/
int rval;
if( **s != '\\' )
rval = *( (*s)++ );
else {
++(*s); /* Skip the \ */
switch( toupper(**s) ) {
case '\0': rval = '\\'; break;
case 'B': rval = '\b' ; break;
case 'F': rval = '\f' ; break;
case 'N': rval = '\n' ; break;
case 'R': rval = '\r' ; break;
case 'S': rval = ' ' ; break;
case 'T': rval = '\t' ; break;
case 'E': rval = '\033'; break;
case '^':
rval = *++(*s) ;
rval = toupper(rval) - '@' ;
break;
case 'X':
rval = 0;
++(*s);
if( ISHEXDIGIT(**s) ) {
rval = hex2bin( *(*s)++ );
}
if( ISHEXDIGIT(**s) ) {
rval <<= 4;
rval |= hex2bin( *(*s)++ );
}
if( ISHEXDIGIT(**s) ) {
rval <<= 4;
rval |= hex2bin( *(*s)++ );
}
--(*s);
break;
default:
if( !ISOCTDIGIT(**s) )
rval = **s;
else {
++(*s);
rval = oct2bin( *(*s)++ );
if( ISOCTDIGIT(**s) ) {
rval <<= 3;
rval |= oct2bin( *(*s)++ );
}
if( ISOCTDIGIT(**s) ) {
rval <<= 3;
rval |= oct2bin( *(*s)++ );
}
--(*s);
}
break;
}
++(*s);
}
return rval;
}
More information about the Comp.lang.c
mailing list