You, too, can look at strings.

Erik Naggum enag at ifi.uio.no
Sat Feb 23 09:36:38 AEST 1991


Hi, Cameron!

In article <1991Feb20.150204.3815 at lgc.com> you write:

>   7.  One reader wrote that he'd send a finite-state machine which
>       models C syntax as soon as he found his copy.  I haven't heard
>       from him since.  I'll pass it along when it arrives.

Hmmm, it must've lost in the fight for survival across the net, which
may explain why I never heard from you, again, either.  :-)

Here's the code, with added table lookup optimization and support for those
who wish to scan the preprocessor directives since the first issue.  This was
some work to get right, so I haven't placed it in the public domain.  Some
readers of these newsgroups may frown upon that, but if you don't like it the
way I did it, or my reasons for copyrighting my work, roll your own.  I'd
appreciate credit if you glean principles of operation from me.  Otherwise,
people are free to use this for non-commercial purposes.  (If you wish to use
this in production code, please drop me a line.)

This is all original code.

The intent is that if the C compiler finds a string, this will find
it, too.

----------------------------------------------------------------------
/*
 *	State machine to trivially parse C code and find strings.
 *
 *	Copyright Erik Naggum, 1991.	Contact: <erik at naggum.uu.no>
 *
 *	Permission to use this code for any non-commercial purpose is
 *	hereby granted, provided that the above copyright notice and
 *	this permission clause are included unchanged.  Please inform
 *	the author if you plan to use this code in an article or
 *	otherwise publish it.  If you change the code, indicate which
 *	sections are yours and which are mine in an unambiguous manner.
 */

/*
 * This code is written with portability in mind, following POSIX.1 [1]
 * and the ANSI C standard [2].  No system dependent features are used.
 *
 * [1] IEEE 1003.1-1990, or ISO 9945-1:1990
 * [2] ANSI X3.159(1989), or ISO 9989:1990
 */

/*
 * Usage: <name> [file ...]
 */

#include <stdio.h>

/*
 * NOTE:
 *	If you wish to find strings also in the preprocessor lines,
 *	define the PREPROC_STRINGS macro when compiling.  The code is
 *	intended to be used on preprocessed output, as ANSI C macros
 *	may contain ## (pasting) and # (stringifying) operators, which
 *	are both hard to handle without extensive support for the
 *	preprocessor, much like the preprocessor itself...
 *
 *	It's decidedly non-trivial to make this decision run-time.
 */

/*
 * Names of states.  Order is not significant.
 */
typedef enum {
	Code,
#if !defined (PREPROC_STRINGS)
	New_Line,
	Preprocessor,
#endif
	String,
	String_Escaped,
	Char,
	Char_Escaped,
	Enter_Comment,
	Comment,
	Exit_Comment,
	Line_Input,
	Line_Escape,
} state;

struct transition;

/*
 * Prototype declarations.
 */
int enter_string (char);
int exit_string (char);
int pass (char);
int passesc (char);
int rescan (char);
int write_char (char);
void engine (FILE *);
void process(struct transition *, state *, char);

/*
 * Magic code which matches any character when scanning transition table
 */
#define ANY -1

/*
 * Distance value for Code state (with and without PREPROC_STRINGS defined)
 */
#if !defined(PREPROC_STRINGS)
#   define CODE_DIST 5
#else
#   define CODE_DIST 4
#endif

/*
 * NOTES:
 * (1) FSA may be optimized by sorting states by frequency of occurrence
 * (2) All states must have a match for all characters (use ANY to fill)
 * (3) Don't change the `rescan' dispositions
 */
struct transition {
	state this_state;		/* state in which condition applies */
	int condition;			/* transition condition (match) */
	state next_state;		/* new state if condition true */
	int (*disposition)(char);	/* action before next state */
	int distance;			/* optimized table search support */
} code_transitions[] = {
	Code,		'"',	String,		enter_string,	CODE_DIST,
	Code,		'\'',	Char,		NULL,		0,
	Code,		'/',	Enter_Comment,	NULL,		0,
#if !defined(PREPROC_STRINGS)
	Code,		'\n',	New_Line,	NULL,		0,
#endif
	Code,		ANY,	Code,		NULL,		0,
	Comment,	'*',	Exit_Comment,	NULL,		2,
	Comment,	ANY,	Comment,	NULL,		0,
#if !defined (PREPROC_STRINGS)
	New_Line,	'#',	Preprocessor,	NULL,		2,
	New_Line,	ANY,	Code,		rescan,		0,
	Preprocessor,	'\n',	Code,		NULL,		2,
	Preprocessor,	ANY,	Preprocessor,	NULL,		0,
#endif	
	String,		'"',	Code,		exit_string,	4,
	String,		'\n',	Code,		exit_string,	0,
	String,		'\\',	String_Escaped,	write_char,	0,
	String,		ANY,	String,		write_char,	0,
	String_Escaped,	ANY,	String,		write_char,	1,
	Char,		'\'',	Code,		NULL,		4,
	Char,		'\\',	Char_Escaped,	NULL,		0,
	Char,		'\n',	Code,		NULL,		0,
	Char,		ANY,	Char,		NULL,		0,
	Char_Escaped,	ANY,	Code,		NULL,		1,
	Enter_Comment,	'*',	Comment,	NULL,		2,
	Enter_Comment,	ANY,	Code,		rescan,		0,
	Exit_Comment,	'/',	Code,		NULL,		2,
	Exit_Comment,	ANY,	Comment,	NULL,		0,
};

/*
 * Separate FSA for the input line handling, to avoid multiplicity of
 * states in the code_transitions.  Also reflects ANSI C translation
 * phase 2.  (Phase 1 is assumed to be empty.)
 */
struct transition line_transitions[] = {
	Line_Input,	'\\',	Line_Escape,	NULL,		2,
	Line_Input,	ANY,	Line_Input,	pass,		0,
	Line_Escape,	'\n',	Line_Input,	NULL,		2,
	Line_Escape,	ANY,	Line_Input,	passesc,	0,
};

static state line_state;
static state code_state;
static char *input_file;

/*
 * Process arguments (file names) and options (none at present)
 */
void main (int argc, char *argv[])
{
	int argp;

	/*
	 * Not very elegant way to force standard input in the absence of
	 * arguments.  Suggestions for improvements are welcome.
	 */
	static char *no_files[] = { "", "-", NULL };
	if (argc <= 1)
		argv = no_files;

	for (argp = 1; argv[argp]; argp++) {
		FILE *input;
		input_file = argv[argp];
		if (strcmp (input_file, "-") == 0) {
			engine (stdin);
			continue;
		}
		input = fopen (input_file, "r");
		if (input == NULL) {
			perror (input_file);
			continue;
		}
		engine (input);
		fclose (input);
	}
	exit (0);
}
		
/*
 * Initialize FSA and process input file
 */
void engine (FILE *input)
{
	int c;

	line_state = Line_Input;
#if !defined(PREPROC_STRINGS)
	code_state = New_Line;
#else
	code_state = Code;
#endif
	while ((c = fgetc(input)) != EOF)
		process (line_transitions, &line_state, c);
}

/*
 * Line handler - pass character to code handler
 */
int pass (char c)
{
	process (code_transitions, &code_state, c);
	return 0;
}

/*
 * Line handler - pass escape when not followed by newline,
 * then rescan the input character.
 */
int passesc (char c)
{
	process (code_transitions, &code_state, '\\');
	return 1;
}

/*
 * Cause current character to be reused in new state
 */
int rescan (char c)
{
	return 1;
}

/*
 * We have entered a string -- print file name
 */
int enter_string (char c)
{
	printf ("%s:\t%c", input_file, c);
	return 0;
}

/*
 * The string is terminated -- finish string processing
 */
int exit_string (char c)
{
	if (c == '\n')
		printf ("\" (unterminated)\n");
	else
		printf ("%c\n", c);
	return 0;
}

/*
 * In string -- print each character out
 */
int write_char (char c)
{
	putchar (c);
	return 0;
}

/*
 * The heart of the Finite State Automaton
 *
 * Note the rescan support.  This code would have been cleaner without
 * it, but other code would suffer.  Better to have it one place, only.
 *
 */
void process (struct transition *transitions, state *in_state, char c)
{
	struct transition *work;
	
	do {
		work = transitions;
		
		while (work -> this_state != *in_state)
			work += work -> distance;

		while (work -> condition != c && work -> condition != ANY)
			++ work;
		*in_state = work -> next_state;
		if (work -> disposition == NULL)
			break;
	} while (work -> disposition (c));
}
----------------------------------------------------------------------

Enjoy!  Bug fixes, if any, will be appreciated.

--
[Erik Naggum]					     <enag at ifi.uio.no>
Naggum Software, Oslo, Norway			   <erik at naggum.uu.no>
-- 
Send compilers articles to compilers at iecc.cambridge.ma.us or
{ima | spdcc | world}!iecc!compilers.  Meta-mail to compilers-request.



More information about the Comp.unix.questions mailing list