Want C syntax in lex
Henry Spencer
henry at utzoo.uucp
Fri Mar 31 07:20:11 AEST 1989
In article <1989Mar29.224649.5766 at utzoo.uucp> I wrote:
>I did a lex description of C tokens a couple of years ago which I can post
>if people are interested. It's not up-to-date...
Enough people have already expressed interest for me to post it. I did a
small update on it at the same time, so it is reasonably current. Read
the comment at the top before getting too confident, though. Note also
that it implements *exactly* ANSI C and makes no attempt at clean error
recovery. I personally don't consider it a useful base for major software
work -- you just cannot analyze C properly without a full preprocessor --
but it is useful for things like statistics gathering.
----------
%{
/*
* ctokens - print tokens of a C or C++ program
*
* Full ANSI C (draft of 31 Oct 1988) except: no trigraphs; copes with
* backslash-newline stripping only inside strings; imperfect understanding
* of the context-dependent rule that makes <bletch.h> a single token
* inside a #include. The only C++ issues are the "::" operator and "//"
* comments.
*
* There are some limitations inherent in not doing preprocessing. In
* ANSI C, characters that look illegal at first glance can disappear
* from the source during preprocessing, either by being #ifdefed out
* or by vanishing into a string. This code does not consider that.
* Preprocessor numbers can also do strange things, again not considered.
*
* There are also some implementation-dependent decisions in areas like
* the exact syntax of header names; we don't try to be smart about this.
*
* Except for newlines, any white-space character is printed as "\t".
* It would be more sensible to make the white-space expression [ \t\v\f]+
* instead of just [ \t\v\f], but some old lexes have problems with that.
*
* Note that this program uses one (sigh) undocumented feature of Unix lex:
* the ability to override the choice of input stream by assigning to yyin.
* Avoiding this requires reimplementing lex's input functions, which is a
* pain because getc/ungetc isn't good enough.
*
* $Log$
*/
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>
#define STREQ(a, b) (*(a) == *(b) && strcmp((a), (b)) == 0)
#ifndef lint
static char RCSid[] = "$Header$";
#endif
int debug = 0;
char *progname;
extern void error(), exit();
#ifdef UTZOOERR
extern char *mkprogname();
#else
#define mkprogname(a) (a)
#endif
#define PRINTIT printf("%s\n", yytext)
int cflag = 0; /* C only. */
/* stuff for stupid context-dependent #include <name> */
#define SAWNL 0
#define SAWNUM 1
#define SAWINC 2
#define OTHER 3
int state = SAWNL;
/* #define PS printf("state %d\n", state) */
#define PS /* */
%}
EXP ([eE][+-]?[0-9]+)
FS [flFL]
IS ([uU][lL]?|[lL][uU]?)
%%
[_a-zA-Z][_a-zA-Z0-9]* { /* identifier */
PRINTIT;
if (strcmp(yytext, "include") == 0 &&
state == SAWNUM)
state = SAWINC;
else
state = OTHER;
PS;
}
[0-9]+"."[0-9]*{EXP}?{FS}? |
"."[0-9]+{EXP}?{FS}? |
[0-9]+{EXP}{FS}? |
[1-9][0-9]*{IS}? |
0[0-7]*{IS}? |
0[xX][0-9a-fA-F]+{IS}? { PRINTIT; /* number */ }
L?\'([^'\\\n]|\\(['"?\\abfnrtv]|[0-7]{1,3}|[xX][0-9a-fA-F]+))+\' {
PRINTIT; /* character constant */
}
L?\"([^"\\\n]|\\(['"?\\abfnrtv\n]|[0-7]{1,3}|[xX][0-9a-fA-F]+))*\" {
/* string -- remove backslashed newlines */
register char *p;
for (p = yytext; *p != '\0'; p++)
if (*p == '\\' && *(p+1) == '\n')
p++;
else
putchar(*p);
putchar('\n');
}
"#" {
if (state == SAWNL)
state = SAWNUM;
PRINTIT;
PS;
}
"<"[^>\n]*">" {
PS;
if (state != SAWINC) {
REJECT;
} else
PRINTIT;
state = OTHER;
}
[-()&*+~!/%<>^|,.=;:{}?] |
"[" |
"]" |
"->" |
"++" |
"--" |
"<<" |
">>" |
"<=" |
">=" |
"==" |
"!=" |
"&&" |
"||" |
"##" |
"..." |
[-*/%+&^|]"=" |
"<<=" |
">>=" { PRINTIT; /* misc. tokens */ }
"::" {
if (cflag) {
REJECT;
} else
PRINTIT;
}
\n { state = SAWNL; PS; printf("\\n\n"); }
[ \t\v\f] printf("\\t\n");
"/*" {
register int ch;
register int nnl = 0;
printf("/* ");
for (;;) {
ch = input();
if (ch == '*') {
ch = input();
if (ch == '/')
break;
else
unput(ch);
} else if (ch == '\n') {
nnl++;
if (nnl <= 10)
printf("\\n");
if (nnl == 10)
printf("...");
} else if (ch == '\0') {
fprintf(stderr, "unterminated comment!\n");
exit(0);
}
}
printf(" */\n");
}
"//" {
register int ch;
if (cflag) {
REJECT;
} else {
printf("//\n");
while ((ch = input()) != '\n')
if (ch == '\0') {
fprintf(stderr, "unterminated comment!\n");
exit(0);
}
unput(ch);
}
}
. printf("%c ???\n", yytext[0]);
%%
/*
- main - parse arguments and handle options
*/
main(argc, argv)
int argc;
char *argv[];
{
int c;
int errflg = 0;
FILE *in;
struct stat statbuf;
extern int optind;
extern char *optarg;
extern FILE *efopen();
void process();
progname = mkprogname(argv[0]);
while ((c = getopt(argc, argv, "dC")) != EOF)
switch (c) {
case 'C': /* C only, no C++. */
cflag = 1;
break;
case 'd': /* Debugging. */
debug++;
break;
case '?':
default:
errflg++;
break;
}
if (errflg) {
fprintf(stderr, "usage: %s [-C] [file] ...\n", progname);
exit(2);
}
if (optind >= argc)
process(stdin, "stdin");
else
for (; optind < argc; optind++)
if (STREQ(argv[optind], "-"))
process(stdin, "-");
else {
in = efopen(argv[optind], "r");
if (fstat(fileno(in), &statbuf) < 0)
error("can't fstat `%s'", argv[optind]);
if ((statbuf.st_mode & S_IFMT) == S_IFDIR)
error("`%s' is directory!", argv[optind]);
process(in, argv[optind]);
(void) fclose(in);
}
exit(0);
}
/*
* process - process input file
*/
void
process(in, inname)
FILE *in;
char *inname;
{
yyin = in;
(void) yylex();
}
/*
- efopen - fopen with error check
*/
FILE *
efopen(name, mode)
char *name;
char *mode;
{
FILE *f;
f = fopen(name, mode);
if (f == NULL)
error("can't open `%s'", name);
return(f);
}
/*
- error - report trouble
*/
void /* does not return */
error(s1, s2)
char *s1;
char *s2;
{
fprintf(stderr, "%s: ", progname);
fprintf(stderr, s1, s2);
fprintf(stderr, "\n");
exit(1);
}
----------
--
Welcome to Mars! Your | Henry Spencer at U of Toronto Zoology
passport and visa, comrade? | uunet!attcan!utzoo!henry henry at zoo.toronto.edu
More information about the Comp.lang.c
mailing list