Newsgroups: comp.sources.unix From: wendt@cs.colostate.edu (alan l wendt) Subject: v28i044: grok - match regular expressions and correct errors, V1.0, Part01/01 Message-id: <1.770288657.8654@gw.home.vix.com> Sender: unix-sources-moderator@gw.home.vix.com Approved: vixie@gw.home.vix.com Submitted-By: wendt@cs.colostate.edu (alan l wendt) Posting-Number: Volume 28, Issue 44 Archive-Name: grok/part01 Grok (get regular expression and make OK) is like grep except that it corrects its input to instances of the regular expression, using a minimum-cost correction. Grok uses flex code. Its only argument is a filename containing character-to-character correction costs, a maximum acceptable cost for the string, and a regular expression. The code has been tested on 286 Xenix 2.3.2 (large model) and DEC 3100 Ultrix 4.2. This is version 1.0. Contributors: Alan Wendt, Vern Paxson, Eugene Myers # This is a shell archive. # Remove everything above and including the cut line. # Then run the rest of the file through sh. #----cut here-----cut here-----cut here-----cut here----# #!/bin/sh # shar: Shell Archiver # Run the following text with /bin/sh to create: # Makefile # README # ccl.c # flexdef.h # grok.1 # misc.c # ncnb.def # nfa.c # parse.h # parse.y # scan.l # sym.c # test.in # test.out # This archive created: Tue Nov 17 09:30:20 1992 cat << \SHAR_EOF > Makefile # # DEC 3100 Ultrix # LDFLAGS = CFLAGS = # # Xenix 286 2.3.2 large model # #LDFLAGS = -M2l -g -F 4000 #CFLAGS = -I$I -DUNIX=1 -LARGE -DUSG=1 -DSV .c.o: ; -$(CC) $(LDFLAGS) $(CFLAGS) -c $*.c # correction scan.c: scan.l flex -I scan.l mv lex.yy.c scan.c REPAIRFILES = nfa.o ccl.o parse.o sym.o misc.o scan.o parse.h parse.c : parse.y yacc -d parse.y @mv y.tab.c parse.c @mv y.tab.h parse.h grok: $(REPAIRFILES) cc $(LDFLAGS) -o grok $(REPAIRFILES) clean: rm *.o *.log scan.c parse.c test: grok grok ncnb.def < test.in > test.out.tmp diff test.out test.out.tmp SHAR_EOF cat << \SHAR_EOF > README Grok (get regular expression and make OK) is like grep except that it corrects its input to instances of the regular expression, using a minimum-cost correction. Grok uses flex code. Its only argument is a filename containing character-to-character correction costs, a maximum acceptable cost for the string, and a regular expression. The code has been tested on 286 Xenix 2.3.2 (large model) and DEC 3100 Ultrix 4.2. Contributors: Alan Wendt, Vern Paxson, Eugene Myers SHAR_EOF cat << \SHAR_EOF > ccl.c /* ccl - routines for character classes */ /* * Copyright (c) 1987, the University of California * * The United States Government has rights in this work pursuant to * contract no. DE-AC03-76SF00098 between the United States Department of * Energy and the University of California. * * This program may be redistributed. Enhancements and derivative works * may be created provided the new works, if made available to the general * public, are made available for use by anyone. */ #include "flexdef.h" /* ccladd - add a single character to a ccl * * synopsis * int cclp; * char ch; * ccladd( cclp, ch ); */ ccltest( cclp, ch ) struct Ccl *cclp; char ch; { /* SUPPRESS 53 */ return cclp->c[(ch & (CSIZE - 1)) >> 3] & (1 << (ch & 7)); } ccladd( cclp, ch ) struct Ccl *cclp; char ch; { /* SUPPRESS 53 */ cclp->c[(ch & (CSIZE - 1)) >> 3] |= 1 << (ch & 7); } /* cclinit - make an empty ccl * * synopsis * int cclinit(); * cclinit( &newccl ); */ void cclinit( ccl ) struct Ccl *ccl; { bzero(ccl->c, sizeof(*ccl)); } /* cclnegate - negate a ccl * * synopsis * int cclp; * cclnegate( ccl ); */ cclnegate( cclp ) struct Ccl *cclp; { int i; for (i = 0; i < sizeof(*cclp); i++) { cclp->c[i] ^= -1; } } /* BUGGY IF CCL IS EMPTY!!! */ int ccl2nfa( cclp ) struct Ccl *cclp; { int i, j; int result = 0; for (i = 0; i < CSIZE; i++) { j = cclp->c[i >> 3]; if (j & (1 << (i & 7))) { if (result == 0) { result = mkstate( i ); } else { result = mkor( result, mkstate( i ) ); } } } return result; } SHAR_EOF cat << \SHAR_EOF > flexdef.h #include #ifdef SV #include #define bzero(s, n) memset((char *)(s), '\000', (unsigned)(n)) #else #include #endif char *sprintf(); /* keep lint happy */ #define min(x,y) (x < y ? x : y) #define max(x,y) (x > y ? x : y) #define true 1 #define false 0 /* NIL must be 0. If not, its special meaning when making equivalence classes * (it marks the representative of a given e.c.) will be unidentifiable */ #define NIL 0 #define NO_TRANSITION NIL #define INFINITY -1 /* for x{5,} constructions */ /* variables used in the flex input routines: * datapos - characters on current output line * dataline - number of contiguous lines of data in current data * statement. Used to generate readable -f output * skelfile - fd of the skeleton file * yyin - input file * temp_action_file - temporary file to hold actions * action_file_name - name of the temporary file * infilename - name of input file * linenum - current input line number */ extern int linenum; /* size of input alphabet - should be size of ASCII set */ #define CSIZE 128 struct Ccl { char c[CSIZE/8]; }; /* variables for flags: * syntaxerror - true if a syntax error has been found */ extern int syntaxerror; #define SYM_EPSILON 0 /* to mark transitions on the symbol epsilon */ /* variables for nfa machine data: * current_mns - current maximum on number of NFA states * firstst - first physical state of a fragment * lastst - last physical state of fragment * finalst - last logical state of fragment * transchar - transition character that got us into this state * accptnum - accepting number * lastnfa - last nfa state number created * edgeptr - pointer to first Edge out of this state * first logical state of a fragment is its machine number */ #define MAXIMUM_MNS 30000 struct state { struct Edge *edgeptr; int firstst; int lastst; int finalst; int ninedges; int noutedges; int temp; unsigned char transchar; } *states[15000]; extern int current_mns; extern int lastnfa, firstnfa; extern int ccl2nfa(); #if 0 /* * transfrom - from edge state number * transto - to state number */ #define INITIAL_MNE 2000 /* default maximum number of nfa edges */ #define MNE_INCREMENT 1000 /* amount to bump above by if it's not enough */ #define MAXIMUM_MNE 31999 extern int current_mne; /* current max # of edges */ extern int current_ne; /* current # of edges */ #endif struct Edge { struct Edge *next, *prior; /* links all edges from a given state */ int from, to; /* from & to state numbers */ }; /* * num_reallocs - number of times it was necessary to realloc() a group * of arrays */ extern int num_reallocs; extern char *malloc(), *realloc(), *sbrk(); #define allocate_integer_array(size) \ (int *) allocate_array( size, sizeof( int ) ) #define reallocate_integer_array(array,size) \ (int *) reallocate_array( (char *) array, size, sizeof( int ) ) char *allocate_array(), *reallocate_array(); struct hash_entry { struct hash_entry *prev, *next; char *name; char *str_val; int int_val; } ; typedef struct hash_entry *hash_table[]; #define NAME_TABLE_HASH_SIZE 101 #define START_COND_HASH_SIZE 101 /* maximum line length we'll have to deal with */ #define MAXLINE BUFSIZ #define allocate_character_array(size) allocate_array( size, sizeof( char ) ) #define reallocate_character_array(array,size) \ reallocate_array( array, size, sizeof( char ) ) /* * nmstr - last NAME scanned by the scanner */ extern char nmstr[MAXLINE]; /* used to communicate between scanner and parser. The type should really * be YYSTYPE, but we can't easily get our hands on it. */ extern int yylval; char delete_cost[128]; char insert_cost[128]; char correct_cost[128][128]; int tolerance; /* maximum tolerable error */ SHAR_EOF cat << \SHAR_EOF > grok.1 .\" .\" @(#)grok.1 6.5 (CSU) 10/11/91 .\" .TH GROK 1 "November 11, 1991 .SH NAME grok \- Match and repair errors in regular expressions .SH SYNOPSIS .in +\w'\fBgrok \fR'u .ti -\w'\fBgrok \fR'u \fBgrok \fR \fIdefinition-file\fR .SH DESCRIPTION .I Grok matches instances of \fIlex\fR-style regular expressions from the input and prints them. In addition, it tries to repair input lines to make them instances of the regular expression, using single-character insertions, deletions, and substitutions. Different single-character edits can be assigned different costs, and \fIgrok\fR will find a repair of lowest total cost. The algorithm used to find lowest-cost repairs takes time proportional to the product of the length of the input and the cost of the repair. A maximum repair cost tolerance must be supplied. \fIgrok\fR ignores input lines that cannot be repaired within the tolerance. The match is \fIanchored\fR, unlike the matching performed by \fIgrep\fR. The \fIdefinition-file\fR contains entries to define character repair costs, followed by a pair of percent signs alone on a line, followed by a single regular expression. For example, the following defines repair for lines scanned from a bank statement: .ne 4 .nf .ft L % correct anything to anything else for 3 %correct . . 3 % a number of common numeric misreads. 4 is often read as (+ % but we cannot correct string to string. %correct "Z" "2" 1 %correct "z" "2" 1 %correct "S" "5" 1 %correct "s" "5" 1 %correct "l" "1" 1 %correct "i" "1" 1 %correct "I" "1" 1 %correct "o" "0" 1 %correct "O" "0" 1 %correct "r" "5" 1 %correct "?" "9" 1 %correct "+" "4" 1 % default insertion cost is 2 %insert . 2 % charge extra for numbers to avoid hallucinating numbers. %insert [0-9] 3 % delete anything for 2. correction cost <= insert + delete. %delete . 2 %delete [0-9] 3 % the maximum tolerable error cost %tolerance 20 %% ([ ]{20,30}[+ ]([ ]{5,7}[0-9]{5}|"NO ITEM # ")[ ]{3,4}[0-9]{9} [ ]{5,7}[0-1][0-9]-[0-3][0-9][ ]{6,13}(""|[1-9][0-9]{0,2}|[1-9] [0-9]{0,2}","[0-9]{3})\.[0-9]{2}([ ]{5,10}[+ ]([ ]{5,7}[0-9]{5}| "NO ITEM # ")[ ]{3,4}[0-9]{9}[ ]{5,7}[0-1][0-9]-[0-3][0-9][ ]{6,13} (""|[1-9][0-9]{0,2}|[1-9][0-9]{0,2}","[0-9]{3})\.[0-9]{2})?)| (" ENCLOSED NUMBER NUMBER".*) .ft R .fi \fB%correct\fR, \fB%insert\fR, and \fB%delete\fR lines supply repair costs to replace each character with each other, as well as for insertions and deletions. Single-character regular expressions are accepted, and later costs overwrite earlier ones. The regular expression has been broken into several lines for display. It defines the expected structure of important lines on one style of bank statement. \fBGrok\fR outputs the repaired lines. .SH BUGS No provision is made for breaking long defining expressions across lines. The NFA optimization is too slow. .SH AUTHOR Alan Wendt. Contributors include Eugene Myers (repair algorithm), Vern Paxson (\fBflex\fR code). SHAR_EOF cat << \SHAR_EOF > misc.c /* misc - miscellaneous flex routines */ /* * Copyright (c) 1987, the University of California * * The United States Government has rights in this work pursuant to * contract no. DE-AC03-76SF00098 between the United States Department of * Energy and the University of California. * * This program may be redistributed. Enhancements and derivative works * may be created provided the new works, if made available to the general * public, are made available for use by anyone. */ #include #include "flexdef.h" char *malloc(), *realloc(); /* allocate_array - allocate memory for an integer array of the given size */ char *allocate_array( size, element_size ) int size, element_size; { register char *mem = malloc( (unsigned) (element_size * size) ); if ( mem == NULL ) flexfatal( "memory allocation failed in allocate_array()" ); return ( mem ); } /* clower - replace upper-case letter to lower-case * * synopsis: * char clower(), c; * c = clower( c ); */ char clower( c ) register char c; { return ( isupper(c) ? tolower(c) : c ); } /* copy_string - returns a dynamically allocated copy of a string * * synopsis * char *str, *copy, *copy_string(); * copy = copy_string( str ); */ char *copy_string( str ) register char *str; { register char *c; char *copy; /* find length */ for ( c = str; *c; ++c ) continue; copy = malloc( (unsigned) ((c - str + 1) * sizeof( char )) ); if ( copy == NULL ) flexfatal( "dynamic memory failure in copy_string()" ); /* SUPPRESS 560 */ for ( c = copy; (*c++ = *str++); ) continue; return ( copy ); } /* lerrif - report an error message formatted with one integer argument * * synopsis * char msg[]; * int arg; * lerrif( msg, arg ); */ lerrif( msg, arg ) char msg[]; int arg; { char errmsg[MAXLINE]; (void) sprintf( errmsg, msg, arg ); flexerror( errmsg ); } /* lerrsf - report an error message formatted with one string argument * * synopsis * char msg[], arg[]; * lerrsf( msg, arg ); */ lerrsf( msg, arg ) char msg[], arg[]; { char errmsg[MAXLINE]; (void) sprintf( errmsg, msg, arg ); flexerror( errmsg ); } /* flexerror - report an error message and terminate * * synopsis * char msg[]; * flexerror( msg ); */ flexerror( msg ) char msg[]; { fprintf( stderr, "flex: %s\n", msg ); exit( 1 ); } /* flexfatal - report a fatal error message and terminate * * synopsis * char msg[]; * flexfatal( msg ); */ flexfatal( msg ) char msg[]; { fprintf( stderr, "flex: fatal internal error %s\n", msg ); exit( 1 ); } /* myctoi - return the integer represented by a string of digits * * synopsis * char array[]; * int val, myctoi(); * val = myctoi( array ); * */ int myctoi( array ) char array[]; { int val = 0; (void) sscanf( array, "%d", &val ); return ( val ); } /* myesc - return character corresponding to escape sequence * * synopsis * char array[], c, myesc(); * c = myesc( array ); * */ char myesc( array ) char array[]; { switch ( array[1] ) { case 'n': return ( '\n' ); case 't': return ( '\t' ); case 'f': return ( '\f' ); case 'r': return ( '\r' ); case 'b': return ( '\b' ); case '0': if ( isdigit(array[2]) ) { /* \0 */ char c, esc_char; register int sptr = 2; while ( isdigit(array[sptr]) ) /* don't increment inside loop control because the * macro will expand it to two increments! (Not a * problem with the C version of the macro) */ ++sptr; c = array[sptr]; array[sptr] = '\0'; esc_char = otoi( array + 2 ); array[sptr] = c; if ( esc_char == '\0' ) { synerr( "escape sequence for null not allowed" ); return ( 1 ); } return ( esc_char ); } else { synerr( "escape sequence for null not allowed" ); return ( 1 ); } #ifdef NOTDEF case '^': { register char next_char = array[2]; if ( next_char == '?' ) return ( 0x7f ); else if ( next_char >= 'A' && next_char <= 'Z' ) return ( next_char - 'A' + 1 ); else if ( next_char >= 'a' && next_char <= 'z' ) return ( next_char - 'z' + 1 ); synerr( "illegal \\^ escape sequence" ); return ( 1 ); } #endif } return ( array[1] ); } /* otoi - convert an octal digit string to an integer value * * synopsis: * int val, otoi(); * char str[]; * val = otoi( str ); */ int otoi( str ) char str[]; { #ifdef FTLSOURCE fortran int gctoi() int dummy = 1; return ( gctoi( str, dummy, 8 ) ); #else int result; (void) sscanf( str, "%o", &result ); return ( result ); #endif } #if 0 /* reallocate_array - increase the size of a dynamic array */ char *reallocate_array( array, size, element_size ) char *array; int size, element_size; { register char *new_array = realloc( array, (unsigned) (size * element_size )); if ( new_array == NULL ) flexfatal( "attempt to increase array size failed" ); return ( new_array ); } #endif SHAR_EOF cat << \SHAR_EOF > ncnb.def % correct anything to anything else for 3 %correct . . 3 % a number of common numeric misreads. 4 is often read as (+ % but we cannot correct string to string. %correct "Z" "2" 1 %correct "z" "2" 1 %correct "S" "5" 1 %correct "s" "5" 1 %correct "l" "1" 1 %correct "i" "1" 1 %correct "I" "1" 1 %correct "o" "0" 1 %correct "O" "0" 1 %correct "r" "5" 1 %correct "?" "9" 1 %correct "+" "4" 1 % default insertion cost is 2 %insert . 2 % charge extra for numbers to avoid hallucinations. %insert [0-9] 3 % delete anything for 2. correct cost <= insert + delete. %delete . 2 %delete [0-9] 3 % the maximum tolerable error cost %tolerance 20 %% ([ ]{20,30}[+ ]([ ]{5,7}[0-9]{5}|"NO ITEM # ")[ ]{3,4}[0-9]{9}[ ]{5,7}[0-1][0-9]-[0-3][0-9][ ]{6,13}(""|[1-9][0-9]{0,2}|[1-9][0-9]{0,2}","[0-9]{3})\.[0-9]{2}([ ]{5,10}[+ ]([ ]{5,7}[0-9]{5}|"NO ITEM # ")[ ]{3,4}[0-9]{9}[ ]{5,7}[0-1][0-9]-[0-3][0-9][ ]{6,13}(""|[1-9][0-9]{0,2}|[1-9][0-9]{0,2}","[0-9]{3})\.[0-9]{2})?)|(" ENCLOSED NUMBER NUMBER".*) SHAR_EOF cat << \SHAR_EOF > nfa.c /* nfa - NFA construction routines */ /* Construct a state-labelled epsilon nfa for a regular expression. * Then read input and output a word from the language defined * by the nfa that is as close as possible to the input. */ /* * Copyright (c) 1987, the University of California * * The United States Government has rights in this work pursuant to * contract no. DE-AC03-76SF00098 between the United States Department of * Energy and the University of California. * * This program may be redistributed. Enhancements and derivative works * may be created provided the new works, if made available to the general * public, are made available for use by anyone. */ /* Changes copyright (c) 1991, Alan Wendt. This work is hereby made * available to the general public, for use by anyone. The changes * may be redistributed only if this notice is removed and the * "Lumberjack Song" substituted in its place, and you must also cut * down the largest tree in the forest with a herring. */ #include "flexdef.h" #define FIRSTST(x) states[x]->firstst #define LASTST(x) states[x]->lastst #define FINALST(x) states[x]->finalst #define EDGEPTR(x) states[x]->edgeptr #define NOUTEDGES(x) states[x]->noutedges #define ACCPTNUM(x) ((x) == finalst) #define NINEDGES(x) states[x]->ninedges #define TRANSCHAR(x) states[x]->transchar #define TEMP(x) states[x]->temp #define EDGE_FROM(x) (x)->from #define EDGE_TO(x) (x)->to #define DELETE_COST(x) delete_cost[x] #define INSERT_COST(x) insert_cost[x] #define CORRECT_COST(x,y) correct_cost[x][y] #define MAXCOST 31 int lastnfa, finalst; int tolerance = 15; /* maximum tolerable error */ #define MAXMEM 32000L char *banks[40]; int nbanks = 0; long memx; /* memory allocation counter */ int globcost; /* cost of repair */ struct BackEdge *globword; /* the repaired word */ static char *getword(); /* reconstruct the repaired word */ static void advance(); char *ralloc(n) /* allocate memory */ int n; { long result; retry: memx += sizeof(int) - 1; /* align */ memx &= ~(sizeof(int) - 1); result = memx; memx += n; if (nbanks == 0 || memx > MAXMEM) { nbanks++; if (nbanks > 40) { fprintf(stderr, "no memory in ralloc\n"); exit(1); } memx = 0; if (banks[nbanks - 1] == NULL) { banks[nbanks - 1] = sbrk((int)MAXMEM); } goto retry; } return banks[nbanks - 1]+result; } /* add_accept - add an accepting state to a machine * * synopsis * * add_accept( mach ); * */ add_accept( mach ) int mach; { finalst = FINALST(mach); /* fprintf(stderr, "final state = %d\n", finalst); */ } /* copysingl - make a given number of copies of a singleton machine * * synopsis * * newsng = copysingl( singl, num ); * * newsng - a new singleton composed of num copies of singl * singl - a singleton machine * num - the number of copies of singl to be present in newsng */ int copysingl( singl, num ) int singl, num; { int copy, i; copy = mkstate( SYM_EPSILON ); for ( i = 1; i <= num; ++i ) copy = link_machines( copy, dupmachine( singl ) ); return ( copy ); } /* dumpnfa - debugging routine to write out an nfa * * synopsis * int state1; * dumpnfa( state1 ); */ dumpnfa( state1 ) int state1; { int ns, j; struct Edge *e; fprintf( stderr, "\n\n********** beginning dump of state-labelled epsilon nfa with start state %d\n", state1 ); /* we probably should loop starting at FIRSTST(state1) and going to * LASTST(state1), but they're not maintained properly when we "or" * all of the rules together. So we use our knowledge that the machine * starts at state 1 and ends at lastnfa. */ /* for ( ns = FIRSTST(state1); ns <= LASTST(state1); ++ns ) */ for ( ns = 1; ns <= lastnfa; ++ns ) { fprintf( stderr, "state # %4d:\t %4d", ns, TRANSCHAR(ns)); if (' ' <= TRANSCHAR(ns) && TRANSCHAR(ns) <= '~') fprintf(stderr, "'%c'", TRANSCHAR(ns)); else fprintf(stderr, " "); fprintf(stderr, ", %d nedges %3d\n", ns == finalst, NOUTEDGES(ns) ); for (e = EDGEPTR(ns), j = 0; j < NOUTEDGES(ns); e = e->next, j++) fprintf( stderr, "from %4d to %4d\n", EDGE_FROM(e), EDGE_TO(e)); } fprintf( stderr, "********** end of dump\n" ); } /* dupmachine - make a duplicate of a given machine * * synopsis * * copy = dupmachine( mach ); * * copy - holds duplicate of mach * mach - machine to be duplicated * * note that the copy of mach is NOT an exact duplicate; rather, all the * transition states values are adjusted so that the copy is self-contained, * as the original should have been. * * also note that the original MUST be contiguous, with its low and high * states accessible by the arrays firstst and lastst */ int dupmachine( mach ) int mach; { int i, j, state, init, last = LASTST(mach), state_offset; struct Edge *e; for ( i = FIRSTST(mach); i <= last; ++i ) { state = mkstate( TRANSCHAR(i) ); } state_offset = state - i + 1; init = mach + state_offset; for ( i = FIRSTST(mach); i <= last; ++i ) for (j = 0, e = EDGEPTR(i); j < NOUTEDGES(i); j++, e = e->next) mkxtion(e->from + state_offset, e->to + state_offset, __LINE__); FIRSTST(init) = FIRSTST(mach) + state_offset; FINALST(init) = FINALST(mach) + state_offset; LASTST(init) = LASTST(mach) + state_offset; return ( init ); } /* link_machines - connect two machines together * * synopsis * * new = link_machines( first, last ); * * new - a machine constructed by connecting first to last * first - the machine whose successor is to be last * last - the machine whose predecessor is to be first * * note: this routine concatenates the machine first with the machine * last to produce a machine new which will pattern-match first first * and then last, and will fail if either of the sub-patterns fails. * FIRST is set to new by the operation. last is unmolested. */ int link_machines( first, last ) int first, last; { #if 0 fprintf(stderr, "link_machines(%d,%d)\n", first, last); #endif if ( first == NIL ) { return ( last ); } else if ( last == NIL ) { return ( first ); } else { mkxtion( FINALST(first), last, __LINE__ ); FINALST(first) = FINALST(last); LASTST(first) = max( LASTST(first), LASTST(last) ); FIRSTST(first) = min( FIRSTST(first), FIRSTST(last) ); return ( first ); } } /* mkclos - convert a machine into a closure * * synopsis * new = mkclos( state ); * * new - a new state which matches the closure of "state" */ int mkclos( state ) int state; { return ( mkopt( mkposcl( state ) ) ); } /* mkopt - make a machine optional * * synopsis * * new = mkopt( mach ); * * new - a machine which optionally matches whatever mach matched * mach - the machine to make optional * * notes: * 1. mach must be the last machine created * 2. mach is destroyed by the call */ int mkopt( mach ) int mach; { return mkor( mach, mkstate( SYM_EPSILON ) ); } /* mkor - make a machine that matches either one of two machines * * synopsis * * new = mkor( first, second ); * * new - a machine which matches either first's pattern or second's * first, second - machines whose patterns are to be or'ed (the | operator) * * note that first and second are both destroyed by the operation * the code is rather convoluted because an attempt is made to minimize * the number of epsilon states needed */ int mkor( first, second ) int first, second; { int eps, result, newedges = 0; #if 0 fprintf(stderr, "mkor(%d,%d)", first, second); #endif if ( first == NIL ) return ( second ); else if ( second == NIL ) return ( first ); eps = mkstate( SYM_EPSILON ); mkxtion( FINALST(first), eps, __LINE__ ); mkxtion( FINALST(second), eps, __LINE__ ); FINALST(second) = FINALST(first) = eps; LASTST(first) = LASTST(second) = eps; newedges += 2; eps = mkstate( SYM_EPSILON ); mkxtion( eps, first, __LINE__ ); mkxtion( eps, second, __LINE__ ); LASTST(first) = LASTST(second) = eps; newedges += 2; result = eps; FINALST(result) = FINALST(first); if ( FIRSTST(first) < FIRSTST(result) ) FIRSTST(result) = FIRSTST(first); if ( FIRSTST(second) < FIRSTST(result) ) FIRSTST(result) = FIRSTST(second); if ( LASTST(first) > LASTST(result) ) LASTST(result) = LASTST(first); if ( LASTST(second) > LASTST(result) ) LASTST(result) = LASTST(second); #if 0 fprintf(stderr, "mkor returns %d\n", result); #endif return result; } /* mkposcl - convert a machine into a positive closure * * synopsis * new = mkposcl( state ); * * new - a machine matching the positive closure of "state" */ int mkposcl( state ) int state; { int eps; if (TRANSCHAR(state) == SYM_EPSILON) { mkxtion( FINALST(state), state, __LINE__ ); return ( state ); } eps = mkstate( SYM_EPSILON ); mkxtion( FINALST(state), eps, __LINE__ ); mkxtion( eps, state, __LINE__ ); FIRSTST(eps) = FIRSTST(state); LASTST(eps) = eps; FINALST(eps) = FINALST(state); return eps; } /* mkrep - make a replicated machine * * synopsis * new = mkrep( mach, lb, ub ); * * new - a machine that matches whatever "mach" matched from "lb" * number of times to "ub" number of times * * note * if "ub" is INFINITY then "new" matches "lb" or more occurrences of "mach" */ int mkrep( mach, lb, ub ) int mach, lb, ub; { int base, tail, copy, i; base = copysingl( mach, lb - 1 ); if ( ub == INFINITY ) { copy = dupmachine( mach ); mach = link_machines( mach, link_machines( base, mkclos( copy ) ) ); } else { tail = mkstate( SYM_EPSILON ); for ( i = lb; i < ub; ++i ) { copy = dupmachine( mach ); tail = mkopt( link_machines( copy, tail ) ); } mach = link_machines( mach, link_machines( base, tail ) ); } return ( mach ); } /* mkstate - create a state with a transition on a given symbol * * synopsis * * state = mkstate( sym ); * * state - a new state matching sym * sym - the symbol the new state is to have its in-transitions on * (state-labelled epsilon-nfa) * * note that this routine makes new states in ascending order through the * state array (and increments LASTNFA accordingly). The routine DUPMACHINE * relies on machines being made in ascending order and that they are * CONTIGUOUS. Change it and you will have to rewrite DUPMACHINE (kludge * that it admittedly is) */ int mkstate( sym ) int sym; { if (++lastnfa >= MAXIMUM_MNS) { lerrif( "input rules are too complicated (>= %d NFA states)", MAXIMUM_MNS ); } states[lastnfa] = (struct state *)malloc(sizeof(struct state)); if (!states[lastnfa]) { lerrif("out of memory at line %d\n", __LINE__); } TRANSCHAR(lastnfa) = sym; FIRSTST(lastnfa) = lastnfa; FINALST(lastnfa) = lastnfa; LASTST(lastnfa) = lastnfa; EDGEPTR(lastnfa) = NULL; NOUTEDGES(lastnfa) = 0; NINEDGES(lastnfa) = 0; return ( lastnfa ); } #define INSERT_LIST(head, new, next, prior) { \ if (head == NULL) { \ head = new->next = new->prior = new; \ } \ else { \ new->next = head; \ new->prior = head->prior; \ new->prior->next = new; \ new->next->prior = new; \ head = new; \ } \ } #define UNLINK(x, type) { \ type *next, *prior; \ next = x->next; \ prior = x->prior; \ prior->next = next; \ next->prior = prior; \ } /* delxtion - delete a transition * * synopsis * * delxtion( edgeptr ) * * edgeptr - a pointer to the struct Edge that is to be deleted */ delxtion( edgeptr ) struct Edge *edgeptr; { NOUTEDGES(EDGE_FROM(edgeptr))--; NINEDGES(EDGE_TO(edgeptr))--; UNLINK(edgeptr, struct Edge); if (EDGEPTR(EDGE_FROM(edgeptr)) == edgeptr) EDGEPTR(EDGE_FROM(edgeptr)) = edgeptr->next; if (NOUTEDGES(EDGE_FROM(edgeptr)) == 0) EDGEPTR(EDGE_FROM(edgeptr)) = NULL; free(edgeptr); } /* mkxtion - make a transition from one state to another * * synopsis * * mkxtion( statefrom, stateto ); * * statefrom - the state from which the transition is to be made * stateto - the state to which the transition is to be made */ /* SUPPRESS 590 */ mkxtion( statefrom, stateto, fromline ) int statefrom, stateto; { struct Edge *newedge; /* fprintf(stderr, "mkxtion(%d,%d,%d)\n", statefrom, stateto, fromline); */ newedge = (struct Edge *)malloc(sizeof(struct Edge)); if (!newedge) { fprintf(stderr, "no memory in mkxtion\n"); exit(1); } NOUTEDGES(statefrom)++; NINEDGES(stateto)++; newedge->from = statefrom; newedge->to = stateto; INSERT_LIST(EDGEPTR(statefrom), newedge, next, prior) } static int bflen; /* length of the buffer */ static char bf[512]; /* the buffer */ extern FILE *yyin; /* v[j][q] records the cost of the cheapest way to get to state q * having consumed j characters. */ static int maxj = 0; /* # of allocated vectors in v */ static unsigned char *v[MAXLINE + 1]; /* Back-edge is used to record the history of edits made by the system * as it finds the cost of edits to get the cheapest match. It is used * to reconstruct a word that is a closest match. */ struct BackEdge { struct BackEdge *prior; char c; }; struct Work { struct Work *next; int q; /* a state number */ int j; /* a number of characters consumed */ struct BackEdge *backedge; } *worklist[MAXCOST+1], *workend[MAXCOST+1]; #define Q_PUT(type, head, tail, state, chars, word) { \ type *newptr; \ newptr = (type *)ralloc(sizeof(type)); \ newptr->q = state; \ newptr->j = chars; \ newptr->backedge = word; \ if (head == NULL) head = newptr; \ else tail->next = newptr; \ newptr->next = NULL; \ tail = newptr; \ } #define Q_EMPTY(head) (head == NULL) /* Get the next element off the queue. Make sure the queue is not * empty before you do this. */ #define Q_GET(type, head, new) { \ type *next; \ new = *(head); \ next = (head)->next; \ (head) = next; \ } /* This constructs a state-labelled epsilon-nfa from a regular expression. * Then it constructs a two-dimensional array v[j][n]. j corresponds * to the number of characters that have been read from the string, so * 0 <= j <= strlen(input). n corresponds to the states of the nfa. * Entries in v represent the cheapest cost of driving the nfa to state * n, having read j characters from the input. Initially the nfa is * in state 0, having read 0 characters from the input, so v[0][0] = 0. * The algorithm uses a dynamic programming algorithm based upon Dikstra's * shortest-path (with discrete bucketing) to find the cheapest path to * v[strlen(input)][nf] where nf is the nfa final state. If the cost * exceeds the tolerance, the search is abandoned. */ main(argc, argv) int argc; char **argv; { int i, j; if (argc == 2) { yyin = fopen(argv[1], "r"); if (!yyin) { fprintf(stderr, "cannot open %s\n", argv[1]); exit(1); } } for (i = 0; i < 128; i++) { delete_cost[i] = insert_cost[i] = 1; for (j = 0; j < 128; j++) correct_cost[i][j] = 1; } yyparse(); firstnfa = link_machines( mkstate( SYM_EPSILON ), firstnfa ); optimize_nfa(); checkedges(__LINE__, __FILE__); /* fprintf(stderr, "firstnfa %d last %d\n", firstnfa, lastnfa); dumpnfa( firstnfa ); fprintf(stderr, "\n\n"); */ while (gets( bf )) { /* fprintf(stderr, "bf = %s\n", bf); */ bflen = strlen( bf ); while (maxj <= bflen) { v[maxj] = (unsigned char *) allocate_array( lastnfa + 1, sizeof(*v[0]) ); maxj++; } for (j = 0; j <= bflen; j++) memset((char *)(v[j]), '\377', (unsigned)(lastnfa + 1)); nbanks = memx = 0; for (i = 0; i <= MAXCOST; i++) { worklist[i] = NULL; workend[i] = NULL; } getcosts(tolerance); if (globword) { /* fprintf(stderr, "cost = %d\nword = '%s'\n", globcost, getword(globword)); */ puts(getword(globword)); } } /* fprintf(stderr, "done!\n"); */ } getcosts(maxcost) { int cost = 0, modcost; struct Work new; /* We can get to the first nfa state, scanning zero characters, * for a cost of 0. Start off by sticking this fact into the queue. */ globword = NULL; record(firstnfa, 0, 0, (struct BackEdge *)0); /* allow increasing costs and see how far we can get */ for (cost = 0; cost <= maxcost; cost++) { modcost = cost % (MAXCOST + 1); /* process until we exhaust the work queue */ while (!Q_EMPTY(worklist[modcost])) { Q_GET(struct Work, worklist[modcost], new); /* fprintf(stderr, "Pull chars=%d state=%d cost=%d from queue\n", new.j, new.q, cost); */ if ( v[new.j][new.q] < cost ) { /* fprintf(stderr, "discard chars=%d state=%d cost=%d\n", new.j, new.q, cost); */ continue; } advance(new.q, new.j, cost, new.backedge); } if (globword) { return; } } } static char word[MAXLINE]; static char *getword(edge) struct BackEdge *edge; { int first = 0, last = 0; int t; /* trace back and collect the corrected word into a buffer */ for (; edge; edge = edge->prior) { word[last++] = edge->c; } word[last--] = 0; /* reverse the buffer */ for (; last > first; last--, first++) { t = word[last]; word[last] = word[first]; word[first] = t; } return word; } /* A call to record says that we can get to a given state for a * given cost k. If this is further than we have gotten yet for * that much cost we put this info on the end of the worklist[k] * and also into v[cost][state]. */ record(state, chars, cost, word) int state; /* state we can get to for given cost */ int chars; /* having consumed this many characters */ int cost; /* cost */ struct BackEdge *word; /* last node in a word that works */ { int modcost; /* fprintf(stderr, "record state=%d chars=%d cost=%d word='%s'\n", state, chars, cost, getword(word)); */ if (state == finalst && chars == bflen && (!globword || cost < globcost)) { globcost = cost; globword = word; } /* Is this a newly-discovered cheaper way to get here? */ if (cost < v[chars][state]) { v[chars][state] = cost; modcost = cost % (MAXCOST + 1); Q_PUT(struct Work, worklist[modcost], workend[modcost], state, chars, word); /* fprintf(stderr, "new v[%d][%d]=%d\n\n", chars, state, cost); */ } } #define TRACE_ADVANCE 0 /* A call to advance(state, chars, cost, word) asserts that cost is the * cheapest way we know to get to the given state after consuming chars * of the input, and demands that the closure of this assertion be * taken. * * Advance goes one more step and enters the new values in v. Advance will * eventually get called for everything that it puts in v, so recursive * calls are not necessary. */ static void advance(state, chars, cost, word) int state; /* we can get to state p */ int chars; /* after consuming so many characters */ int cost; /* for a given cost */ struct BackEdge *word; /* last char in a word that works */ { register q; /* another state */ struct Edge *e; struct BackEdge *new; #if TRACE_ADVANCE fprintf(stderr, "got to state %d at cost %d using %d input chars\n", state, cost, chars); #endif /* we can always consume one character and stay in the same state, * by incurring a "deletion charge". */ if (chars < bflen) { #if TRACE_ADVANCE fprintf(stderr, "get to state %d at cost %d using %d chars by deleting 1 char\n", state, cost + DELETE_COST(bf[chars]), chars + 1); #endif record(state, chars + 1, cost + DELETE_COST(bf[chars]), word); } /* for each edge out of state p */ for (e = EDGEPTR(state); e;) { q = EDGE_TO(e); /* If we have an epsilon edge to a new state, we can get * there for no additional charge and without consuming * any characters. */ if (TRANSCHAR(q) == SYM_EPSILON) { #if TRACE_ADVANCE fprintf(stderr, "get to state %d at cost %d using %d chars with e-edge\n", q, cost, chars); #endif record(q, chars, cost, word); } else { /* We can incur an "insertion charge" and get to the * next state without using up any input characters. */ new = (struct BackEdge *)ralloc(sizeof(struct BackEdge)); new->c = TRANSCHAR(q); new->prior = word; #if TRACE_ADVANCE fprintf(stderr, "get to state %d at cost %d using %d chars by insert %c\n", q, cost + INSERT_COST(new->c), chars, new->c); #endif record( q, chars, cost + INSERT_COST(new->c), new ); /* If we have a transition to a new state that is labelled * correctly, we can get there for no additional charge. */ if (chars < bflen) { if (new->c == bf[chars]) { #if TRACE_ADVANCE fprintf(stderr, "get to state %d cost %d using %d chars following %c edge\n", q, cost, chars + 1, new->c); #endif record(q, chars + 1, cost, new); } else { /* If we have a transition to a new state that is not * labelled correctly, we can get there by incurring a * correction cost. */ #if TRACE_ADVANCE fprintf(stderr, "get to state %d cost %d using %d chars changing %c to %c\n", q, cost + CORRECT_COST(bf[chars], new->c), chars + 1, bf[chars], new->c); #endif record( q, chars + 1, cost + CORRECT_COST(bf[chars], new->c), new ); } } } e = e->next; if (e == EDGEPTR(state)) break; } } /* nfa branch chaining. Find a state A with an out-edge that points * to an epsilon state B. Remove the edge and duplicate the out-edges * of B into A. When you remove all of the in-edges of some state, * you can remove the state unless it is the start state. * * Straightfoward application of this principal makes the nfa * slower and larger. What happens is on something like * [0-9][0-9] the original machine had an e-state in between 10 * left-hand states and 10 right-hand states, and this replaced * 20 edges with 100 in order to eliminate the e-state. This caused * 5 times more calls to record. Now we do not perform this opt * unless the product of B's inedges and B's outedges is <= their * sum. Optimize_nfa still usually eliminates half of the input * states. * * A worklist algorithm would probably improve the speed. */ optimize_nfa() { int A, B, C, k, kk, changes; struct Edge *e, *nexte, *ee, *nextee; for (changes = 1; changes;) { changes = 0; changes = 0; for (A = 1; A <= lastnfa; A++) { /* If an edge from this state goes to an epsilon state */ for (e = EDGEPTR(A); EDGEPTR(A); e = nexte) { nexte = e->next; B = EDGE_TO(e); if (TRANSCHAR(B) == SYM_EPSILON && B != finalst && NINEDGES(B) * NOUTEDGES(B) <= NINEDGES(B) + NOUTEDGES(B)) { /* delete A->B edge */ delxtion(e); /* copy B's out-edges to A */ for (kk = 0, ee = EDGEPTR(B); kk < NOUTEDGES(B); ee = ee->next, kk++) { mkxtion(A, EDGE_TO(ee), __LINE__); /* Fix the case in which there was only one out edge * which is being replaced by a different one. * If we don't do this, e will point to the deleted * edge on the next loop iteration. */ if (e == nexte) nexte = EDGEPTR(A); } /* delete B if it has no incoming edges */ if (NINEDGES(B) == 0) { delete_state(B); A--; changes = 1; goto nextA; /* avoid second loop in case A=0 */ } changes = 1; } if (nexte == EDGEPTR(A)) break; } for (k = 0, e = EDGEPTR(A); k < NOUTEDGES(A); k++, e = nexte) { nexte = e->next; B = EDGE_TO(e); if (TRANSCHAR(B) == SYM_EPSILON && NOUTEDGES(B) == 1 && TRANSCHAR(C = EDGE_TO(EDGEPTR(B))) == SYM_EPSILON && ACCPTNUM(B) <= ACCPTNUM(C)) { /* fprintf(stderr, "%d -> eps %d -> eps %d\n", A, B, C); */ delxtion(e); mkxtion(A, C, __LINE__); if (NINEDGES(B) == 0) { delete_state(B); A--; } changes = 1; } } nextA:; } } } /* delete an nfa state which has lost all of its in-edges */ delete_state(d) int d; { int dd; struct Edge *e, *nexte; retry: #if 0 fprintf(stderr, "delete_state(%d)\n", d); #endif if (firstnfa == lastnfa) firstnfa = d; if (finalst == lastnfa) finalst = d; /* Delete all of the edges emanating from the losing state. * Uncount the in-edge count of each state pointed to. */ /* SUPPRESS 560 */ while (e = EDGEPTR(d)) delxtion(e); states[d] = states[lastnfa--]; /* overwrite the losing state */ /* fix up the edges out of the state that moved */ for (e = EDGEPTR(d); e; e = nexte) { nexte = e->next; if (EDGE_TO(e) == lastnfa + 1) { fprintf(stderr, "can't happen at line %d\n", __LINE__, __FILE__); exit(1); } EDGE_FROM(e) = d; if (nexte == EDGEPTR(d)) break; } /* fix up the edges into the state that moved */ for (dd = 1; dd <= lastnfa; dd++) { for (e = EDGEPTR(dd); e; e = nexte) { nexte = e->next; if (EDGE_TO(e) == lastnfa + 1) { EDGE_TO(e) = d; } if (nexte == EDGEPTR(dd)) break; } } /* delete any states (except the start state) which have * lost all of their incoming edges. */ for (d = 1; d <= lastnfa ; d++) if (NINEDGES(d) == 0 && d != firstnfa) goto retry; } /* Check the nfa to make sure the in-edge and out-edge counts are * correct. If you suspect nfa structure problems, call this from * all over the place. */ /* SUPPRESS 590 */ checkedges(fromline, fromfile) int fromline; char *fromfile; { int i; int k; struct Edge *e, *nexte; for (i = 1; i <= lastnfa; i++) TEMP(i) = 0; /* count the edges into and from each state */ for (i = 1; i <= lastnfa; i++) { k = 0; for (e = EDGEPTR(i); e; e = nexte) { k++; nexte = e->next; TEMP(EDGE_TO(e))++; if (EDGE_FROM(e) != i) { fprintf(stderr, "EDGE_FROM(%d) is %d should be %d\n", e, EDGE_FROM(e), i); } if (nexte == EDGEPTR(i)) break; } if (NOUTEDGES(i) != k) { fprintf(stderr, "noutedges[%d] is %d should be %d line %d %s\n", i, NOUTEDGES(i), k, fromline, fromfile); } } for (i = 1; i <= lastnfa; i++) if (TEMP(i) != NINEDGES(i)) { fprintf(stderr, "ninedges[%d] is %d should be %d line %d %s\n", i, NINEDGES(i), TEMP(i), fromline, fromfile); } } SHAR_EOF cat << \SHAR_EOF > parse.h # define CHAR 257 # define NUMBER 258 # define SECTEND 259 # define SCDECL 260 # define XSCDECL 261 # define WHITESPACE 262 # define NAME 263 # define CORRECT 264 # define STRING 265 # define DELETE 266 # define INSERT 267 # define TOLERANCE 268 SHAR_EOF cat << \SHAR_EOF > parse.y /* parse.y - parser for flex input */ /* * Copyright (c) 1987, the University of California * * The United States Government has rights in this work pursuant to * contract no. DE-AC03-76SF00098 between the United States Department of * Energy and the University of California. * * This program may be redistributed. Enhancements and derivative works * may be created provided the new works, if made available to the general * public, are made available for use by anyone. */ %token CHAR NUMBER SECTEND SCDECL XSCDECL WHITESPACE NAME CORRECT %token STRING DELETE INSERT TOLERANCE %{ #include "flexdef.h" struct Ccl ccl, anyccl, ccl0; int pat, eps, lastchar, i, firstnfa, finalst; char clower(); int linenum, syntaxerror; static void makeany() { static int madeany = false; if ( ! madeany ) { /* create the '.' character class */ checkedges(__LINE__, __FILE__); cclinit( &anyccl ); ccladd( &anyccl, '\n' ); checkedges(__LINE__, __FILE__); cclnegate( &anyccl ); checkedges(__LINE__, __FILE__); madeany = true; } } %} %% goal : initlex correctionrules sect2 ; initlex : { /* initialize for processing rules */ } ; correctionrules : correctionrules onerule | ; onerule : CORRECT item { ccl0 = ccl; } item NUMBER '\n' { int i, j; for (i = 0; i < 128; i++) { for (j = 0; j < 128; j++) { if (ccltest( &ccl0, i) && ccltest( &ccl, j)) { correct_cost[i][j] = $5; } } } } | DELETE item NUMBER '\n' { int i, j; for (i = 0; i < 128; i++) { if (ccltest( &ccl, i)) { delete_cost[i] = $3; } } } | '\n' { } | INSERT item NUMBER '\n' { int i; for (i = 0; i < 128; i++) { if (ccltest( &ccl, i)) { insert_cost[i] = $3; } } } | TOLERANCE NUMBER '\n' { tolerance = $2; } ; item : '.' { makeany(); ccl = anyccl; } | fullccl { } | CHAR { cclinit( &ccl ); ccladd( &ccl, $1 ); } | '"' CHAR '"' { cclinit( &ccl ); ccladd( &ccl, $2 ); } sect2 : sect2 initforrule flexrule '\n' | ; initforrule : { /* initialize for a parse of one rule */ } ; flexrule : '^' re eol { checkedges(__LINE__, __FILE__); pat = link_machines( $2, $3 ); checkedges(__LINE__, __FILE__); add_accept( pat ); checkedges(__LINE__, __FILE__); firstnfa = pat; } | re eol { checkedges(__LINE__, __FILE__); pat = link_machines( $1, $2 ); checkedges(__LINE__, __FILE__); add_accept( pat ); checkedges(__LINE__, __FILE__); firstnfa = pat; } | error { synerr( "unrecognized rule" ); } ; eol : '$' { checkedges(__LINE__, __FILE__); eps = mkstate( SYM_EPSILON ); checkedges(__LINE__, __FILE__); $$ = link_machines( eps, mkstate( '\n' ) ); checkedges(__LINE__, __FILE__); } | { checkedges(__LINE__, __FILE__); $$ = mkstate( SYM_EPSILON ); checkedges(__LINE__, __FILE__); } ; re : re '|' series { checkedges(__LINE__, __FILE__); $$ = mkor( $1, $3 ); checkedges(__LINE__, __FILE__); } | series { $$ = $1; } ; series : series singleton { /* this is where concatenation of adjacent patterns * gets done */ checkedges(__LINE__, __FILE__); $$ = link_machines( $1, $2 ); checkedges(__LINE__, __FILE__); } | singleton { $$ = $1; } ; singleton : singleton '*' { checkedges(__LINE__, __FILE__); $$ = mkclos( $1 ); checkedges(__LINE__, __FILE__); } | singleton '+' { checkedges(__LINE__, __FILE__); $$ = mkposcl( $1 ); checkedges(__LINE__, __FILE__); } | singleton '?' { checkedges(__LINE__, __FILE__); $$ = mkopt( $1 ); checkedges(__LINE__, __FILE__); } | singleton '{' NUMBER ',' NUMBER '}' { if ( $3 > $5 || $3 < 0 ) { synerr( "bad iteration values" ); $$ = $1; } else { if ( $3 == 0 ) { checkedges(__LINE__, __FILE__); $$ = mkrep( $1, 1, $5 ); checkedges(__LINE__, __FILE__); $$ = mkopt( $$ ); checkedges(__LINE__, __FILE__); } else { checkedges(__LINE__, __FILE__); $$ = mkrep( $1, $3, $5 ); checkedges(__LINE__, __FILE__); } } } | singleton '{' NUMBER ',' '}' { if ( $3 <= 0 ) { synerr( "iteration value must be positive" ); $$ = $1; } else $$ = mkrep( $1, $3, INFINITY ); } | singleton '{' NUMBER '}' { if ( $3 <= 0 ) { synerr( "iteration value must be positive" ); $$ = $1; } else { checkedges(__LINE__, __FILE__); $$ = link_machines( $1, copysingl( $1, $3 - 1 ) ); checkedges(__LINE__, __FILE__); } } | '.' { makeany(); $$ = ccl2nfa( &anyccl ); checkedges(__LINE__, __FILE__); } | fullccl { $$ = ccl2nfa( &ccl ); } | '"' string '"' { $$ = $2; } | '(' re ')' { $$ = $2; } | CHAR { if ( $1 == '\0' ) synerr( "null in rule" ); checkedges(__LINE__, __FILE__); $$ = mkstate( $1 ); checkedges(__LINE__, __FILE__); } ; fullccl : '[' ccl ']' | '[' '^' ccl ']' { checkedges(__LINE__, __FILE__); cclnegate( &ccl ); checkedges(__LINE__, __FILE__); } ; ccl : ccl CHAR '-' CHAR { if ( $2 > $4 ) synerr( "negative range in character class" ); else { for ( i = $2; i <= $4; ++i ) { checkedges(__LINE__, __FILE__); ccladd( &ccl, i ); checkedges(__LINE__, __FILE__); } checkedges(__LINE__, __FILE__); lastchar = $4; } $$ = $1; } | ccl CHAR { checkedges(__LINE__, __FILE__); ccladd( &ccl, $2 ); checkedges(__LINE__, __FILE__); lastchar = $2; checkedges(__LINE__, __FILE__); $$ = $1; } | { checkedges(__LINE__, __FILE__); lastchar = 0; checkedges(__LINE__, __FILE__); cclinit( &ccl ); checkedges(__LINE__, __FILE__); } ; string : string CHAR { checkedges(__LINE__, __FILE__); $$ = link_machines( $1, mkstate( $2 ) ); checkedges(__LINE__, __FILE__); } | { checkedges(__LINE__, __FILE__); $$ = mkstate( SYM_EPSILON ); checkedges(__LINE__, __FILE__); } ; %% /* synerr - report a syntax error * * synopsis * char str[]; * synerr( str ); */ synerr( str ) char str[]; { syntaxerror = true; fprintf( stderr, "Syntax error at line %d: %s\n", linenum, str ); } /* yyerror - eat up an error message from the parser * * synopsis * char msg[]; * yyerror( msg ); */ yyerror( msg ) char msg[]; { } SHAR_EOF cat << \SHAR_EOF > scan.l /* scan.l - scanner for flex input */ /* * Copyright (c) 1987, the University of California * * The United States Government has rights in this work pursuant to * contract no. DE-AC03-76SF00098 between the United States Department of * Energy and the University of California. * * This program may be redistributed. Enhancements and derivative works * may be created provided the new works, if made available to the general * public, are made available for use by anyone. */ %{ #include "flexdef.h" #include "parse.h" char nmstr[MAXLINE]; #define RETURNCHAR \ yylval = yytext[0]; \ return ( CHAR ); #define RETURNNAME \ (void) strcpy( nmstr, yytext ); \ return ( NAME ); #define RETURNSTRING \ (void) strcpy( nmstr, yytext ); \ return ( STRING ); #define PUT_BACK_STRING(str, start) \ for ( i = strlen( str ) - 1; i >= start; --i ) \ unput(str[i]) %} %x MAIN NUM QUOTE BRACEERROR FIRSTCCL CCL COR QUOTECOR %x FIRSTCCLCOR CCLCOR C ([a-zA-Z0-9_()%<>[\]{},.;&!~*/+\-=^|?:@`$# ]) cchar ((\\?)({C}|\"))|(\\\')|(\\\\)|(\\([0-7]{1,3})) schar ((\\?)({C}|\'))|(\\\")|(\\\\)|(\\([0-7]{1,3})) WS [ \t]+ NAME [a-z_][a-z_0-9]* ESCSEQ \\([^^\n]|"^".|0[0-9]{1,3}) %% static int bracelevel, didadef; int i; char myesc(); ^%correct { BEGIN(COR); return CORRECT; } ^"%"{WS}.*"\n" { /* I'm a comment */ } ^%tolerance { BEGIN(COR); return TOLERANCE; } ^%insert { BEGIN(COR); return INSERT; } ^%delete { BEGIN(COR); return DELETE; } {WS} { } [0-9]+ { yylval = myctoi( yytext ); return ( NUMBER ); } \" { BEGIN(QUOTECOR); return '"'; } \" { BEGIN(COR); return '"'; } . { RETURNCHAR; } \. { return '.'; } "\n" { return '\n'; } . { RETURNCHAR; } "["([^\\\]\n]|{ESCSEQ})+"]" { (void) strcpy( nmstr, yytext ); /* push back everything but the leading bracket * so the ccl can be rescanned */ PUT_BACK_STRING(nmstr, 1); BEGIN(FIRSTCCLCOR); return ( '[' ); } "^"/[^-\n] BEGIN(CCLCOR); return ( '^' ); "^"/- return ( '^' ); - BEGIN(CCLCOR); yylval = '-'; return ( CHAR ); . BEGIN(CCLCOR); RETURNCHAR; {ESCSEQ} { yylval = myesc( yytext ); BEGIN(CCLCOR); return ( CHAR ); } -/[^\]\n] return ( '-' ); [^\]\n] RETURNCHAR; "]" BEGIN(COR); return ( ']' ); ^"%%".*\n { BEGIN(MAIN); }
^"^" return ( '^' );
\" BEGIN(QUOTE); return ( '"' );
"{"/[0-9] BEGIN(NUM); return ( '{' );
"{"[^0-9\n][^}\n]* BEGIN(BRACEERROR);
"$"/[ \t\n] return ( '$' );
{WS} { /* needs to be separate from following rule due to * bug with trailing context */ bracelevel = 0; return ( '\n' ); }
"["([^\\\]\n]|{ESCSEQ})+"]" { (void) strcpy( nmstr, yytext ); /* push back everything but the leading bracket * so the ccl can be rescanned */ PUT_BACK_STRING(nmstr, 1); BEGIN(FIRSTCCL); return ( '[' ); }
"{"{NAME}"}" { register char *nmdefptr; char *ndlookup(); (void) strcpy( nmstr, yytext ); nmstr[yyleng - 1] = '\0'; /* chop trailing brace */ /* lookup from "nmstr + 1" to chop leading brace */ if ( ! (nmdefptr = ndlookup( nmstr + 1 )) ) synerr( "undefined {name}" ); else { /* push back name surrounded by ()'s */ unput(')'); PUT_BACK_STRING(nmdefptr, 0); unput('('); } }
[/|*+?.()] return ( yytext[0] );
. RETURNCHAR;
\n ++linenum; return ( '\n' ); [^"\n] RETURNCHAR; \" BEGIN(MAIN); return ( '"' ); \n { synerr( "missing quote" ); BEGIN(MAIN); ++linenum; return ( '"' ); } "^"/[^-\n] BEGIN(CCL); return ( '^' ); "^"/- return ( '^' ); - BEGIN(CCL); yylval = '-'; return ( CHAR ); . BEGIN(CCL); RETURNCHAR; {ESCSEQ} { yylval = myesc( yytext ); BEGIN(CCL); return ( CHAR ); } -/[^\]\n] return ( '-' ); [^\]\n] RETURNCHAR; "]" BEGIN(MAIN); return ( ']' ); [0-9]+ { yylval = myctoi( yytext ); return ( NUMBER ); } "," return ( ',' ); "}" BEGIN(MAIN); return ( '}' ); . { synerr( "bad character inside {}'s" ); BEGIN(MAIN); return ( '}' ); } \n { synerr( "missing }" ); BEGIN(MAIN); ++linenum; return ( '}' ); } "}" synerr( "bad name in {}'s" ); BEGIN(MAIN); \n synerr( "missing }" ); ++linenum; BEGIN(MAIN); {ESCSEQ} { yylval = myesc( yytext ); return ( CHAR ); } %% SHAR_EOF cat << \SHAR_EOF > sym.c /* sym - symbol table routines */ /* * Copyright (c) 1987, the University of California * * The United States Government has rights in this work pursuant to * contract no. DE-AC03-76SF00098 between the United States Department of * Energy and the University of California. * * This program may be redistributed. Enhancements and derivative works * may be created provided the new works, if made available to the general * public, are made available for use by anyone. */ #include "flexdef.h" struct hash_entry *ndtbl[NAME_TABLE_HASH_SIZE]; struct hash_entry *sctbl[START_COND_HASH_SIZE]; struct hash_entry *findsym(); /* addsym - add symbol and definitions to symbol table * * synopsis * char sym[], *str_def; * int int_def; * hash_table table; * int table_size; * 0 / -1 = addsym( sym, def, int_def, table, table_size ); * * -1 is returned if the symbol already exists, and the change not made. */ int addsym( sym, str_def, int_def, table, table_size ) register char sym[]; char *str_def; int int_def; hash_table table; int table_size; { int hash_val = hashfunct( sym, table_size ); register struct hash_entry *entry = table[hash_val]; register struct hash_entry *new_entry; register struct hash_entry *successor; char *malloc(); while ( entry ) { if ( ! strcmp( sym, entry->name ) ) { /* entry already exists */ return ( -1 ); } entry = entry->next; } /* create new entry */ new_entry = (struct hash_entry *) malloc( sizeof( struct hash_entry ) ); if ( new_entry == NULL ) flexfatal( "symbol table memory allocation failed" ); if ( (successor = table[hash_val]) ) { new_entry->next = successor; successor->prev = new_entry; } else new_entry->next = NULL; new_entry->prev = NULL; new_entry->name = sym; new_entry->str_val = str_def; new_entry->int_val = int_def; table[hash_val] = new_entry; return ( 0 ); } /* findsym - find symbol in symbol table * * synopsis * char sym[]; * hash_table table; * int table_size; * struct hash_entry *entry, *findsym(); * entry = findsym( sym, table, table_size ); */ struct hash_entry *findsym( sym, table, table_size ) register char sym[]; hash_table table; int table_size; { register struct hash_entry *entry = table[hashfunct( sym, table_size )]; static struct hash_entry empty_entry = { (struct hash_entry *) 0, (struct hash_entry *) 0, NULL, NULL, 0, } ; while ( entry ) { if ( ! strcmp( sym, entry->name ) ) return ( entry ); entry = entry->next; } return ( &empty_entry ); } /* hashfunct - compute the hash value for "str" and hash size "hash_size" * * synopsis * char str[]; * int hash_size, hash_val; * hash_val = hashfunct( str, hash_size ); */ int hashfunct( str, hash_size ) register char str[]; int hash_size; { register int hashval; register int locstr; hashval = 0; locstr = 0; while ( str[locstr] ) hashval = ((hashval << 1) + str[locstr++]) % hash_size; return ( hashval ); } /* ndinstal - install a name definition * * synopsis * char nd[], def[]; * ndinstal( nd, def ); */ ndinstal( nd, def ) char nd[], def[]; { char *copy_string(); if ( addsym( copy_string( nd ), copy_string( def ), 0, ndtbl, NAME_TABLE_HASH_SIZE ) ) synerr( "name defined twice" ); } /* ndlookup - lookup a name definition * * synopsis * char nd[], *def; * char *ndlookup(); * def/NULL = ndlookup( nd ); */ char *ndlookup( nd ) char nd[]; { return ( findsym( nd, ndtbl, NAME_TABLE_HASH_SIZE )->str_val ); } SHAR_EOF cat << \SHAR_EOF > test.in LmC"L4m!3 NCNB Nitionil Bank 0 2 - 2 8-9 1 2 9 4 DEBITS CHECK REFERENCE DATE CHECK REFERENCE DATE ENCLOSED NU4BER ?UMBER POSTED AMOUNT NUMBER NUMBER POSTED AMOUNT 56818 060202238 02-21 1,463.96 56885 050406145 02-26 621. 34 + 56820 061508340 02-19 414.00 56886 051309782 02-25 212.40 56821 060561884 OZ-20 380.00 + 56888 060203646 02-26 316.80 56322 050308612 02-20 363.60 56839 050000681 OZ-27 349.80 S6823 OS100SO88 02-19 206.57 56890 060701340 02-26 380.00 56824 OS1215839 02-19 190.80 56891 OS2104948 02-25 123.20 + 56826 051215810 02-19 119.42 56892 osoilOS09 02-27 588.38 56827 051009745 02-19 258.00 56893 050106127 02-28 161.60 56828 060-,41320 02-20 621.83 56894 OS2104867 C2-2S 168.00 56829 060006534 02-20 541.21 56895 061000900 02-26 63.00 56830 OS1216101 02-19 19216.00 56896 050307144 02-27 703.52 + 56832 05111167S 02-19 269.90 56897 0605OS986 02-26 242.04 56833 060003876 02-20 1,987.50 56898 050106126 02-28 161.60 + 56836 051201744 02-21 182,40 56899 060708682 02-26 964,08 56837 050202940 02-21 1;219.20 56900 060708683 02-26 64.68 + 56839 06000451S 02-21 491.10 56901 061308454 02-26 173.70 56840 OS020209S 02-25 389.60 S6902 060809S84 02-26 213,38 56841 050308595 02-20 91-52 56903 061308450 02-26 1,202.00 56842 050007600 02-20 596.75 + 5690S 060114198 02-27 604.18 56843 050111266 02-22 372.28 + 56908 OS1103408 02-27 108.80 56844 260208095 OZ-21 67.50 56909 051507798 02-27 104.65 5684S 060802989 02-20 1,108.8S 56910 050104176 OZ-28 171.70 + 56847 060109317 02-21 412.70 56911 051606473 02-27 98.48 56848 050307489 02-21 914.80 56912 051312820 02-27 676.40 + 56852 051607269 02-21 562.60 + 56916 050603S74 02-28 15S.20 56853 050007191 02-26 161.60 56917 050603564 02-28 530.40 56854 OS1103024 02-21 54.00 56918 050603S65 02-28 710.64 56855 0511028?5 02-21 161.60 56919 050607308 0 2 - 28 217.90 + 56857 050513006 OZ-22 105.9z 56920 050310013 02-23 347.65 56858 050202096 02-25 108.80 + 56922 OS0310012 02-28 429.8S 56859 060808589 O2-2S 356.40 56923 060601777 02-28 617.69 56860 050806133 02-22 406.36 56924 060709449 02-28 201.70 56861 050808783 02-22 454.8S + 57091 050608090 02-27 210.38 56862 060200919 02-25 291.62 57092 060012187 02-25 629.40 56863 050207083 02-25 112.40 57093 050202097 02-25 161.60 56864 060312068 02-22 640.58 57094 051012134 02-22 168.79 56865 OSIO01219 O2-2S 242.80 57095 OSIO04287 02-22 338.00 56866 060804384 02-25 732.40 57096 0607042SS 02-22 lSl.20 56867 051500756 02-25 312.80 57097 060006924 02-26 200.40 56868 060511506 02-25 50.40 57098 050102456 02-26 410.90 56369 060508702 02-2s 9S.36 57099 050102583 02-25 8S.75 + 56872 060525998 02-25 226.00 57100 060100999 02-20 672.00 + 56874 060212912 02-26 192.00 57101 050906894 02-20 50. 96 56875 OS1500744 02-25 39-5.44 S7102 051009816 02-19 468.00 56876 050007022 02-27 259.80 57103 050308609 02-20 396.00 56877 060006155 02-27 298.80 + 57105 060212020 02-20 161.60 56878 050201596 02-27 176.00 S7106 051002151 02-22 397.84 56879 250211326 02-27 199.75 57107 060002931 02-20 382.40 56880 050406092 02-26 940.89 57108 OS02074S9 02-12 41S.06 + 56882 050007659 02-27 899.20 57109 051909717 02-11 94.40 56883 060213188 02-26 221.10 57110 060503196 OZ-12 119.70 56884 061308451 OZ-26 253.60 NO ITEM 9 051204273 02-22 273.80 + INDICATES BREAK IN NUMERICAL SEQUENCE NdnT'rg QPP nw/rDOL, Qlngl C:C)Q INADOOTAPI-R It,([:(-)DRAATir)rll kA,-h., P:ni(, iNCR!=9 XXXXXXXXXX SHAR_EOF cat << \SHAR_EOF > test.out ENCLOSED NUMBER NUMBER POSTED AMOUNT NUMBER NUMBER POSTED AMOUNT 56818 060202238 02-21 1,463.96 56885 050406145 02-26 621.34 + 56820 061508340 02-19 414.00 56886 051309782 02-25 212.40 56821 060561884 02-20 380.00 + 56888 060203646 02-26 316.80 56322 050308612 02-20 363.60 56839 050000681 02-27 349.80 56823 051005088 02-19 206.57 56890 060701340 02-26 380.00 56824 051215839 02-19 190.80 56891 052104948 02-25 123.20 + 56826 051215810 02-19 119.42 56892 050110509 02-27 588.38 56827 051009745 02-19 258.00 56893 050106127 02-28 161.60 56828 060841320 02-20 621.83 56894 052104867 12-25 168.00 56829 060006534 02-20 541.21 56895 061000900 02-26 63.00 56830 051216101 02-19 19,216.00 56896 050307144 02-27 703.52 + 56832 051111675 02-19 269.90 56897 060505986 02-26 242.04 56833 060003876 02-20 1,987.50 56898 050106126 02-28 161.60 + 56836 051201744 02-21 182.40 56899 060708682 02-26 964.08 56837 050202940 02-21 1,219.20 56900 060708683 02-26 64.68 + 56839 060004515 02-21 491.10 56901 061308454 02-26 173.70 56840 050202095 02-25 389.60 56902 060809584 02-26 213.38 56841 050308595 02-20 91.52 56903 061308450 02-26 1,202.00 56842 050007600 02-20 596.75 + 56905 060114198 02-27 604.18 56843 050111266 02-22 372.28 + 56908 051103408 02-27 108.80 56844 260208095 02-21 67.50 56909 051507798 02-27 104.65 56845 060802989 02-20 1,108.85 56910 050104176 02-28 171.70 + 56847 060109317 02-21 412.70 56911 051606473 02-27 98.48 56848 050307489 02-21 914.80 56912 051312820 02-27 676.40 + 56852 051607269 02-21 562.60 + 56916 050603574 02-28 155.20 56853 050007191 02-26 161.60 56917 050603564 02-28 530.40 56854 051103024 02-21 54.00 56918 050603565 02-28 710.64 56855 051102895 02-21 161.60 56919 050607308 02-28 217.90 + 56857 050513006 02-22 105.92 56920 050310013 02-23 347.65 56858 050202096 02-25 108.80 + 56922 050310012 02-28 429.85 56859 060808589 02-25 356.40 56923 060601777 02-28 617.69 56860 050806133 02-22 406.36 56924 060709449 02-28 201.70 56861 050808783 02-22 454.85 + 57091 050608090 02-27 210.38 56862 060200919 02-25 291.62 57092 060012187 02-25 629.40 56863 050207083 02-25 112.40 57093 050202097 02-25 161.60 56864 060312068 02-22 640.58 57094 051012134 02-22 168.79 56865 051001219 02-25 242.80 57095 051004287 02-22 338.00 56866 060804384 02-25 732.40 57096 060704255 02-22 151.20 56867 051500756 02-25 312.80 57097 060006924 02-26 200.40 56868 060511506 02-25 50.40 57098 050102456 02-26 410.90 56369 060508702 02-25 95.36 57099 050102583 02-25 85.75 + 56872 060525998 02-25 226.00 57100 060100999 02-20 672.00 + 56874 060212912 02-26 192.00 57101 050906894 02-20 50.96 56875 051500744 02-25 395.44 57102 051009816 02-19 468.00 56876 050007022 02-27 259.80 57103 050308609 02-20 396.00 56877 060006155 02-27 298.80 + 57105 060212020 02-20 161.60 56878 050201596 02-27 176.00 57106 051002151 02-22 397.84 56879 250211326 02-27 199.75 57107 060002931 02-20 382.40 56880 050406092 02-26 940.89 57108 050207459 02-12 415.06 + 56882 050007659 02-27 899.20 57109 051909717 02-11 94.40 56883 060213188 02-26 221.10 57110 060503196 02-12 119.70 56884 061308451 02-26 253.60 NO ITEM # 051204273 02-22 273.80 SHAR_EOF # End of shell archive exit 0