]>
Commit | Line | Data |
---|---|---|
878ddf1f | 1 | /*\r |
2 | * This file contains code for\r | |
3 | *\r | |
4 | * int rexpr(char *expr, char *s);\r | |
5 | *\r | |
6 | * which answers\r | |
7 | *\r | |
8 | * 1 if 's' is in the language described by the regular expression 'expr'\r | |
9 | * 0 if it is not\r | |
10 | * -1 if the regular expression is invalid\r | |
11 | *\r | |
12 | * Language membership is determined by constructing a non-deterministic\r | |
13 | * finite automata (NFA) from the regular expression. A depth-\r | |
14 | * first-search is performed on the NFA (graph) to check for a match of 's'.\r | |
15 | * Each non-epsilon arc consumes one character from 's'. Backtracking is\r | |
16 | * performed to check all possible paths through the NFA.\r | |
17 | *\r | |
18 | * Regular expressions follow the meta-language:\r | |
19 | *\r | |
20 | * <regExpr> ::= <andExpr> ( '|' <andExpr> )*\r | |
21 | *\r | |
22 | * <andExpr> ::= <expr> ( <expr> )*\r | |
23 | *\r | |
24 | * <expr> ::= {'~'} '[' <atomList> ']' <repeatSymbol>\r | |
25 | * | '(' <regExpr> ')' <repeatSymbol>\r | |
26 | * | '{' <regExpr> '}' <repeatSymbol>\r | |
27 | * | <atom> <repeatSymbol>\r | |
28 | *\r | |
29 | * <repeatSymbol> ::= { '*' | '+' }\r | |
30 | *\r | |
31 | * <atomList> ::= <atom> ( <atom> )*\r | |
32 | * | { <atomList> } <atom> '-' <atom> { <atomList> }\r | |
33 | *\r | |
34 | * <atom> ::= Token[Atom]\r | |
35 | *\r | |
36 | * Notes:\r | |
37 | * ~ means complement the set in [..]. i.e. all characters not listed\r | |
38 | * * means match 0 or more times (can be on expression or atom)\r | |
39 | * + means match 1 or more times (can be on expression or atom)\r | |
40 | * {} optional\r | |
41 | * () grouping\r | |
42 | * [] set of atoms\r | |
43 | * x-y all characters from x to y (found only in [..])\r | |
44 | * \xx the character with value xx\r | |
45 | *\r | |
46 | * Examples:\r | |
47 | * [a-z]+\r | |
48 | * match 1 or more lower-case letters (e.g. variable)\r | |
49 | *\r | |
50 | * 0x[0-9A-Fa-f]+\r | |
51 | * match a hex number with 0x on front (e.g. 0xA1FF)\r | |
52 | *\r | |
53 | * [0-9]+.[0-9]+{e[0-9]+}\r | |
54 | * match a floating point number (e.g. 3.14e21)\r | |
55 | *\r | |
56 | * Code example:\r | |
57 | * if ( rexpr("[a-zA-Z][a-zA-Z0-9]+", str) ) then str is keyword\r | |
58 | *\r | |
59 | * Terence Parr\r | |
60 | * Purdue University\r | |
61 | * April 1991\r | |
62 | */\r | |
63 | \r | |
64 | #include <stdio.h>\r | |
65 | #include <ctype.h>\r | |
66 | #ifdef __STDC__\r | |
67 | #include <stdlib.h>\r | |
68 | #else\r | |
69 | #include <malloc.h>\r | |
70 | #endif\r | |
71 | #include "rexpr.h"\r | |
72 | \r | |
73 | #ifdef __USE_PROTOS\r | |
74 | static int regExpr( GraphPtr g );\r | |
75 | static int andExpr( GraphPtr g );\r | |
76 | static int expr( GraphPtr g );\r | |
77 | static int repeatSymbol( GraphPtr g );\r | |
78 | static int atomList( char *p, int complement );\r | |
79 | static void next( void );\r | |
80 | static ArcPtr newGraphArc( void );\r | |
81 | static NodePtr newNode( void );\r | |
82 | static int ArcBetweenGraphNode( NodePtr i, NodePtr j, int label );\r | |
83 | static Graph BuildNFA_atom( int label );\r | |
84 | static Graph BuildNFA_AB( Graph A, Graph B );\r | |
85 | static Graph BuildNFA_AorB( Graph A, Graph B );\r | |
86 | static Graph BuildNFA_set( char *s );\r | |
87 | static Graph BuildNFA_Astar( Graph A );\r | |
88 | static Graph BuildNFA_Aplus( Graph A );\r | |
89 | static Graph BuildNFA_Aoptional( Graph A );\r | |
90 | #else\r | |
91 | static int regExpr();\r | |
92 | static int andExpr();\r | |
93 | static int expr();\r | |
94 | static int repeatSymbol();\r | |
95 | static int atomList();\r | |
96 | static void next();\r | |
97 | static ArcPtr newGraphArc();\r | |
98 | static NodePtr newNode();\r | |
99 | static int ArcBetweenGraphNode();\r | |
100 | static Graph BuildNFA_atom();\r | |
101 | static Graph BuildNFA_AB();\r | |
102 | static Graph BuildNFA_AorB();\r | |
103 | static Graph BuildNFA_set();\r | |
104 | static Graph BuildNFA_Astar();\r | |
105 | static Graph BuildNFA_Aplus();\r | |
106 | static Graph BuildNFA_Aoptional();\r | |
107 | #endif\r | |
108 | \r | |
109 | static char *_c;\r | |
110 | static int token, tokchar;\r | |
111 | static NodePtr accept;\r | |
112 | static NodePtr freelist = NULL;\r | |
113 | \r | |
114 | /*\r | |
115 | * return 1 if s in language described by expr\r | |
116 | * 0 if s is not\r | |
117 | * -1 if expr is an invalid regular expression\r | |
118 | */\r | |
119 | #ifdef __USE_PROTOS\r | |
120 | static int rexpr(char *expr,char *s)\r | |
121 | #else\r | |
122 | static int rexpr(expr, s)\r | |
123 | char *expr, *s;\r | |
124 | #endif\r | |
125 | {\r | |
126 | NodePtr p,q;\r | |
127 | Graph nfa;\r | |
128 | int result;\r | |
129 | \r | |
130 | fprintf(stderr, "rexpr(%s,%s);\n", expr,s);\r | |
131 | freelist = NULL;\r | |
132 | _c = expr;\r | |
133 | next();\r | |
134 | if ( regExpr(&nfa) == -1 ) return -1;\r | |
135 | accept = nfa.right;\r | |
136 | result = match(nfa.left, s);\r | |
137 | /* free all your memory */\r | |
138 | p = q = freelist;\r | |
139 | while ( p!=NULL ) { q = p->track; free(p); p = q; }\r | |
140 | return result;\r | |
141 | }\r | |
142 | \r | |
143 | /*\r | |
144 | * do a depth-first-search on the NFA looking for a path from start to\r | |
145 | * accept state labelled with the characters of 's'.\r | |
146 | */\r | |
147 | \r | |
148 | #ifdef __USE_PROTOS\r | |
149 | static int match(NodePtr automaton,char *s)\r | |
150 | #else\r | |
151 | static int match(automaton, s)\r | |
152 | NodePtr automaton;\r | |
153 | char *s;\r | |
154 | #endif\r | |
155 | {\r | |
156 | ArcPtr p;\r | |
157 | \r | |
158 | if ( automaton == accept && *s == '\0' ) return 1; /* match */\r | |
159 | \r | |
160 | for (p=automaton->arcs; p!=NULL; p=p->next) /* try all arcs */\r | |
161 | {\r | |
162 | if ( p->label == Epsilon )\r | |
163 | {\r | |
164 | if ( match(p->target, s) ) return 1;\r | |
165 | }\r | |
166 | else if ( p->label == *s )\r | |
167 | if ( match(p->target, s+1) ) return 1;\r | |
168 | }\r | |
169 | return 0;\r | |
170 | }\r | |
171 | \r | |
172 | /*\r | |
173 | * <regExpr> ::= <andExpr> ( '|' {<andExpr>} )*\r | |
174 | *\r | |
175 | * Return -1 if syntax error\r | |
176 | * Return 0 if none found\r | |
177 | * Return 1 if a regExrp was found\r | |
178 | */\r | |
179 | \r | |
180 | #ifdef __USE_PROTOS\r | |
181 | static int regExpr(GraphPtr g)\r | |
182 | #else\r | |
183 | static int regExpr(g)\r | |
184 | GraphPtr g;\r | |
185 | #endif\r | |
186 | {\r | |
187 | Graph g1, g2;\r | |
188 | \r | |
189 | if ( andExpr(&g1) == -1 )\r | |
190 | {\r | |
191 | return -1;\r | |
192 | }\r | |
193 | \r | |
194 | while ( token == '|' )\r | |
195 | {\r | |
196 | int a;\r | |
197 | next();\r | |
198 | a = andExpr(&g2);\r | |
199 | if ( a == -1 ) return -1; /* syntax error below */\r | |
200 | else if ( !a ) return 1; /* empty alternative */\r | |
201 | g1 = BuildNFA_AorB(g1, g2);\r | |
202 | }\r | |
203 | \r | |
204 | if ( token!='\0' ) return -1;\r | |
205 | \r | |
206 | *g = g1;\r | |
207 | return 1;\r | |
208 | }\r | |
209 | \r | |
210 | /*\r | |
211 | * <andExpr> ::= <expr> ( <expr> )*\r | |
212 | */\r | |
213 | \r | |
214 | #ifdef __USE_PROTOS\r | |
215 | static int andExpr(GraphPtr g)\r | |
216 | #else\r | |
217 | static int andExpr(g)\r | |
218 | GraphPtr g;\r | |
219 | #endif\r | |
220 | {\r | |
221 | Graph g1, g2;\r | |
222 | \r | |
223 | if ( expr(&g1) == -1 )\r | |
224 | {\r | |
225 | return -1;\r | |
226 | }\r | |
227 | \r | |
228 | while ( token==Atom || token=='{' || token=='(' || token=='~' || token=='[' )\r | |
229 | {\r | |
230 | if (expr(&g2) == -1) return -1;\r | |
231 | g1 = BuildNFA_AB(g1, g2);\r | |
232 | }\r | |
233 | \r | |
234 | *g = g1;\r | |
235 | return 1;\r | |
236 | }\r | |
237 | \r | |
238 | /*\r | |
239 | * <expr> ::= {'~'} '[' <atomList> ']' <repeatSymbol>\r | |
240 | * | '(' <regExpr> ')' <repeatSymbol>\r | |
241 | * | '{' <regExpr> '}' <repeatSymbol>\r | |
242 | * | <atom> <repeatSymbol>\r | |
243 | */\r | |
244 | \r | |
245 | #ifdef __USE_PROTOS\r | |
246 | static int expr(GraphPtr g)\r | |
247 | #else\r | |
248 | static int expr(g)\r | |
249 | GraphPtr g;\r | |
250 | #endif\r | |
251 | {\r | |
252 | int complement = 0;\r | |
253 | char s[257]; /* alloc space for string of char in [] */\r | |
254 | \r | |
255 | if ( token == '~' || token == '[' )\r | |
256 | {\r | |
257 | if ( token == '~' ) {complement = 1; next();}\r | |
258 | if ( token != '[' ) return -1;\r | |
259 | next();\r | |
260 | if ( atomList( s, complement ) == -1 ) return -1;\r | |
261 | *g = BuildNFA_set( s );\r | |
262 | if ( token != ']' ) return -1;\r | |
263 | next();\r | |
264 | repeatSymbol( g );\r | |
265 | return 1;\r | |
266 | }\r | |
267 | if ( token == '(' )\r | |
268 | {\r | |
269 | next();\r | |
270 | if ( regExpr( g ) == -1 ) return -1;\r | |
271 | if ( token != ')' ) return -1;\r | |
272 | next();\r | |
273 | repeatSymbol( g );\r | |
274 | return 1;\r | |
275 | }\r | |
276 | if ( token == '{' )\r | |
277 | {\r | |
278 | next();\r | |
279 | if ( regExpr( g ) == -1 ) return -1;\r | |
280 | if ( token != '}' ) return -1;\r | |
281 | next();\r | |
282 | /* S p e c i a l C a s e O p t i o n a l { } */\r | |
283 | if ( token != '*' && token != '+' )\r | |
284 | {\r | |
285 | *g = BuildNFA_Aoptional( *g );\r | |
286 | }\r | |
287 | repeatSymbol( g );\r | |
288 | return 1;\r | |
289 | }\r | |
290 | if ( token == Atom )\r | |
291 | {\r | |
292 | *g = BuildNFA_atom( tokchar );\r | |
293 | next();\r | |
294 | repeatSymbol( g );\r | |
295 | return 1;\r | |
296 | }\r | |
297 | \r | |
298 | return -1;\r | |
299 | }\r | |
300 | \r | |
301 | /*\r | |
302 | * <repeatSymbol> ::= { '*' | '+' }\r | |
303 | */\r | |
304 | #ifdef __USE_PROTOS\r | |
305 | static int repeatSymbol(GraphPtr g)\r | |
306 | #else\r | |
307 | static int repeatSymbol(g)\r | |
308 | GraphPtr g;\r | |
309 | #endif\r | |
310 | {\r | |
311 | switch ( token )\r | |
312 | {\r | |
313 | case '*' : *g = BuildNFA_Astar( *g ); next(); break;\r | |
314 | case '+' : *g = BuildNFA_Aplus( *g ); next(); break;\r | |
315 | }\r | |
316 | return 1;\r | |
317 | }\r | |
318 | \r | |
319 | /*\r | |
320 | * <atomList> ::= <atom> { <atom> }*\r | |
321 | * { <atomList> } <atom> '-' <atom> { <atomList> }\r | |
322 | *\r | |
323 | * a-b is same as ab\r | |
324 | * q-a is same as q\r | |
325 | */\r | |
326 | \r | |
327 | #ifdef __USE_PROTOS\r | |
328 | static int atomList(char *p, int complement)\r | |
329 | #else\r | |
330 | static int atomList(p, complement)\r | |
331 | char *p;\r | |
332 | int complement;\r | |
333 | #endif\r | |
334 | {\r | |
335 | static unsigned char set[256]; /* no duplicates */\r | |
336 | int first, last, i;\r | |
337 | char *s = p;\r | |
338 | \r | |
339 | if ( token != Atom ) return -1;\r | |
340 | \r | |
341 | for (i=0; i<256; i++) set[i] = 0;\r | |
342 | while ( token == Atom )\r | |
343 | {\r | |
344 | if ( !set[tokchar] ) *s++ = tokchar;\r | |
345 | set[tokchar] = 1; /* Add atom to set */\r | |
346 | next();\r | |
347 | if ( token == '-' ) /* have we found '-' */\r | |
348 | {\r | |
349 | first = *(s-1); /* Get last char */\r | |
350 | next();\r | |
351 | if ( token != Atom ) return -1;\r | |
352 | else\r | |
353 | {\r | |
354 | last = tokchar;\r | |
355 | }\r | |
356 | for (i = first+1; i <= last; i++)\r | |
357 | {\r | |
358 | if ( !set[tokchar] ) *s++ = i;\r | |
359 | set[i] = 1; /* Add atom to set */\r | |
360 | }\r | |
361 | next();\r | |
362 | }\r | |
363 | }\r | |
364 | *s = '\0';\r | |
365 | if ( complement )\r | |
366 | {\r | |
367 | for (i=0; i<256; i++) set[i] = !set[i];\r | |
368 | for (i=1,s=p; i<256; i++) if ( set[i] ) *s++ = i;\r | |
369 | *s = '\0';\r | |
370 | }\r | |
371 | return 1;\r | |
372 | }\r | |
373 | \r | |
374 | /* a somewhat stupid lexical analyzer */\r | |
375 | \r | |
376 | #ifdef __USE_PROTOS\r | |
377 | static void next(void)\r | |
378 | #else\r | |
379 | static void next()\r | |
380 | #endif\r | |
381 | {\r | |
382 | while ( *_c==' ' || *_c=='\t' || *_c=='\n' ) _c++;\r | |
383 | if ( *_c=='\\' )\r | |
384 | {\r | |
385 | _c++;\r | |
386 | if ( isdigit(*_c) )\r | |
387 | {\r | |
388 | int n=0;\r | |
389 | while ( isdigit(*_c) )\r | |
390 | {\r | |
391 | n = n*10 + (*_c++ - '0');\r | |
392 | }\r | |
393 | if ( n>255 ) n=255;\r | |
394 | tokchar = n;\r | |
395 | }\r | |
396 | else\r | |
397 | {\r | |
398 | switch (*_c)\r | |
399 | {\r | |
400 | case 'n' : tokchar = '\n'; break;\r | |
401 | case 't' : tokchar = '\t'; break;\r | |
402 | case 'r' : tokchar = '\r'; break;\r | |
403 | default : tokchar = *_c;\r | |
404 | }\r | |
405 | _c++;\r | |
406 | }\r | |
407 | token = Atom;\r | |
408 | }\r | |
409 | else if ( isgraph(*_c) && *_c!='[' && *_c!='(' && *_c!='{' &&\r | |
410 | *_c!='-' && *_c!='}' && *_c!=')' && *_c!=']' &&\r | |
411 | *_c!='+' && *_c!='*' && *_c!='~' && *_c!='|' )\r | |
412 | {\r | |
413 | token = Atom;\r | |
414 | tokchar = *_c++;\r | |
415 | }\r | |
416 | else\r | |
417 | {\r | |
418 | token = tokchar = *_c++;\r | |
419 | }\r | |
420 | }\r | |
421 | \r | |
422 | /* N F A B u i l d i n g R o u t i n e s */\r | |
423 | \r | |
424 | #ifdef __USE_PROTOS\r | |
425 | static ArcPtr newGraphArc(void)\r | |
426 | #else\r | |
427 | static ArcPtr newGraphArc()\r | |
428 | #endif\r | |
429 | {\r | |
430 | ArcPtr p;\r | |
431 | p = (ArcPtr) calloc(1, sizeof(Arc));\r | |
432 | if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);}\r | |
433 | if ( freelist != NULL ) p->track = (ArcPtr) freelist;\r | |
434 | freelist = (NodePtr) p;\r | |
435 | return p;\r | |
436 | }\r | |
437 | \r | |
438 | #ifdef __USE_PROTOS\r | |
439 | static NodePtr newNode(void)\r | |
440 | #else\r | |
441 | static NodePtr newNode()\r | |
442 | #endif\r | |
443 | {\r | |
444 | NodePtr p;\r | |
445 | p = (NodePtr) calloc(1, sizeof(Node));\r | |
446 | if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);}\r | |
447 | if ( freelist != NULL ) p->track = freelist;\r | |
448 | freelist = p;\r | |
449 | return p;\r | |
450 | }\r | |
451 | \r | |
452 | #ifdef __USE_PROTOS\r | |
453 | static void ArcBetweenGraphNodes(NodePtr i,NodePtr j,int label)\r | |
454 | #else\r | |
455 | static void ArcBetweenGraphNodes(i, j, label)\r | |
456 | NodePtr i, j;\r | |
457 | int label;\r | |
458 | #endif\r | |
459 | {\r | |
460 | ArcPtr a;\r | |
461 | \r | |
462 | a = newGraphArc();\r | |
463 | if ( i->arcs == NULL ) i->arctail = i->arcs = a;\r | |
464 | else {(i->arctail)->next = a; i->arctail = a;}\r | |
465 | a->label = label;\r | |
466 | a->target = j;\r | |
467 | }\r | |
468 | \r | |
469 | #ifdef __USE_PROTOS\r | |
470 | static Graph BuildNFA_atom(int label)\r | |
471 | #else\r | |
472 | static Graph BuildNFA_atom(label)\r | |
473 | int label;\r | |
474 | #endif\r | |
475 | {\r | |
476 | Graph g;\r | |
477 | \r | |
478 | g.left = newNode();\r | |
479 | g.right = newNode();\r | |
480 | ArcBetweenGraphNodes(g.left, g.right, label);\r | |
481 | return( g );\r | |
482 | }\r | |
483 | \r | |
484 | #ifdef __USE_PROTOS\r | |
485 | static Graph BuildNFA_AB(Graph A,Graph B)\r | |
486 | #else\r | |
487 | static Graph BuildNFA_AB(A, B)\r | |
488 | Graph A, B;\r | |
489 | #endif\r | |
490 | {\r | |
491 | Graph g;\r | |
492 | \r | |
493 | ArcBetweenGraphNodes(A.right, B.left, Epsilon);\r | |
494 | g.left = A.left;\r | |
495 | g.right = B.right;\r | |
496 | return( g );\r | |
497 | }\r | |
498 | \r | |
499 | #ifdef __USE_PROTOS\r | |
500 | static Graph BuildNFA_AorB(Graph A,Graph B)\r | |
501 | #else\r | |
502 | static Graph BuildNFA_AorB(A, B)\r | |
503 | Graph A, B;\r | |
504 | #endif\r | |
505 | {\r | |
506 | Graph g;\r | |
507 | \r | |
508 | g.left = newNode();\r | |
509 | ArcBetweenGraphNodes(g.left, A.left, Epsilon);\r | |
510 | ArcBetweenGraphNodes(g.left, B.left, Epsilon);\r | |
511 | g.right = newNode();\r | |
512 | ArcBetweenGraphNodes(A.right, g.right, Epsilon);\r | |
513 | ArcBetweenGraphNodes(B.right, g.right, Epsilon);\r | |
514 | return( g );\r | |
515 | }\r | |
516 | \r | |
517 | #ifdef __USE_PROTOS\r | |
518 | static Graph BuildNFA_set(char *s)\r | |
519 | #else\r | |
520 | static Graph BuildNFA_set( s )\r | |
521 | char *s;\r | |
522 | #endif\r | |
523 | {\r | |
524 | Graph g;\r | |
525 | \r | |
526 | if ( s == NULL ) return g;\r | |
527 | \r | |
528 | g.left = newNode();\r | |
529 | g.right = newNode();\r | |
530 | while ( *s != '\0' )\r | |
531 | {\r | |
532 | ArcBetweenGraphNodes(g.left, g.right, *s++);\r | |
533 | }\r | |
534 | return g;\r | |
535 | }\r | |
536 | \r | |
537 | #ifdef __USE_PROTOS\r | |
538 | static Graph BuildNFA_Astar(Graph A)\r | |
539 | #else\r | |
540 | static Graph BuildNFA_Astar( A )\r | |
541 | Graph A;\r | |
542 | #endif\r | |
543 | {\r | |
544 | Graph g;\r | |
545 | \r | |
546 | g.left = newNode();\r | |
547 | g.right = newNode();\r | |
548 | \r | |
549 | ArcBetweenGraphNodes(g.left, A.left, Epsilon);\r | |
550 | ArcBetweenGraphNodes(g.left, g.right, Epsilon);\r | |
551 | ArcBetweenGraphNodes(A.right, g.right, Epsilon);\r | |
552 | ArcBetweenGraphNodes(A.right, A.left, Epsilon);\r | |
553 | \r | |
554 | return( g );\r | |
555 | }\r | |
556 | \r | |
557 | #ifdef __USE_PROTOS\r | |
558 | static Graph BuildNFA_Aplus(Graph A)\r | |
559 | #else\r | |
560 | static Graph BuildNFA_Aplus( A )\r | |
561 | Graph A;\r | |
562 | #endif\r | |
563 | {\r | |
564 | ArcBetweenGraphNodes(A.right, A.left, Epsilon);\r | |
565 | \r | |
566 | return( A );\r | |
567 | }\r | |
568 | \r | |
569 | #ifdef __USE_PROTOS\r | |
570 | static Graph BuildNFA_Aoptional(Graph A)\r | |
571 | #else\r | |
572 | static Graph BuildNFA_Aoptional( A )\r | |
573 | Graph A;\r | |
574 | #endif\r | |
575 | {\r | |
576 | Graph g;\r | |
577 | \r | |
578 | g.left = newNode();\r | |
579 | g.right = newNode();\r | |
580 | \r | |
581 | ArcBetweenGraphNodes(g.left, A.left, Epsilon);\r | |
582 | ArcBetweenGraphNodes(g.left, g.right, Epsilon);\r | |
583 | ArcBetweenGraphNodes(A.right, g.right, Epsilon);\r | |
584 | \r | |
585 | return( g );\r | |
586 | }\r |