ceph/src/boost/tools/build/src/engine/regexp.cpp

   1 /*
   2  * regcomp and regexec -- regsub and regerror are elsewhere
   3  *
   4  *  Copyright (c) 1986 by University of Toronto.
   5  *  Written by Henry Spencer.  Not derived from licensed software.
   6  *
   7  *  Permission is granted to anyone to use this software for any
   8  *  purpose on any computer system, and to redistribute it freely,
   9  *  subject to the following restrictions:
  10  *
  11  *  1. The author is not responsible for the consequences of use of
  12  *      this software, no matter how awful, even if they arise
  13  *      from defects in it.
  14  *
  15  *  2. The origin of this software must not be misrepresented, either
  16  *      by explicit claim or by omission.
  17  *
  18  *  3. Altered versions must be plainly marked as such, and must not
  19  *      be misrepresented as being the original software.
  20  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  21  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  22  *** to assist in implementing egrep.
  23  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  24  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  25  *** as in BSD grep and ex.
  26  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  27  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  29  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  31  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  32  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  33  *** to avoid conflicting with the system regexp.h.  Const, bless its
  34  *** soul, was removed so it can compile everywhere.  The declaration
  35  *** of strchr() was in conflict on AIX, so it was removed (as it is
  36  *** happily defined in string.h).
  37  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  38  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  39  *
  40  * Beware that some of this code is subtly aware of the way operator precedence
  41  * is structured in regular expressions. Serious changes in regular-expression
  42  * syntax might require a total rethink.
  43  */
  44
  45
  46 #include "jam.h"
  47 #include "regexp.h"
  48 #include "output.h"
  49
  50 #include <stdio.h>
  51 #include <ctype.h>
  52 #ifndef ultrix
  53 # include <stdlib.h>
  54 #endif
  55 #include <string.h>
  56
  57
  58 /*
  59  * The "internal use only" fields in regexp.h are present to pass info from
  60  * compile to execute that permits the execute phase to run lots faster on
  61  * simple cases. They are:
  62  :
  63  *  regstart char that must begin a match; '\0' if none obvious.
  64  *  reganch  is the match anchored (at beginning-of-line only)?
  65  *  regmust  string (pointer into program) that match must include, or NULL.
  66  *  regmlen  length of regmust string.
  67  *
  68  * Regstart and reganch permit very fast decisions on suitable starting points
  69  * for a match, cutting down the work a lot.  Regmust permits fast rejection of
  70  * lines that cannot possibly match.  The regmust tests are costly enough that
  71  * regcomp() supplies a regmust only if the r.e. contains something potentially
  72  * expensive (at present, the only such thing detected is * or + at the start of
  73  * the r.e., which can involve a lot of backup). Regmlen is supplied because the
  74  * test in regexec() needs it and regcomp() is computing it anyway.
  75  */
  76
  77 /*
  78  * Structure for regexp "program". This is essentially a linear encoding of a
  79  * nondeterministic finite-state machine (aka syntax charts or "railroad normal
  80  * form" in parsing technology). Each node is an opcode plus a "next" pointer,
  81  * possibly plus an operand. "Next" pointers of all nodes except BRANCH
  82  * implement concatenation; a "next" pointer with a BRANCH on both ends of it is
  83  * connecting two alternatives. [Here we have one of the subtle syntax
  84  * dependencies: an individual BRANCH, as opposed to a collection of them, is
  85  * never concatenated with anything because of operator precedence.] The operand
  86  * of some types of node is a literal string; for others, it is a node leading
  87  * into a sub-FSM. In particular, the operand of a BRANCH node is the first node
  88  * of the branch. [NB this is *not* a tree structure: the tail of the branch
  89  * connects to the thing following the set of BRANCHes.] The opcodes are:
  90  */
  91
  92 /* definition   number  opnd?   meaning */
  93 #define END 0   /* no   End of program. */
  94 #define BOL 1   /* no   Match "" at beginning of line. */
  95 #define EOL 2   /* no   Match "" at end of line. */
  96 #define ANY 3   /* no   Match any one character. */
  97 #define ANYOF   4   /* str  Match any character in this string. */
  98 #define ANYBUT  5   /* str  Match any character not in this string. */
  99 #define BRANCH  6   /* node Match this alternative, or the next... */
 100 #define BACK    7   /* no   Match "", "next" ptr points backward. */
 101 #define EXACTLY 8   /* str  Match this string. */
 102 #define NOTHING 9   /* no   Match empty string. */
 103 #define STAR    10  /* node Match this (simple) thing 0 or more times. */
 104 #define PLUS    11  /* node Match this (simple) thing 1 or more times. */
 105 #define WORDA   12  /* no   Match "" at wordchar, where prev is nonword */
 106 #define WORDZ   13  /* no   Match "" at nonwordchar, where prev is word */
 107 #define OPEN    20  /* no   Mark this point in input as start of #n. */
 108             /*  OPEN+1 is number 1, etc. */
 109 #define CLOSE   30  /* no   Analogous to OPEN. */
 110
 111
 112 /*
 113  * Opcode notes:
 114  *
 115  * BRANCH   The set of branches constituting a single choice are hooked
 116  *      together with their "next" pointers, since precedence prevents
 117  *      anything being concatenated to any individual branch.  The
 118  *      "next" pointer of the last BRANCH in a choice points to the
 119  *      thing following the whole choice.  This is also where the
 120  *      final "next" pointer of each individual branch points; each
 121  *      branch starts with the operand node of a BRANCH node.
 122  *
 123  * BACK     Normal "next" pointers all implicitly point forward; BACK
 124  *      exists to make loop structures possible.
 125  *
 126  * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
 127  *      BRANCH structures using BACK.  Simple cases (one character
 128  *      per match) are implemented with STAR and PLUS for speed
 129  *      and to minimize recursive plunges.
 130  *
 131  * OPEN,CLOSE   ...are numbered at compile time.
 132  */
 133
 134 /*
 135  * A node is one char of opcode followed by two chars of "next" pointer.
 136  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
 137  * value is a positive offset from the opcode of the node containing it.
 138  * An operand, if any, simply follows the node.  (Note that much of the
 139  * code generation knows about this implicit relationship.)
 140  *
 141  * Using two bytes for the "next" pointer is vast overkill for most things,
 142  * but allows patterns to get big without disasters.
 143  */
 144 #define OP(p)   (*(p))
 145 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
 146 #define OPERAND(p)  ((p) + 3)
 147
 148 /*
 149  * See regmagic.h for one further detail of program structure.
 150  */
 151
 152
 153 /*
 154  * Utility definitions.
 155  */
 156 #ifndef CHARBITS
 157 #define UCHARAT(p)  ((int)*(const unsigned char *)(p))
 158 #else
 159 #define UCHARAT(p)  ((int)*(p)&CHARBITS)
 160 #endif
 161
 162 #define FAIL(m) { regerror(m); return(NULL); }
 163 #define ISMULT(c)   ((c) == '*' || (c) == '+' || (c) == '?')
 164
 165 /*
 166  * Flags to be passed up and down.
 167  */
 168 #define HASWIDTH    01  /* Known never to match null string. */
 169 #define SIMPLE      02  /* Simple enough to be STAR/PLUS operand. */
 170 #define SPSTART     04  /* Starts with * or +. */
 171 #define WORST       0   /* Worst case. */
 172
 173 /*
 174  * Global work variables for regcomp().
 175  */
 176 static char *regparse;      /* Input-scan pointer. */
 177 static int regnpar;     /* () count. */
 178 static char regdummy;
 179 static char *regcode;       /* Code-emit pointer; &regdummy = don't. */
 180 static long regsize;        /* Code size. */
 181
 182 /*
 183  * Forward declarations for regcomp()'s friends.
 184  */
 185 #ifndef STATIC
 186 #define STATIC  static
 187 #endif
 188 STATIC char *reg( int paren, int *flagp );
 189 STATIC char *regbranch( int *flagp );
 190 STATIC char *regpiece( int *flagp );
 191 STATIC char *regatom( int *flagp );
 192 STATIC char *regnode( int op );
 193 STATIC char *regnext( char *p );
 194 STATIC void regc( int b );
 195 STATIC void reginsert( char op, char *opnd );
 196 STATIC void regtail( char *p, char *val );
 197 STATIC void regoptail( char *p, char *val );
 198 #ifdef STRCSPN
 199 STATIC int strcspn();
 200 #endif
 201
 202 /*
 203  - regcomp - compile a regular expression into internal code
 204  *
 205  * We can't allocate space until we know how big the compiled form will be,
 206  * but we can't compile it (and thus know how big it is) until we've got a
 207  * place to put the code.  So we cheat:  we compile it twice, once with code
 208  * generation turned off and size counting turned on, and once "for real".
 209  * This also means that we don't allocate space until we are sure that the
 210  * thing really will compile successfully, and we never have to move the
 211  * code and thus invalidate pointers into it.  (Note that it has to be in
 212  * one piece because free() must be able to free it all.)
 213  *
 214  * Beware that the optimization-preparation code in here knows about some
 215  * of the structure of the compiled regexp.
 216  */
 217 regexp *
 218 regcomp( const char *exp )
 219 {
 220     regexp *r;
 221     char *scan;
 222     char *longest;
 223     unsigned len;
 224     int flags;
 225
 226     if (exp == NULL)
 227         FAIL("NULL argument");
 228
 229     /* First pass: determine size, legality. */
 230 #ifdef notdef
 231     if (exp[0] == '.' && exp[1] == '*') exp += 2;  /* aid grep */
 232 #endif
 233     regparse = (char *)exp;
 234     regnpar = 1;
 235     regsize = 0L;
 236     regcode = &regdummy;
 237     regc(MAGIC);
 238     if (reg(0, &flags) == NULL)
 239         return(NULL);
 240
 241     /* Small enough for pointer-storage convention? */
 242     if (regsize >= 32767L)      /* Probably could be 65535L. */
 243         FAIL("regexp too big");
 244
 245     /* Allocate space. */
 246     r = (regexp *)BJAM_MALLOC(sizeof(regexp) + (unsigned)regsize);
 247     if (r == NULL)
 248         FAIL("out of space");
 249
 250     /* Second pass: emit code. */
 251     regparse = (char *)exp;
 252     regnpar = 1;
 253     regcode = r->program;
 254     regc(MAGIC);
 255     if (reg(0, &flags) == NULL)
 256         return(NULL);
 257
 258     /* Dig out information for optimizations. */
 259     r->regstart = '\0'; /* Worst-case defaults. */
 260     r->reganch = 0;
 261     r->regmust = NULL;
 262     r->regmlen = 0;
 263     scan = r->program+1;            /* First BRANCH. */
 264     if (OP(regnext(scan)) == END) {     /* Only one top-level choice. */
 265         scan = OPERAND(scan);
 266
 267         /* Starting-point info. */
 268         if (OP(scan) == EXACTLY)
 269             r->regstart = *OPERAND(scan);
 270         else if (OP(scan) == BOL)
 271             r->reganch++;
 272
 273         /*
 274          * If there's something expensive in the r.e., find the
 275          * longest literal string that must appear and make it the
 276          * regmust.  Resolve ties in favor of later strings, since
 277          * the regstart check works with the beginning of the r.e.
 278          * and avoiding duplication strengthens checking.  Not a
 279          * strong reason, but sufficient in the absence of others.
 280          */
 281         if (flags&SPSTART) {
 282             longest = NULL;
 283             len = 0;
 284             for (; scan != NULL; scan = regnext(scan))
 285                 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
 286                     longest = OPERAND(scan);
 287                     len = strlen(OPERAND(scan));
 288                 }
 289             r->regmust = longest;
 290             r->regmlen = len;
 291         }
 292     }
 293
 294     return(r);
 295 }
 296
 297 /*
 298  - reg - regular expression, i.e. main body or parenthesized thing
 299  *
 300  * Caller must absorb opening parenthesis.
 301  *
 302  * Combining parenthesis handling with the base level of regular expression
 303  * is a trifle forced, but the need to tie the tails of the branches to what
 304  * follows makes it hard to avoid.
 305  */
 306 static char *
 307 reg(
 308     int paren,          /* Parenthesized? */
 309     int *flagp )
 310 {
 311     char *ret;
 312     char *br;
 313     char *ender;
 314     int parno = 0;
 315     int flags;
 316
 317     *flagp = HASWIDTH;  /* Tentatively. */
 318
 319     /* Make an OPEN node, if parenthesized. */
 320     if (paren) {
 321         if (regnpar >= NSUBEXP)
 322             FAIL("too many ()");
 323         parno = regnpar;
 324         regnpar++;
 325         ret = regnode(OPEN+parno);
 326     } else
 327         ret = NULL;
 328
 329     /* Pick up the branches, linking them together. */
 330     br = regbranch(&flags);
 331     if (br == NULL)
 332         return(NULL);
 333     if (ret != NULL)
 334         regtail(ret, br);   /* OPEN -> first. */
 335     else
 336         ret = br;
 337     if (!(flags&HASWIDTH))
 338         *flagp &= ~HASWIDTH;
 339     *flagp |= flags&SPSTART;
 340     while (*regparse == '|' || *regparse == '\n') {
 341         regparse++;
 342         br = regbranch(&flags);
 343         if (br == NULL)
 344             return(NULL);
 345         regtail(ret, br);   /* BRANCH -> BRANCH. */
 346         if (!(flags&HASWIDTH))
 347             *flagp &= ~HASWIDTH;
 348         *flagp |= flags&SPSTART;
 349     }
 350
 351     /* Make a closing node, and hook it on the end. */
 352     ender = regnode((paren) ? CLOSE+parno : END);
 353     regtail(ret, ender);
 354
 355     /* Hook the tails of the branches to the closing node. */
 356     for (br = ret; br != NULL; br = regnext(br))
 357         regoptail(br, ender);
 358
 359     /* Check for proper termination. */
 360     if (paren && *regparse++ != ')') {
 361         FAIL("unmatched ()");
 362     } else if (!paren && *regparse != '\0') {
 363         if (*regparse == ')') {
 364             FAIL("unmatched ()");
 365         } else
 366             FAIL("junk on end");    /* "Can't happen". */
 367         /* NOTREACHED */
 368     }
 369
 370     return(ret);
 371 }
 372
 373 /*
 374  - regbranch - one alternative of an | operator
 375  *
 376  * Implements the concatenation operator.
 377  */
 378 static char *
 379 regbranch( int *flagp )
 380 {
 381     char *ret;
 382     char *chain;
 383     char *latest;
 384     int flags;
 385
 386     *flagp = WORST;     /* Tentatively. */
 387
 388     ret = regnode(BRANCH);
 389     chain = NULL;
 390     while (*regparse != '\0' && *regparse != ')' &&
 391            *regparse != '\n' && *regparse != '|') {
 392         latest = regpiece(&flags);
 393         if (latest == NULL)
 394             return(NULL);
 395         *flagp |= flags&HASWIDTH;
 396         if (chain == NULL)  /* First piece. */
 397             *flagp |= flags&SPSTART;
 398         else
 399             regtail(chain, latest);
 400         chain = latest;
 401     }
 402     if (chain == NULL)  /* Loop ran zero times. */
 403         (void) regnode(NOTHING);
 404
 405     return(ret);
 406 }
 407
 408 /*
 409  - regpiece - something followed by possible [*+?]
 410  *
 411  * Note that the branching code sequences used for ? and the general cases
 412  * of * and + are somewhat optimized:  they use the same NOTHING node as
 413  * both the endmarker for their branch list and the body of the last branch.
 414  * It might seem that this node could be dispensed with entirely, but the
 415  * endmarker role is not redundant.
 416  */
 417 static char *
 418 regpiece( int *flagp )
 419 {
 420     char *ret;
 421     char op;
 422     char *next;
 423     int flags;
 424
 425     ret = regatom(&flags);
 426     if (ret == NULL)
 427         return(NULL);
 428
 429     op = *regparse;
 430     if (!ISMULT(op)) {
 431         *flagp = flags;
 432         return(ret);
 433     }
 434
 435     if (!(flags&HASWIDTH) && op != '?')
 436         FAIL("*+ operand could be empty");
 437     *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
 438
 439     if (op == '*' && (flags&SIMPLE))
 440         reginsert(STAR, ret);
 441     else if (op == '*') {
 442         /* Emit x* as (x&|), where & means "self". */
 443         reginsert(BRANCH, ret);         /* Either x */
 444         regoptail(ret, regnode(BACK));      /* and loop */
 445         regoptail(ret, ret);            /* back */
 446         regtail(ret, regnode(BRANCH));      /* or */
 447         regtail(ret, regnode(NOTHING));     /* null. */
 448     } else if (op == '+' && (flags&SIMPLE))
 449         reginsert(PLUS, ret);
 450     else if (op == '+') {
 451         /* Emit x+ as x(&|), where & means "self". */
 452         next = regnode(BRANCH);         /* Either */
 453         regtail(ret, next);
 454         regtail(regnode(BACK), ret);        /* loop back */
 455         regtail(next, regnode(BRANCH));     /* or */
 456         regtail(ret, regnode(NOTHING));     /* null. */
 457     } else if (op == '?') {
 458         /* Emit x? as (x|) */
 459         reginsert(BRANCH, ret);         /* Either x */
 460         regtail(ret, regnode(BRANCH));      /* or */
 461         next = regnode(NOTHING);        /* null. */
 462         regtail(ret, next);
 463         regoptail(ret, next);
 464     }
 465     regparse++;
 466     if (ISMULT(*regparse))
 467         FAIL("nested *?+");
 468
 469     return(ret);
 470 }
 471
 472 /*
 473  - regatom - the lowest level
 474  *
 475  * Optimization:  gobbles an entire sequence of ordinary characters so that
 476  * it can turn them into a single node, which is smaller to store and
 477  * faster to run.  Backslashed characters are exceptions, each becoming a
 478  * separate node; the code is simpler that way and it's not worth fixing.
 479  */
 480 static char *
 481 regatom( int *flagp )
 482 {
 483     char *ret;
 484     int flags;
 485
 486     *flagp = WORST;     /* Tentatively. */
 487
 488     switch (*regparse++) {
 489     /* FIXME: these chars only have meaning at beg/end of pat? */
 490     case '^':
 491         ret = regnode(BOL);
 492         break;
 493     case '$':
 494         ret = regnode(EOL);
 495         break;
 496     case '.':
 497         ret = regnode(ANY);
 498         *flagp |= HASWIDTH|SIMPLE;
 499         break;
 500     case '[': {
 501             int classr;
 502             int classend;
 503
 504             if (*regparse == '^') { /* Complement of range. */
 505                 ret = regnode(ANYBUT);
 506                 regparse++;
 507             } else
 508                 ret = regnode(ANYOF);
 509             if (*regparse == ']' || *regparse == '-')
 510                 regc(*regparse++);
 511             while (*regparse != '\0' && *regparse != ']') {
 512                 if (*regparse == '-') {
 513                     regparse++;
 514                     if (*regparse == ']' || *regparse == '\0')
 515                         regc('-');
 516                     else {
 517                         classr = UCHARAT(regparse-2)+1;
 518                         classend = UCHARAT(regparse);
 519                         if (classr > classend+1)
 520                             FAIL("invalid [] range");
 521                         for (; classr <= classend; classr++)
 522                             regc(classr);
 523                         regparse++;
 524                     }
 525                 } else
 526                     regc(*regparse++);
 527             }
 528             regc('\0');
 529             if (*regparse != ']')
 530                 FAIL("unmatched []");
 531             regparse++;
 532             *flagp |= HASWIDTH|SIMPLE;
 533         }
 534         break;
 535     case '(':
 536         ret = reg(1, &flags);
 537         if (ret == NULL)
 538             return(NULL);
 539         *flagp |= flags&(HASWIDTH|SPSTART);
 540         break;
 541     case '\0':
 542     case '|':
 543     case '\n':
 544     case ')':
 545         FAIL("internal urp");   /* Supposed to be caught earlier. */
 546         break;
 547     case '?':
 548     case '+':
 549     case '*':
 550         FAIL("?+* follows nothing");
 551         break;
 552     case '\\':
 553         switch (*regparse++) {
 554         case '\0':
 555             FAIL("trailing \\");
 556             break;
 557         case '<':
 558             ret = regnode(WORDA);
 559             break;
 560         case '>':
 561             ret = regnode(WORDZ);
 562             break;
 563         /* FIXME: Someday handle \1, \2, ... */
 564         default:
 565             /* Handle general quoted chars in exact-match routine */
 566             goto de_fault;
 567         }
 568         break;
 569     de_fault:
 570     default:
 571         /*
 572          * Encode a string of characters to be matched exactly.
 573          *
 574          * This is a bit tricky due to quoted chars and due to
 575          * '*', '+', and '?' taking the SINGLE char previous
 576          * as their operand.
 577          *
 578          * On entry, the char at regparse[-1] is going to go
 579          * into the string, no matter what it is.  (It could be
 580          * following a \ if we are entered from the '\' case.)
 581          *
 582          * Basic idea is to pick up a good char in  ch  and
 583          * examine the next char.  If it's *+? then we twiddle.
 584          * If it's \ then we frozzle.  If it's other magic char
 585          * we push  ch  and terminate the string.  If none of the
 586          * above, we push  ch  on the string and go around again.
 587          *
 588          *  regprev  is used to remember where "the current char"
 589          * starts in the string, if due to a *+? we need to back
 590          * up and put the current char in a separate, 1-char, string.
 591          * When  regprev  is NULL,  ch  is the only char in the
 592          * string; this is used in *+? handling, and in setting
 593          * flags |= SIMPLE at the end.
 594          */
 595         {
 596             char *regprev;
 597             char ch;
 598
 599             regparse--;         /* Look at cur char */
 600             ret = regnode(EXACTLY);
 601             for ( regprev = 0 ; ; ) {
 602                 ch = *regparse++;   /* Get current char */
 603                 switch (*regparse) {    /* look at next one */
 604
 605                 default:
 606                     regc(ch);   /* Add cur to string */
 607                     break;
 608
 609                 case '.': case '[': case '(':
 610                 case ')': case '|': case '\n':
 611                 case '$': case '^':
 612                 case '\0':
 613                 /* FIXME, $ and ^ should not always be magic */
 614                 magic:
 615                     regc(ch);   /* dump cur char */
 616                     goto done;  /* and we are done */
 617
 618                 case '?': case '+': case '*':
 619                     if (!regprev)   /* If just ch in str, */
 620                         goto magic; /* use it */
 621                     /* End mult-char string one early */
 622                     regparse = regprev; /* Back up parse */
 623                     goto done;
 624
 625                 case '\\':
 626                     regc(ch);   /* Cur char OK */
 627                     switch (regparse[1]){ /* Look after \ */
 628                     case '\0':
 629                     case '<':
 630                     case '>':
 631                     /* FIXME: Someday handle \1, \2, ... */
 632                         goto done; /* Not quoted */
 633                     default:
 634                         /* Backup point is \, scan                           * point is after it. */
 635                         regprev = regparse;
 636                         regparse++;
 637                         continue;   /* NOT break; */
 638                     }
 639                 }
 640                 regprev = regparse; /* Set backup point */
 641             }
 642         done:
 643             regc('\0');
 644             *flagp |= HASWIDTH;
 645             if (!regprev)       /* One char? */
 646                 *flagp |= SIMPLE;
 647         }
 648         break;
 649     }
 650
 651     return(ret);
 652 }
 653
 654 /*
 655  - regnode - emit a node
 656  */
 657 static char *           /* Location. */
 658 regnode( int op )
 659 {
 660     char *ret;
 661     char *ptr;
 662
 663     ret = regcode;
 664     if (ret == &regdummy) {
 665         regsize += 3;
 666         return(ret);
 667     }
 668
 669     ptr = ret;
 670     *ptr++ = op;
 671     *ptr++ = '\0';      /* Null "next" pointer. */
 672     *ptr++ = '\0';
 673     regcode = ptr;
 674
 675     return(ret);
 676 }
 677
 678 /*
 679  - regc - emit (if appropriate) a byte of code
 680  */
 681 static void
 682 regc( int b )
 683 {
 684     if (regcode != &regdummy)
 685         *regcode++ = b;
 686     else
 687         regsize++;
 688 }
 689
 690 /*
 691  - reginsert - insert an operator in front of already-emitted operand
 692  *
 693  * Means relocating the operand.
 694  */
 695 static void
 696 reginsert(
 697     char op,
 698     char *opnd )
 699 {
 700     char *src;
 701     char *dst;
 702     char *place;
 703
 704     if (regcode == &regdummy) {
 705         regsize += 3;
 706         return;
 707     }
 708
 709     src = regcode;
 710     regcode += 3;
 711     dst = regcode;
 712     while (src > opnd)
 713         *--dst = *--src;
 714
 715     place = opnd;       /* Op node, where operand used to be. */
 716     *place++ = op;
 717     *place++ = '\0';
 718     *place++ = '\0';
 719 }
 720
 721 /*
 722  - regtail - set the next-pointer at the end of a node chain
 723  */
 724 static void
 725 regtail(
 726     char *p,
 727     char *val )
 728 {
 729     char *scan;
 730     char *temp;
 731     int offset;
 732
 733     if (p == &regdummy)
 734         return;
 735
 736     /* Find last node. */
 737     scan = p;
 738     for (;;) {
 739         temp = regnext(scan);
 740         if (temp == NULL)
 741             break;
 742         scan = temp;
 743     }
 744
 745     if (OP(scan) == BACK)
 746         offset = scan - val;
 747     else
 748         offset = val - scan;
 749     *(scan+1) = (offset>>8)&0377;
 750     *(scan+2) = offset&0377;
 751 }
 752
 753 /*
 754  - regoptail - regtail on operand of first argument; nop if operandless
 755  */
 756
 757 static void
 758 regoptail(
 759     char *p,
 760     char *val )
 761 {
 762     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
 763     if (p == NULL || p == &regdummy || OP(p) != BRANCH)
 764         return;
 765     regtail(OPERAND(p), val);
 766 }
 767
 768 /*
 769  * regexec and friends
 770  */
 771
 772 /*
 773  * Global work variables for regexec().
 774  */
 775 static const char *reginput;      /* String-input pointer. */
 776 static const char *regbol;        /* Beginning of input, for ^ check. */
 777 static const char **regstartp;    /* Pointer to startp array. */
 778 static const char **regendp;      /* Ditto for endp. */
 779
 780 /*
 781  * Forwards.
 782  */
 783 STATIC int regtry( regexp *prog, const char *string );
 784 STATIC int regmatch( char *prog );
 785 STATIC int regrepeat( char *p );
 786
 787 #ifdef DEBUG
 788 int regnarrate = 0;
 789 void regdump();
 790 STATIC char *regprop();
 791 #endif
 792
 793 /*
 794  - regexec - match a regexp against a string
 795  */
 796 int
 797 regexec(
 798     regexp *prog,
 799     const char *string )
 800 {
 801     char *s;
 802
 803     /* Be paranoid... */
 804     if (prog == NULL || string == NULL) {
 805         regerror("NULL parameter");
 806         return(0);
 807     }
 808
 809     /* Check validity of program. */
 810     if (UCHARAT(prog->program) != MAGIC) {
 811         regerror("corrupted program");
 812         return(0);
 813     }
 814
 815     /* If there is a "must appear" string, look for it. */
 816     if ( prog->regmust != NULL )
 817     {
 818         s = (char *)string;
 819         while ( ( s = strchr( s, prog->regmust[ 0 ] ) ) != NULL )
 820         {
 821             if ( !strncmp( s, prog->regmust, prog->regmlen ) )
 822                 break;  /* Found it. */
 823             ++s;
 824         }
 825         if ( s == NULL )  /* Not present. */
 826             return 0;
 827     }
 828
 829     /* Mark beginning of line for ^ . */
 830     regbol = (char *)string;
 831
 832     /* Simplest case:  anchored match need be tried only once. */
 833     if ( prog->reganch )
 834         return regtry( prog, string );
 835
 836     /* Messy cases:  unanchored match. */
 837     s = (char *)string;
 838     if (prog->regstart != '\0')
 839         /* We know what char it must start with. */
 840         while ((s = strchr(s, prog->regstart)) != NULL) {
 841             if (regtry(prog, s))
 842                 return(1);
 843             s++;
 844         }
 845     else
 846         /* We do not -- general case. */
 847         do {
 848             if ( regtry( prog, s ) )
 849                 return( 1 );
 850         } while ( *s++ != '\0' );
 851
 852     /* Failure. */
 853     return 0;
 854 }
 855
 856
 857 /*
 858  * regtry() - try match at specific point.
 859  */
 860
 861 static int          /* 0 failure, 1 success */
 862 regtry(
 863     regexp *prog,
 864     const char *string )
 865 {
 866     int i;
 867     const char * * sp;
 868     const char * * ep;
 869
 870     reginput = string;
 871     regstartp = prog->startp;
 872     regendp = prog->endp;
 873
 874     sp = prog->startp;
 875     ep = prog->endp;
 876     for ( i = NSUBEXP; i > 0; --i )
 877     {
 878         *sp++ = NULL;
 879         *ep++ = NULL;
 880     }
 881     if ( regmatch( prog->program + 1 ) )
 882     {
 883         prog->startp[ 0 ] = string;
 884         prog->endp[ 0 ] = reginput;
 885         return 1;
 886     }
 887     else
 888         return 0;
 889 }
 890
 891
 892 /*
 893  * regmatch() - main matching routine.
 894  *
 895  * Conceptually the strategy is simple: check to see whether the current node
 896  * matches, call self recursively to see whether the rest matches, and then act
 897  * accordingly. In practice we make some effort to avoid recursion, in
 898  * particular by going through "ordinary" nodes (that do not need to know
 899  * whether the rest of the match failed) by a loop instead of by recursion.
 900  */
 901
 902 static int          /* 0 failure, 1 success */
 903 regmatch( char * prog )
 904 {
 905     char * scan;  /* Current node. */
 906     char * next;  /* Next node. */
 907
 908     scan = prog;
 909 #ifdef DEBUG
 910     if (scan != NULL && regnarrate)
 911         err_printf("%s(\n", regprop(scan));
 912 #endif
 913     while (scan != NULL) {
 914 #ifdef DEBUG
 915         if (regnarrate)
 916             err_printf("%s...\n", regprop(scan));
 917 #endif
 918         next = regnext(scan);
 919
 920         switch (OP(scan)) {
 921         case BOL:
 922             if (reginput != regbol)
 923                 return(0);
 924             break;
 925         case EOL:
 926             if (*reginput != '\0')
 927                 return(0);
 928             break;
 929         case WORDA:
 930             /* Must be looking at a letter, digit, or _ */
 931             if ((!isalnum(*reginput)) && *reginput != '_')
 932                 return(0);
 933             /* Prev must be BOL or nonword */
 934             if (reginput > regbol &&
 935                 (isalnum(reginput[-1]) || reginput[-1] == '_'))
 936                 return(0);
 937             break;
 938         case WORDZ:
 939             /* Must be looking at non letter, digit, or _ */
 940             if (isalnum(*reginput) || *reginput == '_')
 941                 return(0);
 942             /* We don't care what the previous char was */
 943             break;
 944         case ANY:
 945             if (*reginput == '\0')
 946                 return(0);
 947             reginput++;
 948             break;
 949         case EXACTLY: {
 950                 int len;
 951                 char *opnd;
 952
 953                 opnd = OPERAND(scan);
 954                 /* Inline the first character, for speed. */
 955                 if (*opnd != *reginput)
 956                     return(0);
 957                 len = strlen(opnd);
 958                 if (len > 1 && strncmp(opnd, reginput, len) != 0)
 959                     return(0);
 960                 reginput += len;
 961             }
 962             break;
 963         case ANYOF:
 964             if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
 965                 return(0);
 966             reginput++;
 967             break;
 968         case ANYBUT:
 969             if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
 970                 return(0);
 971             reginput++;
 972             break;
 973         case NOTHING:
 974             break;
 975         case BACK:
 976             break;
 977         case OPEN+1:
 978         case OPEN+2:
 979         case OPEN+3:
 980         case OPEN+4:
 981         case OPEN+5:
 982         case OPEN+6:
 983         case OPEN+7:
 984         case OPEN+8:
 985         case OPEN+9: {
 986                 int no;
 987                 const char *save;
 988
 989                 no = OP(scan) - OPEN;
 990                 save = reginput;
 991
 992                 if (regmatch(next)) {
 993                     /*
 994                      * Don't set startp if some later
 995                      * invocation of the same parentheses
 996                      * already has.
 997                      */
 998                     if (regstartp[no] == NULL)
 999                         regstartp[no] = save;
1000                     return(1);
1001                 } else
1002                     return(0);
1003             }
1004             break;
1005         case CLOSE+1:
1006         case CLOSE+2:
1007         case CLOSE+3:
1008         case CLOSE+4:
1009         case CLOSE+5:
1010         case CLOSE+6:
1011         case CLOSE+7:
1012         case CLOSE+8:
1013         case CLOSE+9: {
1014                 int no;
1015                 const char *save;
1016
1017                 no = OP(scan) - CLOSE;
1018                 save = reginput;
1019
1020                 if (regmatch(next)) {
1021                     /*
1022                      * Don't set endp if some later
1023                      * invocation of the same parentheses
1024                      * already has.
1025                      */
1026                     if (regendp[no] == NULL)
1027                         regendp[no] = save;
1028                     return(1);
1029                 } else
1030                     return(0);
1031             }
1032             break;
1033         case BRANCH: {
1034                 const char *save;
1035
1036                 if (OP(next) != BRANCH)     /* No choice. */
1037                     next = OPERAND(scan);   /* Avoid recursion. */
1038                 else {
1039                     do {
1040                         save = reginput;
1041                         if (regmatch(OPERAND(scan)))
1042                             return(1);
1043                         reginput = save;
1044                         scan = regnext(scan);
1045                     } while (scan != NULL && OP(scan) == BRANCH);
1046                     return(0);
1047                     /* NOTREACHED */
1048                 }
1049             }
1050             break;
1051         case STAR:
1052         case PLUS: {
1053                 char nextch;
1054                 int no;
1055                 const char *save;
1056                 int min;
1057
1058                 /*
1059                  * Lookahead to avoid useless match attempts
1060                  * when we know what character comes next.
1061                  */
1062                 nextch = '\0';
1063                 if (OP(next) == EXACTLY)
1064                     nextch = *OPERAND(next);
1065                 min = (OP(scan) == STAR) ? 0 : 1;
1066                 save = reginput;
1067                 no = regrepeat(OPERAND(scan));
1068                 while (no >= min) {
1069                     /* If it could work, try it. */
1070                     if (nextch == '\0' || *reginput == nextch)
1071                         if (regmatch(next))
1072                             return(1);
1073                     /* Couldn't or didn't -- back up. */
1074                     no--;
1075                     reginput = save + no;
1076                 }
1077                 return(0);
1078             }
1079             break;
1080         case END:
1081             return(1);  /* Success! */
1082             break;
1083         default:
1084             regerror("memory corruption");
1085             return(0);
1086             break;
1087         }
1088
1089         scan = next;
1090     }
1091
1092     /*
1093      * We get here only if there's trouble -- normally "case END" is
1094      * the terminating point.
1095      */
1096     regerror("corrupted pointers");
1097     return(0);
1098 }
1099
1100 /*
1101  - regrepeat - repeatedly match something simple, report how many
1102  */
1103 static int
1104 regrepeat( char *p )
1105 {
1106     int count = 0;
1107     const char *scan;
1108     char *opnd;
1109
1110     scan = reginput;
1111     opnd = OPERAND(p);
1112     switch (OP(p)) {
1113     case ANY:
1114         count = strlen(scan);
1115         scan += count;
1116         break;
1117     case EXACTLY:
1118         while (*opnd == *scan) {
1119             count++;
1120             scan++;
1121         }
1122         break;
1123     case ANYOF:
1124         while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1125             count++;
1126             scan++;
1127         }
1128         break;
1129     case ANYBUT:
1130         while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1131             count++;
1132             scan++;
1133         }
1134         break;
1135     default:        /* Oh dear.  Called inappropriately. */
1136         regerror("internal foulup");
1137         count = 0;  /* Best compromise. */
1138         break;
1139     }
1140     reginput = scan;
1141
1142     return(count);
1143 }
1144
1145 /*
1146  - regnext - dig the "next" pointer out of a node
1147  */
1148 static char *
1149 regnext( char *p )
1150 {
1151     int offset;
1152
1153     if (p == &regdummy)
1154         return(NULL);
1155
1156     offset = NEXT(p);
1157     if (offset == 0)
1158         return(NULL);
1159
1160     if (OP(p) == BACK)
1161         return(p-offset);
1162     else
1163         return(p+offset);
1164 }
1165
1166 #ifdef DEBUG
1167
1168 STATIC char *regprop();
1169
1170 /*
1171  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1172  */
1173 void
1174 regdump( regexp *r )
1175 {
1176     char *s;
1177     char op = EXACTLY; /* Arbitrary non-END op. */
1178     char *next;
1179
1180
1181     s = r->program + 1;
1182     while (op != END) { /* While that wasn't END last time... */
1183         op = OP(s);
1184         out_printf("%2d%s", s-r->program, regprop(s));  /* Where, what. */
1185         next = regnext(s);
1186         if (next == NULL)       /* Next ptr. */
1187             out_printf("(0)");
1188         else
1189             out_printf("(%d)", (s-r->program)+(next-s));
1190         s += 3;
1191         if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1192             /* Literal string, where present. */
1193             while (*s != '\0') {
1194                 out_putc(*s);
1195                 s++;
1196             }
1197             s++;
1198         }
1199         out_putc('\n');
1200     }
1201
1202     /* Header fields of interest. */
1203     if (r->regstart != '\0')
1204         out_printf("start `%c' ", r->regstart);
1205     if (r->reganch)
1206         out_printf("anchored ");
1207     if (r->regmust != NULL)
1208         out_printf("must have \"%s\"", r->regmust);
1209     out_printf("\n");
1210 }
1211
1212 /*
1213  - regprop - printable representation of opcode
1214  */
1215 static char *
1216 regprop( char *op )
1217 {
1218     char *p;
1219     static char buf[50];
1220
1221     (void) strcpy(buf, ":");
1222
1223     switch (OP(op)) {
1224     case BOL:
1225         p = "BOL";
1226         break;
1227     case EOL:
1228         p = "EOL";
1229         break;
1230     case ANY:
1231         p = "ANY";
1232         break;
1233     case ANYOF:
1234         p = "ANYOF";
1235         break;
1236     case ANYBUT:
1237         p = "ANYBUT";
1238         break;
1239     case BRANCH:
1240         p = "BRANCH";
1241         break;
1242     case EXACTLY:
1243         p = "EXACTLY";
1244         break;
1245     case NOTHING:
1246         p = "NOTHING";
1247         break;
1248     case BACK:
1249         p = "BACK";
1250         break;
1251     case END:
1252         p = "END";
1253         break;
1254     case OPEN+1:
1255     case OPEN+2:
1256     case OPEN+3:
1257     case OPEN+4:
1258     case OPEN+5:
1259     case OPEN+6:
1260     case OPEN+7:
1261     case OPEN+8:
1262     case OPEN+9:
1263         sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1264         p = NULL;
1265         break;
1266     case CLOSE+1:
1267     case CLOSE+2:
1268     case CLOSE+3:
1269     case CLOSE+4:
1270     case CLOSE+5:
1271     case CLOSE+6:
1272     case CLOSE+7:
1273     case CLOSE+8:
1274     case CLOSE+9:
1275         sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1276         p = NULL;
1277         break;
1278     case STAR:
1279         p = "STAR";
1280         break;
1281     case PLUS:
1282         p = "PLUS";
1283         break;
1284     case WORDA:
1285         p = "WORDA";
1286         break;
1287     case WORDZ:
1288         p = "WORDZ";
1289         break;
1290     default:
1291         regerror("corrupted opcode");
1292         break;
1293     }
1294     if (p != NULL)
1295         (void) strcat(buf, p);
1296     return(buf);
1297 }
1298 #endif
1299
1300 /*
1301  * The following is provided for those people who do not have strcspn() in
1302  * their C libraries.  They should get off their butts and do something
1303  * about it; at least one public-domain implementation of those (highly
1304  * useful) string routines has been published on Usenet.
1305  */
1306 #ifdef STRCSPN
1307 /*
1308  * strcspn - find length of initial segment of s1 consisting entirely
1309  * of characters not from s2
1310  */
1311
1312 static int
1313 strcspn(
1314     char *s1,
1315     char *s2 )
1316 {
1317     char *scan1;
1318     char *scan2;
1319     int count;
1320
1321     count = 0;
1322     for (scan1 = s1; *scan1 != '\0'; scan1++) {
1323         for (scan2 = s2; *scan2 != '\0';)   /* ++ moved down. */
1324             if (*scan1 == *scan2++)
1325                 return(count);
1326         count++;
1327     }
1328     return(count);
1329 }
1330 #endif