regex/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  19    MA 02110-1301 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           int length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  27 static reg_errcode_t init_word_char (re_dfa_t *dfa);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 static reg_errcode_t analyze (re_dfa_t *dfa);
  34 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
  35 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
  36 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
  37 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
  38 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  39                                              int top_clone_node, int root_node,
  40                                              unsigned int constraint);
  41 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  42                                      unsigned int constraint);
  43 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  44                                    unsigned int constraint);
  45 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  46 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  47                                          int node, int root);
  48 static void calc_inveclosure (re_dfa_t *dfa);
  49 static int fetch_number (re_string_t *input, re_token_t *token,
  50                          reg_syntax_t syntax);
  51 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
  52 static int peek_token (re_token_t *token, re_string_t *input,
  53                         reg_syntax_t syntax);
  54 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  55                                reg_syntax_t syntax);
  56 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  57                           reg_syntax_t syntax, reg_errcode_t *err);
  58 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  59                                   re_token_t *token, reg_syntax_t syntax,
  60                                   int nest, reg_errcode_t *err);
  61 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  62                                  re_token_t *token, reg_syntax_t syntax,
  63                                  int nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  65                                      re_token_t *token, reg_syntax_t syntax,
  66                                      int nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  68                                   re_token_t *token, reg_syntax_t syntax,
  69                                   int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  71                                  re_dfa_t *dfa, re_token_t *token,
  72                                  reg_syntax_t syntax, reg_errcode_t *err);
  73 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  74                                       re_token_t *token, reg_syntax_t syntax,
  75                                       reg_errcode_t *err);
  76 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  77                                             re_string_t *regexp,
  78                                             re_token_t *token, int token_len,
  79                                             re_dfa_t *dfa,
  80                                             reg_syntax_t syntax);
  81 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  82                                           re_string_t *regexp,
  83                                           re_token_t *token);
  84 #ifndef _LIBC
  85 # ifdef RE_ENABLE_I18N
  86 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
  87                                       re_charset_t *mbcset, int *range_alloc,
  88                                       bracket_elem_t *start_elem,
  89                                       bracket_elem_t *end_elem);
  90 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
  91                                              re_charset_t *mbcset,
  92                                              int *coll_sym_alloc,
  93                                              const unsigned char *name);
  94 # else /* not RE_ENABLE_I18N */
  95 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
  96                                       bracket_elem_t *start_elem,
  97                                       bracket_elem_t *end_elem);
  98 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
  99                                              const unsigned char *name);
 100 # endif /* not RE_ENABLE_I18N */
 101 #endif /* not _LIBC */
 102 #ifdef RE_ENABLE_I18N
 103 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 104                                         re_charset_t *mbcset,
 105                                         int *equiv_class_alloc,
 106                                         const unsigned char *name);
 107 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
 108                                       re_charset_t *mbcset,
 109                                       int *char_class_alloc,
 110                                       const unsigned char *class_name,
 111                                       reg_syntax_t syntax);
 112 #else  /* not RE_ENABLE_I18N */
 113 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 114                                         const unsigned char *name);
 115 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
 116                                       const unsigned char *class_name,
 117                                       reg_syntax_t syntax);
 118 #endif /* not RE_ENABLE_I18N */
 119 static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
 120 static void free_bin_tree (bin_tree_t *tree);
 121 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
 122                                 re_token_type_t type, int index);
 123 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 124 \f
 125 /* This table gives an error message for each of the error codes listed
 126    in regex.h.  Obviously the order here has to be same as there.
 127    POSIX doesn't require that we do anything for REG_NOERROR,
 128    but why not be nice?  */
 129
 130 const char __re_error_msgid[] attribute_hidden =
 131   {
 132 #define REG_NOERROR_IDX 0
 133     gettext_noop ("Success")    /* REG_NOERROR */
 134     "\0"
 135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 136     gettext_noop ("No match")   /* REG_NOMATCH */
 137     "\0"
 138 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 139     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 140     "\0"
 141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 142     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 143     "\0"
 144 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 145     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 146     "\0"
 147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 148     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 149     "\0"
 150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 151     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 152     "\0"
 153 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 154     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 155     "\0"
 156 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 157     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 158     "\0"
 159 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 160     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 161     "\0"
 162 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 163     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 164     "\0"
 165 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 166     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 167     "\0"
 168 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 169     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 170     "\0"
 171 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 172     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 173     "\0"
 174 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 175     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 176     "\0"
 177 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 178     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 179     "\0"
 180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 181     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 182   };
 183
 184 const size_t __re_error_msgid_idx[] attribute_hidden =
 185   {
 186     REG_NOERROR_IDX,
 187     REG_NOMATCH_IDX,
 188     REG_BADPAT_IDX,
 189     REG_ECOLLATE_IDX,
 190     REG_ECTYPE_IDX,
 191     REG_EESCAPE_IDX,
 192     REG_ESUBREG_IDX,
 193     REG_EBRACK_IDX,
 194     REG_EPAREN_IDX,
 195     REG_EBRACE_IDX,
 196     REG_BADBR_IDX,
 197     REG_ERANGE_IDX,
 198     REG_ESPACE_IDX,
 199     REG_BADRPT_IDX,
 200     REG_EEND_IDX,
 201     REG_ESIZE_IDX,
 202     REG_ERPAREN_IDX
 203   };
 204 \f
 205 /* Entry points for GNU code.  */
 206
 207 /* re_compile_pattern is the GNU regular expression compiler: it
 208    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 209    Returns 0 if the pattern was valid, otherwise an error string.
 210
 211    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 212    are set in BUFP on entry.  */
 213
 214 const char *
 215 re_compile_pattern (pattern, length, bufp)
 216     const char *pattern;
 217     size_t length;
 218     struct re_pattern_buffer *bufp;
 219 {
 220   reg_errcode_t ret;
 221
 222   /* And GNU code determines whether or not to get register information
 223      by passing null for the REGS argument to re_match, etc., not by
 224      setting no_sub.  */
 225   bufp->no_sub = 0;
 226
 227   /* Match anchors at newline.  */
 228   bufp->newline_anchor = 1;
 229
 230   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 231
 232   if (!ret)
 233     return NULL;
 234   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 235 }
 236 #ifdef _LIBC
 237 weak_alias (__re_compile_pattern, re_compile_pattern)
 238 #endif
 239
 240 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 241    also be assigned to arbitrarily: each pattern buffer stores its own
 242    syntax, so it can be changed between regex compilations.  */
 243 /* This has no initializer because initialized variables in Emacs
 244    become read-only after dumping.  */
 245 reg_syntax_t re_syntax_options;
 246
 247
 248 /* Specify the precise syntax of regexps for compilation.  This provides
 249    for compatibility for various utilities which historically have
 250    different, incompatible syntaxes.
 251
 252    The argument SYNTAX is a bit mask comprised of the various bits
 253    defined in regex.h.  We return the old syntax.  */
 254
 255 reg_syntax_t
 256 re_set_syntax (syntax)
 257     reg_syntax_t syntax;
 258 {
 259   reg_syntax_t ret = re_syntax_options;
 260
 261   re_syntax_options = syntax;
 262   return ret;
 263 }
 264 #ifdef _LIBC
 265 weak_alias (__re_set_syntax, re_set_syntax)
 266 #endif
 267
 268 int
 269 re_compile_fastmap (bufp)
 270     struct re_pattern_buffer *bufp;
 271 {
 272   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 273   char *fastmap = bufp->fastmap;
 274
 275   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 276   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 277   if (dfa->init_state != dfa->init_state_word)
 278     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 279   if (dfa->init_state != dfa->init_state_nl)
 280     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 281   if (dfa->init_state != dfa->init_state_begbuf)
 282     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 283   bufp->fastmap_accurate = 1;
 284   return 0;
 285 }
 286 #ifdef _LIBC
 287 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 288 #endif
 289
 290 static inline void
 291 re_set_fastmap (char *fastmap, int icase, int ch)
 292 {
 293   fastmap[ch] = 1;
 294   if (icase)
 295     fastmap[tolower (ch)] = 1;
 296 }
 297
 298 /* Helper function for re_compile_fastmap.
 299    Compile fastmap for the initial_state INIT_STATE.  */
 300
 301 static void
 302 re_compile_fastmap_iter (bufp, init_state, fastmap)
 303      regex_t *bufp;
 304      const re_dfastate_t *init_state;
 305      char *fastmap;
 306 {
 307   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 308   int node_cnt;
 309   int icase = (MB_CUR_MAX == 1 && (bufp->syntax & RE_ICASE));
 310   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 311     {
 312       int node = init_state->nodes.elems[node_cnt];
 313       re_token_type_t type = dfa->nodes[node].type;
 314
 315       if (type == CHARACTER)
 316         re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 317       else if (type == SIMPLE_BRACKET)
 318         {
 319           int i, j, ch;
 320           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 321             for (j = 0; j < UINT_BITS; ++j, ++ch)
 322               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 323                 re_set_fastmap (fastmap, icase, ch);
 324         }
 325 #ifdef RE_ENABLE_I18N
 326       else if (type == COMPLEX_BRACKET)
 327         {
 328           int i;
 329           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 330           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 331               || cset->nranges || cset->nchar_classes)
 332             {
 333 # ifdef _LIBC
 334               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 335                 {
 336                   /* In this case we want to catch the bytes which are
 337                      the first byte of any collation elements.
 338                      e.g. In da_DK, we want to catch 'a' since "aa"
 339                           is a valid collation element, and don't catch
 340                           'b' since 'b' is the only collation element
 341                           which starts from 'b'.  */
 342                   int j, ch;
 343                   const int32_t *table = (const int32_t *)
 344                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 345                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 346                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 347                       if (table[ch] < 0)
 348                         re_set_fastmap (fastmap, icase, ch);
 349                 }
 350 # else
 351               if (MB_CUR_MAX > 1)
 352                 for (i = 0; i < SBC_MAX; ++i)
 353                   if (__btowc (i) == WEOF)
 354                     re_set_fastmap (fastmap, icase, i);
 355 # endif /* not _LIBC */
 356             }
 357           for (i = 0; i < cset->nmbchars; ++i)
 358             {
 359               char buf[256];
 360               mbstate_t state;
 361               memset (&state, '\0', sizeof (state));
 362               __wcrtomb (buf, cset->mbchars[i], &state);
 363               re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 364             }
 365         }
 366 #endif /* RE_ENABLE_I18N */
 367       else if (type == END_OF_RE || type == OP_PERIOD)
 368         {
 369           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 370           if (type == END_OF_RE)
 371             bufp->can_be_null = 1;
 372           return;
 373         }
 374     }
 375 }
 376 \f
 377 /* Entry point for POSIX code.  */
 378 /* regcomp takes a regular expression as a string and compiles it.
 379
 380    PREG is a regex_t *.  We do not expect any fields to be initialized,
 381    since POSIX says we shouldn't.  Thus, we set
 382
 383      `buffer' to the compiled pattern;
 384      `used' to the length of the compiled pattern;
 385      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 386        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 387        RE_SYNTAX_POSIX_BASIC;
 388      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 389      `fastmap' to an allocated space for the fastmap;
 390      `fastmap_accurate' to zero;
 391      `re_nsub' to the number of subexpressions in PATTERN.
 392
 393    PATTERN is the address of the pattern string.
 394
 395    CFLAGS is a series of bits which affect compilation.
 396
 397      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 398      use POSIX basic syntax.
 399
 400      If REG_NEWLINE is set, then . and [^...] don't match newline.
 401      Also, regexec will try a match beginning after every newline.
 402
 403      If REG_ICASE is set, then we considers upper- and lowercase
 404      versions of letters to be equivalent when matching.
 405
 406      If REG_NOSUB is set, then when PREG is passed to regexec, that
 407      routine will report only success or failure, and nothing about the
 408      registers.
 409
 410    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 411    the return codes and their meanings.)  */
 412
 413 int
 414 regcomp (preg, pattern, cflags)
 415     regex_t *__restrict preg;
 416     const char *__restrict pattern;
 417     int cflags;
 418 {
 419   reg_errcode_t ret;
 420   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 421                          : RE_SYNTAX_POSIX_BASIC);
 422
 423   preg->buffer = NULL;
 424   preg->allocated = 0;
 425   preg->used = 0;
 426
 427   /* Try to allocate space for the fastmap.  */
 428   preg->fastmap = re_malloc (char, SBC_MAX);
 429   if (BE (preg->fastmap == NULL, 0))
 430     return REG_ESPACE;
 431
 432   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 433
 434   /* If REG_NEWLINE is set, newlines are treated differently.  */
 435   if (cflags & REG_NEWLINE)
 436     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 437       syntax &= ~RE_DOT_NEWLINE;
 438       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 439       /* It also changes the matching behavior.  */
 440       preg->newline_anchor = 1;
 441     }
 442   else
 443     preg->newline_anchor = 0;
 444   preg->no_sub = !!(cflags & REG_NOSUB);
 445   preg->translate = NULL;
 446
 447   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 448
 449   /* POSIX doesn't distinguish between an unmatched open-group and an
 450      unmatched close-group: both are REG_EPAREN.  */
 451   if (ret == REG_ERPAREN)
 452     ret = REG_EPAREN;
 453
 454   /* We have already checked preg->fastmap != NULL.  */
 455   if (BE (ret == REG_NOERROR, 1))
 456     /* Compute the fastmap now, since regexec cannot modify the pattern
 457        buffer.  This function nevers fails in this implementation.  */
 458     (void) re_compile_fastmap (preg);
 459   else
 460     {
 461       /* Some error occurred while compiling the expression.  */
 462       re_free (preg->fastmap);
 463       preg->fastmap = NULL;
 464     }
 465
 466   return (int) ret;
 467 }
 468 #ifdef _LIBC
 469 weak_alias (__regcomp, regcomp)
 470 #endif
 471
 472 /* Returns a message corresponding to an error code, ERRCODE, returned
 473    from either regcomp or regexec.   We don't use PREG here.  */
 474
 475 size_t
 476 regerror (errcode, preg, errbuf, errbuf_size)
 477     int errcode;
 478     const regex_t *preg;
 479     char *errbuf;
 480     size_t errbuf_size;
 481 {
 482   const char *msg;
 483   size_t msg_size;
 484
 485   if (BE (errcode < 0
 486           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 487                                / sizeof (__re_error_msgid_idx[0])), 0))
 488     /* Only error codes returned by the rest of the code should be passed
 489        to this routine.  If we are given anything else, or if other regex
 490        code generates an invalid error code, then the program has a bug.
 491        Dump core so we can fix it.  */
 492     abort ();
 493
 494   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 495
 496   msg_size = strlen (msg) + 1; /* Includes the null.  */
 497
 498   if (BE (errbuf_size != 0, 1))
 499     {
 500       if (BE (msg_size > errbuf_size, 0))
 501         {
 502 #if defined HAVE_MEMPCPY || defined _LIBC
 503           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 504 #else
 505           memcpy (errbuf, msg, errbuf_size - 1);
 506           errbuf[errbuf_size - 1] = 0;
 507 #endif
 508         }
 509       else
 510         memcpy (errbuf, msg, msg_size);
 511     }
 512
 513   return msg_size;
 514 }
 515 #ifdef _LIBC
 516 weak_alias (__regerror, regerror)
 517 #endif
 518
 519
 520 static void
 521 free_dfa_content (re_dfa_t *dfa)
 522 {
 523   int i, j;
 524
 525   re_free (dfa->subexps);
 526
 527   for (i = 0; i < dfa->nodes_len; ++i)
 528     {
 529       re_token_t *node = dfa->nodes + i;
 530 #ifdef RE_ENABLE_I18N
 531       if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
 532         free_charset (node->opr.mbcset);
 533       else
 534 #endif /* RE_ENABLE_I18N */
 535         if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
 536           re_free (node->opr.sbcset);
 537     }
 538   re_free (dfa->nexts);
 539   for (i = 0; i < dfa->nodes_len; ++i)
 540     {
 541       if (dfa->eclosures != NULL)
 542         re_node_set_free (dfa->eclosures + i);
 543       if (dfa->inveclosures != NULL)
 544         re_node_set_free (dfa->inveclosures + i);
 545       if (dfa->edests != NULL)
 546         re_node_set_free (dfa->edests + i);
 547     }
 548   re_free (dfa->edests);
 549   re_free (dfa->eclosures);
 550   re_free (dfa->inveclosures);
 551   re_free (dfa->nodes);
 552
 553   for (i = 0; i <= dfa->state_hash_mask; ++i)
 554     {
 555       struct re_state_table_entry *entry = dfa->state_table + i;
 556       for (j = 0; j < entry->num; ++j)
 557         {
 558           re_dfastate_t *state = entry->array[j];
 559           free_state (state);
 560         }
 561       re_free (entry->array);
 562     }
 563   re_free (dfa->state_table);
 564
 565   if (dfa->word_char != NULL)
 566     re_free (dfa->word_char);
 567 #ifdef DEBUG
 568   re_free (dfa->re_str);
 569 #endif
 570
 571   re_free (dfa);
 572 }
 573
 574
 575 /* Free dynamically allocated space used by PREG.  */
 576
 577 void
 578 regfree (preg)
 579     regex_t *preg;
 580 {
 581   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 582   if (BE (dfa != NULL, 1))
 583     free_dfa_content (dfa);
 584
 585   re_free (preg->fastmap);
 586 }
 587 #ifdef _LIBC
 588 weak_alias (__regfree, regfree)
 589 #endif
 590 \f
 591 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 592    them unless specifically requested.  */
 593
 594 #if defined _REGEX_RE_COMP || defined _LIBC
 595
 596 /* BSD has one and only one pattern buffer.  */
 597 static struct re_pattern_buffer re_comp_buf;
 598
 599 char *
 600 # ifdef _LIBC
 601 /* Make these definitions weak in libc, so POSIX programs can redefine
 602    these names if they don't use our functions, and still use
 603    regcomp/regexec above without link errors.  */
 604 weak_function
 605 # endif
 606 re_comp (s)
 607      const char *s;
 608 {
 609   reg_errcode_t ret;
 610   char *fastmap;
 611
 612   if (!s)
 613     {
 614       if (!re_comp_buf.buffer)
 615         return gettext ("No previous regular expression");
 616       return 0;
 617     }
 618
 619   if (re_comp_buf.buffer)
 620     {
 621       fastmap = re_comp_buf.fastmap;
 622       re_comp_buf.fastmap = NULL;
 623       __regfree (&re_comp_buf);
 624       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 625       re_comp_buf.fastmap = fastmap;
 626     }
 627
 628   if (re_comp_buf.fastmap == NULL)
 629     {
 630       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 631       if (re_comp_buf.fastmap == NULL)
 632         return (char *) gettext (__re_error_msgid
 633                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 634     }
 635
 636   /* Since `re_exec' always passes NULL for the `regs' argument, we
 637      don't need to initialize the pattern buffer fields which affect it.  */
 638
 639   /* Match anchors at newlines.  */
 640   re_comp_buf.newline_anchor = 1;
 641
 642   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 643
 644   if (!ret)
 645     return NULL;
 646
 647   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 648   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 649 }
 650
 651 #ifdef _LIBC
 652 libc_freeres_fn (free_mem)
 653 {
 654   __regfree (&re_comp_buf);
 655 }
 656 #endif
 657
 658 #endif /* _REGEX_RE_COMP */
 659 \f
 660 /* Internal entry point.
 661    Compile the regular expression PATTERN, whose length is LENGTH.
 662    SYNTAX indicate regular expression's syntax.  */
 663
 664 static reg_errcode_t
 665 re_compile_internal (preg, pattern, length, syntax)
 666      regex_t *preg;
 667      const char * pattern;
 668      int length;
 669      reg_syntax_t syntax;
 670 {
 671   reg_errcode_t err = REG_NOERROR;
 672   re_dfa_t *dfa;
 673   re_string_t regexp;
 674
 675   /* Initialize the pattern buffer.  */
 676   preg->fastmap_accurate = 0;
 677   preg->syntax = syntax;
 678   preg->not_bol = preg->not_eol = 0;
 679   preg->used = 0;
 680   preg->re_nsub = 0;
 681   preg->can_be_null = 0;
 682   preg->regs_allocated = REGS_UNALLOCATED;
 683
 684   /* Initialize the dfa.  */
 685   dfa = (re_dfa_t *) preg->buffer;
 686   if (preg->allocated < sizeof (re_dfa_t))
 687     {
 688       /* If zero allocated, but buffer is non-null, try to realloc
 689          enough space.  This loses if buffer's address is bogus, but
 690          that is the user's responsibility.  If ->buffer is NULL this
 691          is a simple allocation.  */
 692       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 693       if (dfa == NULL)
 694         return REG_ESPACE;
 695       preg->allocated = sizeof (re_dfa_t);
 696     }
 697   preg->buffer = (unsigned char *) dfa;
 698   preg->used = sizeof (re_dfa_t);
 699
 700   err = init_dfa (dfa, length);
 701   if (BE (err != REG_NOERROR, 0))
 702     {
 703       re_free (dfa);
 704       preg->buffer = NULL;
 705       preg->allocated = 0;
 706       return err;
 707     }
 708 #ifdef DEBUG
 709   dfa->re_str = re_malloc (char, length + 1);
 710   strncpy (dfa->re_str, pattern, length + 1);
 711 #endif
 712
 713   err = re_string_construct (&regexp, pattern, length, preg->translate,
 714                              syntax & RE_ICASE);
 715   if (BE (err != REG_NOERROR, 0))
 716     {
 717       re_free (dfa);
 718       preg->buffer = NULL;
 719       preg->allocated = 0;
 720       return err;
 721     }
 722
 723   /* Parse the regular expression, and build a structure tree.  */
 724   preg->re_nsub = 0;
 725   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 726   if (BE (dfa->str_tree == NULL, 0))
 727     goto re_compile_internal_free_return;
 728
 729   /* Analyze the tree and collect information which is necessary to
 730      create the dfa.  */
 731   err = analyze (dfa);
 732   if (BE (err != REG_NOERROR, 0))
 733     goto re_compile_internal_free_return;
 734
 735   /* Then create the initial state of the dfa.  */
 736   err = create_initial_state (dfa);
 737
 738   /* Release work areas.  */
 739   free_workarea_compile (preg);
 740   re_string_destruct (&regexp);
 741
 742   if (BE (err != REG_NOERROR, 0))
 743     {
 744     re_compile_internal_free_return:
 745       free_dfa_content (dfa);
 746       preg->buffer = NULL;
 747       preg->allocated = 0;
 748     }
 749
 750   return err;
 751 }
 752
 753 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 754    as the initial length of some arrays.  */
 755
 756 static reg_errcode_t
 757 init_dfa (dfa, pat_len)
 758      re_dfa_t *dfa;
 759      int pat_len;
 760 {
 761   int table_size;
 762
 763   memset (dfa, '\0', sizeof (re_dfa_t));
 764
 765   dfa->nodes_alloc = pat_len + 1;
 766   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 767
 768   dfa->states_alloc = pat_len + 1;
 769
 770   /*  table_size = 2 ^ ceil(log pat_len) */
 771   for (table_size = 1; table_size > 0; table_size <<= 1)
 772     if (table_size > pat_len)
 773       break;
 774
 775   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 776   dfa->state_hash_mask = table_size - 1;
 777
 778   dfa->subexps_alloc = 1;
 779   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
 780   dfa->word_char = NULL;
 781
 782   if (BE (dfa->nodes == NULL || dfa->state_table == NULL
 783           || dfa->subexps == NULL, 0))
 784     {
 785       /* We don't bother to free anything which was allocated.  Very
 786          soon the process will go down anyway.  */
 787       dfa->subexps = NULL;
 788       dfa->state_table = NULL;
 789       dfa->nodes = NULL;
 790       return REG_ESPACE;
 791     }
 792   return REG_NOERROR;
 793 }
 794
 795 /* Initialize WORD_CHAR table, which indicate which character is
 796    "word".  In this case "word" means that it is the word construction
 797    character used by some operators like "\<", "\>", etc.  */
 798
 799 static reg_errcode_t
 800 init_word_char (dfa)
 801      re_dfa_t *dfa;
 802 {
 803   int i, j, ch;
 804   dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 805   if (BE (dfa->word_char == NULL, 0))
 806     return REG_ESPACE;
 807   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 808     for (j = 0; j < UINT_BITS; ++j, ++ch)
 809       if (isalnum (ch) || ch == '_')
 810         dfa->word_char[i] |= 1 << j;
 811   return REG_NOERROR;
 812 }
 813
 814 /* Free the work area which are only used while compiling.  */
 815
 816 static void
 817 free_workarea_compile (preg)
 818      regex_t *preg;
 819 {
 820   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 821   free_bin_tree (dfa->str_tree);
 822   dfa->str_tree = NULL;
 823   re_free (dfa->org_indices);
 824   dfa->org_indices = NULL;
 825 }
 826
 827 /* Create initial states for all contexts.  */
 828
 829 static reg_errcode_t
 830 create_initial_state (dfa)
 831      re_dfa_t *dfa;
 832 {
 833   int first, i;
 834   reg_errcode_t err;
 835   re_node_set init_nodes;
 836
 837   /* Initial states have the epsilon closure of the node which is
 838      the first node of the regular expression.  */
 839   first = dfa->str_tree->first;
 840   dfa->init_node = first;
 841   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 842   if (BE (err != REG_NOERROR, 0))
 843     return err;
 844
 845   /* The back-references which are in initial states can epsilon transit,
 846      since in this case all of the subexpressions can be null.
 847      Then we add epsilon closures of the nodes which are the next nodes of
 848      the back-references.  */
 849   if (dfa->nbackref > 0)
 850     for (i = 0; i < init_nodes.nelem; ++i)
 851       {
 852         int node_idx = init_nodes.elems[i];
 853         re_token_type_t type = dfa->nodes[node_idx].type;
 854
 855         int clexp_idx;
 856         if (type != OP_BACK_REF)
 857           continue;
 858         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 859           {
 860             re_token_t *clexp_node;
 861             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 862             if (clexp_node->type == OP_CLOSE_SUBEXP
 863                 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
 864               break;
 865           }
 866         if (clexp_idx == init_nodes.nelem)
 867           continue;
 868
 869         if (type == OP_BACK_REF)
 870           {
 871             int dest_idx = dfa->edests[node_idx].elems[0];
 872             if (!re_node_set_contains (&init_nodes, dest_idx))
 873               {
 874                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 875                 i = 0;
 876               }
 877           }
 878       }
 879
 880   /* It must be the first time to invoke acquire_state.  */
 881   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 882   /* We don't check ERR here, since the initial state must not be NULL.  */
 883   if (BE (dfa->init_state == NULL, 0))
 884     return err;
 885   if (dfa->init_state->has_constraint)
 886     {
 887       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 888                                                        CONTEXT_WORD);
 889       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 890                                                      CONTEXT_NEWLINE);
 891       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
 892                                                          &init_nodes,
 893                                                          CONTEXT_NEWLINE
 894                                                          | CONTEXT_BEGBUF);
 895       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 896               || dfa->init_state_begbuf == NULL, 0))
 897         return err;
 898     }
 899   else
 900     dfa->init_state_word = dfa->init_state_nl
 901       = dfa->init_state_begbuf = dfa->init_state;
 902
 903   re_node_set_free (&init_nodes);
 904   return REG_NOERROR;
 905 }
 906 \f
 907 /* Analyze the structure tree, and calculate "first", "next", "edest",
 908    "eclosure", and "inveclosure".  */
 909
 910 static reg_errcode_t
 911 analyze (dfa)
 912      re_dfa_t *dfa;
 913 {
 914   int i;
 915   reg_errcode_t ret;
 916
 917   /* Allocate arrays.  */
 918   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
 919   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
 920   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
 921   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
 922   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
 923   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
 924           || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
 925     return REG_ESPACE;
 926   /* Initialize them.  */
 927   for (i = 0; i < dfa->nodes_len; ++i)
 928     {
 929       dfa->nexts[i] = -1;
 930       re_node_set_init_empty (dfa->edests + i);
 931       re_node_set_init_empty (dfa->eclosures + i);
 932       re_node_set_init_empty (dfa->inveclosures + i);
 933     }
 934
 935   ret = analyze_tree (dfa, dfa->str_tree);
 936   if (BE (ret == REG_NOERROR, 1))
 937     {
 938       ret = calc_eclosure (dfa);
 939       if (ret == REG_NOERROR)
 940         calc_inveclosure (dfa);
 941     }
 942   return ret;
 943 }
 944
 945 /* Helper functions for analyze.
 946    This function calculate "first", "next", and "edest" for the subtree
 947    whose root is NODE.  */
 948
 949 static reg_errcode_t
 950 analyze_tree (dfa, node)
 951      re_dfa_t *dfa;
 952      bin_tree_t *node;
 953 {
 954   reg_errcode_t ret;
 955   if (node->first == -1)
 956     calc_first (dfa, node);
 957   if (node->next == -1)
 958     calc_next (dfa, node);
 959   if (node->eclosure.nelem == 0)
 960     calc_epsdest (dfa, node);
 961   /* Calculate "first" etc. for the left child.  */
 962   if (node->left != NULL)
 963     {
 964       ret = analyze_tree (dfa, node->left);
 965       if (BE (ret != REG_NOERROR, 0))
 966         return ret;
 967     }
 968   /* Calculate "first" etc. for the right child.  */
 969   if (node->right != NULL)
 970     {
 971       ret = analyze_tree (dfa, node->right);
 972       if (BE (ret != REG_NOERROR, 0))
 973         return ret;
 974     }
 975   return REG_NOERROR;
 976 }
 977
 978 /* Calculate "first" for the node NODE.  */
 979 static void
 980 calc_first (dfa, node)
 981      re_dfa_t *dfa;
 982      bin_tree_t *node;
 983 {
 984   int idx, type;
 985   idx = node->node_idx;
 986   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
 987
 988   switch (type)
 989     {
 990 #ifdef DEBUG
 991     case OP_OPEN_BRACKET:
 992     case OP_CLOSE_BRACKET:
 993     case OP_OPEN_DUP_NUM:
 994     case OP_CLOSE_DUP_NUM:
 995     case OP_NON_MATCH_LIST:
 996     case OP_OPEN_COLL_ELEM:
 997     case OP_CLOSE_COLL_ELEM:
 998     case OP_OPEN_EQUIV_CLASS:
 999     case OP_CLOSE_EQUIV_CLASS:
1000     case OP_OPEN_CHAR_CLASS:
1001     case OP_CLOSE_CHAR_CLASS:
1002       /* These must not be appeared here.  */
1003       assert (0);
1004 #endif
1005     case END_OF_RE:
1006     case CHARACTER:
1007     case OP_PERIOD:
1008     case OP_DUP_ASTERISK:
1009     case OP_DUP_QUESTION:
1010 #ifdef RE_ENABLE_I18N
1011     case COMPLEX_BRACKET:
1012 #endif /* RE_ENABLE_I18N */
1013     case SIMPLE_BRACKET:
1014     case OP_BACK_REF:
1015     case ANCHOR:
1016     case OP_OPEN_SUBEXP:
1017     case OP_CLOSE_SUBEXP:
1018       node->first = idx;
1019       break;
1020     case OP_DUP_PLUS:
1021 #ifdef DEBUG
1022       assert (node->left != NULL);
1023 #endif
1024       if (node->left->first == -1)
1025         calc_first (dfa, node->left);
1026       node->first = node->left->first;
1027       break;
1028     case OP_ALT:
1029       node->first = idx;
1030       break;
1031       /* else fall through */
1032     default:
1033 #ifdef DEBUG
1034       assert (node->left != NULL);
1035 #endif
1036       if (node->left->first == -1)
1037         calc_first (dfa, node->left);
1038       node->first = node->left->first;
1039       break;
1040     }
1041 }
1042
1043 /* Calculate "next" for the node NODE.  */
1044
1045 static void
1046 calc_next (dfa, node)
1047      re_dfa_t *dfa;
1048      bin_tree_t *node;
1049 {
1050   int idx, type;
1051   bin_tree_t *parent = node->parent;
1052   if (parent == NULL)
1053     {
1054       node->next = -1;
1055       idx = node->node_idx;
1056       if (node->type == 0)
1057         dfa->nexts[idx] = node->next;
1058       return;
1059     }
1060
1061   idx = parent->node_idx;
1062   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1063
1064   switch (type)
1065     {
1066     case OP_DUP_ASTERISK:
1067     case OP_DUP_PLUS:
1068       node->next = idx;
1069       break;
1070     case CONCAT:
1071       if (parent->left == node)
1072         {
1073           if (parent->right->first == -1)
1074             calc_first (dfa, parent->right);
1075           node->next = parent->right->first;
1076           break;
1077         }
1078       /* else fall through */
1079     default:
1080       if (parent->next == -1)
1081         calc_next (dfa, parent);
1082       node->next = parent->next;
1083       break;
1084     }
1085   idx = node->node_idx;
1086   if (node->type == 0)
1087     dfa->nexts[idx] = node->next;
1088 }
1089
1090 /* Calculate "edest" for the node NODE.  */
1091
1092 static void
1093 calc_epsdest (dfa, node)
1094      re_dfa_t *dfa;
1095      bin_tree_t *node;
1096 {
1097   int idx;
1098   idx = node->node_idx;
1099   if (node->type == 0)
1100     {
1101       if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1102           || dfa->nodes[idx].type == OP_DUP_PLUS
1103           || dfa->nodes[idx].type == OP_DUP_QUESTION)
1104         {
1105           if (node->left->first == -1)
1106             calc_first (dfa, node->left);
1107           if (node->next == -1)
1108             calc_next (dfa, node);
1109           re_node_set_init_2 (dfa->edests + idx, node->left->first,
1110                               node->next);
1111         }
1112       else if (dfa->nodes[idx].type == OP_ALT)
1113         {
1114           int left, right;
1115           if (node->left != NULL)
1116             {
1117               if (node->left->first == -1)
1118                 calc_first (dfa, node->left);
1119               left = node->left->first;
1120             }
1121           else
1122             {
1123               if (node->next == -1)
1124                 calc_next (dfa, node);
1125               left = node->next;
1126             }
1127           if (node->right != NULL)
1128             {
1129               if (node->right->first == -1)
1130                 calc_first (dfa, node->right);
1131               right = node->right->first;
1132             }
1133           else
1134             {
1135               if (node->next == -1)
1136                 calc_next (dfa, node);
1137               right = node->next;
1138             }
1139           re_node_set_init_2 (dfa->edests + idx, left, right);
1140         }
1141       else if (dfa->nodes[idx].type == ANCHOR
1142                || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1143                || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1144                || dfa->nodes[idx].type == OP_BACK_REF)
1145         re_node_set_init_1 (dfa->edests + idx, node->next);
1146     }
1147 }
1148
1149 /* Duplicate the epsilon closure of the node ROOT_NODE.
1150    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1151    to their own constraint.  */
1152
1153 static reg_errcode_t
1154 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1155                         init_constraint)
1156      re_dfa_t *dfa;
1157      int top_org_node, top_clone_node, root_node;
1158      unsigned int init_constraint;
1159 {
1160   reg_errcode_t err;
1161   int org_node, clone_node, ret;
1162   unsigned int constraint = init_constraint;
1163   for (org_node = top_org_node, clone_node = top_clone_node;;)
1164     {
1165       int org_dest, clone_dest;
1166       if (dfa->nodes[org_node].type == OP_BACK_REF)
1167         {
1168           /* If the back reference epsilon-transit, its destination must
1169              also have the constraint.  Then duplicate the epsilon closure
1170              of the destination of the back reference, and store it in
1171              edests of the back reference.  */
1172           org_dest = dfa->nexts[org_node];
1173           re_node_set_empty (dfa->edests + clone_node);
1174           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1175           if (BE (err != REG_NOERROR, 0))
1176             return err;
1177           dfa->nexts[clone_node] = dfa->nexts[org_node];
1178           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1179           if (BE (ret < 0, 0))
1180             return REG_ESPACE;
1181         }
1182       else if (dfa->edests[org_node].nelem == 0)
1183         {
1184           /* In case of the node can't epsilon-transit, don't duplicate the
1185              destination and store the original destination as the
1186              destination of the node.  */
1187           dfa->nexts[clone_node] = dfa->nexts[org_node];
1188           break;
1189         }
1190       else if (dfa->edests[org_node].nelem == 1)
1191         {
1192           /* In case of the node can epsilon-transit, and it has only one
1193              destination.  */
1194           org_dest = dfa->edests[org_node].elems[0];
1195           re_node_set_empty (dfa->edests + clone_node);
1196           if (dfa->nodes[org_node].type == ANCHOR)
1197             {
1198               /* In case of the node has another constraint, append it.  */
1199               if (org_node == root_node && clone_node != org_node)
1200                 {
1201                   /* ...but if the node is root_node itself, it means the
1202                      epsilon closure have a loop, then tie it to the
1203                      destination of the root_node.  */
1204                   ret = re_node_set_insert (dfa->edests + clone_node,
1205                                             org_dest);
1206                   if (BE (ret < 0, 0))
1207                     return REG_ESPACE;
1208                   break;
1209                 }
1210               constraint |= dfa->nodes[org_node].opr.ctx_type;
1211             }
1212           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1213           if (BE (err != REG_NOERROR, 0))
1214             return err;
1215           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1216           if (BE (ret < 0, 0))
1217             return REG_ESPACE;
1218         }
1219       else /* dfa->edests[org_node].nelem == 2 */
1220         {
1221           /* In case of the node can epsilon-transit, and it has two
1222              destinations. E.g. '|', '*', '+', '?'.   */
1223           org_dest = dfa->edests[org_node].elems[0];
1224           re_node_set_empty (dfa->edests + clone_node);
1225           /* Search for a duplicated node which satisfies the constraint.  */
1226           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1227           if (clone_dest == -1)
1228             {
1229               /* There are no such a duplicated node, create a new one.  */
1230               err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1231               if (BE (err != REG_NOERROR, 0))
1232                 return err;
1233               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1234               if (BE (ret < 0, 0))
1235                 return REG_ESPACE;
1236               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1237                                             root_node, constraint);
1238               if (BE (err != REG_NOERROR, 0))
1239                 return err;
1240             }
1241           else
1242             {
1243               /* There are a duplicated node which satisfy the constraint,
1244                  use it to avoid infinite loop.  */
1245               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1246               if (BE (ret < 0, 0))
1247                 return REG_ESPACE;
1248             }
1249
1250           org_dest = dfa->edests[org_node].elems[1];
1251           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1252           if (BE (err != REG_NOERROR, 0))
1253             return err;
1254           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1255           if (BE (ret < 0, 0))
1256             return REG_ESPACE;
1257         }
1258       org_node = org_dest;
1259       clone_node = clone_dest;
1260     }
1261   return REG_NOERROR;
1262 }
1263
1264 /* Search for a node which is duplicated from the node ORG_NODE, and
1265    satisfies the constraint CONSTRAINT.  */
1266
1267 static int
1268 search_duplicated_node (dfa, org_node, constraint)
1269      re_dfa_t *dfa;
1270      int org_node;
1271      unsigned int constraint;
1272 {
1273   int idx;
1274   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1275     {
1276       if (org_node == dfa->org_indices[idx]
1277           && constraint == dfa->nodes[idx].constraint)
1278         return idx; /* Found.  */
1279     }
1280   return -1; /* Not found.  */
1281 }
1282
1283 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1284    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1285    otherwise return the error code.  */
1286
1287 static reg_errcode_t
1288 duplicate_node (new_idx, dfa, org_idx, constraint)
1289      re_dfa_t *dfa;
1290      int *new_idx, org_idx;
1291      unsigned int constraint;
1292 {
1293   re_token_t dup;
1294   int dup_idx;
1295
1296   dup = dfa->nodes[org_idx];
1297   dup_idx = re_dfa_add_node (dfa, dup, 1);
1298   if (BE (dup_idx == -1, 0))
1299     return REG_ESPACE;
1300   dfa->nodes[dup_idx].constraint = constraint;
1301   if (dfa->nodes[org_idx].type == ANCHOR)
1302     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1303   dfa->nodes[dup_idx].duplicated = 1;
1304   re_node_set_init_empty (dfa->edests + dup_idx);
1305   re_node_set_init_empty (dfa->eclosures + dup_idx);
1306   re_node_set_init_empty (dfa->inveclosures + dup_idx);
1307
1308   /* Store the index of the original node.  */
1309   dfa->org_indices[dup_idx] = org_idx;
1310   *new_idx = dup_idx;
1311   return REG_NOERROR;
1312 }
1313
1314 static void
1315 calc_inveclosure (dfa)
1316      re_dfa_t *dfa;
1317 {
1318   int src, idx, dest;
1319   for (src = 0; src < dfa->nodes_len; ++src)
1320     {
1321       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1322         {
1323           dest = dfa->eclosures[src].elems[idx];
1324           re_node_set_insert (dfa->inveclosures + dest, src);
1325         }
1326     }
1327 }
1328
1329 /* Calculate "eclosure" for all the node in DFA.  */
1330
1331 static reg_errcode_t
1332 calc_eclosure (dfa)
1333      re_dfa_t *dfa;
1334 {
1335   int node_idx, incomplete;
1336 #ifdef DEBUG
1337   assert (dfa->nodes_len > 0);
1338 #endif
1339   incomplete = 0;
1340   /* For each nodes, calculate epsilon closure.  */
1341   for (node_idx = 0; ; ++node_idx)
1342     {
1343       reg_errcode_t err;
1344       re_node_set eclosure_elem;
1345       if (node_idx == dfa->nodes_len)
1346         {
1347           if (!incomplete)
1348             break;
1349           incomplete = 0;
1350           node_idx = 0;
1351         }
1352
1353 #ifdef DEBUG
1354       assert (dfa->eclosures[node_idx].nelem != -1);
1355 #endif
1356       /* If we have already calculated, skip it.  */
1357       if (dfa->eclosures[node_idx].nelem != 0)
1358         continue;
1359       /* Calculate epsilon closure of `node_idx'.  */
1360       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1361       if (BE (err != REG_NOERROR, 0))
1362         return err;
1363
1364       if (dfa->eclosures[node_idx].nelem == 0)
1365         {
1366           incomplete = 1;
1367           re_node_set_free (&eclosure_elem);
1368         }
1369     }
1370   return REG_NOERROR;
1371 }
1372
1373 /* Calculate epsilon closure of NODE.  */
1374
1375 static reg_errcode_t
1376 calc_eclosure_iter (new_set, dfa, node, root)
1377      re_node_set *new_set;
1378      re_dfa_t *dfa;
1379      int node, root;
1380 {
1381   reg_errcode_t err;
1382   unsigned int constraint;
1383   int i, incomplete;
1384   re_node_set eclosure;
1385   incomplete = 0;
1386   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1387   if (BE (err != REG_NOERROR, 0))
1388     return err;
1389
1390   /* This indicates that we are calculating this node now.
1391      We reference this value to avoid infinite loop.  */
1392   dfa->eclosures[node].nelem = -1;
1393
1394   constraint = ((dfa->nodes[node].type == ANCHOR)
1395                 ? dfa->nodes[node].opr.ctx_type : 0);
1396   /* If the current node has constraints, duplicate all nodes.
1397      Since they must inherit the constraints.  */
1398   if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1399     {
1400       int org_node, cur_node;
1401       org_node = cur_node = node;
1402       err = duplicate_node_closure (dfa, node, node, node, constraint);
1403       if (BE (err != REG_NOERROR, 0))
1404         return err;
1405     }
1406
1407   /* Expand each epsilon destination nodes.  */
1408   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1409     for (i = 0; i < dfa->edests[node].nelem; ++i)
1410       {
1411         re_node_set eclosure_elem;
1412         int edest = dfa->edests[node].elems[i];
1413         /* If calculating the epsilon closure of `edest' is in progress,
1414            return intermediate result.  */
1415         if (dfa->eclosures[edest].nelem == -1)
1416           {
1417             incomplete = 1;
1418             continue;
1419           }
1420         /* If we haven't calculated the epsilon closure of `edest' yet,
1421            calculate now. Otherwise use calculated epsilon closure.  */
1422         if (dfa->eclosures[edest].nelem == 0)
1423           {
1424             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1425             if (BE (err != REG_NOERROR, 0))
1426               return err;
1427           }
1428         else
1429           eclosure_elem = dfa->eclosures[edest];
1430         /* Merge the epsilon closure of `edest'.  */
1431         re_node_set_merge (&eclosure, &eclosure_elem);
1432         /* If the epsilon closure of `edest' is incomplete,
1433            the epsilon closure of this node is also incomplete.  */
1434         if (dfa->eclosures[edest].nelem == 0)
1435           {
1436             incomplete = 1;
1437             re_node_set_free (&eclosure_elem);
1438           }
1439       }
1440
1441   /* Epsilon closures include itself.  */
1442   re_node_set_insert (&eclosure, node);
1443   if (incomplete && !root)
1444     dfa->eclosures[node].nelem = 0;
1445   else
1446     dfa->eclosures[node] = eclosure;
1447   *new_set = eclosure;
1448   return REG_NOERROR;
1449 }
1450 \f
1451 /* Functions for token which are used in the parser.  */
1452
1453 /* Fetch a token from INPUT.
1454    We must not use this function inside bracket expressions.  */
1455
1456 static re_token_t
1457 fetch_token (input, syntax)
1458      re_string_t *input;
1459      reg_syntax_t syntax;
1460 {
1461   re_token_t token;
1462   int consumed_byte;
1463   consumed_byte = peek_token (&token, input, syntax);
1464   re_string_skip_bytes (input, consumed_byte);
1465   return token;
1466 }
1467
1468 /* Peek a token from INPUT, and return the length of the token.
1469    We must not use this function inside bracket expressions.  */
1470
1471 static int
1472 peek_token (token, input, syntax)
1473      re_token_t *token;
1474      re_string_t *input;
1475      reg_syntax_t syntax;
1476 {
1477   unsigned char c;
1478
1479   if (re_string_eoi (input))
1480     {
1481       token->type = END_OF_RE;
1482       return 0;
1483     }
1484
1485   c = re_string_peek_byte (input, 0);
1486   token->opr.c = c;
1487
1488 #ifdef RE_ENABLE_I18N
1489   token->mb_partial = 0;
1490   if (MB_CUR_MAX > 1 &&
1491       !re_string_first_byte (input, re_string_cur_idx (input)))
1492     {
1493       token->type = CHARACTER;
1494       token->mb_partial = 1;
1495       return 1;
1496     }
1497 #endif
1498   if (c == '\\')
1499     {
1500       unsigned char c2;
1501       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1502         {
1503           token->type = BACK_SLASH;
1504           return 1;
1505         }
1506
1507       c2 = re_string_peek_byte_case (input, 1);
1508       token->opr.c = c2;
1509       token->type = CHARACTER;
1510       switch (c2)
1511         {
1512         case '|':
1513           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1514             token->type = OP_ALT;
1515           break;
1516         case '1': case '2': case '3': case '4': case '5':
1517         case '6': case '7': case '8': case '9':
1518           if (!(syntax & RE_NO_BK_REFS))
1519             {
1520               token->type = OP_BACK_REF;
1521               token->opr.idx = c2 - '0';
1522             }
1523           break;
1524         case '<':
1525           if (!(syntax & RE_NO_GNU_OPS))
1526             {
1527               token->type = ANCHOR;
1528               token->opr.idx = WORD_FIRST;
1529             }
1530           break;
1531         case '>':
1532           if (!(syntax & RE_NO_GNU_OPS))
1533             {
1534               token->type = ANCHOR;
1535               token->opr.idx = WORD_LAST;
1536             }
1537           break;
1538         case 'b':
1539           if (!(syntax & RE_NO_GNU_OPS))
1540             {
1541               token->type = ANCHOR;
1542               token->opr.idx = WORD_DELIM;
1543             }
1544           break;
1545         case 'B':
1546           if (!(syntax & RE_NO_GNU_OPS))
1547             {
1548               token->type = ANCHOR;
1549               token->opr.idx = INSIDE_WORD;
1550             }
1551           break;
1552         case 'w':
1553           if (!(syntax & RE_NO_GNU_OPS))
1554             token->type = OP_WORD;
1555           break;
1556         case 'W':
1557           if (!(syntax & RE_NO_GNU_OPS))
1558             token->type = OP_NOTWORD;
1559           break;
1560         case '`':
1561           if (!(syntax & RE_NO_GNU_OPS))
1562             {
1563               token->type = ANCHOR;
1564               token->opr.idx = BUF_FIRST;
1565             }
1566           break;
1567         case '\'':
1568           if (!(syntax & RE_NO_GNU_OPS))
1569             {
1570               token->type = ANCHOR;
1571               token->opr.idx = BUF_LAST;
1572             }
1573           break;
1574         case '(':
1575           if (!(syntax & RE_NO_BK_PARENS))
1576             token->type = OP_OPEN_SUBEXP;
1577           break;
1578         case ')':
1579           if (!(syntax & RE_NO_BK_PARENS))
1580             token->type = OP_CLOSE_SUBEXP;
1581           break;
1582         case '+':
1583           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1584             token->type = OP_DUP_PLUS;
1585           break;
1586         case '?':
1587           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1588             token->type = OP_DUP_QUESTION;
1589           break;
1590         case '{':
1591           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1592             token->type = OP_OPEN_DUP_NUM;
1593           break;
1594         case '}':
1595           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1596             token->type = OP_CLOSE_DUP_NUM;
1597           break;
1598         default:
1599           break;
1600         }
1601       return 2;
1602     }
1603
1604   token->type = CHARACTER;
1605   switch (c)
1606     {
1607     case '\n':
1608       if (syntax & RE_NEWLINE_ALT)
1609         token->type = OP_ALT;
1610       break;
1611     case '|':
1612       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1613         token->type = OP_ALT;
1614       break;
1615     case '*':
1616       token->type = OP_DUP_ASTERISK;
1617       break;
1618     case '+':
1619       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1620         token->type = OP_DUP_PLUS;
1621       break;
1622     case '?':
1623       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1624         token->type = OP_DUP_QUESTION;
1625       break;
1626     case '{':
1627       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1628         token->type = OP_OPEN_DUP_NUM;
1629       break;
1630     case '}':
1631       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1632         token->type = OP_CLOSE_DUP_NUM;
1633       break;
1634     case '(':
1635       if (syntax & RE_NO_BK_PARENS)
1636         token->type = OP_OPEN_SUBEXP;
1637       break;
1638     case ')':
1639       if (syntax & RE_NO_BK_PARENS)
1640         token->type = OP_CLOSE_SUBEXP;
1641       break;
1642     case '[':
1643       token->type = OP_OPEN_BRACKET;
1644       break;
1645     case '.':
1646       token->type = OP_PERIOD;
1647       break;
1648     case '^':
1649       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1650           re_string_cur_idx (input) != 0)
1651         {
1652           char prev = re_string_peek_byte (input, -1);
1653           if (prev != '|' && prev != '(' &&
1654               (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1655             break;
1656         }
1657       token->type = ANCHOR;
1658       token->opr.idx = LINE_FIRST;
1659       break;
1660     case '$':
1661       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1662           re_string_cur_idx (input) + 1 != re_string_length (input))
1663         {
1664           re_token_t next;
1665           re_string_skip_bytes (input, 1);
1666           peek_token (&next, input, syntax);
1667           re_string_skip_bytes (input, -1);
1668           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1669             break;
1670         }
1671       token->type = ANCHOR;
1672       token->opr.idx = LINE_LAST;
1673       break;
1674     default:
1675       break;
1676     }
1677   return 1;
1678 }
1679
1680 /* Peek a token from INPUT, and return the length of the token.
1681    We must not use this function out of bracket expressions.  */
1682
1683 static int
1684 peek_token_bracket (token, input, syntax)
1685      re_token_t *token;
1686      re_string_t *input;
1687      reg_syntax_t syntax;
1688 {
1689   unsigned char c;
1690   if (re_string_eoi (input))
1691     {
1692       token->type = END_OF_RE;
1693       return 0;
1694     }
1695   c = re_string_peek_byte (input, 0);
1696   token->opr.c = c;
1697
1698 #ifdef RE_ENABLE_I18N
1699   if (MB_CUR_MAX > 1 &&
1700       !re_string_first_byte (input, re_string_cur_idx (input)))
1701     {
1702       token->type = CHARACTER;
1703       return 1;
1704     }
1705 #endif /* RE_ENABLE_I18N */
1706
1707   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1708     {
1709       /* In this case, '\' escape a character.  */
1710       unsigned char c2;
1711       re_string_skip_bytes (input, 1);
1712       c2 = re_string_peek_byte (input, 0);
1713       token->opr.c = c2;
1714       token->type = CHARACTER;
1715       return 1;
1716     }
1717   if (c == '[') /* '[' is a special char in a bracket exps.  */
1718     {
1719       unsigned char c2;
1720       int token_len;
1721       c2 = re_string_peek_byte (input, 1);
1722       token->opr.c = c2;
1723       token_len = 2;
1724       switch (c2)
1725         {
1726         case '.':
1727           token->type = OP_OPEN_COLL_ELEM;
1728           break;
1729         case '=':
1730           token->type = OP_OPEN_EQUIV_CLASS;
1731           break;
1732         case ':':
1733           if (syntax & RE_CHAR_CLASSES)
1734             {
1735               token->type = OP_OPEN_CHAR_CLASS;
1736               break;
1737             }
1738           /* else fall through.  */
1739         default:
1740           token->type = CHARACTER;
1741           token->opr.c = c;
1742           token_len = 1;
1743           break;
1744         }
1745       return token_len;
1746     }
1747   switch (c)
1748     {
1749     case '-':
1750       token->type = OP_CHARSET_RANGE;
1751       break;
1752     case ']':
1753       token->type = OP_CLOSE_BRACKET;
1754       break;
1755     case '^':
1756       token->type = OP_NON_MATCH_LIST;
1757       break;
1758     default:
1759       token->type = CHARACTER;
1760     }
1761   return 1;
1762 }
1763 \f
1764 /* Functions for parser.  */
1765
1766 /* Entry point of the parser.
1767    Parse the regular expression REGEXP and return the structure tree.
1768    If an error is occured, ERR is set by error code, and return NULL.
1769    This function build the following tree, from regular expression <reg_exp>:
1770            CAT
1771            / \
1772           /   \
1773    <reg_exp>  EOR
1774
1775    CAT means concatenation.
1776    EOR means end of regular expression.  */
1777
1778 static bin_tree_t *
1779 parse (regexp, preg, syntax, err)
1780      re_string_t *regexp;
1781      regex_t *preg;
1782      reg_syntax_t syntax;
1783      reg_errcode_t *err;
1784 {
1785   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1786   bin_tree_t *tree, *eor, *root;
1787   re_token_t current_token;
1788   int new_idx;
1789   current_token = fetch_token (regexp, syntax);
1790   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1791   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1792     return NULL;
1793   new_idx = re_dfa_add_node (dfa, current_token, 0);
1794   eor = create_tree (NULL, NULL, 0, new_idx);
1795   if (tree != NULL)
1796     root = create_tree (tree, eor, CONCAT, 0);
1797   else
1798     root = eor;
1799   if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1800     {
1801       *err = REG_ESPACE;
1802       return NULL;
1803     }
1804   return root;
1805 }
1806
1807 /* This function build the following tree, from regular expression
1808    <branch1>|<branch2>:
1809            ALT
1810            / \
1811           /   \
1812    <branch1> <branch2>
1813
1814    ALT means alternative, which represents the operator `|'.  */
1815
1816 static bin_tree_t *
1817 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1818      re_string_t *regexp;
1819      regex_t *preg;
1820      re_token_t *token;
1821      reg_syntax_t syntax;
1822      int nest;
1823      reg_errcode_t *err;
1824 {
1825   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1826   bin_tree_t *tree, *branch = NULL;
1827   int new_idx;
1828   tree = parse_branch (regexp, preg, token, syntax, nest, err);
1829   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1830     return NULL;
1831
1832   while (token->type == OP_ALT)
1833     {
1834       re_token_t alt_token = *token;
1835       new_idx = re_dfa_add_node (dfa, alt_token, 0);
1836       *token = fetch_token (regexp, syntax);
1837       if (token->type != OP_ALT && token->type != END_OF_RE
1838           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1839         {
1840           branch = parse_branch (regexp, preg, token, syntax, nest, err);
1841           if (BE (*err != REG_NOERROR && branch == NULL, 0))
1842             {
1843               free_bin_tree (tree);
1844               return NULL;
1845             }
1846         }
1847       else
1848         branch = NULL;
1849       tree = create_tree (tree, branch, 0, new_idx);
1850       if (BE (new_idx == -1 || tree == NULL, 0))
1851         {
1852           *err = REG_ESPACE;
1853           return NULL;
1854         }
1855       dfa->has_plural_match = 1;
1856     }
1857   return tree;
1858 }
1859
1860 /* This function build the following tree, from regular expression
1861    <exp1><exp2>:
1862         CAT
1863         / \
1864        /   \
1865    <exp1> <exp2>
1866
1867    CAT means concatenation.  */
1868
1869 static bin_tree_t *
1870 parse_branch (regexp, preg, token, syntax, nest, err)
1871      re_string_t *regexp;
1872      regex_t *preg;
1873      re_token_t *token;
1874      reg_syntax_t syntax;
1875      int nest;
1876      reg_errcode_t *err;
1877 {
1878   bin_tree_t *tree, *exp;
1879   tree = parse_expression (regexp, preg, token, syntax, nest, err);
1880   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1881     return NULL;
1882
1883   while (token->type != OP_ALT && token->type != END_OF_RE
1884          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1885     {
1886       exp = parse_expression (regexp, preg, token, syntax, nest, err);
1887       if (BE (*err != REG_NOERROR && exp == NULL, 0))
1888         {
1889           free_bin_tree (tree);
1890           return NULL;
1891         }
1892       if (tree != NULL && exp != NULL)
1893         {
1894           tree = create_tree (tree, exp, CONCAT, 0);
1895           if (tree == NULL)
1896             {
1897               *err = REG_ESPACE;
1898               return NULL;
1899             }
1900         }
1901       else if (tree == NULL)
1902         tree = exp;
1903       /* Otherwise exp == NULL, we don't need to create new tree.  */
1904     }
1905   return tree;
1906 }
1907
1908 /* This function build the following tree, from regular expression a*:
1909          *
1910          |
1911          a
1912 */
1913
1914 static bin_tree_t *
1915 parse_expression (regexp, preg, token, syntax, nest, err)
1916      re_string_t *regexp;
1917      regex_t *preg;
1918      re_token_t *token;
1919      reg_syntax_t syntax;
1920      int nest;
1921      reg_errcode_t *err;
1922 {
1923   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1924   bin_tree_t *tree;
1925   int new_idx;
1926   switch (token->type)
1927     {
1928     case CHARACTER:
1929       new_idx = re_dfa_add_node (dfa, *token, 0);
1930       tree = create_tree (NULL, NULL, 0, new_idx);
1931       if (BE (new_idx == -1 || tree == NULL, 0))
1932         {
1933           *err = REG_ESPACE;
1934           return NULL;
1935         }
1936 #ifdef RE_ENABLE_I18N
1937       if (MB_CUR_MAX > 1)
1938         {
1939           while (!re_string_eoi (regexp)
1940                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1941             {
1942               bin_tree_t *mbc_remain;
1943               *token = fetch_token (regexp, syntax);
1944               new_idx = re_dfa_add_node (dfa, *token, 0);
1945               mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1946               tree = create_tree (tree, mbc_remain, CONCAT, 0);
1947               if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1948                 {
1949                   *err = REG_ESPACE;
1950                   return NULL;
1951                 }
1952             }
1953         }
1954 #endif
1955       break;
1956     case OP_OPEN_SUBEXP:
1957       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1958       if (BE (*err != REG_NOERROR && tree == NULL, 0))
1959         return NULL;
1960       break;
1961     case OP_OPEN_BRACKET:
1962       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1963       if (BE (*err != REG_NOERROR && tree == NULL, 0))
1964         return NULL;
1965       break;
1966     case OP_BACK_REF:
1967       if (BE (preg->re_nsub < token->opr.idx
1968               || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1969         {
1970           *err = REG_ESUBREG;
1971           return NULL;
1972         }
1973       dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
1974       new_idx = re_dfa_add_node (dfa, *token, 0);
1975       tree = create_tree (NULL, NULL, 0, new_idx);
1976       if (BE (new_idx == -1 || tree == NULL, 0))
1977         {
1978           *err = REG_ESPACE;
1979           return NULL;
1980         }
1981       ++dfa->nbackref;
1982       dfa->has_mb_node = 1;
1983       break;
1984     case OP_DUP_ASTERISK:
1985     case OP_DUP_PLUS:
1986     case OP_DUP_QUESTION:
1987     case OP_OPEN_DUP_NUM:
1988       if (syntax & RE_CONTEXT_INVALID_OPS)
1989         {
1990           *err = REG_BADRPT;
1991           return NULL;
1992         }
1993       else if (syntax & RE_CONTEXT_INDEP_OPS)
1994         {
1995           *token = fetch_token (regexp, syntax);
1996           return parse_expression (regexp, preg, token, syntax, nest, err);
1997         }
1998       /* else fall through  */
1999     case OP_CLOSE_SUBEXP:
2000       if ((token->type == OP_CLOSE_SUBEXP) &&
2001           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2002         {
2003           *err = REG_ERPAREN;
2004           return NULL;
2005         }
2006       /* else fall through  */
2007     case OP_CLOSE_DUP_NUM:
2008       /* We treat it as a normal character.  */
2009
2010       /* Then we can these characters as normal characters.  */
2011       token->type = CHARACTER;
2012       new_idx = re_dfa_add_node (dfa, *token, 0);
2013       tree = create_tree (NULL, NULL, 0, new_idx);
2014       if (BE (new_idx == -1 || tree == NULL, 0))
2015         {
2016           *err = REG_ESPACE;
2017           return NULL;
2018         }
2019       break;
2020     case ANCHOR:
2021       if (dfa->word_char == NULL)
2022         {
2023           *err = init_word_char (dfa);
2024           if (BE (*err != REG_NOERROR, 0))
2025             return NULL;
2026         }
2027       if (token->opr.ctx_type == WORD_DELIM)
2028         {
2029           bin_tree_t *tree_first, *tree_last;
2030           int idx_first, idx_last;
2031           token->opr.ctx_type = WORD_FIRST;
2032           idx_first = re_dfa_add_node (dfa, *token, 0);
2033           tree_first = create_tree (NULL, NULL, 0, idx_first);
2034           token->opr.ctx_type = WORD_LAST;
2035           idx_last = re_dfa_add_node (dfa, *token, 0);
2036           tree_last = create_tree (NULL, NULL, 0, idx_last);
2037           token->type = OP_ALT;
2038           new_idx = re_dfa_add_node (dfa, *token, 0);
2039           tree = create_tree (tree_first, tree_last, 0, new_idx);
2040           if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2041                   || tree_first == NULL || tree_last == NULL
2042                   || tree == NULL, 0))
2043             {
2044               *err = REG_ESPACE;
2045               return NULL;
2046             }
2047         }
2048       else
2049         {
2050           new_idx = re_dfa_add_node (dfa, *token, 0);
2051           tree = create_tree (NULL, NULL, 0, new_idx);
2052           if (BE (new_idx == -1 || tree == NULL, 0))
2053             {
2054               *err = REG_ESPACE;
2055               return NULL;
2056             }
2057         }
2058       /* We must return here, since ANCHORs can't be followed
2059          by repetition operators.
2060          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2061              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2062       *token = fetch_token (regexp, syntax);
2063       return tree;
2064     case OP_PERIOD:
2065       new_idx = re_dfa_add_node (dfa, *token, 0);
2066       tree = create_tree (NULL, NULL, 0, new_idx);
2067       if (BE (new_idx == -1 || tree == NULL, 0))
2068         {
2069           *err = REG_ESPACE;
2070           return NULL;
2071         }
2072       if (MB_CUR_MAX > 1)
2073         dfa->has_mb_node = 1;
2074       break;
2075     case OP_WORD:
2076       tree = build_word_op (dfa, 0, err);
2077       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2078         return NULL;
2079       break;
2080     case OP_NOTWORD:
2081       tree = build_word_op (dfa, 1, err);
2082       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2083         return NULL;
2084       break;
2085     case OP_ALT:
2086     case END_OF_RE:
2087       return NULL;
2088     case BACK_SLASH:
2089       *err = REG_EESCAPE;
2090       return NULL;
2091     default:
2092       /* Must not happen?  */
2093 #ifdef DEBUG
2094       assert (0);
2095 #endif
2096       return NULL;
2097     }
2098   *token = fetch_token (regexp, syntax);
2099
2100   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2101          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2102     {
2103       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2104       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2105         return NULL;
2106       dfa->has_plural_match = 1;
2107     }
2108
2109   return tree;
2110 }
2111
2112 /* This function build the following tree, from regular expression
2113    (<reg_exp>):
2114          SUBEXP
2115             |
2116         <reg_exp>
2117 */
2118
2119 static bin_tree_t *
2120 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2121      re_string_t *regexp;
2122      regex_t *preg;
2123      re_token_t *token;
2124      reg_syntax_t syntax;
2125      int nest;
2126      reg_errcode_t *err;
2127 {
2128   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2129   bin_tree_t *tree, *left_par, *right_par;
2130   size_t cur_nsub;
2131   int new_idx;
2132   cur_nsub = preg->re_nsub++;
2133   if (dfa->subexps_alloc < preg->re_nsub)
2134     {
2135       re_subexp_t *new_array;
2136       dfa->subexps_alloc *= 2;
2137       new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2138       if (BE (new_array == NULL, 0))
2139         {
2140           dfa->subexps_alloc /= 2;
2141           *err = REG_ESPACE;
2142           return NULL;
2143         }
2144       dfa->subexps = new_array;
2145     }
2146   dfa->subexps[cur_nsub].start = dfa->nodes_len;
2147   dfa->subexps[cur_nsub].end = -1;
2148
2149   new_idx = re_dfa_add_node (dfa, *token, 0);
2150   left_par = create_tree (NULL, NULL, 0, new_idx);
2151   if (BE (new_idx == -1 || left_par == NULL, 0))
2152     {
2153       *err = REG_ESPACE;
2154       return NULL;
2155     }
2156   dfa->nodes[new_idx].opr.idx = cur_nsub;
2157   *token = fetch_token (regexp, syntax);
2158
2159   /* The subexpression may be a null string.  */
2160   if (token->type == OP_CLOSE_SUBEXP)
2161     tree = NULL;
2162   else
2163     {
2164       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2165       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2166         return NULL;
2167     }
2168   if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2169     {
2170       free_bin_tree (tree);
2171       *err = REG_BADPAT;
2172       return NULL;
2173     }
2174   new_idx = re_dfa_add_node (dfa, *token, 0);
2175   dfa->subexps[cur_nsub].end = dfa->nodes_len;
2176   right_par = create_tree (NULL, NULL, 0, new_idx);
2177   tree = ((tree == NULL) ? right_par
2178           : create_tree (tree, right_par, CONCAT, 0));
2179   tree = create_tree (left_par, tree, CONCAT, 0);
2180   if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2181     {
2182       *err = REG_ESPACE;
2183       return NULL;
2184     }
2185   dfa->nodes[new_idx].opr.idx = cur_nsub;
2186
2187   return tree;
2188 }
2189
2190 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2191
2192 static bin_tree_t *
2193 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2194      bin_tree_t *dup_elem;
2195      re_string_t *regexp;
2196      re_dfa_t *dfa;
2197      re_token_t *token;
2198      reg_syntax_t syntax;
2199      reg_errcode_t *err;
2200 {
2201   re_token_t dup_token;
2202   bin_tree_t *tree = dup_elem, *work_tree;
2203   int new_idx, start_idx = re_string_cur_idx (regexp);
2204   re_token_t start_token = *token;
2205   if (token->type == OP_OPEN_DUP_NUM)
2206     {
2207       int i;
2208       int end = 0;
2209       int start = fetch_number (regexp, token, syntax);
2210       bin_tree_t *elem;
2211       if (start == -1)
2212         {
2213           if (token->type == CHARACTER && token->opr.c == ',')
2214             start = 0; /* We treat "{,m}" as "{0,m}".  */
2215           else
2216             {
2217               *err = REG_BADBR; /* <re>{} is invalid.  */
2218               return NULL;
2219             }
2220         }
2221       if (BE (start != -2, 1))
2222         {
2223           /* We treat "{n}" as "{n,n}".  */
2224           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2225                  : ((token->type == CHARACTER && token->opr.c == ',')
2226                     ? fetch_number (regexp, token, syntax) : -2));
2227         }
2228       if (BE (start == -2 || end == -2, 0))
2229         {
2230           /* Invalid sequence.  */
2231           if (token->type == OP_CLOSE_DUP_NUM)
2232             goto parse_dup_op_invalid_interval;
2233           else
2234             goto parse_dup_op_ebrace;
2235         }
2236       if (BE (start == 0 && end == 0, 0))
2237         {
2238           /* We treat "<re>{0}" and "<re>{0,0}" as null string.  */
2239           *token = fetch_token (regexp, syntax);
2240           free_bin_tree (dup_elem);
2241           return NULL;
2242         }
2243
2244       /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2245       elem = tree;
2246       for (i = 0; i < start; ++i)
2247         if (i != 0)
2248           {
2249             work_tree = duplicate_tree (elem, dfa);
2250             tree = create_tree (tree, work_tree, CONCAT, 0);
2251             if (BE (work_tree == NULL || tree == NULL, 0))
2252               goto parse_dup_op_espace;
2253           }
2254
2255       if (end == -1)
2256         {
2257           /* We treat "<re>{0,}" as "<re>*".  */
2258           dup_token.type = OP_DUP_ASTERISK;
2259           if (start > 0)
2260             {
2261               elem = duplicate_tree (elem, dfa);
2262               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2263               work_tree = create_tree (elem, NULL, 0, new_idx);
2264               tree = create_tree (tree, work_tree, CONCAT, 0);
2265               if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2266                       || tree == NULL, 0))
2267                 goto parse_dup_op_espace;
2268             }
2269           else
2270             {
2271               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2272               tree = create_tree (elem, NULL, 0, new_idx);
2273               if (BE (new_idx == -1 || tree == NULL, 0))
2274                 goto parse_dup_op_espace;
2275             }
2276         }
2277       else if (end - start > 0)
2278         {
2279           /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?".  */
2280           dup_token.type = OP_DUP_QUESTION;
2281           if (start > 0)
2282             {
2283               elem = duplicate_tree (elem, dfa);
2284               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2285               elem = create_tree (elem, NULL, 0, new_idx);
2286               tree = create_tree (tree, elem, CONCAT, 0);
2287               if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2288                 goto parse_dup_op_espace;
2289             }
2290           else
2291             {
2292               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2293               tree = elem = create_tree (elem, NULL, 0, new_idx);
2294               if (BE (new_idx == -1 || tree == NULL, 0))
2295                 goto parse_dup_op_espace;
2296             }
2297           for (i = 1; i < end - start; ++i)
2298             {
2299               work_tree = duplicate_tree (elem, dfa);
2300               tree = create_tree (tree, work_tree, CONCAT, 0);
2301               if (BE (work_tree == NULL || tree == NULL, 0))
2302                 {
2303                   *err = REG_ESPACE;
2304                   return NULL;
2305                 }
2306             }
2307         }
2308     }
2309   else
2310     {
2311       new_idx = re_dfa_add_node (dfa, *token, 0);
2312       tree = create_tree (tree, NULL, 0, new_idx);
2313       if (BE (new_idx == -1 || tree == NULL, 0))
2314         {
2315           *err = REG_ESPACE;
2316           return NULL;
2317         }
2318     }
2319   *token = fetch_token (regexp, syntax);
2320   return tree;
2321
2322  parse_dup_op_espace:
2323   free_bin_tree (tree);
2324   *err = REG_ESPACE;
2325   return NULL;
2326
2327  parse_dup_op_ebrace:
2328   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2329     {
2330       *err = REG_EBRACE;
2331       return NULL;
2332     }
2333   goto parse_dup_op_rollback;
2334  parse_dup_op_invalid_interval:
2335   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2336     {
2337       *err = REG_BADBR;
2338       return NULL;
2339     }
2340  parse_dup_op_rollback:
2341   re_string_set_index (regexp, start_idx);
2342   *token = start_token;
2343   token->type = CHARACTER;
2344   return dup_elem;
2345 }
2346
2347 /* Size of the names for collating symbol/equivalence_class/character_class.
2348    I'm not sure, but maybe enough.  */
2349 #define BRACKET_NAME_BUF_SIZE 32
2350
2351 #ifndef _LIBC
2352   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2353      Build the range expression which starts from START_ELEM, and ends
2354      at END_ELEM.  The result are written to MBCSET and SBCSET.
2355      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2356      mbcset->range_ends, is a pointer argument sinse we may
2357      update it.  */
2358
2359 static reg_errcode_t
2360 # ifdef RE_ENABLE_I18N
2361 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2362      re_charset_t *mbcset;
2363      int *range_alloc;
2364 # else /* not RE_ENABLE_I18N */
2365 build_range_exp (sbcset, start_elem, end_elem)
2366 # endif /* not RE_ENABLE_I18N */
2367      re_bitset_ptr_t sbcset;
2368      bracket_elem_t *start_elem, *end_elem;
2369 {
2370   unsigned int start_ch, end_ch;
2371   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2372   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2373           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2374           0))
2375     return REG_ERANGE;
2376
2377   /* We can handle no multi character collating elements without libc
2378      support.  */
2379   if (BE ((start_elem->type == COLL_SYM
2380            && strlen ((char *) start_elem->opr.name) > 1)
2381           || (end_elem->type == COLL_SYM
2382               && strlen ((char *) end_elem->opr.name) > 1), 0))
2383     return REG_ECOLLATE;
2384
2385 # ifdef RE_ENABLE_I18N
2386   {
2387     wchar_t wc, start_wc, end_wc;
2388     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2389
2390     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2391                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2392                    : 0));
2393     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2394               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2395                  : 0));
2396     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2397                 ? __btowc (start_ch) : start_elem->opr.wch);
2398     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2399               ? __btowc (end_ch) : end_elem->opr.wch);
2400     cmp_buf[0] = start_wc;
2401     cmp_buf[4] = end_wc;
2402     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2403       return REG_ERANGE;
2404
2405     /* Check the space of the arrays.  */
2406     if (*range_alloc == mbcset->nranges)
2407       {
2408         /* There are not enough space, need realloc.  */
2409         wchar_t *new_array_start, *new_array_end;
2410         int new_nranges;
2411
2412         /* +1 in case of mbcset->nranges is 0.  */
2413         new_nranges = 2 * mbcset->nranges + 1;
2414         /* Use realloc since mbcset->range_starts and mbcset->range_ends
2415            are NULL if *range_alloc == 0.  */
2416         new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2417                                       new_nranges);
2418         new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2419                                     new_nranges);
2420
2421         if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2422           return REG_ESPACE;
2423
2424         mbcset->range_starts = new_array_start;
2425         mbcset->range_ends = new_array_end;
2426         *range_alloc = new_nranges;
2427       }
2428
2429     mbcset->range_starts[mbcset->nranges] = start_wc;
2430     mbcset->range_ends[mbcset->nranges++] = end_wc;
2431
2432     /* Build the table for single byte characters.  */
2433     for (wc = 0; wc <= SBC_MAX; ++wc)
2434       {
2435         cmp_buf[2] = wc;
2436         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2437             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2438           bitset_set (sbcset, wc);
2439       }
2440   }
2441 # else /* not RE_ENABLE_I18N */
2442   {
2443     unsigned int ch;
2444     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2445                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2446                    : 0));
2447     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2448               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2449                  : 0));
2450     if (start_ch > end_ch)
2451       return REG_ERANGE;
2452     /* Build the table for single byte characters.  */
2453     for (ch = 0; ch <= SBC_MAX; ++ch)
2454       if (start_ch <= ch  && ch <= end_ch)
2455         bitset_set (sbcset, ch);
2456   }
2457 # endif /* not RE_ENABLE_I18N */
2458   return REG_NOERROR;
2459 }
2460 #endif /* not _LIBC */
2461
2462 #ifndef _LIBC
2463 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2464    Build the collating element which is represented by NAME.
2465    The result are written to MBCSET and SBCSET.
2466    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2467    pointer argument since we may update it.  */
2468
2469 static reg_errcode_t
2470 # ifdef RE_ENABLE_I18N
2471 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2472      re_charset_t *mbcset;
2473      int *coll_sym_alloc;
2474 # else /* not RE_ENABLE_I18N */
2475 build_collating_symbol (sbcset, name)
2476 # endif /* not RE_ENABLE_I18N */
2477      re_bitset_ptr_t sbcset;
2478      const unsigned char *name;
2479 {
2480   size_t name_len = strlen ((const char *) name);
2481   if (BE (name_len != 1, 0))
2482     return REG_ECOLLATE;
2483   else
2484     {
2485       bitset_set (sbcset, name[0]);
2486       return REG_NOERROR;
2487     }
2488 }
2489 #endif /* not _LIBC */
2490
2491 /* This function parse bracket expression like "[abc]", "[a-c]",
2492    "[[.a-a.]]" etc.  */
2493
2494 static bin_tree_t *
2495 parse_bracket_exp (regexp, dfa, token, syntax, err)
2496      re_string_t *regexp;
2497      re_dfa_t *dfa;
2498      re_token_t *token;
2499      reg_syntax_t syntax;
2500      reg_errcode_t *err;
2501 {
2502 #ifdef _LIBC
2503   const unsigned char *collseqmb;
2504   const char *collseqwc;
2505   uint32_t nrules;
2506   int32_t table_size;
2507   const int32_t *symb_table;
2508   const unsigned char *extra;
2509
2510   /* Local function for parse_bracket_exp used in _LIBC environement.
2511      Seek the collating symbol entry correspondings to NAME.
2512      Return the index of the symbol in the SYMB_TABLE.  */
2513
2514   static inline int32_t
2515   seek_collating_symbol_entry (name, name_len)
2516          const unsigned char *name;
2517          size_t name_len;
2518     {
2519       int32_t hash = elem_hash ((const char *) name, name_len);
2520       int32_t elem = hash % table_size;
2521       int32_t second = hash % (table_size - 2);
2522       while (symb_table[2 * elem] != 0)
2523         {
2524           /* First compare the hashing value.  */
2525           if (symb_table[2 * elem] == hash
2526               /* Compare the length of the name.  */
2527               && name_len == extra[symb_table[2 * elem + 1]]
2528               /* Compare the name.  */
2529               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2530                          name_len) == 0)
2531             {
2532               /* Yep, this is the entry.  */
2533               break;
2534             }
2535
2536           /* Next entry.  */
2537           elem += second;
2538         }
2539       return elem;
2540     }
2541
2542   /* Local function for parse_bracket_exp used in _LIBC environement.
2543      Look up the collation sequence value of BR_ELEM.
2544      Return the value if succeeded, UINT_MAX otherwise.  */
2545
2546   static inline unsigned int
2547   lookup_collation_sequence_value (br_elem)
2548          bracket_elem_t *br_elem;
2549     {
2550       if (br_elem->type == SB_CHAR)
2551         {
2552           /*
2553           if (MB_CUR_MAX == 1)
2554           */
2555           if (nrules == 0)
2556             return collseqmb[br_elem->opr.ch];
2557           else
2558             {
2559               wint_t wc = __btowc (br_elem->opr.ch);
2560               return collseq_table_lookup (collseqwc, wc);
2561             }
2562         }
2563       else if (br_elem->type == MB_CHAR)
2564         {
2565           return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2566         }
2567       else if (br_elem->type == COLL_SYM)
2568         {
2569           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2570           if (nrules != 0)
2571             {
2572               int32_t elem, idx;
2573               elem = seek_collating_symbol_entry (br_elem->opr.name,
2574                                                   sym_name_len);
2575               if (symb_table[2 * elem] != 0)
2576                 {
2577                   /* We found the entry.  */
2578                   idx = symb_table[2 * elem + 1];
2579                   /* Skip the name of collating element name.  */
2580                   idx += 1 + extra[idx];
2581                   /* Skip the byte sequence of the collating element.  */
2582                   idx += 1 + extra[idx];
2583                   /* Adjust for the alignment.  */
2584                   idx = (idx + 3) & ~3;
2585                   /* Skip the multibyte collation sequence value.  */
2586                   idx += sizeof (unsigned int);
2587                   /* Skip the wide char sequence of the collating element.  */
2588                   idx += sizeof (unsigned int) *
2589                     (1 + *(unsigned int *) (extra + idx));
2590                   /* Return the collation sequence value.  */
2591                   return *(unsigned int *) (extra + idx);
2592                 }
2593               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2594                 {
2595                   /* No valid character.  Match it as a single byte
2596                      character.  */
2597                   return collseqmb[br_elem->opr.name[0]];
2598                 }
2599             }
2600           else if (sym_name_len == 1)
2601             return collseqmb[br_elem->opr.name[0]];
2602         }
2603       return UINT_MAX;
2604     }
2605
2606   /* Local function for parse_bracket_exp used in _LIBC environement.
2607      Build the range expression which starts from START_ELEM, and ends
2608      at END_ELEM.  The result are written to MBCSET and SBCSET.
2609      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2610      mbcset->range_ends, is a pointer argument sinse we may
2611      update it.  */
2612
2613   static inline reg_errcode_t
2614 # ifdef RE_ENABLE_I18N
2615   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2616          re_charset_t *mbcset;
2617          int *range_alloc;
2618 # else /* not RE_ENABLE_I18N */
2619   build_range_exp (sbcset, start_elem, end_elem)
2620 # endif /* not RE_ENABLE_I18N */
2621          re_bitset_ptr_t sbcset;
2622          bracket_elem_t *start_elem, *end_elem;
2623     {
2624       unsigned int ch;
2625       uint32_t start_collseq;
2626       uint32_t end_collseq;
2627
2628 # ifdef RE_ENABLE_I18N
2629       /* Check the space of the arrays.  */
2630       if (*range_alloc == mbcset->nranges)
2631         {
2632           /* There are not enough space, need realloc.  */
2633           uint32_t *new_array_start;
2634           uint32_t *new_array_end;
2635           int new_nranges;
2636
2637           /* +1 in case of mbcset->nranges is 0.  */
2638           new_nranges = 2 * mbcset->nranges + 1;
2639           /* Use realloc since mbcset->range_starts and mbcset->range_ends
2640              are NULL if *range_alloc == 0.  */
2641           new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2642                                         new_nranges);
2643           new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2644                                       new_nranges);
2645
2646           if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2647             return REG_ESPACE;
2648
2649           mbcset->range_starts = new_array_start;
2650           mbcset->range_ends = new_array_end;
2651           *range_alloc = new_nranges;
2652         }
2653 # endif /* RE_ENABLE_I18N */
2654
2655       /* Equivalence Classes and Character Classes can't be a range
2656          start/end.  */
2657       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2658               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2659               0))
2660         return REG_ERANGE;
2661
2662       start_collseq = lookup_collation_sequence_value (start_elem);
2663       end_collseq = lookup_collation_sequence_value (end_elem);
2664       /* Check start/end collation sequence values.  */
2665       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2666         return REG_ECOLLATE;
2667       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2668         return REG_ERANGE;
2669
2670 # ifdef RE_ENABLE_I18N
2671       /* Got valid collation sequence values, add them as a new entry.  */
2672       mbcset->range_starts[mbcset->nranges] = start_collseq;
2673       mbcset->range_ends[mbcset->nranges++] = end_collseq;
2674 # endif /* RE_ENABLE_I18N */
2675
2676       /* Build the table for single byte characters.  */
2677       for (ch = 0; ch <= SBC_MAX; ch++)
2678         {
2679           uint32_t ch_collseq;
2680           /*
2681           if (MB_CUR_MAX == 1)
2682           */
2683           if (nrules == 0)
2684             ch_collseq = collseqmb[ch];
2685           else
2686             ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2687           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2688             bitset_set (sbcset, ch);
2689         }
2690       return REG_NOERROR;
2691     }
2692
2693   /* Local function for parse_bracket_exp used in _LIBC environement.
2694      Build the collating element which is represented by NAME.
2695      The result are written to MBCSET and SBCSET.
2696      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2697      pointer argument sinse we may update it.  */
2698
2699   static inline reg_errcode_t
2700 # ifdef RE_ENABLE_I18N
2701   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2702          re_charset_t *mbcset;
2703          int *coll_sym_alloc;
2704 # else /* not RE_ENABLE_I18N */
2705   build_collating_symbol (sbcset, name)
2706 # endif /* not RE_ENABLE_I18N */
2707          re_bitset_ptr_t sbcset;
2708          const unsigned char *name;
2709     {
2710       int32_t elem, idx;
2711       size_t name_len = strlen ((const char *) name);
2712       if (nrules != 0)
2713         {
2714           elem = seek_collating_symbol_entry (name, name_len);
2715           if (symb_table[2 * elem] != 0)
2716             {
2717               /* We found the entry.  */
2718               idx = symb_table[2 * elem + 1];
2719               /* Skip the name of collating element name.  */
2720               idx += 1 + extra[idx];
2721             }
2722           else if (symb_table[2 * elem] == 0 && name_len == 1)
2723             {
2724               /* No valid character, treat it as a normal
2725                  character.  */
2726               bitset_set (sbcset, name[0]);
2727               return REG_NOERROR;
2728             }
2729           else
2730             return REG_ECOLLATE;
2731
2732 # ifdef RE_ENABLE_I18N
2733           /* Got valid collation sequence, add it as a new entry.  */
2734           /* Check the space of the arrays.  */
2735           if (*coll_sym_alloc == mbcset->ncoll_syms)
2736             {
2737               /* Not enough, realloc it.  */
2738               /* +1 in case of mbcset->ncoll_syms is 0.  */
2739               *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2740               /* Use realloc since mbcset->coll_syms is NULL
2741                  if *alloc == 0.  */
2742               mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2743                                               *coll_sym_alloc);
2744               if (BE (mbcset->coll_syms == NULL, 0))
2745                 return REG_ESPACE;
2746             }
2747           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2748 # endif /* RE_ENABLE_I18N */
2749           return REG_NOERROR;
2750         }
2751       else
2752         {
2753           if (BE (name_len != 1, 0))
2754             return REG_ECOLLATE;
2755           else
2756             {
2757               bitset_set (sbcset, name[0]);
2758               return REG_NOERROR;
2759             }
2760         }
2761     }
2762 #endif
2763
2764   re_token_t br_token;
2765   re_bitset_ptr_t sbcset;
2766 #ifdef RE_ENABLE_I18N
2767   re_charset_t *mbcset;
2768   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2769   int equiv_class_alloc = 0, char_class_alloc = 0;
2770 #else /* not RE_ENABLE_I18N */
2771   int non_match = 0;
2772 #endif /* not RE_ENABLE_I18N */
2773   bin_tree_t *work_tree;
2774   int token_len, new_idx;
2775 #ifdef _LIBC
2776   collseqmb = (const unsigned char *)
2777     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2778   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2779   if (nrules)
2780     {
2781       /*
2782       if (MB_CUR_MAX > 1)
2783       */
2784         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2785       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2786       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2787                                                   _NL_COLLATE_SYMB_TABLEMB);
2788       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2789                                                    _NL_COLLATE_SYMB_EXTRAMB);
2790     }
2791 #endif
2792   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2793 #ifdef RE_ENABLE_I18N
2794   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2795 #endif /* RE_ENABLE_I18N */
2796 #ifdef RE_ENABLE_I18N
2797   if (BE (sbcset == NULL || mbcset == NULL, 0))
2798 #else
2799   if (BE (sbcset == NULL, 0))
2800 #endif /* RE_ENABLE_I18N */
2801     {
2802       *err = REG_ESPACE;
2803       return NULL;
2804     }
2805
2806   token_len = peek_token_bracket (token, regexp, syntax);
2807   if (BE (token->type == END_OF_RE, 0))
2808     {
2809       *err = REG_BADPAT;
2810       goto parse_bracket_exp_free_return;
2811     }
2812   if (token->type == OP_NON_MATCH_LIST)
2813     {
2814 #ifdef RE_ENABLE_I18N
2815       int i;
2816       mbcset->non_match = 1;
2817 #else /* not RE_ENABLE_I18N */
2818       non_match = 1;
2819 #endif /* not RE_ENABLE_I18N */
2820       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2821         bitset_set (sbcset, '\0');
2822       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2823       token_len = peek_token_bracket (token, regexp, syntax);
2824       if (BE (token->type == END_OF_RE, 0))
2825         {
2826           *err = REG_BADPAT;
2827           goto parse_bracket_exp_free_return;
2828         }
2829 #ifdef RE_ENABLE_I18N
2830       if (MB_CUR_MAX > 1)
2831         for (i = 0; i < SBC_MAX; ++i)
2832           if (__btowc (i) == WEOF)
2833             bitset_set (sbcset, i);
2834 #endif /* RE_ENABLE_I18N */
2835     }
2836
2837   /* We treat the first ']' as a normal character.  */
2838   if (token->type == OP_CLOSE_BRACKET)
2839     token->type = CHARACTER;
2840
2841   while (1)
2842     {
2843       bracket_elem_t start_elem, end_elem;
2844       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2845       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2846       reg_errcode_t ret;
2847       int token_len2 = 0, is_range_exp = 0;
2848       re_token_t token2;
2849
2850       start_elem.opr.name = start_name_buf;
2851       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2852                                    syntax);
2853       if (BE (ret != REG_NOERROR, 0))
2854         {
2855           *err = ret;
2856           goto parse_bracket_exp_free_return;
2857         }
2858
2859       token_len = peek_token_bracket (token, regexp, syntax);
2860       if (BE (token->type == END_OF_RE, 0))
2861         {
2862           *err = REG_BADPAT;
2863           goto parse_bracket_exp_free_return;
2864         }
2865       if (token->type == OP_CHARSET_RANGE)
2866         {
2867           re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
2868           token_len2 = peek_token_bracket (&token2, regexp, syntax);
2869           if (BE (token->type == END_OF_RE, 0))
2870             {
2871               *err = REG_BADPAT;
2872               goto parse_bracket_exp_free_return;
2873             }
2874           if (token2.type == OP_CLOSE_BRACKET)
2875             {
2876               /* We treat the last '-' as a normal character.  */
2877               re_string_skip_bytes (regexp, -token_len);
2878               token->type = CHARACTER;
2879             }
2880           else
2881             is_range_exp = 1;
2882         }
2883
2884       if (is_range_exp == 1)
2885         {
2886           end_elem.opr.name = end_name_buf;
2887           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2888                                        dfa, syntax);
2889           if (BE (ret != REG_NOERROR, 0))
2890             {
2891               *err = ret;
2892               goto parse_bracket_exp_free_return;
2893             }
2894
2895           token_len = peek_token_bracket (token, regexp, syntax);
2896           if (BE (token->type == END_OF_RE, 0))
2897             {
2898               *err = REG_BADPAT;
2899               goto parse_bracket_exp_free_return;
2900             }
2901           *err = build_range_exp (sbcset,
2902 #ifdef RE_ENABLE_I18N
2903                                   mbcset, &range_alloc,
2904 #endif /* RE_ENABLE_I18N */
2905                                   &start_elem, &end_elem);
2906           if (BE (*err != REG_NOERROR, 0))
2907             goto parse_bracket_exp_free_return;
2908         }
2909       else
2910         {
2911           switch (start_elem.type)
2912             {
2913             case SB_CHAR:
2914               bitset_set (sbcset, start_elem.opr.ch);
2915               break;
2916 #ifdef RE_ENABLE_I18N
2917             case MB_CHAR:
2918               /* Check whether the array has enough space.  */
2919               if (mbchar_alloc == mbcset->nmbchars)
2920                 {
2921                   /* Not enough, realloc it.  */
2922                   /* +1 in case of mbcset->nmbchars is 0.  */
2923                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
2924                   /* Use realloc since array is NULL if *alloc == 0.  */
2925                   mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2926                                                 mbchar_alloc);
2927                   if (BE (mbcset->mbchars == NULL, 0))
2928                     goto parse_bracket_exp_espace;
2929                 }
2930               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2931               break;
2932 #endif /* RE_ENABLE_I18N */
2933             case EQUIV_CLASS:
2934               *err = build_equiv_class (sbcset,
2935 #ifdef RE_ENABLE_I18N
2936                                         mbcset, &equiv_class_alloc,
2937 #endif /* RE_ENABLE_I18N */
2938                                         start_elem.opr.name);
2939               if (BE (*err != REG_NOERROR, 0))
2940                 goto parse_bracket_exp_free_return;
2941               break;
2942             case COLL_SYM:
2943               *err = build_collating_symbol (sbcset,
2944 #ifdef RE_ENABLE_I18N
2945                                              mbcset, &coll_sym_alloc,
2946 #endif /* RE_ENABLE_I18N */
2947                                              start_elem.opr.name);
2948               if (BE (*err != REG_NOERROR, 0))
2949                 goto parse_bracket_exp_free_return;
2950               break;
2951             case CHAR_CLASS:
2952               *err = build_charclass (sbcset,
2953 #ifdef RE_ENABLE_I18N
2954                                       mbcset, &char_class_alloc,
2955 #endif /* RE_ENABLE_I18N */
2956                                       start_elem.opr.name, syntax);
2957               if (BE (*err != REG_NOERROR, 0))
2958                goto parse_bracket_exp_free_return;
2959               break;
2960             default:
2961               assert (0);
2962               break;
2963             }
2964         }
2965       if (token->type == OP_CLOSE_BRACKET)
2966         break;
2967     }
2968
2969   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2970
2971   /* If it is non-matching list.  */
2972 #ifdef RE_ENABLE_I18N
2973   if (mbcset->non_match)
2974 #else /* not RE_ENABLE_I18N */
2975   if (non_match)
2976 #endif /* not RE_ENABLE_I18N */
2977     bitset_not (sbcset);
2978
2979   /* Build a tree for simple bracket.  */
2980   br_token.type = SIMPLE_BRACKET;
2981   br_token.opr.sbcset = sbcset;
2982   new_idx = re_dfa_add_node (dfa, br_token, 0);
2983   work_tree = create_tree (NULL, NULL, 0, new_idx);
2984   if (BE (new_idx == -1 || work_tree == NULL, 0))
2985     goto parse_bracket_exp_espace;
2986
2987 #ifdef RE_ENABLE_I18N
2988   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2989       || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2990                                                 || mbcset->non_match)))
2991     {
2992       re_token_t alt_token;
2993       bin_tree_t *mbc_tree;
2994       /* Build a tree for complex bracket.  */
2995       br_token.type = COMPLEX_BRACKET;
2996       br_token.opr.mbcset = mbcset;
2997       dfa->has_mb_node = 1;
2998       new_idx = re_dfa_add_node (dfa, br_token, 0);
2999       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3000       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3001         goto parse_bracket_exp_espace;
3002       /* Then join them by ALT node.  */
3003       dfa->has_plural_match = 1;
3004       alt_token.type = OP_ALT;
3005       new_idx = re_dfa_add_node (dfa, alt_token, 0);
3006       work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3007       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3008         return work_tree;
3009     }
3010   else
3011     {
3012       free_charset (mbcset);
3013       return work_tree;
3014     }
3015 #else /* not RE_ENABLE_I18N */
3016   return work_tree;
3017 #endif /* not RE_ENABLE_I18N */
3018
3019  parse_bracket_exp_espace:
3020   *err = REG_ESPACE;
3021  parse_bracket_exp_free_return:
3022   re_free (sbcset);
3023 #ifdef RE_ENABLE_I18N
3024   free_charset (mbcset);
3025 #endif /* RE_ENABLE_I18N */
3026   return NULL;
3027 }
3028
3029 /* Parse an element in the bracket expression.  */
3030
3031 static reg_errcode_t
3032 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
3033      bracket_elem_t *elem;
3034      re_string_t *regexp;
3035      re_token_t *token;
3036      int token_len;
3037      re_dfa_t *dfa;
3038      reg_syntax_t syntax;
3039 {
3040 #ifdef RE_ENABLE_I18N
3041   int cur_char_size;
3042   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3043   if (cur_char_size > 1)
3044     {
3045       elem->type = MB_CHAR;
3046       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3047       re_string_skip_bytes (regexp, cur_char_size);
3048       return REG_NOERROR;
3049     }
3050 #endif /* RE_ENABLE_I18N */
3051   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3052   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3053       || token->type == OP_OPEN_EQUIV_CLASS)
3054     return parse_bracket_symbol (elem, regexp, token);
3055   elem->type = SB_CHAR;
3056   elem->opr.ch = token->opr.c;
3057   return REG_NOERROR;
3058 }
3059
3060 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3061    such as [:<character_class>:], [.<collating_element>.], and
3062    [=<equivalent_class>=].  */
3063
3064 static reg_errcode_t
3065 parse_bracket_symbol (elem, regexp, token)
3066      bracket_elem_t *elem;
3067      re_string_t *regexp;
3068      re_token_t *token;
3069 {
3070   unsigned char ch, delim = token->opr.c;
3071   int i = 0;
3072   for (;; ++i)
3073     {
3074       if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3075         return REG_EBRACK;
3076       if (token->type == OP_OPEN_CHAR_CLASS)
3077         ch = re_string_fetch_byte_case (regexp);
3078       else
3079         ch = re_string_fetch_byte (regexp);
3080       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3081         break;
3082       elem->opr.name[i] = ch;
3083     }
3084   re_string_skip_bytes (regexp, 1);
3085   elem->opr.name[i] = '\0';
3086   switch (token->type)
3087     {
3088     case OP_OPEN_COLL_ELEM:
3089       elem->type = COLL_SYM;
3090       break;
3091     case OP_OPEN_EQUIV_CLASS:
3092       elem->type = EQUIV_CLASS;
3093       break;
3094     case OP_OPEN_CHAR_CLASS:
3095       elem->type = CHAR_CLASS;
3096       break;
3097     default:
3098       break;
3099     }
3100   return REG_NOERROR;
3101 }
3102
3103   /* Helper function for parse_bracket_exp.
3104      Build the equivalence class which is represented by NAME.
3105      The result are written to MBCSET and SBCSET.
3106      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3107      is a pointer argument sinse we may update it.  */
3108
3109 static reg_errcode_t
3110 #ifdef RE_ENABLE_I18N
3111 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3112      re_charset_t *mbcset;
3113      int *equiv_class_alloc;
3114 #else /* not RE_ENABLE_I18N */
3115 build_equiv_class (sbcset, name)
3116 #endif /* not RE_ENABLE_I18N */
3117      re_bitset_ptr_t sbcset;
3118      const unsigned char *name;
3119 {
3120 #if defined _LIBC && defined RE_ENABLE_I18N
3121   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3122   if (nrules != 0)
3123     {
3124       const int32_t *table, *indirect;
3125       const unsigned char *weights, *extra, *cp;
3126       unsigned char char_buf[2];
3127       int32_t idx1, idx2;
3128       unsigned int ch;
3129       size_t len;
3130       /* This #include defines a local function!  */
3131 # include <locale/weight.h>
3132       /* Calculate the index for equivalence class.  */
3133       cp = name;
3134       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3135       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3136                                                _NL_COLLATE_WEIGHTMB);
3137       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3138                                                    _NL_COLLATE_EXTRAMB);
3139       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3140                                                 _NL_COLLATE_INDIRECTMB);
3141       idx1 = findidx (&cp);
3142       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3143         /* This isn't a valid character.  */
3144         return REG_ECOLLATE;
3145
3146       /* Build single byte matcing table for this equivalence class.  */
3147       char_buf[1] = (unsigned char) '\0';
3148       len = weights[idx1];
3149       for (ch = 0; ch < SBC_MAX; ++ch)
3150         {
3151           char_buf[0] = ch;
3152           cp = char_buf;
3153           idx2 = findidx (&cp);
3154 /*
3155           idx2 = table[ch];
3156 */
3157           if (idx2 == 0)
3158             /* This isn't a valid character.  */
3159             continue;
3160           if (len == weights[idx2])
3161             {
3162               int cnt = 0;
3163               while (cnt <= len &&
3164                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3165                 ++cnt;
3166
3167               if (cnt > len)
3168                 bitset_set (sbcset, ch);
3169             }
3170         }
3171       /* Check whether the array has enough space.  */
3172       if (*equiv_class_alloc == mbcset->nequiv_classes)
3173         {
3174           /* Not enough, realloc it.  */
3175           /* +1 in case of mbcset->nequiv_classes is 0.  */
3176           *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3177           /* Use realloc since the array is NULL if *alloc == 0.  */
3178           mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3179                                               *equiv_class_alloc);
3180           if (BE (mbcset->equiv_classes == NULL, 0))
3181             return REG_ESPACE;
3182         }
3183       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3184     }
3185   else
3186 #endif /* _LIBC && RE_ENABLE_I18N */
3187     {
3188       if (BE (strlen ((const char *) name) != 1, 0))
3189         return REG_ECOLLATE;
3190       bitset_set (sbcset, *name);
3191     }
3192   return REG_NOERROR;
3193 }
3194
3195   /* Helper function for parse_bracket_exp.
3196      Build the character class which is represented by NAME.
3197      The result are written to MBCSET and SBCSET.
3198      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3199      is a pointer argument sinse we may update it.  */
3200
3201 static reg_errcode_t
3202 #ifdef RE_ENABLE_I18N
3203 build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3204      re_charset_t *mbcset;
3205      int *char_class_alloc;
3206 #else /* not RE_ENABLE_I18N */
3207 build_charclass (sbcset, class_name, syntax)
3208 #endif /* not RE_ENABLE_I18N */
3209      re_bitset_ptr_t sbcset;
3210      const unsigned char *class_name;
3211      reg_syntax_t syntax;
3212 {
3213   int i;
3214   const char *name = (const char *) class_name;
3215
3216   /* In case of REG_ICASE "upper" and "lower" match the both of
3217      upper and lower cases.  */
3218   if ((syntax & RE_ICASE)
3219       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3220     name = "alpha";
3221
3222 #ifdef RE_ENABLE_I18N
3223   /* Check the space of the arrays.  */
3224   if (*char_class_alloc == mbcset->nchar_classes)
3225     {
3226       /* Not enough, realloc it.  */
3227       /* +1 in case of mbcset->nchar_classes is 0.  */
3228       *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3229       /* Use realloc since array is NULL if *alloc == 0.  */
3230       mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3231                                          *char_class_alloc);
3232       if (BE (mbcset->char_classes == NULL, 0))
3233         return REG_ESPACE;
3234     }
3235   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3236 #endif /* RE_ENABLE_I18N */
3237
3238 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3239     for (i = 0; i < SBC_MAX; ++i)       \
3240       {                                 \
3241         if (ctype_func (i))             \
3242           bitset_set (sbcset, i);       \
3243       }
3244
3245   if (strcmp (name, "alnum") == 0)
3246     BUILD_CHARCLASS_LOOP (isalnum)
3247   else if (strcmp (name, "cntrl") == 0)
3248     BUILD_CHARCLASS_LOOP (iscntrl)
3249   else if (strcmp (name, "lower") == 0)
3250     BUILD_CHARCLASS_LOOP (islower)
3251   else if (strcmp (name, "space") == 0)
3252     BUILD_CHARCLASS_LOOP (isspace)
3253   else if (strcmp (name, "alpha") == 0)
3254     BUILD_CHARCLASS_LOOP (isalpha)
3255   else if (strcmp (name, "digit") == 0)
3256     BUILD_CHARCLASS_LOOP (isdigit)
3257   else if (strcmp (name, "print") == 0)
3258     BUILD_CHARCLASS_LOOP (isprint)
3259   else if (strcmp (name, "upper") == 0)
3260     BUILD_CHARCLASS_LOOP (isupper)
3261   else if (strcmp (name, "blank") == 0)
3262     BUILD_CHARCLASS_LOOP (isblank)
3263   else if (strcmp (name, "graph") == 0)
3264     BUILD_CHARCLASS_LOOP (isgraph)
3265   else if (strcmp (name, "punct") == 0)
3266     BUILD_CHARCLASS_LOOP (ispunct)
3267   else if (strcmp (name, "xdigit") == 0)
3268     BUILD_CHARCLASS_LOOP (isxdigit)
3269   else
3270     return REG_ECTYPE;
3271
3272   return REG_NOERROR;
3273 }
3274
3275 static bin_tree_t *
3276 build_word_op (dfa, not, err)
3277      re_dfa_t *dfa;
3278      int not;
3279      reg_errcode_t *err;
3280 {
3281   re_bitset_ptr_t sbcset;
3282 #ifdef RE_ENABLE_I18N
3283   re_charset_t *mbcset;
3284   int alloc = 0;
3285 #else /* not RE_ENABLE_I18N */
3286   int non_match = 0;
3287 #endif /* not RE_ENABLE_I18N */
3288   reg_errcode_t ret;
3289   re_token_t br_token;
3290   bin_tree_t *tree;
3291   int new_idx;
3292
3293   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3294 #ifdef RE_ENABLE_I18N
3295   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3296 #endif /* RE_ENABLE_I18N */
3297
3298 #ifdef RE_ENABLE_I18N
3299   if (BE (sbcset == NULL || mbcset == NULL, 0))
3300 #else /* not RE_ENABLE_I18N */
3301   if (BE (sbcset == NULL, 0))
3302 #endif /* not RE_ENABLE_I18N */
3303     {
3304       *err = REG_ESPACE;
3305       return NULL;
3306     }
3307
3308   if (not)
3309     {
3310 #ifdef RE_ENABLE_I18N
3311       int i;
3312       /*
3313       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3314         bitset_set(cset->sbcset, '\0');
3315       */
3316       mbcset->non_match = 1;
3317       if (MB_CUR_MAX > 1)
3318         for (i = 0; i < SBC_MAX; ++i)
3319           if (__btowc (i) == WEOF)
3320             bitset_set (sbcset, i);
3321 #else /* not RE_ENABLE_I18N */
3322       non_match = 1;
3323 #endif /* not RE_ENABLE_I18N */
3324     }
3325
3326   /* We don't care the syntax in this case.  */
3327   ret = build_charclass (sbcset,
3328 #ifdef RE_ENABLE_I18N
3329                          mbcset, &alloc,
3330 #endif /* RE_ENABLE_I18N */
3331                          (const unsigned char *) "alpha", 0);
3332
3333   if (BE (ret != REG_NOERROR, 0))
3334     {
3335       re_free (sbcset);
3336 #ifdef RE_ENABLE_I18N
3337       free_charset (mbcset);
3338 #endif /* RE_ENABLE_I18N */
3339       *err = ret;
3340       return NULL;
3341     }
3342   /* \w match '_' also.  */
3343   bitset_set (sbcset, '_');
3344
3345   /* If it is non-matching list.  */
3346 #ifdef RE_ENABLE_I18N
3347   if (mbcset->non_match)
3348 #else /* not RE_ENABLE_I18N */
3349   if (non_match)
3350 #endif /* not RE_ENABLE_I18N */
3351     bitset_not (sbcset);
3352
3353   /* Build a tree for simple bracket.  */
3354   br_token.type = SIMPLE_BRACKET;
3355   br_token.opr.sbcset = sbcset;
3356   new_idx = re_dfa_add_node (dfa, br_token, 0);
3357   tree = create_tree (NULL, NULL, 0, new_idx);
3358   if (BE (new_idx == -1 || tree == NULL, 0))
3359     goto build_word_op_espace;
3360
3361 #ifdef RE_ENABLE_I18N
3362   if (MB_CUR_MAX > 1)
3363     {
3364       re_token_t alt_token;
3365       bin_tree_t *mbc_tree;
3366       /* Build a tree for complex bracket.  */
3367       br_token.type = COMPLEX_BRACKET;
3368       br_token.opr.mbcset = mbcset;
3369       dfa->has_mb_node = 1;
3370       new_idx = re_dfa_add_node (dfa, br_token, 0);
3371       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3372       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3373         goto build_word_op_espace;
3374       /* Then join them by ALT node.  */
3375       alt_token.type = OP_ALT;
3376       new_idx = re_dfa_add_node (dfa, alt_token, 0);
3377       tree = create_tree (tree, mbc_tree, 0, new_idx);
3378       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3379         return tree;
3380     }
3381   else
3382     {
3383       free_charset (mbcset);
3384       return tree;
3385     }
3386 #else /* not RE_ENABLE_I18N */
3387   return tree;
3388 #endif /* not RE_ENABLE_I18N */
3389
3390  build_word_op_espace:
3391   re_free (sbcset);
3392 #ifdef RE_ENABLE_I18N
3393   free_charset (mbcset);
3394 #endif /* RE_ENABLE_I18N */
3395   *err = REG_ESPACE;
3396   return NULL;
3397 }
3398
3399 /* This is intended for the expressions like "a{1,3}".
3400    Fetch a number from `input', and return the number.
3401    Return -1, if the number field is empty like "{,1}".
3402    Return -2, If an error is occured.  */
3403
3404 static int
3405 fetch_number (input, token, syntax)
3406      re_string_t *input;
3407      re_token_t *token;
3408      reg_syntax_t syntax;
3409 {
3410   int num = -1;
3411   unsigned char c;
3412   while (1)
3413     {
3414       *token = fetch_token (input, syntax);
3415       c = token->opr.c;
3416       if (BE (token->type == END_OF_RE, 0))
3417         return -2;
3418       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3419         break;
3420       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3421              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3422       num = (num > RE_DUP_MAX) ? -2 : num;
3423     }
3424   return num;
3425 }
3426 \f
3427 #ifdef RE_ENABLE_I18N
3428 static void
3429 free_charset (re_charset_t *cset)
3430 {
3431   re_free (cset->mbchars);
3432 # ifdef _LIBC
3433   re_free (cset->coll_syms);
3434   re_free (cset->equiv_classes);
3435   re_free (cset->range_starts);
3436   re_free (cset->range_ends);
3437 # endif
3438   re_free (cset->char_classes);
3439   re_free (cset);
3440 }
3441 #endif /* RE_ENABLE_I18N */
3442 \f
3443 /* Functions for binary tree operation.  */
3444
3445 /* Create a node of tree.
3446    Note: This function automatically free left and right if malloc fails.  */
3447
3448 static bin_tree_t *
3449 create_tree (left, right, type, index)
3450      bin_tree_t *left;
3451      bin_tree_t *right;
3452      re_token_type_t type;
3453      int index;
3454 {
3455   bin_tree_t *tree;
3456   tree = re_malloc (bin_tree_t, 1);
3457   if (BE (tree == NULL, 0))
3458     {
3459       free_bin_tree (left);
3460       free_bin_tree (right);
3461       return NULL;
3462     }
3463   tree->parent = NULL;
3464   tree->left = left;
3465   tree->right = right;
3466   tree->type = type;
3467   tree->node_idx = index;
3468   tree->first = -1;
3469   tree->next = -1;
3470   re_node_set_init_empty (&tree->eclosure);
3471
3472   if (left != NULL)
3473     left->parent = tree;
3474   if (right != NULL)
3475     right->parent = tree;
3476   return tree;
3477 }
3478
3479 /* Free the sub tree pointed by TREE.  */
3480
3481 static void
3482 free_bin_tree (tree)
3483      bin_tree_t *tree;
3484 {
3485   if (tree == NULL)
3486     return;
3487   /*re_node_set_free (&tree->eclosure);*/
3488   free_bin_tree (tree->left);
3489   free_bin_tree (tree->right);
3490   re_free (tree);
3491 }
3492
3493 /* Duplicate the node SRC, and return new node.  */
3494
3495 static bin_tree_t *
3496 duplicate_tree (src, dfa)
3497      const bin_tree_t *src;
3498      re_dfa_t *dfa;
3499 {
3500   bin_tree_t *left = NULL, *right = NULL, *new_tree;
3501   int new_node_idx;
3502   /* Since node indies must be according to Post-order of the tree,
3503      we must duplicate the left at first.  */
3504   if (src->left != NULL)
3505     {
3506       left = duplicate_tree (src->left, dfa);
3507       if (left == NULL)
3508         return NULL;
3509     }
3510
3511   /* Secondaly, duplicate the right.  */
3512   if (src->right != NULL)
3513     {
3514       right = duplicate_tree (src->right, dfa);
3515       if (right == NULL)
3516         {
3517           free_bin_tree (left);
3518           return NULL;
3519         }
3520     }
3521
3522   /* At last, duplicate itself.  */
3523   if (src->type == NON_TYPE)
3524     {
3525       new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3526       dfa->nodes[new_node_idx].duplicated = 1;
3527       if (BE (new_node_idx == -1, 0))
3528         {
3529           free_bin_tree (left);
3530           free_bin_tree (right);
3531           return NULL;
3532         }
3533     }
3534   else
3535     new_node_idx = src->type;
3536
3537   new_tree = create_tree (left, right, src->type, new_node_idx);
3538   if (BE (new_tree == NULL, 0))
3539     {
3540       free_bin_tree (left);
3541       free_bin_tree (right);
3542     }
3543   return new_tree;
3544 }