regex/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2018 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #ifdef _LIBC
  21 # include <locale/weight.h>
  22 #endif
  23
  24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  25                                           size_t length, reg_syntax_t syntax);
  26 static void re_compile_fastmap_iter (regex_t *bufp,
  27                                      const re_dfastate_t *init_state,
  28                                      char *fastmap);
  29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  30 #ifdef RE_ENABLE_I18N
  31 static void free_charset (re_charset_t *cset);
  32 #endif /* RE_ENABLE_I18N */
  33 static void free_workarea_compile (regex_t *preg);
  34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  35 #ifdef RE_ENABLE_I18N
  36 static void optimize_utf8 (re_dfa_t *dfa);
  37 #endif
  38 static reg_errcode_t analyze (regex_t *preg);
  39 static reg_errcode_t preorder (bin_tree_t *root,
  40                                reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                void *extra);
  42 static reg_errcode_t postorder (bin_tree_t *root,
  43                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  44                                 void *extra);
  45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  48                                  bin_tree_t *node);
  49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  54                                    unsigned int constraint);
  55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  57                                          Idx node, bool root);
  58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  59 static Idx fetch_number (re_string_t *input, re_token_t *token,
  60                          reg_syntax_t syntax);
  61 static int peek_token (re_token_t *token, re_string_t *input,
  62                         reg_syntax_t syntax);
  63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  64                           reg_syntax_t syntax, reg_errcode_t *err);
  65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  66                                   re_token_t *token, reg_syntax_t syntax,
  67                                   Idx nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  69                                  re_token_t *token, reg_syntax_t syntax,
  70                                  Idx nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  72                                      re_token_t *token, reg_syntax_t syntax,
  73                                      Idx nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  75                                   re_token_t *token, reg_syntax_t syntax,
  76                                   Idx nest, reg_errcode_t *err);
  77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  78                                  re_dfa_t *dfa, re_token_t *token,
  79                                  reg_syntax_t syntax, reg_errcode_t *err);
  80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  81                                       re_token_t *token, reg_syntax_t syntax,
  82                                       reg_errcode_t *err);
  83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  84                                             re_string_t *regexp,
  85                                             re_token_t *token, int token_len,
  86                                             re_dfa_t *dfa,
  87                                             reg_syntax_t syntax,
  88                                             bool accept_hyphen);
  89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  90                                           re_string_t *regexp,
  91                                           re_token_t *token);
  92 #ifdef RE_ENABLE_I18N
  93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  94                                         re_charset_t *mbcset,
  95                                         Idx *equiv_class_alloc,
  96                                         const unsigned char *name);
  97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  98                                       bitset_t sbcset,
  99                                       re_charset_t *mbcset,
 100                                       Idx *char_class_alloc,
 101                                       const char *class_name,
 102                                       reg_syntax_t syntax);
 103 #else  /* not RE_ENABLE_I18N */
 104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 105                                         const unsigned char *name);
 106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 107                                       bitset_t sbcset,
 108                                       const char *class_name,
 109                                       reg_syntax_t syntax);
 110 #endif /* not RE_ENABLE_I18N */
 111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 112                                        RE_TRANSLATE_TYPE trans,
 113                                        const char *class_name,
 114                                        const char *extra,
 115                                        bool non_match, reg_errcode_t *err);
 116 static bin_tree_t *create_tree (re_dfa_t *dfa,
 117                                 bin_tree_t *left, bin_tree_t *right,
 118                                 re_token_type_t type);
 119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 120                                       bin_tree_t *left, bin_tree_t *right,
 121                                       const re_token_t *token);
 122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 123 static void free_token (re_token_t *node);
 124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 126 \f
 127 /* This table gives an error message for each of the error codes listed
 128    in regex.h.  Obviously the order here has to be same as there.
 129    POSIX doesn't require that we do anything for REG_NOERROR,
 130    but why not be nice?  */
 131
 132 static const char __re_error_msgid[] =
 133   {
 134 #define REG_NOERROR_IDX 0
 135     gettext_noop ("Success")    /* REG_NOERROR */
 136     "\0"
 137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 138     gettext_noop ("No match")   /* REG_NOMATCH */
 139     "\0"
 140 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 141     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 142     "\0"
 143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 144     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 145     "\0"
 146 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 147     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 148     "\0"
 149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 150     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 151     "\0"
 152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 153     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 154     "\0"
 155 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 156     gettext_noop ("Unmatched [, [^, [:, [., or [=")     /* REG_EBRACK */
 157     "\0"
 158 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
 159     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 160     "\0"
 161 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 162     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 163     "\0"
 164 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 165     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 166     "\0"
 167 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 168     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 169     "\0"
 170 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 171     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 172     "\0"
 173 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 174     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 175     "\0"
 176 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 177     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 178     "\0"
 179 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 180     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 181     "\0"
 182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 183     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 184   };
 185
 186 static const size_t __re_error_msgid_idx[] =
 187   {
 188     REG_NOERROR_IDX,
 189     REG_NOMATCH_IDX,
 190     REG_BADPAT_IDX,
 191     REG_ECOLLATE_IDX,
 192     REG_ECTYPE_IDX,
 193     REG_EESCAPE_IDX,
 194     REG_ESUBREG_IDX,
 195     REG_EBRACK_IDX,
 196     REG_EPAREN_IDX,
 197     REG_EBRACE_IDX,
 198     REG_BADBR_IDX,
 199     REG_ERANGE_IDX,
 200     REG_ESPACE_IDX,
 201     REG_BADRPT_IDX,
 202     REG_EEND_IDX,
 203     REG_ESIZE_IDX,
 204     REG_ERPAREN_IDX
 205   };
 206 \f
 207 /* Entry points for GNU code.  */
 208
 209 /* re_compile_pattern is the GNU regular expression compiler: it
 210    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 211    Returns 0 if the pattern was valid, otherwise an error string.
 212
 213    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 214    are set in BUFP on entry.  */
 215
 216 const char *
 217 re_compile_pattern (const char *pattern, size_t length,
 218                     struct re_pattern_buffer *bufp)
 219 {
 220   reg_errcode_t ret;
 221
 222   /* And GNU code determines whether or not to get register information
 223      by passing null for the REGS argument to re_match, etc., not by
 224      setting no_sub, unless RE_NO_SUB is set.  */
 225   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 226
 227   /* Match anchors at newline.  */
 228   bufp->newline_anchor = 1;
 229
 230   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 231
 232   if (!ret)
 233     return NULL;
 234   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 235 }
 236 #ifdef _LIBC
 237 weak_alias (__re_compile_pattern, re_compile_pattern)
 238 #endif
 239
 240 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 241    also be assigned to arbitrarily: each pattern buffer stores its own
 242    syntax, so it can be changed between regex compilations.  */
 243 /* This has no initializer because initialized variables in Emacs
 244    become read-only after dumping.  */
 245 reg_syntax_t re_syntax_options;
 246
 247
 248 /* Specify the precise syntax of regexps for compilation.  This provides
 249    for compatibility for various utilities which historically have
 250    different, incompatible syntaxes.
 251
 252    The argument SYNTAX is a bit mask comprised of the various bits
 253    defined in regex.h.  We return the old syntax.  */
 254
 255 reg_syntax_t
 256 re_set_syntax (reg_syntax_t syntax)
 257 {
 258   reg_syntax_t ret = re_syntax_options;
 259
 260   re_syntax_options = syntax;
 261   return ret;
 262 }
 263 #ifdef _LIBC
 264 weak_alias (__re_set_syntax, re_set_syntax)
 265 #endif
 266
 267 int
 268 re_compile_fastmap (struct re_pattern_buffer *bufp)
 269 {
 270   re_dfa_t *dfa = bufp->buffer;
 271   char *fastmap = bufp->fastmap;
 272
 273   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 274   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 275   if (dfa->init_state != dfa->init_state_word)
 276     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 277   if (dfa->init_state != dfa->init_state_nl)
 278     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 279   if (dfa->init_state != dfa->init_state_begbuf)
 280     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 281   bufp->fastmap_accurate = 1;
 282   return 0;
 283 }
 284 #ifdef _LIBC
 285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 286 #endif
 287
 288 static inline void
 289 __attribute__ ((always_inline))
 290 re_set_fastmap (char *fastmap, bool icase, int ch)
 291 {
 292   fastmap[ch] = 1;
 293   if (icase)
 294     fastmap[tolower (ch)] = 1;
 295 }
 296
 297 /* Helper function for re_compile_fastmap.
 298    Compile fastmap for the initial_state INIT_STATE.  */
 299
 300 static void
 301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 302                          char *fastmap)
 303 {
 304   re_dfa_t *dfa = bufp->buffer;
 305   Idx node_cnt;
 306   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 307   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 308     {
 309       Idx node = init_state->nodes.elems[node_cnt];
 310       re_token_type_t type = dfa->nodes[node].type;
 311
 312       if (type == CHARACTER)
 313         {
 314           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 315 #ifdef RE_ENABLE_I18N
 316           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 317             {
 318               unsigned char buf[MB_LEN_MAX];
 319               unsigned char *p;
 320               wchar_t wc;
 321               mbstate_t state;
 322
 323               p = buf;
 324               *p++ = dfa->nodes[node].opr.c;
 325               while (++node < dfa->nodes_len
 326                      && dfa->nodes[node].type == CHARACTER
 327                      && dfa->nodes[node].mb_partial)
 328                 *p++ = dfa->nodes[node].opr.c;
 329               memset (&state, '\0', sizeof (state));
 330               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 331                              &state) == p - buf
 332                   && (__wcrtomb ((char *) buf, __towlower (wc), &state)
 333                       != (size_t) -1))
 334                 re_set_fastmap (fastmap, false, buf[0]);
 335             }
 336 #endif
 337         }
 338       else if (type == SIMPLE_BRACKET)
 339         {
 340           int i, ch;
 341           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 342             {
 343               int j;
 344               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 345               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 346                 if (w & ((bitset_word_t) 1 << j))
 347                   re_set_fastmap (fastmap, icase, ch);
 348             }
 349         }
 350 #ifdef RE_ENABLE_I18N
 351       else if (type == COMPLEX_BRACKET)
 352         {
 353           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 354           Idx i;
 355
 356 # ifdef _LIBC
 357           /* See if we have to try all bytes which start multiple collation
 358              elements.
 359              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 360                   collation element, and don't catch 'b' since 'b' is
 361                   the only collation element which starts from 'b' (and
 362                   it is caught by SIMPLE_BRACKET).  */
 363               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 364                   && (cset->ncoll_syms || cset->nranges))
 365                 {
 366                   const int32_t *table = (const int32_t *)
 367                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 368                   for (i = 0; i < SBC_MAX; ++i)
 369                     if (table[i] < 0)
 370                       re_set_fastmap (fastmap, icase, i);
 371                 }
 372 # endif /* _LIBC */
 373
 374           /* See if we have to start the match at all multibyte characters,
 375              i.e. where we would not find an invalid sequence.  This only
 376              applies to multibyte character sets; for single byte character
 377              sets, the SIMPLE_BRACKET again suffices.  */
 378           if (dfa->mb_cur_max > 1
 379               && (cset->nchar_classes || cset->non_match || cset->nranges
 380 # ifdef _LIBC
 381                   || cset->nequiv_classes
 382 # endif /* _LIBC */
 383                  ))
 384             {
 385               unsigned char c = 0;
 386               do
 387                 {
 388                   mbstate_t mbs;
 389                   memset (&mbs, 0, sizeof (mbs));
 390                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 391                     re_set_fastmap (fastmap, false, (int) c);
 392                 }
 393               while (++c != 0);
 394             }
 395
 396           else
 397             {
 398               /* ... Else catch all bytes which can start the mbchars.  */
 399               for (i = 0; i < cset->nmbchars; ++i)
 400                 {
 401                   char buf[256];
 402                   mbstate_t state;
 403                   memset (&state, '\0', sizeof (state));
 404                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 405                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 406                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 407                     {
 408                       if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
 409                           != (size_t) -1)
 410                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 411                     }
 412                 }
 413             }
 414         }
 415 #endif /* RE_ENABLE_I18N */
 416       else if (type == OP_PERIOD
 417 #ifdef RE_ENABLE_I18N
 418                || type == OP_UTF8_PERIOD
 419 #endif /* RE_ENABLE_I18N */
 420                || type == END_OF_RE)
 421         {
 422           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 423           if (type == END_OF_RE)
 424             bufp->can_be_null = 1;
 425           return;
 426         }
 427     }
 428 }
 429 \f
 430 /* Entry point for POSIX code.  */
 431 /* regcomp takes a regular expression as a string and compiles it.
 432
 433    PREG is a regex_t *.  We do not expect any fields to be initialized,
 434    since POSIX says we shouldn't.  Thus, we set
 435
 436      'buffer' to the compiled pattern;
 437      'used' to the length of the compiled pattern;
 438      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 439        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 440        RE_SYNTAX_POSIX_BASIC;
 441      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 442      'fastmap' to an allocated space for the fastmap;
 443      'fastmap_accurate' to zero;
 444      're_nsub' to the number of subexpressions in PATTERN.
 445
 446    PATTERN is the address of the pattern string.
 447
 448    CFLAGS is a series of bits which affect compilation.
 449
 450      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 451      use POSIX basic syntax.
 452
 453      If REG_NEWLINE is set, then . and [^...] don't match newline.
 454      Also, regexec will try a match beginning after every newline.
 455
 456      If REG_ICASE is set, then we considers upper- and lowercase
 457      versions of letters to be equivalent when matching.
 458
 459      If REG_NOSUB is set, then when PREG is passed to regexec, that
 460      routine will report only success or failure, and nothing about the
 461      registers.
 462
 463    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 464    the return codes and their meanings.)  */
 465
 466 int
 467 regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern, int cflags)
 468 {
 469   reg_errcode_t ret;
 470   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 471                          : RE_SYNTAX_POSIX_BASIC);
 472
 473   preg->buffer = NULL;
 474   preg->allocated = 0;
 475   preg->used = 0;
 476
 477   /* Try to allocate space for the fastmap.  */
 478   preg->fastmap = re_malloc (char, SBC_MAX);
 479   if (BE (preg->fastmap == NULL, 0))
 480     return REG_ESPACE;
 481
 482   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 483
 484   /* If REG_NEWLINE is set, newlines are treated differently.  */
 485   if (cflags & REG_NEWLINE)
 486     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 487       syntax &= ~RE_DOT_NEWLINE;
 488       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 489       /* It also changes the matching behavior.  */
 490       preg->newline_anchor = 1;
 491     }
 492   else
 493     preg->newline_anchor = 0;
 494   preg->no_sub = !!(cflags & REG_NOSUB);
 495   preg->translate = NULL;
 496
 497   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 498
 499   /* POSIX doesn't distinguish between an unmatched open-group and an
 500      unmatched close-group: both are REG_EPAREN.  */
 501   if (ret == REG_ERPAREN)
 502     ret = REG_EPAREN;
 503
 504   /* We have already checked preg->fastmap != NULL.  */
 505   if (BE (ret == REG_NOERROR, 1))
 506     /* Compute the fastmap now, since regexec cannot modify the pattern
 507        buffer.  This function never fails in this implementation.  */
 508     (void) re_compile_fastmap (preg);
 509   else
 510     {
 511       /* Some error occurred while compiling the expression.  */
 512       re_free (preg->fastmap);
 513       preg->fastmap = NULL;
 514     }
 515
 516   return (int) ret;
 517 }
 518 #ifdef _LIBC
 519 libc_hidden_def (__regcomp)
 520 weak_alias (__regcomp, regcomp)
 521 #endif
 522
 523 /* Returns a message corresponding to an error code, ERRCODE, returned
 524    from either regcomp or regexec.   We don't use PREG here.  */
 525
 526 size_t
 527 regerror (int errcode, const regex_t *_Restrict_ preg, char *_Restrict_ errbuf,
 528           size_t errbuf_size)
 529 {
 530   const char *msg;
 531   size_t msg_size;
 532
 533   if (BE (errcode < 0
 534           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 535                                / sizeof (__re_error_msgid_idx[0])), 0))
 536     /* Only error codes returned by the rest of the code should be passed
 537        to this routine.  If we are given anything else, or if other regex
 538        code generates an invalid error code, then the program has a bug.
 539        Dump core so we can fix it.  */
 540     abort ();
 541
 542   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 543
 544   msg_size = strlen (msg) + 1; /* Includes the null.  */
 545
 546   if (BE (errbuf_size != 0, 1))
 547     {
 548       size_t cpy_size = msg_size;
 549       if (BE (msg_size > errbuf_size, 0))
 550         {
 551           cpy_size = errbuf_size - 1;
 552           errbuf[cpy_size] = '\0';
 553         }
 554       memcpy (errbuf, msg, cpy_size);
 555     }
 556
 557   return msg_size;
 558 }
 559 #ifdef _LIBC
 560 weak_alias (__regerror, regerror)
 561 #endif
 562
 563
 564 #ifdef RE_ENABLE_I18N
 565 /* This static array is used for the map to single-byte characters when
 566    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 567    it the same all the time.  UTF-8 is the preferred encoding so this is
 568    a worthwhile optimization.  */
 569 static const bitset_t utf8_sb_map =
 570 {
 571   /* Set the first 128 bits.  */
 572 # if defined __GNUC__ && !defined __STRICT_ANSI__
 573   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 574 # else
 575 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
 576 #   error "bitset_word_t is narrower than 32 bits"
 577 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 578   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 579 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 580   BITSET_WORD_MAX, BITSET_WORD_MAX,
 581 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 582   BITSET_WORD_MAX,
 583 #  endif
 584   (BITSET_WORD_MAX
 585    >> (SBC_MAX % BITSET_WORD_BITS == 0
 586        ? 0
 587        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 588 # endif
 589 };
 590 #endif
 591
 592
 593 static void
 594 free_dfa_content (re_dfa_t *dfa)
 595 {
 596   Idx i, j;
 597
 598   if (dfa->nodes)
 599     for (i = 0; i < dfa->nodes_len; ++i)
 600       free_token (dfa->nodes + i);
 601   re_free (dfa->nexts);
 602   for (i = 0; i < dfa->nodes_len; ++i)
 603     {
 604       if (dfa->eclosures != NULL)
 605         re_node_set_free (dfa->eclosures + i);
 606       if (dfa->inveclosures != NULL)
 607         re_node_set_free (dfa->inveclosures + i);
 608       if (dfa->edests != NULL)
 609         re_node_set_free (dfa->edests + i);
 610     }
 611   re_free (dfa->edests);
 612   re_free (dfa->eclosures);
 613   re_free (dfa->inveclosures);
 614   re_free (dfa->nodes);
 615
 616   if (dfa->state_table)
 617     for (i = 0; i <= dfa->state_hash_mask; ++i)
 618       {
 619         struct re_state_table_entry *entry = dfa->state_table + i;
 620         for (j = 0; j < entry->num; ++j)
 621           {
 622             re_dfastate_t *state = entry->array[j];
 623             free_state (state);
 624           }
 625         re_free (entry->array);
 626       }
 627   re_free (dfa->state_table);
 628 #ifdef RE_ENABLE_I18N
 629   if (dfa->sb_char != utf8_sb_map)
 630     re_free (dfa->sb_char);
 631 #endif
 632   re_free (dfa->subexp_map);
 633 #ifdef DEBUG
 634   re_free (dfa->re_str);
 635 #endif
 636
 637   re_free (dfa);
 638 }
 639
 640
 641 /* Free dynamically allocated space used by PREG.  */
 642
 643 void
 644 regfree (regex_t *preg)
 645 {
 646   re_dfa_t *dfa = preg->buffer;
 647   if (BE (dfa != NULL, 1))
 648     {
 649       lock_fini (dfa->lock);
 650       free_dfa_content (dfa);
 651     }
 652   preg->buffer = NULL;
 653   preg->allocated = 0;
 654
 655   re_free (preg->fastmap);
 656   preg->fastmap = NULL;
 657
 658   re_free (preg->translate);
 659   preg->translate = NULL;
 660 }
 661 #ifdef _LIBC
 662 libc_hidden_def (__regfree)
 663 weak_alias (__regfree, regfree)
 664 #endif
 665 \f
 666 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 667    them unless specifically requested.  */
 668
 669 #if defined _REGEX_RE_COMP || defined _LIBC
 670
 671 /* BSD has one and only one pattern buffer.  */
 672 static struct re_pattern_buffer re_comp_buf;
 673
 674 char *
 675 # ifdef _LIBC
 676 /* Make these definitions weak in libc, so POSIX programs can redefine
 677    these names if they don't use our functions, and still use
 678    regcomp/regexec above without link errors.  */
 679 weak_function
 680 # endif
 681 re_comp (const char *s)
 682 {
 683   reg_errcode_t ret;
 684   char *fastmap;
 685
 686   if (!s)
 687     {
 688       if (!re_comp_buf.buffer)
 689         return gettext ("No previous regular expression");
 690       return 0;
 691     }
 692
 693   if (re_comp_buf.buffer)
 694     {
 695       fastmap = re_comp_buf.fastmap;
 696       re_comp_buf.fastmap = NULL;
 697       __regfree (&re_comp_buf);
 698       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 699       re_comp_buf.fastmap = fastmap;
 700     }
 701
 702   if (re_comp_buf.fastmap == NULL)
 703     {
 704       re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
 705       if (re_comp_buf.fastmap == NULL)
 706         return (char *) gettext (__re_error_msgid
 707                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 708     }
 709
 710   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 711      don't need to initialize the pattern buffer fields which affect it.  */
 712
 713   /* Match anchors at newlines.  */
 714   re_comp_buf.newline_anchor = 1;
 715
 716   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 717
 718   if (!ret)
 719     return NULL;
 720
 721   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 722   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 723 }
 724
 725 #ifdef _LIBC
 726 libc_freeres_fn (free_mem)
 727 {
 728   __regfree (&re_comp_buf);
 729 }
 730 #endif
 731
 732 #endif /* _REGEX_RE_COMP */
 733 \f
 734 /* Internal entry point.
 735    Compile the regular expression PATTERN, whose length is LENGTH.
 736    SYNTAX indicate regular expression's syntax.  */
 737
 738 static reg_errcode_t
 739 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 740                      reg_syntax_t syntax)
 741 {
 742   reg_errcode_t err = REG_NOERROR;
 743   re_dfa_t *dfa;
 744   re_string_t regexp;
 745
 746   /* Initialize the pattern buffer.  */
 747   preg->fastmap_accurate = 0;
 748   preg->syntax = syntax;
 749   preg->not_bol = preg->not_eol = 0;
 750   preg->used = 0;
 751   preg->re_nsub = 0;
 752   preg->can_be_null = 0;
 753   preg->regs_allocated = REGS_UNALLOCATED;
 754
 755   /* Initialize the dfa.  */
 756   dfa = preg->buffer;
 757   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 758     {
 759       /* If zero allocated, but buffer is non-null, try to realloc
 760          enough space.  This loses if buffer's address is bogus, but
 761          that is the user's responsibility.  If ->buffer is NULL this
 762          is a simple allocation.  */
 763       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 764       if (dfa == NULL)
 765         return REG_ESPACE;
 766       preg->allocated = sizeof (re_dfa_t);
 767       preg->buffer = dfa;
 768     }
 769   preg->used = sizeof (re_dfa_t);
 770
 771   err = init_dfa (dfa, length);
 772   if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
 773     err = REG_ESPACE;
 774   if (BE (err != REG_NOERROR, 0))
 775     {
 776       free_dfa_content (dfa);
 777       preg->buffer = NULL;
 778       preg->allocated = 0;
 779       return err;
 780     }
 781 #ifdef DEBUG
 782   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 783   dfa->re_str = re_malloc (char, length + 1);
 784   strncpy (dfa->re_str, pattern, length + 1);
 785 #endif
 786
 787   err = re_string_construct (&regexp, pattern, length, preg->translate,
 788                              (syntax & RE_ICASE) != 0, dfa);
 789   if (BE (err != REG_NOERROR, 0))
 790     {
 791     re_compile_internal_free_return:
 792       free_workarea_compile (preg);
 793       re_string_destruct (&regexp);
 794       lock_fini (dfa->lock);
 795       free_dfa_content (dfa);
 796       preg->buffer = NULL;
 797       preg->allocated = 0;
 798       return err;
 799     }
 800
 801   /* Parse the regular expression, and build a structure tree.  */
 802   preg->re_nsub = 0;
 803   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 804   if (BE (dfa->str_tree == NULL, 0))
 805     goto re_compile_internal_free_return;
 806
 807   /* Analyze the tree and create the nfa.  */
 808   err = analyze (preg);
 809   if (BE (err != REG_NOERROR, 0))
 810     goto re_compile_internal_free_return;
 811
 812 #ifdef RE_ENABLE_I18N
 813   /* If possible, do searching in single byte encoding to speed things up.  */
 814   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 815     optimize_utf8 (dfa);
 816 #endif
 817
 818   /* Then create the initial state of the dfa.  */
 819   err = create_initial_state (dfa);
 820
 821   /* Release work areas.  */
 822   free_workarea_compile (preg);
 823   re_string_destruct (&regexp);
 824
 825   if (BE (err != REG_NOERROR, 0))
 826     {
 827       lock_fini (dfa->lock);
 828       free_dfa_content (dfa);
 829       preg->buffer = NULL;
 830       preg->allocated = 0;
 831     }
 832
 833   return err;
 834 }
 835
 836 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 837    as the initial length of some arrays.  */
 838
 839 static reg_errcode_t
 840 init_dfa (re_dfa_t *dfa, size_t pat_len)
 841 {
 842   __re_size_t table_size;
 843 #if !defined(_LIBC) && !defined(_REGEX_STANDALONE)
 844   const char *codeset_name;
 845 #endif
 846 #ifdef RE_ENABLE_I18N
 847   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 848 #else
 849   size_t max_i18n_object_size = 0;
 850 #endif
 851   size_t max_object_size =
 852     MAX (sizeof (struct re_state_table_entry),
 853          MAX (sizeof (re_token_t),
 854               MAX (sizeof (re_node_set),
 855                    MAX (sizeof (regmatch_t),
 856                         max_i18n_object_size))));
 857
 858   memset (dfa, '\0', sizeof (re_dfa_t));
 859
 860   /* Force allocation of str_tree_storage the first time.  */
 861   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 862
 863   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 864      calculation below, and for similar doubling calculations
 865      elsewhere.  And it's <= rather than <, because some of the
 866      doubling calculations add 1 afterwards.  */
 867   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
 868     return REG_ESPACE;
 869
 870   dfa->nodes_alloc = pat_len + 1;
 871   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 872
 873   /*  table_size = 2 ^ ceil(log pat_len) */
 874   for (table_size = 1; ; table_size <<= 1)
 875     if (table_size > pat_len)
 876       break;
 877
 878   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 879   dfa->state_hash_mask = table_size - 1;
 880
 881   dfa->mb_cur_max = MB_CUR_MAX;
 882 #ifdef _LIBC
 883   if (dfa->mb_cur_max == 6
 884       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 885     dfa->is_utf8 = 1;
 886   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 887                        != 0);
 888 #else
 889 # if !defined(_REGEX_STANDALONE)
 890   codeset_name = nl_langinfo (CODESET);
 891   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
 892       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 893       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 894       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
 895     dfa->is_utf8 = 1;
 896 # endif
 897
 898   /* We check exhaustively in the loop below if this charset is a
 899      superset of ASCII.  */
 900   dfa->map_notascii = 0;
 901 #endif
 902
 903 #ifdef RE_ENABLE_I18N
 904   if (dfa->mb_cur_max > 1)
 905     {
 906       if (dfa->is_utf8)
 907         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 908       else
 909         {
 910           int i, j, ch;
 911
 912           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 913           if (BE (dfa->sb_char == NULL, 0))
 914             return REG_ESPACE;
 915
 916           /* Set the bits corresponding to single byte chars.  */
 917           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 918             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 919               {
 920                 wint_t wch = __btowc (ch);
 921                 if (wch != WEOF)
 922                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 923 # ifndef _LIBC
 924                 if (isascii (ch) && wch != ch)
 925                   dfa->map_notascii = 1;
 926 # endif
 927               }
 928         }
 929     }
 930 #endif
 931
 932   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 933     return REG_ESPACE;
 934   return REG_NOERROR;
 935 }
 936
 937 /* Initialize WORD_CHAR table, which indicate which character is
 938    "word".  In this case "word" means that it is the word construction
 939    character used by some operators like "\<", "\>", etc.  */
 940
 941 static void
 942 init_word_char (re_dfa_t *dfa)
 943 {
 944   int i = 0;
 945   int j;
 946   int ch = 0;
 947   dfa->word_ops_used = 1;
 948   if (BE (dfa->map_notascii == 0, 1))
 949     {
 950       /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
 951          them, an issue when this code is used in Gnulib.  */
 952       bitset_word_t bits0 = 0x00000000;
 953       bitset_word_t bits1 = 0x03ff0000;
 954       bitset_word_t bits2 = 0x87fffffe;
 955       bitset_word_t bits3 = 0x07fffffe;
 956       if (BITSET_WORD_BITS == 64)
 957         {
 958           /* Pacify gcc -Woverflow on 32-bit platformns.  */
 959           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 960           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 961           i = 2;
 962         }
 963       else if (BITSET_WORD_BITS == 32)
 964         {
 965           dfa->word_char[0] = bits0;
 966           dfa->word_char[1] = bits1;
 967           dfa->word_char[2] = bits2;
 968           dfa->word_char[3] = bits3;
 969           i = 4;
 970         }
 971       else
 972         goto general_case;
 973       ch = 128;
 974
 975       if (BE (dfa->is_utf8, 1))
 976         {
 977           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 978           return;
 979         }
 980     }
 981
 982  general_case:
 983   for (; i < BITSET_WORDS; ++i)
 984     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 985       if (isalnum (ch) || ch == '_')
 986         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 987 }
 988
 989 /* Free the work area which are only used while compiling.  */
 990
 991 static void
 992 free_workarea_compile (regex_t *preg)
 993 {
 994   re_dfa_t *dfa = preg->buffer;
 995   bin_tree_storage_t *storage, *next;
 996   for (storage = dfa->str_tree_storage; storage; storage = next)
 997     {
 998       next = storage->next;
 999       re_free (storage);
1000     }
1001   dfa->str_tree_storage = NULL;
1002   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1003   dfa->str_tree = NULL;
1004   re_free (dfa->org_indices);
1005   dfa->org_indices = NULL;
1006 }
1007
1008 /* Create initial states for all contexts.  */
1009
1010 static reg_errcode_t
1011 create_initial_state (re_dfa_t *dfa)
1012 {
1013   Idx first, i;
1014   reg_errcode_t err;
1015   re_node_set init_nodes;
1016
1017   /* Initial states have the epsilon closure of the node which is
1018      the first node of the regular expression.  */
1019   first = dfa->str_tree->first->node_idx;
1020   dfa->init_node = first;
1021   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1022   if (BE (err != REG_NOERROR, 0))
1023     return err;
1024
1025   /* The back-references which are in initial states can epsilon transit,
1026      since in this case all of the subexpressions can be null.
1027      Then we add epsilon closures of the nodes which are the next nodes of
1028      the back-references.  */
1029   if (dfa->nbackref > 0)
1030     for (i = 0; i < init_nodes.nelem; ++i)
1031       {
1032         Idx node_idx = init_nodes.elems[i];
1033         re_token_type_t type = dfa->nodes[node_idx].type;
1034
1035         Idx clexp_idx;
1036         if (type != OP_BACK_REF)
1037           continue;
1038         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1039           {
1040             re_token_t *clexp_node;
1041             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1042             if (clexp_node->type == OP_CLOSE_SUBEXP
1043                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1044               break;
1045           }
1046         if (clexp_idx == init_nodes.nelem)
1047           continue;
1048
1049         if (type == OP_BACK_REF)
1050           {
1051             Idx dest_idx = dfa->edests[node_idx].elems[0];
1052             if (!re_node_set_contains (&init_nodes, dest_idx))
1053               {
1054                 reg_errcode_t merge_err
1055                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1056                 if (merge_err != REG_NOERROR)
1057                   return merge_err;
1058                 i = 0;
1059               }
1060           }
1061       }
1062
1063   /* It must be the first time to invoke acquire_state.  */
1064   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1065   /* We don't check ERR here, since the initial state must not be NULL.  */
1066   if (BE (dfa->init_state == NULL, 0))
1067     return err;
1068   if (dfa->init_state->has_constraint)
1069     {
1070       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1071                                                        CONTEXT_WORD);
1072       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1073                                                      CONTEXT_NEWLINE);
1074       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1075                                                          &init_nodes,
1076                                                          CONTEXT_NEWLINE
1077                                                          | CONTEXT_BEGBUF);
1078       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1079               || dfa->init_state_begbuf == NULL, 0))
1080         return err;
1081     }
1082   else
1083     dfa->init_state_word = dfa->init_state_nl
1084       = dfa->init_state_begbuf = dfa->init_state;
1085
1086   re_node_set_free (&init_nodes);
1087   return REG_NOERROR;
1088 }
1089 \f
1090 #ifdef RE_ENABLE_I18N
1091 /* If it is possible to do searching in single byte encoding instead of UTF-8
1092    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1093    DFA nodes where needed.  */
1094
1095 static void
1096 optimize_utf8 (re_dfa_t *dfa)
1097 {
1098   Idx node;
1099   int i;
1100   bool mb_chars = false;
1101   bool has_period = false;
1102
1103   for (node = 0; node < dfa->nodes_len; ++node)
1104     switch (dfa->nodes[node].type)
1105       {
1106       case CHARACTER:
1107         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1108           mb_chars = true;
1109         break;
1110       case ANCHOR:
1111         switch (dfa->nodes[node].opr.ctx_type)
1112           {
1113           case LINE_FIRST:
1114           case LINE_LAST:
1115           case BUF_FIRST:
1116           case BUF_LAST:
1117             break;
1118           default:
1119             /* Word anchors etc. cannot be handled.  It's okay to test
1120                opr.ctx_type since constraints (for all DFA nodes) are
1121                created by ORing one or more opr.ctx_type values.  */
1122             return;
1123           }
1124         break;
1125       case OP_PERIOD:
1126         has_period = true;
1127         break;
1128       case OP_BACK_REF:
1129       case OP_ALT:
1130       case END_OF_RE:
1131       case OP_DUP_ASTERISK:
1132       case OP_OPEN_SUBEXP:
1133       case OP_CLOSE_SUBEXP:
1134         break;
1135       case COMPLEX_BRACKET:
1136         return;
1137       case SIMPLE_BRACKET:
1138         /* Just double check.  */
1139         {
1140           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1141                         ? 0
1142                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1143           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1144             {
1145               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1146                 return;
1147               rshift = 0;
1148             }
1149         }
1150         break;
1151       default:
1152         abort ();
1153       }
1154
1155   if (mb_chars || has_period)
1156     for (node = 0; node < dfa->nodes_len; ++node)
1157       {
1158         if (dfa->nodes[node].type == CHARACTER
1159             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1160           dfa->nodes[node].mb_partial = 0;
1161         else if (dfa->nodes[node].type == OP_PERIOD)
1162           dfa->nodes[node].type = OP_UTF8_PERIOD;
1163       }
1164
1165   /* The search can be in single byte locale.  */
1166   dfa->mb_cur_max = 1;
1167   dfa->is_utf8 = 0;
1168   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1169 }
1170 #endif
1171 \f
1172 /* Analyze the structure tree, and calculate "first", "next", "edest",
1173    "eclosure", and "inveclosure".  */
1174
1175 static reg_errcode_t
1176 analyze (regex_t *preg)
1177 {
1178   re_dfa_t *dfa = preg->buffer;
1179   reg_errcode_t ret;
1180
1181   /* Allocate arrays.  */
1182   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1183   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1184   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1185   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1186   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1187           || dfa->eclosures == NULL, 0))
1188     return REG_ESPACE;
1189
1190   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1191   if (dfa->subexp_map != NULL)
1192     {
1193       Idx i;
1194       for (i = 0; i < preg->re_nsub; i++)
1195         dfa->subexp_map[i] = i;
1196       preorder (dfa->str_tree, optimize_subexps, dfa);
1197       for (i = 0; i < preg->re_nsub; i++)
1198         if (dfa->subexp_map[i] != i)
1199           break;
1200       if (i == preg->re_nsub)
1201         {
1202           re_free (dfa->subexp_map);
1203           dfa->subexp_map = NULL;
1204         }
1205     }
1206
1207   ret = postorder (dfa->str_tree, lower_subexps, preg);
1208   if (BE (ret != REG_NOERROR, 0))
1209     return ret;
1210   ret = postorder (dfa->str_tree, calc_first, dfa);
1211   if (BE (ret != REG_NOERROR, 0))
1212     return ret;
1213   preorder (dfa->str_tree, calc_next, dfa);
1214   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1215   if (BE (ret != REG_NOERROR, 0))
1216     return ret;
1217   ret = calc_eclosure (dfa);
1218   if (BE (ret != REG_NOERROR, 0))
1219     return ret;
1220
1221   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1222      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1223   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1224       || dfa->nbackref)
1225     {
1226       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1227       if (BE (dfa->inveclosures == NULL, 0))
1228         return REG_ESPACE;
1229       ret = calc_inveclosure (dfa);
1230     }
1231
1232   return ret;
1233 }
1234
1235 /* Our parse trees are very unbalanced, so we cannot use a stack to
1236    implement parse tree visits.  Instead, we use parent pointers and
1237    some hairy code in these two functions.  */
1238 static reg_errcode_t
1239 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1240            void *extra)
1241 {
1242   bin_tree_t *node, *prev;
1243
1244   for (node = root; ; )
1245     {
1246       /* Descend down the tree, preferably to the left (or to the right
1247          if that's the only child).  */
1248       while (node->left || node->right)
1249         if (node->left)
1250           node = node->left;
1251         else
1252           node = node->right;
1253
1254       do
1255         {
1256           reg_errcode_t err = fn (extra, node);
1257           if (BE (err != REG_NOERROR, 0))
1258             return err;
1259           if (node->parent == NULL)
1260             return REG_NOERROR;
1261           prev = node;
1262           node = node->parent;
1263         }
1264       /* Go up while we have a node that is reached from the right.  */
1265       while (node->right == prev || node->right == NULL);
1266       node = node->right;
1267     }
1268 }
1269
1270 static reg_errcode_t
1271 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1272           void *extra)
1273 {
1274   bin_tree_t *node;
1275
1276   for (node = root; ; )
1277     {
1278       reg_errcode_t err = fn (extra, node);
1279       if (BE (err != REG_NOERROR, 0))
1280         return err;
1281
1282       /* Go to the left node, or up and to the right.  */
1283       if (node->left)
1284         node = node->left;
1285       else
1286         {
1287           bin_tree_t *prev = NULL;
1288           while (node->right == prev || node->right == NULL)
1289             {
1290               prev = node;
1291               node = node->parent;
1292               if (!node)
1293                 return REG_NOERROR;
1294             }
1295           node = node->right;
1296         }
1297     }
1298 }
1299
1300 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1301    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1302    backreferences as well.  Requires a preorder visit.  */
1303 static reg_errcode_t
1304 optimize_subexps (void *extra, bin_tree_t *node)
1305 {
1306   re_dfa_t *dfa = (re_dfa_t *) extra;
1307
1308   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1309     {
1310       int idx = node->token.opr.idx;
1311       node->token.opr.idx = dfa->subexp_map[idx];
1312       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1313     }
1314
1315   else if (node->token.type == SUBEXP
1316            && node->left && node->left->token.type == SUBEXP)
1317     {
1318       Idx other_idx = node->left->token.opr.idx;
1319
1320       node->left = node->left->left;
1321       if (node->left)
1322         node->left->parent = node;
1323
1324       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1325       if (other_idx < BITSET_WORD_BITS)
1326         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1327     }
1328
1329   return REG_NOERROR;
1330 }
1331
1332 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1333    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1334 static reg_errcode_t
1335 lower_subexps (void *extra, bin_tree_t *node)
1336 {
1337   regex_t *preg = (regex_t *) extra;
1338   reg_errcode_t err = REG_NOERROR;
1339
1340   if (node->left && node->left->token.type == SUBEXP)
1341     {
1342       node->left = lower_subexp (&err, preg, node->left);
1343       if (node->left)
1344         node->left->parent = node;
1345     }
1346   if (node->right && node->right->token.type == SUBEXP)
1347     {
1348       node->right = lower_subexp (&err, preg, node->right);
1349       if (node->right)
1350         node->right->parent = node;
1351     }
1352
1353   return err;
1354 }
1355
1356 static bin_tree_t *
1357 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1358 {
1359   re_dfa_t *dfa = preg->buffer;
1360   bin_tree_t *body = node->left;
1361   bin_tree_t *op, *cls, *tree1, *tree;
1362
1363   if (preg->no_sub
1364       /* We do not optimize empty subexpressions, because otherwise we may
1365          have bad CONCAT nodes with NULL children.  This is obviously not
1366          very common, so we do not lose much.  An example that triggers
1367          this case is the sed "script" /\(\)/x.  */
1368       && node->left != NULL
1369       && (node->token.opr.idx >= BITSET_WORD_BITS
1370           || !(dfa->used_bkref_map
1371                & ((bitset_word_t) 1 << node->token.opr.idx))))
1372     return node->left;
1373
1374   /* Convert the SUBEXP node to the concatenation of an
1375      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1376   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1377   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1378   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1379   tree = create_tree (dfa, op, tree1, CONCAT);
1380   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1381     {
1382       *err = REG_ESPACE;
1383       return NULL;
1384     }
1385
1386   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1387   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1388   return tree;
1389 }
1390
1391 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1392    nodes.  Requires a postorder visit.  */
1393 static reg_errcode_t
1394 calc_first (void *extra, bin_tree_t *node)
1395 {
1396   re_dfa_t *dfa = (re_dfa_t *) extra;
1397   if (node->token.type == CONCAT)
1398     {
1399       node->first = node->left->first;
1400       node->node_idx = node->left->node_idx;
1401     }
1402   else
1403     {
1404       node->first = node;
1405       node->node_idx = re_dfa_add_node (dfa, node->token);
1406       if (BE (node->node_idx == -1, 0))
1407         return REG_ESPACE;
1408       if (node->token.type == ANCHOR)
1409         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1410     }
1411   return REG_NOERROR;
1412 }
1413
1414 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1415 static reg_errcode_t
1416 calc_next (void *extra, bin_tree_t *node)
1417 {
1418   switch (node->token.type)
1419     {
1420     case OP_DUP_ASTERISK:
1421       node->left->next = node;
1422       break;
1423     case CONCAT:
1424       node->left->next = node->right->first;
1425       node->right->next = node->next;
1426       break;
1427     default:
1428       if (node->left)
1429         node->left->next = node->next;
1430       if (node->right)
1431         node->right->next = node->next;
1432       break;
1433     }
1434   return REG_NOERROR;
1435 }
1436
1437 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1438 static reg_errcode_t
1439 link_nfa_nodes (void *extra, bin_tree_t *node)
1440 {
1441   re_dfa_t *dfa = (re_dfa_t *) extra;
1442   Idx idx = node->node_idx;
1443   reg_errcode_t err = REG_NOERROR;
1444
1445   switch (node->token.type)
1446     {
1447     case CONCAT:
1448       break;
1449
1450     case END_OF_RE:
1451       assert (node->next == NULL);
1452       break;
1453
1454     case OP_DUP_ASTERISK:
1455     case OP_ALT:
1456       {
1457         Idx left, right;
1458         dfa->has_plural_match = 1;
1459         if (node->left != NULL)
1460           left = node->left->first->node_idx;
1461         else
1462           left = node->next->node_idx;
1463         if (node->right != NULL)
1464           right = node->right->first->node_idx;
1465         else
1466           right = node->next->node_idx;
1467         assert (left > -1);
1468         assert (right > -1);
1469         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1470       }
1471       break;
1472
1473     case ANCHOR:
1474     case OP_OPEN_SUBEXP:
1475     case OP_CLOSE_SUBEXP:
1476       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1477       break;
1478
1479     case OP_BACK_REF:
1480       dfa->nexts[idx] = node->next->node_idx;
1481       if (node->token.type == OP_BACK_REF)
1482         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1483       break;
1484
1485     default:
1486       assert (!IS_EPSILON_NODE (node->token.type));
1487       dfa->nexts[idx] = node->next->node_idx;
1488       break;
1489     }
1490
1491   return err;
1492 }
1493
1494 /* Duplicate the epsilon closure of the node ROOT_NODE.
1495    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1496    to their own constraint.  */
1497
1498 static reg_errcode_t
1499 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1500                         Idx root_node, unsigned int init_constraint)
1501 {
1502   Idx org_node, clone_node;
1503   bool ok;
1504   unsigned int constraint = init_constraint;
1505   for (org_node = top_org_node, clone_node = top_clone_node;;)
1506     {
1507       Idx org_dest, clone_dest;
1508       if (dfa->nodes[org_node].type == OP_BACK_REF)
1509         {
1510           /* If the back reference epsilon-transit, its destination must
1511              also have the constraint.  Then duplicate the epsilon closure
1512              of the destination of the back reference, and store it in
1513              edests of the back reference.  */
1514           org_dest = dfa->nexts[org_node];
1515           re_node_set_empty (dfa->edests + clone_node);
1516           clone_dest = duplicate_node (dfa, org_dest, constraint);
1517           if (BE (clone_dest == -1, 0))
1518             return REG_ESPACE;
1519           dfa->nexts[clone_node] = dfa->nexts[org_node];
1520           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1521           if (BE (! ok, 0))
1522             return REG_ESPACE;
1523         }
1524       else if (dfa->edests[org_node].nelem == 0)
1525         {
1526           /* In case of the node can't epsilon-transit, don't duplicate the
1527              destination and store the original destination as the
1528              destination of the node.  */
1529           dfa->nexts[clone_node] = dfa->nexts[org_node];
1530           break;
1531         }
1532       else if (dfa->edests[org_node].nelem == 1)
1533         {
1534           /* In case of the node can epsilon-transit, and it has only one
1535              destination.  */
1536           org_dest = dfa->edests[org_node].elems[0];
1537           re_node_set_empty (dfa->edests + clone_node);
1538           /* If the node is root_node itself, it means the epsilon closure
1539              has a loop.  Then tie it to the destination of the root_node.  */
1540           if (org_node == root_node && clone_node != org_node)
1541             {
1542               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1543               if (BE (! ok, 0))
1544                 return REG_ESPACE;
1545               break;
1546             }
1547           /* In case the node has another constraint, append it.  */
1548           constraint |= dfa->nodes[org_node].constraint;
1549           clone_dest = duplicate_node (dfa, org_dest, constraint);
1550           if (BE (clone_dest == -1, 0))
1551             return REG_ESPACE;
1552           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1553           if (BE (! ok, 0))
1554             return REG_ESPACE;
1555         }
1556       else /* dfa->edests[org_node].nelem == 2 */
1557         {
1558           /* In case of the node can epsilon-transit, and it has two
1559              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1560           org_dest = dfa->edests[org_node].elems[0];
1561           re_node_set_empty (dfa->edests + clone_node);
1562           /* Search for a duplicated node which satisfies the constraint.  */
1563           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1564           if (clone_dest == -1)
1565             {
1566               /* There is no such duplicated node, create a new one.  */
1567               reg_errcode_t err;
1568               clone_dest = duplicate_node (dfa, org_dest, constraint);
1569               if (BE (clone_dest == -1, 0))
1570                 return REG_ESPACE;
1571               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572               if (BE (! ok, 0))
1573                 return REG_ESPACE;
1574               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1575                                             root_node, constraint);
1576               if (BE (err != REG_NOERROR, 0))
1577                 return err;
1578             }
1579           else
1580             {
1581               /* There is a duplicated node which satisfies the constraint,
1582                  use it to avoid infinite loop.  */
1583               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1584               if (BE (! ok, 0))
1585                 return REG_ESPACE;
1586             }
1587
1588           org_dest = dfa->edests[org_node].elems[1];
1589           clone_dest = duplicate_node (dfa, org_dest, constraint);
1590           if (BE (clone_dest == -1, 0))
1591             return REG_ESPACE;
1592           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1593           if (BE (! ok, 0))
1594             return REG_ESPACE;
1595         }
1596       org_node = org_dest;
1597       clone_node = clone_dest;
1598     }
1599   return REG_NOERROR;
1600 }
1601
1602 /* Search for a node which is duplicated from the node ORG_NODE, and
1603    satisfies the constraint CONSTRAINT.  */
1604
1605 static Idx
1606 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1607                         unsigned int constraint)
1608 {
1609   Idx idx;
1610   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1611     {
1612       if (org_node == dfa->org_indices[idx]
1613           && constraint == dfa->nodes[idx].constraint)
1614         return idx; /* Found.  */
1615     }
1616   return -1; /* Not found.  */
1617 }
1618
1619 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1620    Return the index of the new node, or -1 if insufficient storage is
1621    available.  */
1622
1623 static Idx
1624 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1625 {
1626   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1627   if (BE (dup_idx != -1, 1))
1628     {
1629       dfa->nodes[dup_idx].constraint = constraint;
1630       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1631       dfa->nodes[dup_idx].duplicated = 1;
1632
1633       /* Store the index of the original node.  */
1634       dfa->org_indices[dup_idx] = org_idx;
1635     }
1636   return dup_idx;
1637 }
1638
1639 static reg_errcode_t
1640 calc_inveclosure (re_dfa_t *dfa)
1641 {
1642   Idx src, idx;
1643   bool ok;
1644   for (idx = 0; idx < dfa->nodes_len; ++idx)
1645     re_node_set_init_empty (dfa->inveclosures + idx);
1646
1647   for (src = 0; src < dfa->nodes_len; ++src)
1648     {
1649       Idx *elems = dfa->eclosures[src].elems;
1650       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1651         {
1652           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1653           if (BE (! ok, 0))
1654             return REG_ESPACE;
1655         }
1656     }
1657
1658   return REG_NOERROR;
1659 }
1660
1661 /* Calculate "eclosure" for all the node in DFA.  */
1662
1663 static reg_errcode_t
1664 calc_eclosure (re_dfa_t *dfa)
1665 {
1666   Idx node_idx;
1667   bool incomplete;
1668 #ifdef DEBUG
1669   assert (dfa->nodes_len > 0);
1670 #endif
1671   incomplete = false;
1672   /* For each nodes, calculate epsilon closure.  */
1673   for (node_idx = 0; ; ++node_idx)
1674     {
1675       reg_errcode_t err;
1676       re_node_set eclosure_elem;
1677       if (node_idx == dfa->nodes_len)
1678         {
1679           if (!incomplete)
1680             break;
1681           incomplete = false;
1682           node_idx = 0;
1683         }
1684
1685 #ifdef DEBUG
1686       assert (dfa->eclosures[node_idx].nelem != -1);
1687 #endif
1688
1689       /* If we have already calculated, skip it.  */
1690       if (dfa->eclosures[node_idx].nelem != 0)
1691         continue;
1692       /* Calculate epsilon closure of 'node_idx'.  */
1693       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1694       if (BE (err != REG_NOERROR, 0))
1695         return err;
1696
1697       if (dfa->eclosures[node_idx].nelem == 0)
1698         {
1699           incomplete = true;
1700           re_node_set_free (&eclosure_elem);
1701         }
1702     }
1703   return REG_NOERROR;
1704 }
1705
1706 /* Calculate epsilon closure of NODE.  */
1707
1708 static reg_errcode_t
1709 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1710 {
1711   reg_errcode_t err;
1712   Idx i;
1713   re_node_set eclosure;
1714   bool ok;
1715   bool incomplete = false;
1716   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1717   if (BE (err != REG_NOERROR, 0))
1718     return err;
1719
1720   /* This indicates that we are calculating this node now.
1721      We reference this value to avoid infinite loop.  */
1722   dfa->eclosures[node].nelem = -1;
1723
1724   /* If the current node has constraints, duplicate all nodes
1725      since they must inherit the constraints.  */
1726   if (dfa->nodes[node].constraint
1727       && dfa->edests[node].nelem
1728       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1729     {
1730       err = duplicate_node_closure (dfa, node, node, node,
1731                                     dfa->nodes[node].constraint);
1732       if (BE (err != REG_NOERROR, 0))
1733         return err;
1734     }
1735
1736   /* Expand each epsilon destination nodes.  */
1737   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1738     for (i = 0; i < dfa->edests[node].nelem; ++i)
1739       {
1740         re_node_set eclosure_elem;
1741         Idx edest = dfa->edests[node].elems[i];
1742         /* If calculating the epsilon closure of 'edest' is in progress,
1743            return intermediate result.  */
1744         if (dfa->eclosures[edest].nelem == -1)
1745           {
1746             incomplete = true;
1747             continue;
1748           }
1749         /* If we haven't calculated the epsilon closure of 'edest' yet,
1750            calculate now. Otherwise use calculated epsilon closure.  */
1751         if (dfa->eclosures[edest].nelem == 0)
1752           {
1753             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1754             if (BE (err != REG_NOERROR, 0))
1755               return err;
1756           }
1757         else
1758           eclosure_elem = dfa->eclosures[edest];
1759         /* Merge the epsilon closure of 'edest'.  */
1760         err = re_node_set_merge (&eclosure, &eclosure_elem);
1761         if (BE (err != REG_NOERROR, 0))
1762           return err;
1763         /* If the epsilon closure of 'edest' is incomplete,
1764            the epsilon closure of this node is also incomplete.  */
1765         if (dfa->eclosures[edest].nelem == 0)
1766           {
1767             incomplete = true;
1768             re_node_set_free (&eclosure_elem);
1769           }
1770       }
1771
1772   /* An epsilon closure includes itself.  */
1773   ok = re_node_set_insert (&eclosure, node);
1774   if (BE (! ok, 0))
1775     return REG_ESPACE;
1776   if (incomplete && !root)
1777     dfa->eclosures[node].nelem = 0;
1778   else
1779     dfa->eclosures[node] = eclosure;
1780   *new_set = eclosure;
1781   return REG_NOERROR;
1782 }
1783 \f
1784 /* Functions for token which are used in the parser.  */
1785
1786 /* Fetch a token from INPUT.
1787    We must not use this function inside bracket expressions.  */
1788
1789 static void
1790 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1791 {
1792   re_string_skip_bytes (input, peek_token (result, input, syntax));
1793 }
1794
1795 /* Peek a token from INPUT, and return the length of the token.
1796    We must not use this function inside bracket expressions.  */
1797
1798 static int
1799 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1800 {
1801   unsigned char c;
1802
1803   if (re_string_eoi (input))
1804     {
1805       token->type = END_OF_RE;
1806       return 0;
1807     }
1808
1809   c = re_string_peek_byte (input, 0);
1810   token->opr.c = c;
1811
1812   token->word_char = 0;
1813 #ifdef RE_ENABLE_I18N
1814   token->mb_partial = 0;
1815   if (input->mb_cur_max > 1 &&
1816       !re_string_first_byte (input, re_string_cur_idx (input)))
1817     {
1818       token->type = CHARACTER;
1819       token->mb_partial = 1;
1820       return 1;
1821     }
1822 #endif
1823   if (c == '\\')
1824     {
1825       unsigned char c2;
1826       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1827         {
1828           token->type = BACK_SLASH;
1829           return 1;
1830         }
1831
1832       c2 = re_string_peek_byte_case (input, 1);
1833       token->opr.c = c2;
1834       token->type = CHARACTER;
1835 #ifdef RE_ENABLE_I18N
1836       if (input->mb_cur_max > 1)
1837         {
1838           wint_t wc = re_string_wchar_at (input,
1839                                           re_string_cur_idx (input) + 1);
1840           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1841         }
1842       else
1843 #endif
1844         token->word_char = IS_WORD_CHAR (c2) != 0;
1845
1846       switch (c2)
1847         {
1848         case '|':
1849           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1850             token->type = OP_ALT;
1851           break;
1852         case '1': case '2': case '3': case '4': case '5':
1853         case '6': case '7': case '8': case '9':
1854           if (!(syntax & RE_NO_BK_REFS))
1855             {
1856               token->type = OP_BACK_REF;
1857               token->opr.idx = c2 - '1';
1858             }
1859           break;
1860         case '<':
1861           if (!(syntax & RE_NO_GNU_OPS))
1862             {
1863               token->type = ANCHOR;
1864               token->opr.ctx_type = WORD_FIRST;
1865             }
1866           break;
1867         case '>':
1868           if (!(syntax & RE_NO_GNU_OPS))
1869             {
1870               token->type = ANCHOR;
1871               token->opr.ctx_type = WORD_LAST;
1872             }
1873           break;
1874         case 'b':
1875           if (!(syntax & RE_NO_GNU_OPS))
1876             {
1877               token->type = ANCHOR;
1878               token->opr.ctx_type = WORD_DELIM;
1879             }
1880           break;
1881         case 'B':
1882           if (!(syntax & RE_NO_GNU_OPS))
1883             {
1884               token->type = ANCHOR;
1885               token->opr.ctx_type = NOT_WORD_DELIM;
1886             }
1887           break;
1888         case 'w':
1889           if (!(syntax & RE_NO_GNU_OPS))
1890             token->type = OP_WORD;
1891           break;
1892         case 'W':
1893           if (!(syntax & RE_NO_GNU_OPS))
1894             token->type = OP_NOTWORD;
1895           break;
1896         case 's':
1897           if (!(syntax & RE_NO_GNU_OPS))
1898             token->type = OP_SPACE;
1899           break;
1900         case 'S':
1901           if (!(syntax & RE_NO_GNU_OPS))
1902             token->type = OP_NOTSPACE;
1903           break;
1904         case '`':
1905           if (!(syntax & RE_NO_GNU_OPS))
1906             {
1907               token->type = ANCHOR;
1908               token->opr.ctx_type = BUF_FIRST;
1909             }
1910           break;
1911         case '\'':
1912           if (!(syntax & RE_NO_GNU_OPS))
1913             {
1914               token->type = ANCHOR;
1915               token->opr.ctx_type = BUF_LAST;
1916             }
1917           break;
1918         case '(':
1919           if (!(syntax & RE_NO_BK_PARENS))
1920             token->type = OP_OPEN_SUBEXP;
1921           break;
1922         case ')':
1923           if (!(syntax & RE_NO_BK_PARENS))
1924             token->type = OP_CLOSE_SUBEXP;
1925           break;
1926         case '+':
1927           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1928             token->type = OP_DUP_PLUS;
1929           break;
1930         case '?':
1931           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1932             token->type = OP_DUP_QUESTION;
1933           break;
1934         case '{':
1935           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1936             token->type = OP_OPEN_DUP_NUM;
1937           break;
1938         case '}':
1939           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1940             token->type = OP_CLOSE_DUP_NUM;
1941           break;
1942         default:
1943           break;
1944         }
1945       return 2;
1946     }
1947
1948   token->type = CHARACTER;
1949 #ifdef RE_ENABLE_I18N
1950   if (input->mb_cur_max > 1)
1951     {
1952       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1953       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1954     }
1955   else
1956 #endif
1957     token->word_char = IS_WORD_CHAR (token->opr.c);
1958
1959   switch (c)
1960     {
1961     case '\n':
1962       if (syntax & RE_NEWLINE_ALT)
1963         token->type = OP_ALT;
1964       break;
1965     case '|':
1966       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1967         token->type = OP_ALT;
1968       break;
1969     case '*':
1970       token->type = OP_DUP_ASTERISK;
1971       break;
1972     case '+':
1973       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1974         token->type = OP_DUP_PLUS;
1975       break;
1976     case '?':
1977       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1978         token->type = OP_DUP_QUESTION;
1979       break;
1980     case '{':
1981       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1982         token->type = OP_OPEN_DUP_NUM;
1983       break;
1984     case '}':
1985       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1986         token->type = OP_CLOSE_DUP_NUM;
1987       break;
1988     case '(':
1989       if (syntax & RE_NO_BK_PARENS)
1990         token->type = OP_OPEN_SUBEXP;
1991       break;
1992     case ')':
1993       if (syntax & RE_NO_BK_PARENS)
1994         token->type = OP_CLOSE_SUBEXP;
1995       break;
1996     case '[':
1997       token->type = OP_OPEN_BRACKET;
1998       break;
1999     case '.':
2000       token->type = OP_PERIOD;
2001       break;
2002     case '^':
2003       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2004           re_string_cur_idx (input) != 0)
2005         {
2006           char prev = re_string_peek_byte (input, -1);
2007           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2008             break;
2009         }
2010       token->type = ANCHOR;
2011       token->opr.ctx_type = LINE_FIRST;
2012       break;
2013     case '$':
2014       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2015           re_string_cur_idx (input) + 1 != re_string_length (input))
2016         {
2017           re_token_t next;
2018           re_string_skip_bytes (input, 1);
2019           peek_token (&next, input, syntax);
2020           re_string_skip_bytes (input, -1);
2021           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2022             break;
2023         }
2024       token->type = ANCHOR;
2025       token->opr.ctx_type = LINE_LAST;
2026       break;
2027     default:
2028       break;
2029     }
2030   return 1;
2031 }
2032
2033 /* Peek a token from INPUT, and return the length of the token.
2034    We must not use this function out of bracket expressions.  */
2035
2036 static int
2037 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2038 {
2039   unsigned char c;
2040   if (re_string_eoi (input))
2041     {
2042       token->type = END_OF_RE;
2043       return 0;
2044     }
2045   c = re_string_peek_byte (input, 0);
2046   token->opr.c = c;
2047
2048 #ifdef RE_ENABLE_I18N
2049   if (input->mb_cur_max > 1 &&
2050       !re_string_first_byte (input, re_string_cur_idx (input)))
2051     {
2052       token->type = CHARACTER;
2053       return 1;
2054     }
2055 #endif /* RE_ENABLE_I18N */
2056
2057   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2058       && re_string_cur_idx (input) + 1 < re_string_length (input))
2059     {
2060       /* In this case, '\' escape a character.  */
2061       unsigned char c2;
2062       re_string_skip_bytes (input, 1);
2063       c2 = re_string_peek_byte (input, 0);
2064       token->opr.c = c2;
2065       token->type = CHARACTER;
2066       return 1;
2067     }
2068   if (c == '[') /* '[' is a special char in a bracket exps.  */
2069     {
2070       unsigned char c2;
2071       int token_len;
2072       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2073         c2 = re_string_peek_byte (input, 1);
2074       else
2075         c2 = 0;
2076       token->opr.c = c2;
2077       token_len = 2;
2078       switch (c2)
2079         {
2080         case '.':
2081           token->type = OP_OPEN_COLL_ELEM;
2082           break;
2083
2084         case '=':
2085           token->type = OP_OPEN_EQUIV_CLASS;
2086           break;
2087
2088         case ':':
2089           if (syntax & RE_CHAR_CLASSES)
2090             {
2091               token->type = OP_OPEN_CHAR_CLASS;
2092               break;
2093             }
2094           FALLTHROUGH;
2095         default:
2096           token->type = CHARACTER;
2097           token->opr.c = c;
2098           token_len = 1;
2099           break;
2100         }
2101       return token_len;
2102     }
2103   switch (c)
2104     {
2105     case '-':
2106       token->type = OP_CHARSET_RANGE;
2107       break;
2108     case ']':
2109       token->type = OP_CLOSE_BRACKET;
2110       break;
2111     case '^':
2112       token->type = OP_NON_MATCH_LIST;
2113       break;
2114     default:
2115       token->type = CHARACTER;
2116     }
2117   return 1;
2118 }
2119 \f
2120 /* Functions for parser.  */
2121
2122 /* Entry point of the parser.
2123    Parse the regular expression REGEXP and return the structure tree.
2124    If an error occurs, ERR is set by error code, and return NULL.
2125    This function build the following tree, from regular expression <reg_exp>:
2126            CAT
2127            / \
2128           /   \
2129    <reg_exp>  EOR
2130
2131    CAT means concatenation.
2132    EOR means end of regular expression.  */
2133
2134 static bin_tree_t *
2135 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2136        reg_errcode_t *err)
2137 {
2138   re_dfa_t *dfa = preg->buffer;
2139   bin_tree_t *tree, *eor, *root;
2140   re_token_t current_token;
2141   dfa->syntax = syntax;
2142   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2143   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2144   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2145     return NULL;
2146   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2147   if (tree != NULL)
2148     root = create_tree (dfa, tree, eor, CONCAT);
2149   else
2150     root = eor;
2151   if (BE (eor == NULL || root == NULL, 0))
2152     {
2153       *err = REG_ESPACE;
2154       return NULL;
2155     }
2156   return root;
2157 }
2158
2159 /* This function build the following tree, from regular expression
2160    <branch1>|<branch2>:
2161            ALT
2162            / \
2163           /   \
2164    <branch1> <branch2>
2165
2166    ALT means alternative, which represents the operator '|'.  */
2167
2168 static bin_tree_t *
2169 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2170                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2171 {
2172   re_dfa_t *dfa = preg->buffer;
2173   bin_tree_t *tree, *branch = NULL;
2174   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2175   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2176   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2177     return NULL;
2178
2179   while (token->type == OP_ALT)
2180     {
2181       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2182       if (token->type != OP_ALT && token->type != END_OF_RE
2183           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2184         {
2185           bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2186           dfa->completed_bkref_map = initial_bkref_map;
2187           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2188           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2189             {
2190               if (tree != NULL)
2191                 postorder (tree, free_tree, NULL);
2192               return NULL;
2193             }
2194           dfa->completed_bkref_map |= accumulated_bkref_map;
2195         }
2196       else
2197         branch = NULL;
2198       tree = create_tree (dfa, tree, branch, OP_ALT);
2199       if (BE (tree == NULL, 0))
2200         {
2201           *err = REG_ESPACE;
2202           return NULL;
2203         }
2204     }
2205   return tree;
2206 }
2207
2208 /* This function build the following tree, from regular expression
2209    <exp1><exp2>:
2210         CAT
2211         / \
2212        /   \
2213    <exp1> <exp2>
2214
2215    CAT means concatenation.  */
2216
2217 static bin_tree_t *
2218 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2219               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2220 {
2221   bin_tree_t *tree, *expr;
2222   re_dfa_t *dfa = preg->buffer;
2223   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2224   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2225     return NULL;
2226
2227   while (token->type != OP_ALT && token->type != END_OF_RE
2228          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2229     {
2230       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2231       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2232         {
2233           if (tree != NULL)
2234             postorder (tree, free_tree, NULL);
2235           return NULL;
2236         }
2237       if (tree != NULL && expr != NULL)
2238         {
2239           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2240           if (newtree == NULL)
2241             {
2242               postorder (expr, free_tree, NULL);
2243               postorder (tree, free_tree, NULL);
2244               *err = REG_ESPACE;
2245               return NULL;
2246             }
2247           tree = newtree;
2248         }
2249       else if (tree == NULL)
2250         tree = expr;
2251       /* Otherwise expr == NULL, we don't need to create new tree.  */
2252     }
2253   return tree;
2254 }
2255
2256 /* This function build the following tree, from regular expression a*:
2257          *
2258          |
2259          a
2260 */
2261
2262 static bin_tree_t *
2263 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2264                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2265 {
2266   re_dfa_t *dfa = preg->buffer;
2267   bin_tree_t *tree;
2268   switch (token->type)
2269     {
2270     case CHARACTER:
2271       tree = create_token_tree (dfa, NULL, NULL, token);
2272       if (BE (tree == NULL, 0))
2273         {
2274           *err = REG_ESPACE;
2275           return NULL;
2276         }
2277 #ifdef RE_ENABLE_I18N
2278       if (dfa->mb_cur_max > 1)
2279         {
2280           while (!re_string_eoi (regexp)
2281                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2282             {
2283               bin_tree_t *mbc_remain;
2284               fetch_token (token, regexp, syntax);
2285               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2286               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2287               if (BE (mbc_remain == NULL || tree == NULL, 0))
2288                 {
2289                   *err = REG_ESPACE;
2290                   return NULL;
2291                 }
2292             }
2293         }
2294 #endif
2295       break;
2296
2297     case OP_OPEN_SUBEXP:
2298       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2299       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2300         return NULL;
2301       break;
2302
2303     case OP_OPEN_BRACKET:
2304       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2305       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2306         return NULL;
2307       break;
2308
2309     case OP_BACK_REF:
2310       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2311         {
2312           *err = REG_ESUBREG;
2313           return NULL;
2314         }
2315       dfa->used_bkref_map |= 1 << token->opr.idx;
2316       tree = create_token_tree (dfa, NULL, NULL, token);
2317       if (BE (tree == NULL, 0))
2318         {
2319           *err = REG_ESPACE;
2320           return NULL;
2321         }
2322       ++dfa->nbackref;
2323       dfa->has_mb_node = 1;
2324       break;
2325
2326     case OP_OPEN_DUP_NUM:
2327       if (syntax & RE_CONTEXT_INVALID_DUP)
2328         {
2329           *err = REG_BADRPT;
2330           return NULL;
2331         }
2332       FALLTHROUGH;
2333     case OP_DUP_ASTERISK:
2334     case OP_DUP_PLUS:
2335     case OP_DUP_QUESTION:
2336       if (syntax & RE_CONTEXT_INVALID_OPS)
2337         {
2338           *err = REG_BADRPT;
2339           return NULL;
2340         }
2341       else if (syntax & RE_CONTEXT_INDEP_OPS)
2342         {
2343           fetch_token (token, regexp, syntax);
2344           return parse_expression (regexp, preg, token, syntax, nest, err);
2345         }
2346       FALLTHROUGH;
2347     case OP_CLOSE_SUBEXP:
2348       if ((token->type == OP_CLOSE_SUBEXP) &&
2349           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2350         {
2351           *err = REG_ERPAREN;
2352           return NULL;
2353         }
2354       FALLTHROUGH;
2355     case OP_CLOSE_DUP_NUM:
2356       /* We treat it as a normal character.  */
2357
2358       /* Then we can these characters as normal characters.  */
2359       token->type = CHARACTER;
2360       /* mb_partial and word_char bits should be initialized already
2361          by peek_token.  */
2362       tree = create_token_tree (dfa, NULL, NULL, token);
2363       if (BE (tree == NULL, 0))
2364         {
2365           *err = REG_ESPACE;
2366           return NULL;
2367         }
2368       break;
2369
2370     case ANCHOR:
2371       if ((token->opr.ctx_type
2372            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2373           && dfa->word_ops_used == 0)
2374         init_word_char (dfa);
2375       if (token->opr.ctx_type == WORD_DELIM
2376           || token->opr.ctx_type == NOT_WORD_DELIM)
2377         {
2378           bin_tree_t *tree_first, *tree_last;
2379           if (token->opr.ctx_type == WORD_DELIM)
2380             {
2381               token->opr.ctx_type = WORD_FIRST;
2382               tree_first = create_token_tree (dfa, NULL, NULL, token);
2383               token->opr.ctx_type = WORD_LAST;
2384             }
2385           else
2386             {
2387               token->opr.ctx_type = INSIDE_WORD;
2388               tree_first = create_token_tree (dfa, NULL, NULL, token);
2389               token->opr.ctx_type = INSIDE_NOTWORD;
2390             }
2391           tree_last = create_token_tree (dfa, NULL, NULL, token);
2392           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2393           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2394             {
2395               *err = REG_ESPACE;
2396               return NULL;
2397             }
2398         }
2399       else
2400         {
2401           tree = create_token_tree (dfa, NULL, NULL, token);
2402           if (BE (tree == NULL, 0))
2403             {
2404               *err = REG_ESPACE;
2405               return NULL;
2406             }
2407         }
2408       /* We must return here, since ANCHORs can't be followed
2409          by repetition operators.
2410          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2411              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2412       fetch_token (token, regexp, syntax);
2413       return tree;
2414
2415     case OP_PERIOD:
2416       tree = create_token_tree (dfa, NULL, NULL, token);
2417       if (BE (tree == NULL, 0))
2418         {
2419           *err = REG_ESPACE;
2420           return NULL;
2421         }
2422       if (dfa->mb_cur_max > 1)
2423         dfa->has_mb_node = 1;
2424       break;
2425
2426     case OP_WORD:
2427     case OP_NOTWORD:
2428       tree = build_charclass_op (dfa, regexp->trans,
2429                                  "alnum",
2430                                  "_",
2431                                  token->type == OP_NOTWORD, err);
2432       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2433         return NULL;
2434       break;
2435
2436     case OP_SPACE:
2437     case OP_NOTSPACE:
2438       tree = build_charclass_op (dfa, regexp->trans,
2439                                  "space",
2440                                  "",
2441                                  token->type == OP_NOTSPACE, err);
2442       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2443         return NULL;
2444       break;
2445
2446     case OP_ALT:
2447     case END_OF_RE:
2448       return NULL;
2449
2450     case BACK_SLASH:
2451       *err = REG_EESCAPE;
2452       return NULL;
2453
2454     default:
2455       /* Must not happen?  */
2456 #ifdef DEBUG
2457       assert (0);
2458 #endif
2459       return NULL;
2460     }
2461   fetch_token (token, regexp, syntax);
2462
2463   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2464          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2465     {
2466       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2467                                            syntax, err);
2468       if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2469         {
2470           if (tree != NULL)
2471             postorder (tree, free_tree, NULL);
2472           return NULL;
2473         }
2474       tree = dup_tree;
2475       /* In BRE consecutive duplications are not allowed.  */
2476       if ((syntax & RE_CONTEXT_INVALID_DUP)
2477           && (token->type == OP_DUP_ASTERISK
2478               || token->type == OP_OPEN_DUP_NUM))
2479         {
2480           if (tree != NULL)
2481             postorder (tree, free_tree, NULL);
2482           *err = REG_BADRPT;
2483           return NULL;
2484         }
2485     }
2486
2487   return tree;
2488 }
2489
2490 /* This function build the following tree, from regular expression
2491    (<reg_exp>):
2492          SUBEXP
2493             |
2494         <reg_exp>
2495 */
2496
2497 static bin_tree_t *
2498 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2499                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2500 {
2501   re_dfa_t *dfa = preg->buffer;
2502   bin_tree_t *tree;
2503   size_t cur_nsub;
2504   cur_nsub = preg->re_nsub++;
2505
2506   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2507
2508   /* The subexpression may be a null string.  */
2509   if (token->type == OP_CLOSE_SUBEXP)
2510     tree = NULL;
2511   else
2512     {
2513       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2514       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2515         {
2516           if (tree != NULL)
2517             postorder (tree, free_tree, NULL);
2518           *err = REG_EPAREN;
2519         }
2520       if (BE (*err != REG_NOERROR, 0))
2521         return NULL;
2522     }
2523
2524   if (cur_nsub <= '9' - '1')
2525     dfa->completed_bkref_map |= 1 << cur_nsub;
2526
2527   tree = create_tree (dfa, tree, NULL, SUBEXP);
2528   if (BE (tree == NULL, 0))
2529     {
2530       *err = REG_ESPACE;
2531       return NULL;
2532     }
2533   tree->token.opr.idx = cur_nsub;
2534   return tree;
2535 }
2536
2537 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2538
2539 static bin_tree_t *
2540 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2541               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2542 {
2543   bin_tree_t *tree = NULL, *old_tree = NULL;
2544   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2545   re_token_t start_token = *token;
2546
2547   if (token->type == OP_OPEN_DUP_NUM)
2548     {
2549       end = 0;
2550       start = fetch_number (regexp, token, syntax);
2551       if (start == -1)
2552         {
2553           if (token->type == CHARACTER && token->opr.c == ',')
2554             start = 0; /* We treat "{,m}" as "{0,m}".  */
2555           else
2556             {
2557               *err = REG_BADBR; /* <re>{} is invalid.  */
2558               return NULL;
2559             }
2560         }
2561       if (BE (start != -2, 1))
2562         {
2563           /* We treat "{n}" as "{n,n}".  */
2564           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2565                  : ((token->type == CHARACTER && token->opr.c == ',')
2566                     ? fetch_number (regexp, token, syntax) : -2));
2567         }
2568       if (BE (start == -2 || end == -2, 0))
2569         {
2570           /* Invalid sequence.  */
2571           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2572             {
2573               if (token->type == END_OF_RE)
2574                 *err = REG_EBRACE;
2575               else
2576                 *err = REG_BADBR;
2577
2578               return NULL;
2579             }
2580
2581           /* If the syntax bit is set, rollback.  */
2582           re_string_set_index (regexp, start_idx);
2583           *token = start_token;
2584           token->type = CHARACTER;
2585           /* mb_partial and word_char bits should be already initialized by
2586              peek_token.  */
2587           return elem;
2588         }
2589
2590       if (BE ((end != -1 && start > end)
2591               || token->type != OP_CLOSE_DUP_NUM, 0))
2592         {
2593           /* First number greater than second.  */
2594           *err = REG_BADBR;
2595           return NULL;
2596         }
2597
2598       if (BE (RE_DUP_MAX < (end == -1 ? start : end), 0))
2599         {
2600           *err = REG_ESIZE;
2601           return NULL;
2602         }
2603     }
2604   else
2605     {
2606       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2607       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2608     }
2609
2610   fetch_token (token, regexp, syntax);
2611
2612   if (BE (elem == NULL, 0))
2613     return NULL;
2614   if (BE (start == 0 && end == 0, 0))
2615     {
2616       postorder (elem, free_tree, NULL);
2617       return NULL;
2618     }
2619
2620   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2621   if (BE (start > 0, 0))
2622     {
2623       tree = elem;
2624       for (i = 2; i <= start; ++i)
2625         {
2626           elem = duplicate_tree (elem, dfa);
2627           tree = create_tree (dfa, tree, elem, CONCAT);
2628           if (BE (elem == NULL || tree == NULL, 0))
2629             goto parse_dup_op_espace;
2630         }
2631
2632       if (start == end)
2633         return tree;
2634
2635       /* Duplicate ELEM before it is marked optional.  */
2636       elem = duplicate_tree (elem, dfa);
2637       if (BE (elem == NULL, 0))
2638         goto parse_dup_op_espace;
2639       old_tree = tree;
2640     }
2641   else
2642     old_tree = NULL;
2643
2644   if (elem->token.type == SUBEXP)
2645     {
2646       uintptr_t subidx = elem->token.opr.idx;
2647       postorder (elem, mark_opt_subexp, (void *) subidx);
2648     }
2649
2650   tree = create_tree (dfa, elem, NULL,
2651                       (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2652   if (BE (tree == NULL, 0))
2653     goto parse_dup_op_espace;
2654
2655   /* This loop is actually executed only when end != -1,
2656      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2657      already created the start+1-th copy.  */
2658   if (TYPE_SIGNED (Idx) || end != -1)
2659     for (i = start + 2; i <= end; ++i)
2660       {
2661         elem = duplicate_tree (elem, dfa);
2662         tree = create_tree (dfa, tree, elem, CONCAT);
2663         if (BE (elem == NULL || tree == NULL, 0))
2664           goto parse_dup_op_espace;
2665
2666         tree = create_tree (dfa, tree, NULL, OP_ALT);
2667         if (BE (tree == NULL, 0))
2668           goto parse_dup_op_espace;
2669       }
2670
2671   if (old_tree)
2672     tree = create_tree (dfa, old_tree, tree, CONCAT);
2673
2674   return tree;
2675
2676  parse_dup_op_espace:
2677   *err = REG_ESPACE;
2678   return NULL;
2679 }
2680
2681 /* Size of the names for collating symbol/equivalence_class/character_class.
2682    I'm not sure, but maybe enough.  */
2683 #define BRACKET_NAME_BUF_SIZE 32
2684
2685 #ifndef _LIBC
2686
2687 # ifdef RE_ENABLE_I18N
2688 /* Convert the byte B to the corresponding wide character.  In a
2689    unibyte locale, treat B as itself if it is an encoding error.
2690    In a multibyte locale, return WEOF if B is an encoding error.  */
2691 static wint_t
2692 parse_byte (unsigned char b, re_charset_t *mbcset)
2693 {
2694   wint_t wc = __btowc (b);
2695   return wc == WEOF && !mbcset ? b : wc;
2696 }
2697 #endif
2698
2699   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2700      Build the range expression which starts from START_ELEM, and ends
2701      at END_ELEM.  The result are written to MBCSET and SBCSET.
2702      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2703      mbcset->range_ends, is a pointer argument since we may
2704      update it.  */
2705
2706 static reg_errcode_t
2707 # ifdef RE_ENABLE_I18N
2708 build_range_exp (const reg_syntax_t syntax,
2709                  bitset_t sbcset,
2710                  re_charset_t *mbcset,
2711                  Idx *range_alloc,
2712                  const bracket_elem_t *start_elem,
2713                  const bracket_elem_t *end_elem)
2714 # else /* not RE_ENABLE_I18N */
2715 build_range_exp (const reg_syntax_t syntax,
2716                  bitset_t sbcset,
2717                  const bracket_elem_t *start_elem,
2718                  const bracket_elem_t *end_elem)
2719 # endif /* not RE_ENABLE_I18N */
2720 {
2721   unsigned int start_ch, end_ch;
2722   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2723   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2724           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2725           0))
2726     return REG_ERANGE;
2727
2728   /* We can handle no multi character collating elements without libc
2729      support.  */
2730   if (BE ((start_elem->type == COLL_SYM
2731            && strlen ((char *) start_elem->opr.name) > 1)
2732           || (end_elem->type == COLL_SYM
2733               && strlen ((char *) end_elem->opr.name) > 1), 0))
2734     return REG_ECOLLATE;
2735
2736 # ifdef RE_ENABLE_I18N
2737   {
2738     wchar_t wc;
2739     wint_t start_wc;
2740     wint_t end_wc;
2741
2742     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2743                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2744                    : 0));
2745     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2746               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2747                  : 0));
2748     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2749                 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2750     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2751               ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2752     if (start_wc == WEOF || end_wc == WEOF)
2753       return REG_ECOLLATE;
2754     else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2755       return REG_ERANGE;
2756
2757     /* Got valid collation sequence values, add them as a new entry.
2758        However, for !_LIBC we have no collation elements: if the
2759        character set is single byte, the single byte character set
2760        that we build below suffices.  parse_bracket_exp passes
2761        no MBCSET if dfa->mb_cur_max == 1.  */
2762     if (mbcset)
2763       {
2764         /* Check the space of the arrays.  */
2765         if (BE (*range_alloc == mbcset->nranges, 0))
2766           {
2767             /* There is not enough space, need realloc.  */
2768             wchar_t *new_array_start, *new_array_end;
2769             Idx new_nranges;
2770
2771             /* +1 in case of mbcset->nranges is 0.  */
2772             new_nranges = 2 * mbcset->nranges + 1;
2773             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2774                are NULL if *range_alloc == 0.  */
2775             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2776                                           new_nranges);
2777             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2778                                         new_nranges);
2779
2780             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2781               {
2782                 re_free (new_array_start);
2783                 re_free (new_array_end);
2784                 return REG_ESPACE;
2785               }
2786
2787             mbcset->range_starts = new_array_start;
2788             mbcset->range_ends = new_array_end;
2789             *range_alloc = new_nranges;
2790           }
2791
2792         mbcset->range_starts[mbcset->nranges] = start_wc;
2793         mbcset->range_ends[mbcset->nranges++] = end_wc;
2794       }
2795
2796     /* Build the table for single byte characters.  */
2797     for (wc = 0; wc < SBC_MAX; ++wc)
2798       {
2799         if (start_wc <= wc && wc <= end_wc)
2800           bitset_set (sbcset, wc);
2801       }
2802   }
2803 # else /* not RE_ENABLE_I18N */
2804   {
2805     unsigned int ch;
2806     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2807                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2808                    : 0));
2809     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2810               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2811                  : 0));
2812     if (start_ch > end_ch)
2813       return REG_ERANGE;
2814     /* Build the table for single byte characters.  */
2815     for (ch = 0; ch < SBC_MAX; ++ch)
2816       if (start_ch <= ch  && ch <= end_ch)
2817         bitset_set (sbcset, ch);
2818   }
2819 # endif /* not RE_ENABLE_I18N */
2820   return REG_NOERROR;
2821 }
2822 #endif /* not _LIBC */
2823
2824 #ifndef _LIBC
2825 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2826    Build the collating element which is represented by NAME.
2827    The result are written to MBCSET and SBCSET.
2828    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2829    pointer argument since we may update it.  */
2830
2831 static reg_errcode_t
2832 # ifdef RE_ENABLE_I18N
2833 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2834                         Idx *coll_sym_alloc, const unsigned char *name)
2835 # else /* not RE_ENABLE_I18N */
2836 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2837 # endif /* not RE_ENABLE_I18N */
2838 {
2839   size_t name_len = strlen ((const char *) name);
2840   if (BE (name_len != 1, 0))
2841     return REG_ECOLLATE;
2842   else
2843     {
2844       bitset_set (sbcset, name[0]);
2845       return REG_NOERROR;
2846     }
2847 }
2848 #endif /* not _LIBC */
2849
2850 /* This function parse bracket expression like "[abc]", "[a-c]",
2851    "[[.a-a.]]" etc.  */
2852
2853 static bin_tree_t *
2854 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2855                    reg_syntax_t syntax, reg_errcode_t *err)
2856 {
2857 #ifdef _LIBC
2858   const unsigned char *collseqmb;
2859   const char *collseqwc;
2860   uint32_t nrules;
2861   int32_t table_size;
2862   const int32_t *symb_table;
2863   const unsigned char *extra;
2864
2865   /* Local function for parse_bracket_exp used in _LIBC environment.
2866      Seek the collating symbol entry corresponding to NAME.
2867      Return the index of the symbol in the SYMB_TABLE,
2868      or -1 if not found.  */
2869
2870   auto inline int32_t
2871   __attribute__ ((always_inline))
2872   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2873     {
2874       int32_t elem;
2875
2876       for (elem = 0; elem < table_size; elem++)
2877         if (symb_table[2 * elem] != 0)
2878           {
2879             int32_t idx = symb_table[2 * elem + 1];
2880             /* Skip the name of collating element name.  */
2881             idx += 1 + extra[idx];
2882             if (/* Compare the length of the name.  */
2883                 name_len == extra[idx]
2884                 /* Compare the name.  */
2885                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2886               /* Yep, this is the entry.  */
2887               return elem;
2888           }
2889       return -1;
2890     }
2891
2892   /* Local function for parse_bracket_exp used in _LIBC environment.
2893      Look up the collation sequence value of BR_ELEM.
2894      Return the value if succeeded, UINT_MAX otherwise.  */
2895
2896   auto inline unsigned int
2897   __attribute__ ((always_inline))
2898   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2899     {
2900       if (br_elem->type == SB_CHAR)
2901         {
2902           /*
2903           if (MB_CUR_MAX == 1)
2904           */
2905           if (nrules == 0)
2906             return collseqmb[br_elem->opr.ch];
2907           else
2908             {
2909               wint_t wc = __btowc (br_elem->opr.ch);
2910               return __collseq_table_lookup (collseqwc, wc);
2911             }
2912         }
2913       else if (br_elem->type == MB_CHAR)
2914         {
2915           if (nrules != 0)
2916             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2917         }
2918       else if (br_elem->type == COLL_SYM)
2919         {
2920           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2921           if (nrules != 0)
2922             {
2923               int32_t elem, idx;
2924               elem = seek_collating_symbol_entry (br_elem->opr.name,
2925                                                   sym_name_len);
2926               if (elem != -1)
2927                 {
2928                   /* We found the entry.  */
2929                   idx = symb_table[2 * elem + 1];
2930                   /* Skip the name of collating element name.  */
2931                   idx += 1 + extra[idx];
2932                   /* Skip the byte sequence of the collating element.  */
2933                   idx += 1 + extra[idx];
2934                   /* Adjust for the alignment.  */
2935                   idx = (idx + 3) & ~3;
2936                   /* Skip the multibyte collation sequence value.  */
2937                   idx += sizeof (unsigned int);
2938                   /* Skip the wide char sequence of the collating element.  */
2939                   idx += sizeof (unsigned int) *
2940                     (1 + *(unsigned int *) (extra + idx));
2941                   /* Return the collation sequence value.  */
2942                   return *(unsigned int *) (extra + idx);
2943                 }
2944               else if (sym_name_len == 1)
2945                 {
2946                   /* No valid character.  Match it as a single byte
2947                      character.  */
2948                   return collseqmb[br_elem->opr.name[0]];
2949                 }
2950             }
2951           else if (sym_name_len == 1)
2952             return collseqmb[br_elem->opr.name[0]];
2953         }
2954       return UINT_MAX;
2955     }
2956
2957   /* Local function for parse_bracket_exp used in _LIBC environment.
2958      Build the range expression which starts from START_ELEM, and ends
2959      at END_ELEM.  The result are written to MBCSET and SBCSET.
2960      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2961      mbcset->range_ends, is a pointer argument since we may
2962      update it.  */
2963
2964   auto inline reg_errcode_t
2965   __attribute__ ((always_inline))
2966   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2967                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2968     {
2969       unsigned int ch;
2970       uint32_t start_collseq;
2971       uint32_t end_collseq;
2972
2973       /* Equivalence Classes and Character Classes can't be a range
2974          start/end.  */
2975       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2976               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2977               0))
2978         return REG_ERANGE;
2979
2980       /* FIXME: Implement rational ranges here, too.  */
2981       start_collseq = lookup_collation_sequence_value (start_elem);
2982       end_collseq = lookup_collation_sequence_value (end_elem);
2983       /* Check start/end collation sequence values.  */
2984       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2985         return REG_ECOLLATE;
2986       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2987         return REG_ERANGE;
2988
2989       /* Got valid collation sequence values, add them as a new entry.
2990          However, if we have no collation elements, and the character set
2991          is single byte, the single byte character set that we
2992          build below suffices. */
2993       if (nrules > 0 || dfa->mb_cur_max > 1)
2994         {
2995           /* Check the space of the arrays.  */
2996           if (BE (*range_alloc == mbcset->nranges, 0))
2997             {
2998               /* There is not enough space, need realloc.  */
2999               uint32_t *new_array_start;
3000               uint32_t *new_array_end;
3001               Idx new_nranges;
3002
3003               /* +1 in case of mbcset->nranges is 0.  */
3004               new_nranges = 2 * mbcset->nranges + 1;
3005               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3006                                             new_nranges);
3007               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3008                                           new_nranges);
3009
3010               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3011                 return REG_ESPACE;
3012
3013               mbcset->range_starts = new_array_start;
3014               mbcset->range_ends = new_array_end;
3015               *range_alloc = new_nranges;
3016             }
3017
3018           mbcset->range_starts[mbcset->nranges] = start_collseq;
3019           mbcset->range_ends[mbcset->nranges++] = end_collseq;
3020         }
3021
3022       /* Build the table for single byte characters.  */
3023       for (ch = 0; ch < SBC_MAX; ch++)
3024         {
3025           uint32_t ch_collseq;
3026           /*
3027           if (MB_CUR_MAX == 1)
3028           */
3029           if (nrules == 0)
3030             ch_collseq = collseqmb[ch];
3031           else
3032             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3033           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3034             bitset_set (sbcset, ch);
3035         }
3036       return REG_NOERROR;
3037     }
3038
3039   /* Local function for parse_bracket_exp used in _LIBC environment.
3040      Build the collating element which is represented by NAME.
3041      The result are written to MBCSET and SBCSET.
3042      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3043      pointer argument since we may update it.  */
3044
3045   auto inline reg_errcode_t
3046   __attribute__ ((always_inline))
3047   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3048                           Idx *coll_sym_alloc, const unsigned char *name)
3049     {
3050       int32_t elem, idx;
3051       size_t name_len = strlen ((const char *) name);
3052       if (nrules != 0)
3053         {
3054           elem = seek_collating_symbol_entry (name, name_len);
3055           if (elem != -1)
3056             {
3057               /* We found the entry.  */
3058               idx = symb_table[2 * elem + 1];
3059               /* Skip the name of collating element name.  */
3060               idx += 1 + extra[idx];
3061             }
3062           else if (name_len == 1)
3063             {
3064               /* No valid character, treat it as a normal
3065                  character.  */
3066               bitset_set (sbcset, name[0]);
3067               return REG_NOERROR;
3068             }
3069           else
3070             return REG_ECOLLATE;
3071
3072           /* Got valid collation sequence, add it as a new entry.  */
3073           /* Check the space of the arrays.  */
3074           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3075             {
3076               /* Not enough, realloc it.  */
3077               /* +1 in case of mbcset->ncoll_syms is 0.  */
3078               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3079               /* Use realloc since mbcset->coll_syms is NULL
3080                  if *alloc == 0.  */
3081               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3082                                                    new_coll_sym_alloc);
3083               if (BE (new_coll_syms == NULL, 0))
3084                 return REG_ESPACE;
3085               mbcset->coll_syms = new_coll_syms;
3086               *coll_sym_alloc = new_coll_sym_alloc;
3087             }
3088           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3089           return REG_NOERROR;
3090         }
3091       else
3092         {
3093           if (BE (name_len != 1, 0))
3094             return REG_ECOLLATE;
3095           else
3096             {
3097               bitset_set (sbcset, name[0]);
3098               return REG_NOERROR;
3099             }
3100         }
3101     }
3102 #endif
3103
3104   re_token_t br_token;
3105   re_bitset_ptr_t sbcset;
3106 #ifdef RE_ENABLE_I18N
3107   re_charset_t *mbcset;
3108   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3109   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3110 #endif /* not RE_ENABLE_I18N */
3111   bool non_match = false;
3112   bin_tree_t *work_tree;
3113   int token_len;
3114   bool first_round = true;
3115 #ifdef _LIBC
3116   collseqmb = (const unsigned char *)
3117     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3118   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3119   if (nrules)
3120     {
3121       /*
3122       if (MB_CUR_MAX > 1)
3123       */
3124       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3125       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3126       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3127                                                   _NL_COLLATE_SYMB_TABLEMB);
3128       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3129                                                    _NL_COLLATE_SYMB_EXTRAMB);
3130     }
3131 #endif
3132   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3133 #ifdef RE_ENABLE_I18N
3134   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3135 #endif /* RE_ENABLE_I18N */
3136 #ifdef RE_ENABLE_I18N
3137   if (BE (sbcset == NULL || mbcset == NULL, 0))
3138 #else
3139   if (BE (sbcset == NULL, 0))
3140 #endif /* RE_ENABLE_I18N */
3141     {
3142       re_free (sbcset);
3143 #ifdef RE_ENABLE_I18N
3144       re_free (mbcset);
3145 #endif
3146       *err = REG_ESPACE;
3147       return NULL;
3148     }
3149
3150   token_len = peek_token_bracket (token, regexp, syntax);
3151   if (BE (token->type == END_OF_RE, 0))
3152     {
3153       *err = REG_BADPAT;
3154       goto parse_bracket_exp_free_return;
3155     }
3156   if (token->type == OP_NON_MATCH_LIST)
3157     {
3158 #ifdef RE_ENABLE_I18N
3159       mbcset->non_match = 1;
3160 #endif /* not RE_ENABLE_I18N */
3161       non_match = true;
3162       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3163         bitset_set (sbcset, '\n');
3164       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3165       token_len = peek_token_bracket (token, regexp, syntax);
3166       if (BE (token->type == END_OF_RE, 0))
3167         {
3168           *err = REG_BADPAT;
3169           goto parse_bracket_exp_free_return;
3170         }
3171     }
3172
3173   /* We treat the first ']' as a normal character.  */
3174   if (token->type == OP_CLOSE_BRACKET)
3175     token->type = CHARACTER;
3176
3177   while (1)
3178     {
3179       bracket_elem_t start_elem, end_elem;
3180       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3181       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3182       reg_errcode_t ret;
3183       int token_len2 = 0;
3184       bool is_range_exp = false;
3185       re_token_t token2;
3186
3187       start_elem.opr.name = start_name_buf;
3188       start_elem.type = COLL_SYM;
3189       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3190                                    syntax, first_round);
3191       if (BE (ret != REG_NOERROR, 0))
3192         {
3193           *err = ret;
3194           goto parse_bracket_exp_free_return;
3195         }
3196       first_round = false;
3197
3198       /* Get information about the next token.  We need it in any case.  */
3199       token_len = peek_token_bracket (token, regexp, syntax);
3200
3201       /* Do not check for ranges if we know they are not allowed.  */
3202       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3203         {
3204           if (BE (token->type == END_OF_RE, 0))
3205             {
3206               *err = REG_EBRACK;
3207               goto parse_bracket_exp_free_return;
3208             }
3209           if (token->type == OP_CHARSET_RANGE)
3210             {
3211               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3212               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3213               if (BE (token2.type == END_OF_RE, 0))
3214                 {
3215                   *err = REG_EBRACK;
3216                   goto parse_bracket_exp_free_return;
3217                 }
3218               if (token2.type == OP_CLOSE_BRACKET)
3219                 {
3220                   /* We treat the last '-' as a normal character.  */
3221                   re_string_skip_bytes (regexp, -token_len);
3222                   token->type = CHARACTER;
3223                 }
3224               else
3225                 is_range_exp = true;
3226             }
3227         }
3228
3229       if (is_range_exp == true)
3230         {
3231           end_elem.opr.name = end_name_buf;
3232           end_elem.type = COLL_SYM;
3233           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3234                                        dfa, syntax, true);
3235           if (BE (ret != REG_NOERROR, 0))
3236             {
3237               *err = ret;
3238               goto parse_bracket_exp_free_return;
3239             }
3240
3241           token_len = peek_token_bracket (token, regexp, syntax);
3242
3243 #ifdef _LIBC
3244           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3245                                   &start_elem, &end_elem);
3246 #else
3247 # ifdef RE_ENABLE_I18N
3248           *err = build_range_exp (syntax, sbcset,
3249                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3250                                   &range_alloc, &start_elem, &end_elem);
3251 # else
3252           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3253 # endif
3254 #endif /* RE_ENABLE_I18N */
3255           if (BE (*err != REG_NOERROR, 0))
3256             goto parse_bracket_exp_free_return;
3257         }
3258       else
3259         {
3260           switch (start_elem.type)
3261             {
3262             case SB_CHAR:
3263               bitset_set (sbcset, start_elem.opr.ch);
3264               break;
3265 #ifdef RE_ENABLE_I18N
3266             case MB_CHAR:
3267               /* Check whether the array has enough space.  */
3268               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3269                 {
3270                   wchar_t *new_mbchars;
3271                   /* Not enough, realloc it.  */
3272                   /* +1 in case of mbcset->nmbchars is 0.  */
3273                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3274                   /* Use realloc since array is NULL if *alloc == 0.  */
3275                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3276                                             mbchar_alloc);
3277                   if (BE (new_mbchars == NULL, 0))
3278                     goto parse_bracket_exp_espace;
3279                   mbcset->mbchars = new_mbchars;
3280                 }
3281               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3282               break;
3283 #endif /* RE_ENABLE_I18N */
3284             case EQUIV_CLASS:
3285               *err = build_equiv_class (sbcset,
3286 #ifdef RE_ENABLE_I18N
3287                                         mbcset, &equiv_class_alloc,
3288 #endif /* RE_ENABLE_I18N */
3289                                         start_elem.opr.name);
3290               if (BE (*err != REG_NOERROR, 0))
3291                 goto parse_bracket_exp_free_return;
3292               break;
3293             case COLL_SYM:
3294               *err = build_collating_symbol (sbcset,
3295 #ifdef RE_ENABLE_I18N
3296                                              mbcset, &coll_sym_alloc,
3297 #endif /* RE_ENABLE_I18N */
3298                                              start_elem.opr.name);
3299               if (BE (*err != REG_NOERROR, 0))
3300                 goto parse_bracket_exp_free_return;
3301               break;
3302             case CHAR_CLASS:
3303               *err = build_charclass (regexp->trans, sbcset,
3304 #ifdef RE_ENABLE_I18N
3305                                       mbcset, &char_class_alloc,
3306 #endif /* RE_ENABLE_I18N */
3307                                       (const char *) start_elem.opr.name,
3308                                       syntax);
3309               if (BE (*err != REG_NOERROR, 0))
3310                goto parse_bracket_exp_free_return;
3311               break;
3312             default:
3313               assert (0);
3314               break;
3315             }
3316         }
3317       if (BE (token->type == END_OF_RE, 0))
3318         {
3319           *err = REG_EBRACK;
3320           goto parse_bracket_exp_free_return;
3321         }
3322       if (token->type == OP_CLOSE_BRACKET)
3323         break;
3324     }
3325
3326   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3327
3328   /* If it is non-matching list.  */
3329   if (non_match)
3330     bitset_not (sbcset);
3331
3332 #ifdef RE_ENABLE_I18N
3333   /* Ensure only single byte characters are set.  */
3334   if (dfa->mb_cur_max > 1)
3335     bitset_mask (sbcset, dfa->sb_char);
3336
3337   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3338       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3339                                                      || mbcset->non_match)))
3340     {
3341       bin_tree_t *mbc_tree;
3342       int sbc_idx;
3343       /* Build a tree for complex bracket.  */
3344       dfa->has_mb_node = 1;
3345       br_token.type = COMPLEX_BRACKET;
3346       br_token.opr.mbcset = mbcset;
3347       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3348       if (BE (mbc_tree == NULL, 0))
3349         goto parse_bracket_exp_espace;
3350       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3351         if (sbcset[sbc_idx])
3352           break;
3353       /* If there are no bits set in sbcset, there is no point
3354          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3355       if (sbc_idx < BITSET_WORDS)
3356         {
3357           /* Build a tree for simple bracket.  */
3358           br_token.type = SIMPLE_BRACKET;
3359           br_token.opr.sbcset = sbcset;
3360           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3361           if (BE (work_tree == NULL, 0))
3362             goto parse_bracket_exp_espace;
3363
3364           /* Then join them by ALT node.  */
3365           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3366           if (BE (work_tree == NULL, 0))
3367             goto parse_bracket_exp_espace;
3368         }
3369       else
3370         {
3371           re_free (sbcset);
3372           work_tree = mbc_tree;
3373         }
3374     }
3375   else
3376 #endif /* not RE_ENABLE_I18N */
3377     {
3378 #ifdef RE_ENABLE_I18N
3379       free_charset (mbcset);
3380 #endif
3381       /* Build a tree for simple bracket.  */
3382       br_token.type = SIMPLE_BRACKET;
3383       br_token.opr.sbcset = sbcset;
3384       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3385       if (BE (work_tree == NULL, 0))
3386         goto parse_bracket_exp_espace;
3387     }
3388   return work_tree;
3389
3390  parse_bracket_exp_espace:
3391   *err = REG_ESPACE;
3392  parse_bracket_exp_free_return:
3393   re_free (sbcset);
3394 #ifdef RE_ENABLE_I18N
3395   free_charset (mbcset);
3396 #endif /* RE_ENABLE_I18N */
3397   return NULL;
3398 }
3399
3400 /* Parse an element in the bracket expression.  */
3401
3402 static reg_errcode_t
3403 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3404                        re_token_t *token, int token_len, re_dfa_t *dfa,
3405                        reg_syntax_t syntax, bool accept_hyphen)
3406 {
3407 #ifdef RE_ENABLE_I18N
3408   int cur_char_size;
3409   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3410   if (cur_char_size > 1)
3411     {
3412       elem->type = MB_CHAR;
3413       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3414       re_string_skip_bytes (regexp, cur_char_size);
3415       return REG_NOERROR;
3416     }
3417 #endif /* RE_ENABLE_I18N */
3418   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3419   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3420       || token->type == OP_OPEN_EQUIV_CLASS)
3421     return parse_bracket_symbol (elem, regexp, token);
3422   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3423     {
3424       /* A '-' must only appear as anything but a range indicator before
3425          the closing bracket.  Everything else is an error.  */
3426       re_token_t token2;
3427       (void) peek_token_bracket (&token2, regexp, syntax);
3428       if (token2.type != OP_CLOSE_BRACKET)
3429         /* The actual error value is not standardized since this whole
3430            case is undefined.  But ERANGE makes good sense.  */
3431         return REG_ERANGE;
3432     }
3433   elem->type = SB_CHAR;
3434   elem->opr.ch = token->opr.c;
3435   return REG_NOERROR;
3436 }
3437
3438 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3439    such as [:<character_class>:], [.<collating_element>.], and
3440    [=<equivalent_class>=].  */
3441
3442 static reg_errcode_t
3443 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3444                       re_token_t *token)
3445 {
3446   unsigned char ch, delim = token->opr.c;
3447   int i = 0;
3448   if (re_string_eoi(regexp))
3449     return REG_EBRACK;
3450   for (;; ++i)
3451     {
3452       if (i >= BRACKET_NAME_BUF_SIZE)
3453         return REG_EBRACK;
3454       if (token->type == OP_OPEN_CHAR_CLASS)
3455         ch = re_string_fetch_byte_case (regexp);
3456       else
3457         ch = re_string_fetch_byte (regexp);
3458       if (re_string_eoi(regexp))
3459         return REG_EBRACK;
3460       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3461         break;
3462       elem->opr.name[i] = ch;
3463     }
3464   re_string_skip_bytes (regexp, 1);
3465   elem->opr.name[i] = '\0';
3466   switch (token->type)
3467     {
3468     case OP_OPEN_COLL_ELEM:
3469       elem->type = COLL_SYM;
3470       break;
3471     case OP_OPEN_EQUIV_CLASS:
3472       elem->type = EQUIV_CLASS;
3473       break;
3474     case OP_OPEN_CHAR_CLASS:
3475       elem->type = CHAR_CLASS;
3476       break;
3477     default:
3478       break;
3479     }
3480   return REG_NOERROR;
3481 }
3482
3483   /* Helper function for parse_bracket_exp.
3484      Build the equivalence class which is represented by NAME.
3485      The result are written to MBCSET and SBCSET.
3486      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3487      is a pointer argument since we may update it.  */
3488
3489 static reg_errcode_t
3490 #ifdef RE_ENABLE_I18N
3491 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3492                    Idx *equiv_class_alloc, const unsigned char *name)
3493 #else /* not RE_ENABLE_I18N */
3494 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3495 #endif /* not RE_ENABLE_I18N */
3496 {
3497 #ifdef _LIBC
3498   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3499   if (nrules != 0)
3500     {
3501       const int32_t *table, *indirect;
3502       const unsigned char *weights, *extra, *cp;
3503       unsigned char char_buf[2];
3504       int32_t idx1, idx2;
3505       unsigned int ch;
3506       size_t len;
3507       /* Calculate the index for equivalence class.  */
3508       cp = name;
3509       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3510       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3511                                                _NL_COLLATE_WEIGHTMB);
3512       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3513                                                    _NL_COLLATE_EXTRAMB);
3514       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3515                                                 _NL_COLLATE_INDIRECTMB);
3516       idx1 = findidx (table, indirect, extra, &cp, -1);
3517       if (BE (idx1 == 0 || *cp != '\0', 0))
3518         /* This isn't a valid character.  */
3519         return REG_ECOLLATE;
3520
3521       /* Build single byte matching table for this equivalence class.  */
3522       len = weights[idx1 & 0xffffff];
3523       for (ch = 0; ch < SBC_MAX; ++ch)
3524         {
3525           char_buf[0] = ch;
3526           cp = char_buf;
3527           idx2 = findidx (table, indirect, extra, &cp, 1);
3528 /*
3529           idx2 = table[ch];
3530 */
3531           if (idx2 == 0)
3532             /* This isn't a valid character.  */
3533             continue;
3534           /* Compare only if the length matches and the collation rule
3535              index is the same.  */
3536           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3537               && memcmp (weights + (idx1 & 0xffffff) + 1,
3538                          weights + (idx2 & 0xffffff) + 1, len) == 0)
3539             bitset_set (sbcset, ch);
3540         }
3541       /* Check whether the array has enough space.  */
3542       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3543         {
3544           /* Not enough, realloc it.  */
3545           /* +1 in case of mbcset->nequiv_classes is 0.  */
3546           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3547           /* Use realloc since the array is NULL if *alloc == 0.  */
3548           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3549                                                    int32_t,
3550                                                    new_equiv_class_alloc);
3551           if (BE (new_equiv_classes == NULL, 0))
3552             return REG_ESPACE;
3553           mbcset->equiv_classes = new_equiv_classes;
3554           *equiv_class_alloc = new_equiv_class_alloc;
3555         }
3556       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3557     }
3558   else
3559 #endif /* _LIBC */
3560     {
3561       if (BE (strlen ((const char *) name) != 1, 0))
3562         return REG_ECOLLATE;
3563       bitset_set (sbcset, *name);
3564     }
3565   return REG_NOERROR;
3566 }
3567
3568   /* Helper function for parse_bracket_exp.
3569      Build the character class which is represented by NAME.
3570      The result are written to MBCSET and SBCSET.
3571      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3572      is a pointer argument since we may update it.  */
3573
3574 static reg_errcode_t
3575 #ifdef RE_ENABLE_I18N
3576 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3577                  re_charset_t *mbcset, Idx *char_class_alloc,
3578                  const char *class_name, reg_syntax_t syntax)
3579 #else /* not RE_ENABLE_I18N */
3580 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3581                  const char *class_name, reg_syntax_t syntax)
3582 #endif /* not RE_ENABLE_I18N */
3583 {
3584   int i;
3585   const char *name = class_name;
3586
3587   /* In case of REG_ICASE "upper" and "lower" match the both of
3588      upper and lower cases.  */
3589   if ((syntax & RE_ICASE)
3590       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3591     name = "alpha";
3592
3593 #ifdef RE_ENABLE_I18N
3594   /* Check the space of the arrays.  */
3595   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3596     {
3597       /* Not enough, realloc it.  */
3598       /* +1 in case of mbcset->nchar_classes is 0.  */
3599       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3600       /* Use realloc since array is NULL if *alloc == 0.  */
3601       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3602                                                new_char_class_alloc);
3603       if (BE (new_char_classes == NULL, 0))
3604         return REG_ESPACE;
3605       mbcset->char_classes = new_char_classes;
3606       *char_class_alloc = new_char_class_alloc;
3607     }
3608   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3609 #endif /* RE_ENABLE_I18N */
3610
3611 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3612   do {                                          \
3613     if (BE (trans != NULL, 0))                  \
3614       {                                         \
3615         for (i = 0; i < SBC_MAX; ++i)           \
3616           if (ctype_func (i))                   \
3617             bitset_set (sbcset, trans[i]);      \
3618       }                                         \
3619     else                                        \
3620       {                                         \
3621         for (i = 0; i < SBC_MAX; ++i)           \
3622           if (ctype_func (i))                   \
3623             bitset_set (sbcset, i);             \
3624       }                                         \
3625   } while (0)
3626
3627   if (strcmp (name, "alnum") == 0)
3628     BUILD_CHARCLASS_LOOP (isalnum);
3629   else if (strcmp (name, "cntrl") == 0)
3630     BUILD_CHARCLASS_LOOP (iscntrl);
3631   else if (strcmp (name, "lower") == 0)
3632     BUILD_CHARCLASS_LOOP (islower);
3633   else if (strcmp (name, "space") == 0)
3634     BUILD_CHARCLASS_LOOP (isspace);
3635   else if (strcmp (name, "alpha") == 0)
3636     BUILD_CHARCLASS_LOOP (isalpha);
3637   else if (strcmp (name, "digit") == 0)
3638     BUILD_CHARCLASS_LOOP (isdigit);
3639   else if (strcmp (name, "print") == 0)
3640     BUILD_CHARCLASS_LOOP (isprint);
3641   else if (strcmp (name, "upper") == 0)
3642     BUILD_CHARCLASS_LOOP (isupper);
3643   else if (strcmp (name, "blank") == 0)
3644     BUILD_CHARCLASS_LOOP (isblank);
3645   else if (strcmp (name, "graph") == 0)
3646     BUILD_CHARCLASS_LOOP (isgraph);
3647   else if (strcmp (name, "punct") == 0)
3648     BUILD_CHARCLASS_LOOP (ispunct);
3649   else if (strcmp (name, "xdigit") == 0)
3650     BUILD_CHARCLASS_LOOP (isxdigit);
3651   else
3652     return REG_ECTYPE;
3653
3654   return REG_NOERROR;
3655 }
3656
3657 static bin_tree_t *
3658 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3659                     const char *class_name,
3660                     const char *extra, bool non_match,
3661                     reg_errcode_t *err)
3662 {
3663   re_bitset_ptr_t sbcset;
3664 #ifdef RE_ENABLE_I18N
3665   re_charset_t *mbcset;
3666   Idx alloc = 0;
3667 #endif /* not RE_ENABLE_I18N */
3668   reg_errcode_t ret;
3669   re_token_t br_token;
3670   bin_tree_t *tree;
3671
3672   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3673   if (BE (sbcset == NULL, 0))
3674     {
3675       *err = REG_ESPACE;
3676       return NULL;
3677     }
3678 #ifdef RE_ENABLE_I18N
3679   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3680   if (BE (mbcset == NULL, 0))
3681     {
3682       re_free (sbcset);
3683       *err = REG_ESPACE;
3684       return NULL;
3685     }
3686   mbcset->non_match = non_match;
3687 #endif /* RE_ENABLE_I18N */
3688
3689   /* We don't care the syntax in this case.  */
3690   ret = build_charclass (trans, sbcset,
3691 #ifdef RE_ENABLE_I18N
3692                          mbcset, &alloc,
3693 #endif /* RE_ENABLE_I18N */
3694                          class_name, 0);
3695
3696   if (BE (ret != REG_NOERROR, 0))
3697     {
3698       re_free (sbcset);
3699 #ifdef RE_ENABLE_I18N
3700       free_charset (mbcset);
3701 #endif /* RE_ENABLE_I18N */
3702       *err = ret;
3703       return NULL;
3704     }
3705   /* \w match '_' also.  */
3706   for (; *extra; extra++)
3707     bitset_set (sbcset, *extra);
3708
3709   /* If it is non-matching list.  */
3710   if (non_match)
3711     bitset_not (sbcset);
3712
3713 #ifdef RE_ENABLE_I18N
3714   /* Ensure only single byte characters are set.  */
3715   if (dfa->mb_cur_max > 1)
3716     bitset_mask (sbcset, dfa->sb_char);
3717 #endif
3718
3719   /* Build a tree for simple bracket.  */
3720 #if defined GCC_LINT || defined lint
3721   memset (&br_token, 0, sizeof br_token);
3722 #endif
3723   br_token.type = SIMPLE_BRACKET;
3724   br_token.opr.sbcset = sbcset;
3725   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3726   if (BE (tree == NULL, 0))
3727     goto build_word_op_espace;
3728
3729 #ifdef RE_ENABLE_I18N
3730   if (dfa->mb_cur_max > 1)
3731     {
3732       bin_tree_t *mbc_tree;
3733       /* Build a tree for complex bracket.  */
3734       br_token.type = COMPLEX_BRACKET;
3735       br_token.opr.mbcset = mbcset;
3736       dfa->has_mb_node = 1;
3737       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3738       if (BE (mbc_tree == NULL, 0))
3739         goto build_word_op_espace;
3740       /* Then join them by ALT node.  */
3741       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3742       if (BE (mbc_tree != NULL, 1))
3743         return tree;
3744     }
3745   else
3746     {
3747       free_charset (mbcset);
3748       return tree;
3749     }
3750 #else /* not RE_ENABLE_I18N */
3751   return tree;
3752 #endif /* not RE_ENABLE_I18N */
3753
3754  build_word_op_espace:
3755   re_free (sbcset);
3756 #ifdef RE_ENABLE_I18N
3757   free_charset (mbcset);
3758 #endif /* RE_ENABLE_I18N */
3759   *err = REG_ESPACE;
3760   return NULL;
3761 }
3762
3763 /* This is intended for the expressions like "a{1,3}".
3764    Fetch a number from 'input', and return the number.
3765    Return -1 if the number field is empty like "{,1}".
3766    Return RE_DUP_MAX + 1 if the number field is too large.
3767    Return -2 if an error occurred.  */
3768
3769 static Idx
3770 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3771 {
3772   Idx num = -1;
3773   unsigned char c;
3774   while (1)
3775     {
3776       fetch_token (token, input, syntax);
3777       c = token->opr.c;
3778       if (BE (token->type == END_OF_RE, 0))
3779         return -2;
3780       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3781         break;
3782       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3783              ? -2
3784              : num == -1
3785              ? c - '0'
3786              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3787     }
3788   return num;
3789 }
3790 \f
3791 #ifdef RE_ENABLE_I18N
3792 static void
3793 free_charset (re_charset_t *cset)
3794 {
3795   re_free (cset->mbchars);
3796 # ifdef _LIBC
3797   re_free (cset->coll_syms);
3798   re_free (cset->equiv_classes);
3799   re_free (cset->range_starts);
3800   re_free (cset->range_ends);
3801 # endif
3802   re_free (cset->char_classes);
3803   re_free (cset);
3804 }
3805 #endif /* RE_ENABLE_I18N */
3806 \f
3807 /* Functions for binary tree operation.  */
3808
3809 /* Create a tree node.  */
3810
3811 static bin_tree_t *
3812 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3813              re_token_type_t type)
3814 {
3815   re_token_t t;
3816 #if defined GCC_LINT || defined lint
3817   memset (&t, 0, sizeof t);
3818 #endif
3819   t.type = type;
3820   return create_token_tree (dfa, left, right, &t);
3821 }
3822
3823 static bin_tree_t *
3824 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3825                    const re_token_t *token)
3826 {
3827   bin_tree_t *tree;
3828   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3829     {
3830       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3831
3832       if (storage == NULL)
3833         return NULL;
3834       storage->next = dfa->str_tree_storage;
3835       dfa->str_tree_storage = storage;
3836       dfa->str_tree_storage_idx = 0;
3837     }
3838   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3839
3840   tree->parent = NULL;
3841   tree->left = left;
3842   tree->right = right;
3843   tree->token = *token;
3844   tree->token.duplicated = 0;
3845   tree->token.opt_subexp = 0;
3846   tree->first = NULL;
3847   tree->next = NULL;
3848   tree->node_idx = -1;
3849
3850   if (left != NULL)
3851     left->parent = tree;
3852   if (right != NULL)
3853     right->parent = tree;
3854   return tree;
3855 }
3856
3857 /* Mark the tree SRC as an optional subexpression.
3858    To be called from preorder or postorder.  */
3859
3860 static reg_errcode_t
3861 mark_opt_subexp (void *extra, bin_tree_t *node)
3862 {
3863   Idx idx = (uintptr_t) extra;
3864   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3865     node->token.opt_subexp = 1;
3866
3867   return REG_NOERROR;
3868 }
3869
3870 /* Free the allocated memory inside NODE. */
3871
3872 static void
3873 free_token (re_token_t *node)
3874 {
3875 #ifdef RE_ENABLE_I18N
3876   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3877     free_charset (node->opr.mbcset);
3878   else
3879 #endif /* RE_ENABLE_I18N */
3880     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3881       re_free (node->opr.sbcset);
3882 }
3883
3884 /* Worker function for tree walking.  Free the allocated memory inside NODE
3885    and its children. */
3886
3887 static reg_errcode_t
3888 free_tree (void *extra, bin_tree_t *node)
3889 {
3890   free_token (&node->token);
3891   return REG_NOERROR;
3892 }
3893
3894
3895 /* Duplicate the node SRC, and return new node.  This is a preorder
3896    visit similar to the one implemented by the generic visitor, but
3897    we need more infrastructure to maintain two parallel trees --- so,
3898    it's easier to duplicate.  */
3899
3900 static bin_tree_t *
3901 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3902 {
3903   const bin_tree_t *node;
3904   bin_tree_t *dup_root;
3905   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3906
3907   for (node = root; ; )
3908     {
3909       /* Create a new tree and link it back to the current parent.  */
3910       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3911       if (*p_new == NULL)
3912         return NULL;
3913       (*p_new)->parent = dup_node;
3914       (*p_new)->token.duplicated = 1;
3915       dup_node = *p_new;
3916
3917       /* Go to the left node, or up and to the right.  */
3918       if (node->left)
3919         {
3920           node = node->left;
3921           p_new = &dup_node->left;
3922         }
3923       else
3924         {
3925           const bin_tree_t *prev = NULL;
3926           while (node->right == prev || node->right == NULL)
3927             {
3928               prev = node;
3929               node = node->parent;
3930               dup_node = dup_node->parent;
3931               if (!node)
3932                 return dup_root;
3933             }
3934           node = node->right;
3935           p_new = &dup_node->right;
3936         }
3937     }
3938 }