]> git.proxmox.com Git - mirror_smartmontools-debian.git/blob - regex/regcomp.c
import smartmontools 7.0
[mirror_smartmontools-debian.git] / regex / regcomp.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
23
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 #ifdef RE_ENABLE_I18N
31 static void free_charset (re_charset_t *cset);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t *preg);
34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35 #ifdef RE_ENABLE_I18N
36 static void optimize_utf8 (re_dfa_t *dfa);
37 #endif
38 static reg_errcode_t analyze (regex_t *preg);
39 static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
44 void *extra);
45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 bin_tree_t *node);
49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 unsigned int constraint);
55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 Idx node, bool root);
58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59 static Idx fetch_number (re_string_t *input, re_token_t *token,
60 reg_syntax_t syntax);
61 static int peek_token (re_token_t *token, re_string_t *input,
62 reg_syntax_t syntax);
63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 reg_syntax_t syntax, reg_errcode_t *err);
65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 re_token_t *token, reg_syntax_t syntax,
76 Idx nest, reg_errcode_t *err);
77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 re_token_t *token, reg_syntax_t syntax,
82 reg_errcode_t *err);
83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 re_string_t *regexp,
85 re_token_t *token, int token_len,
86 re_dfa_t *dfa,
87 reg_syntax_t syntax,
88 bool accept_hyphen);
89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 re_string_t *regexp,
91 re_token_t *token);
92 #ifdef RE_ENABLE_I18N
93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *equiv_class_alloc,
96 const unsigned char *name);
97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 bitset_t sbcset,
99 re_charset_t *mbcset,
100 Idx *char_class_alloc,
101 const char *class_name,
102 reg_syntax_t syntax);
103 #else /* not RE_ENABLE_I18N */
104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 const unsigned char *name);
106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 bitset_t sbcset,
108 const char *class_name,
109 reg_syntax_t syntax);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 RE_TRANSLATE_TYPE trans,
113 const char *class_name,
114 const char *extra,
115 bool non_match, reg_errcode_t *err);
116 static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 re_token_type_t type);
119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123 static void free_token (re_token_t *node);
124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
126 \f
127 /* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
131
132 static const char __re_error_msgid[] =
133 {
134 #define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
136 "\0"
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
139 "\0"
140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142 "\0"
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145 "\0"
146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148 "\0"
149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151 "\0"
152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154 "\0"
155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
157 "\0"
158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160 "\0"
161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163 "\0"
164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166 "\0"
167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
169 "\0"
170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
172 "\0"
173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175 "\0"
176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
178 "\0"
179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
181 "\0"
182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184 };
185
186 static const size_t __re_error_msgid_idx[] =
187 {
188 REG_NOERROR_IDX,
189 REG_NOMATCH_IDX,
190 REG_BADPAT_IDX,
191 REG_ECOLLATE_IDX,
192 REG_ECTYPE_IDX,
193 REG_EESCAPE_IDX,
194 REG_ESUBREG_IDX,
195 REG_EBRACK_IDX,
196 REG_EPAREN_IDX,
197 REG_EBRACE_IDX,
198 REG_BADBR_IDX,
199 REG_ERANGE_IDX,
200 REG_ESPACE_IDX,
201 REG_BADRPT_IDX,
202 REG_EEND_IDX,
203 REG_ESIZE_IDX,
204 REG_ERPAREN_IDX
205 };
206 \f
207 /* Entry points for GNU code. */
208
209 /* re_compile_pattern is the GNU regular expression compiler: it
210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211 Returns 0 if the pattern was valid, otherwise an error string.
212
213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214 are set in BUFP on entry. */
215
216 const char *
217 re_compile_pattern (const char *pattern, size_t length,
218 struct re_pattern_buffer *bufp)
219 {
220 reg_errcode_t ret;
221
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
229
230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231
232 if (!ret)
233 return NULL;
234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235 }
236 #ifdef _LIBC
237 weak_alias (__re_compile_pattern, re_compile_pattern)
238 #endif
239
240 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243 /* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245 reg_syntax_t re_syntax_options;
246
247
248 /* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
251
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
254
255 reg_syntax_t
256 re_set_syntax (reg_syntax_t syntax)
257 {
258 reg_syntax_t ret = re_syntax_options;
259
260 re_syntax_options = syntax;
261 return ret;
262 }
263 #ifdef _LIBC
264 weak_alias (__re_set_syntax, re_set_syntax)
265 #endif
266
267 int
268 re_compile_fastmap (struct re_pattern_buffer *bufp)
269 {
270 re_dfa_t *dfa = bufp->buffer;
271 char *fastmap = bufp->fastmap;
272
273 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275 if (dfa->init_state != dfa->init_state_word)
276 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277 if (dfa->init_state != dfa->init_state_nl)
278 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279 if (dfa->init_state != dfa->init_state_begbuf)
280 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281 bufp->fastmap_accurate = 1;
282 return 0;
283 }
284 #ifdef _LIBC
285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
286 #endif
287
288 static inline void
289 __attribute__ ((always_inline))
290 re_set_fastmap (char *fastmap, bool icase, int ch)
291 {
292 fastmap[ch] = 1;
293 if (icase)
294 fastmap[tolower (ch)] = 1;
295 }
296
297 /* Helper function for re_compile_fastmap.
298 Compile fastmap for the initial_state INIT_STATE. */
299
300 static void
301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
302 char *fastmap)
303 {
304 re_dfa_t *dfa = bufp->buffer;
305 Idx node_cnt;
306 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
307 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
308 {
309 Idx node = init_state->nodes.elems[node_cnt];
310 re_token_type_t type = dfa->nodes[node].type;
311
312 if (type == CHARACTER)
313 {
314 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315 #ifdef RE_ENABLE_I18N
316 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
317 {
318 unsigned char buf[MB_LEN_MAX];
319 unsigned char *p;
320 wchar_t wc;
321 mbstate_t state;
322
323 p = buf;
324 *p++ = dfa->nodes[node].opr.c;
325 while (++node < dfa->nodes_len
326 && dfa->nodes[node].type == CHARACTER
327 && dfa->nodes[node].mb_partial)
328 *p++ = dfa->nodes[node].opr.c;
329 memset (&state, '\0', sizeof (state));
330 if (__mbrtowc (&wc, (const char *) buf, p - buf,
331 &state) == p - buf
332 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
333 != (size_t) -1))
334 re_set_fastmap (fastmap, false, buf[0]);
335 }
336 #endif
337 }
338 else if (type == SIMPLE_BRACKET)
339 {
340 int i, ch;
341 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
342 {
343 int j;
344 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 if (w & ((bitset_word_t) 1 << j))
347 re_set_fastmap (fastmap, icase, ch);
348 }
349 }
350 #ifdef RE_ENABLE_I18N
351 else if (type == COMPLEX_BRACKET)
352 {
353 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
354 Idx i;
355
356 # ifdef _LIBC
357 /* See if we have to try all bytes which start multiple collation
358 elements.
359 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 collation element, and don't catch 'b' since 'b' is
361 the only collation element which starts from 'b' (and
362 it is caught by SIMPLE_BRACKET). */
363 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
364 && (cset->ncoll_syms || cset->nranges))
365 {
366 const int32_t *table = (const int32_t *)
367 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
368 for (i = 0; i < SBC_MAX; ++i)
369 if (table[i] < 0)
370 re_set_fastmap (fastmap, icase, i);
371 }
372 # endif /* _LIBC */
373
374 /* See if we have to start the match at all multibyte characters,
375 i.e. where we would not find an invalid sequence. This only
376 applies to multibyte character sets; for single byte character
377 sets, the SIMPLE_BRACKET again suffices. */
378 if (dfa->mb_cur_max > 1
379 && (cset->nchar_classes || cset->non_match || cset->nranges
380 # ifdef _LIBC
381 || cset->nequiv_classes
382 # endif /* _LIBC */
383 ))
384 {
385 unsigned char c = 0;
386 do
387 {
388 mbstate_t mbs;
389 memset (&mbs, 0, sizeof (mbs));
390 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
391 re_set_fastmap (fastmap, false, (int) c);
392 }
393 while (++c != 0);
394 }
395
396 else
397 {
398 /* ... Else catch all bytes which can start the mbchars. */
399 for (i = 0; i < cset->nmbchars; ++i)
400 {
401 char buf[256];
402 mbstate_t state;
403 memset (&state, '\0', sizeof (state));
404 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
405 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
406 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
407 {
408 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
409 != (size_t) -1)
410 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
411 }
412 }
413 }
414 }
415 #endif /* RE_ENABLE_I18N */
416 else if (type == OP_PERIOD
417 #ifdef RE_ENABLE_I18N
418 || type == OP_UTF8_PERIOD
419 #endif /* RE_ENABLE_I18N */
420 || type == END_OF_RE)
421 {
422 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
423 if (type == END_OF_RE)
424 bufp->can_be_null = 1;
425 return;
426 }
427 }
428 }
429 \f
430 /* Entry point for POSIX code. */
431 /* regcomp takes a regular expression as a string and compiles it.
432
433 PREG is a regex_t *. We do not expect any fields to be initialized,
434 since POSIX says we shouldn't. Thus, we set
435
436 'buffer' to the compiled pattern;
437 'used' to the length of the compiled pattern;
438 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439 REG_EXTENDED bit in CFLAGS is set; otherwise, to
440 RE_SYNTAX_POSIX_BASIC;
441 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
442 'fastmap' to an allocated space for the fastmap;
443 'fastmap_accurate' to zero;
444 're_nsub' to the number of subexpressions in PATTERN.
445
446 PATTERN is the address of the pattern string.
447
448 CFLAGS is a series of bits which affect compilation.
449
450 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451 use POSIX basic syntax.
452
453 If REG_NEWLINE is set, then . and [^...] don't match newline.
454 Also, regexec will try a match beginning after every newline.
455
456 If REG_ICASE is set, then we considers upper- and lowercase
457 versions of letters to be equivalent when matching.
458
459 If REG_NOSUB is set, then when PREG is passed to regexec, that
460 routine will report only success or failure, and nothing about the
461 registers.
462
463 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
464 the return codes and their meanings.) */
465
466 int
467 regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern, int cflags)
468 {
469 reg_errcode_t ret;
470 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
471 : RE_SYNTAX_POSIX_BASIC);
472
473 preg->buffer = NULL;
474 preg->allocated = 0;
475 preg->used = 0;
476
477 /* Try to allocate space for the fastmap. */
478 preg->fastmap = re_malloc (char, SBC_MAX);
479 if (BE (preg->fastmap == NULL, 0))
480 return REG_ESPACE;
481
482 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
483
484 /* If REG_NEWLINE is set, newlines are treated differently. */
485 if (cflags & REG_NEWLINE)
486 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
487 syntax &= ~RE_DOT_NEWLINE;
488 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
489 /* It also changes the matching behavior. */
490 preg->newline_anchor = 1;
491 }
492 else
493 preg->newline_anchor = 0;
494 preg->no_sub = !!(cflags & REG_NOSUB);
495 preg->translate = NULL;
496
497 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
498
499 /* POSIX doesn't distinguish between an unmatched open-group and an
500 unmatched close-group: both are REG_EPAREN. */
501 if (ret == REG_ERPAREN)
502 ret = REG_EPAREN;
503
504 /* We have already checked preg->fastmap != NULL. */
505 if (BE (ret == REG_NOERROR, 1))
506 /* Compute the fastmap now, since regexec cannot modify the pattern
507 buffer. This function never fails in this implementation. */
508 (void) re_compile_fastmap (preg);
509 else
510 {
511 /* Some error occurred while compiling the expression. */
512 re_free (preg->fastmap);
513 preg->fastmap = NULL;
514 }
515
516 return (int) ret;
517 }
518 #ifdef _LIBC
519 libc_hidden_def (__regcomp)
520 weak_alias (__regcomp, regcomp)
521 #endif
522
523 /* Returns a message corresponding to an error code, ERRCODE, returned
524 from either regcomp or regexec. We don't use PREG here. */
525
526 size_t
527 regerror (int errcode, const regex_t *_Restrict_ preg, char *_Restrict_ errbuf,
528 size_t errbuf_size)
529 {
530 const char *msg;
531 size_t msg_size;
532
533 if (BE (errcode < 0
534 || errcode >= (int) (sizeof (__re_error_msgid_idx)
535 / sizeof (__re_error_msgid_idx[0])), 0))
536 /* Only error codes returned by the rest of the code should be passed
537 to this routine. If we are given anything else, or if other regex
538 code generates an invalid error code, then the program has a bug.
539 Dump core so we can fix it. */
540 abort ();
541
542 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
543
544 msg_size = strlen (msg) + 1; /* Includes the null. */
545
546 if (BE (errbuf_size != 0, 1))
547 {
548 size_t cpy_size = msg_size;
549 if (BE (msg_size > errbuf_size, 0))
550 {
551 cpy_size = errbuf_size - 1;
552 errbuf[cpy_size] = '\0';
553 }
554 memcpy (errbuf, msg, cpy_size);
555 }
556
557 return msg_size;
558 }
559 #ifdef _LIBC
560 weak_alias (__regerror, regerror)
561 #endif
562
563
564 #ifdef RE_ENABLE_I18N
565 /* This static array is used for the map to single-byte characters when
566 UTF-8 is used. Otherwise we would allocate memory just to initialize
567 it the same all the time. UTF-8 is the preferred encoding so this is
568 a worthwhile optimization. */
569 static const bitset_t utf8_sb_map =
570 {
571 /* Set the first 128 bits. */
572 # if defined __GNUC__ && !defined __STRICT_ANSI__
573 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
574 # else
575 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
576 # error "bitset_word_t is narrower than 32 bits"
577 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
578 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
579 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
580 BITSET_WORD_MAX, BITSET_WORD_MAX,
581 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
582 BITSET_WORD_MAX,
583 # endif
584 (BITSET_WORD_MAX
585 >> (SBC_MAX % BITSET_WORD_BITS == 0
586 ? 0
587 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
588 # endif
589 };
590 #endif
591
592
593 static void
594 free_dfa_content (re_dfa_t *dfa)
595 {
596 Idx i, j;
597
598 if (dfa->nodes)
599 for (i = 0; i < dfa->nodes_len; ++i)
600 free_token (dfa->nodes + i);
601 re_free (dfa->nexts);
602 for (i = 0; i < dfa->nodes_len; ++i)
603 {
604 if (dfa->eclosures != NULL)
605 re_node_set_free (dfa->eclosures + i);
606 if (dfa->inveclosures != NULL)
607 re_node_set_free (dfa->inveclosures + i);
608 if (dfa->edests != NULL)
609 re_node_set_free (dfa->edests + i);
610 }
611 re_free (dfa->edests);
612 re_free (dfa->eclosures);
613 re_free (dfa->inveclosures);
614 re_free (dfa->nodes);
615
616 if (dfa->state_table)
617 for (i = 0; i <= dfa->state_hash_mask; ++i)
618 {
619 struct re_state_table_entry *entry = dfa->state_table + i;
620 for (j = 0; j < entry->num; ++j)
621 {
622 re_dfastate_t *state = entry->array[j];
623 free_state (state);
624 }
625 re_free (entry->array);
626 }
627 re_free (dfa->state_table);
628 #ifdef RE_ENABLE_I18N
629 if (dfa->sb_char != utf8_sb_map)
630 re_free (dfa->sb_char);
631 #endif
632 re_free (dfa->subexp_map);
633 #ifdef DEBUG
634 re_free (dfa->re_str);
635 #endif
636
637 re_free (dfa);
638 }
639
640
641 /* Free dynamically allocated space used by PREG. */
642
643 void
644 regfree (regex_t *preg)
645 {
646 re_dfa_t *dfa = preg->buffer;
647 if (BE (dfa != NULL, 1))
648 {
649 lock_fini (dfa->lock);
650 free_dfa_content (dfa);
651 }
652 preg->buffer = NULL;
653 preg->allocated = 0;
654
655 re_free (preg->fastmap);
656 preg->fastmap = NULL;
657
658 re_free (preg->translate);
659 preg->translate = NULL;
660 }
661 #ifdef _LIBC
662 libc_hidden_def (__regfree)
663 weak_alias (__regfree, regfree)
664 #endif
665 \f
666 /* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
668
669 #if defined _REGEX_RE_COMP || defined _LIBC
670
671 /* BSD has one and only one pattern buffer. */
672 static struct re_pattern_buffer re_comp_buf;
673
674 char *
675 # ifdef _LIBC
676 /* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
679 weak_function
680 # endif
681 re_comp (const char *s)
682 {
683 reg_errcode_t ret;
684 char *fastmap;
685
686 if (!s)
687 {
688 if (!re_comp_buf.buffer)
689 return gettext ("No previous regular expression");
690 return 0;
691 }
692
693 if (re_comp_buf.buffer)
694 {
695 fastmap = re_comp_buf.fastmap;
696 re_comp_buf.fastmap = NULL;
697 __regfree (&re_comp_buf);
698 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
699 re_comp_buf.fastmap = fastmap;
700 }
701
702 if (re_comp_buf.fastmap == NULL)
703 {
704 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
705 if (re_comp_buf.fastmap == NULL)
706 return (char *) gettext (__re_error_msgid
707 + __re_error_msgid_idx[(int) REG_ESPACE]);
708 }
709
710 /* Since 're_exec' always passes NULL for the 'regs' argument, we
711 don't need to initialize the pattern buffer fields which affect it. */
712
713 /* Match anchors at newlines. */
714 re_comp_buf.newline_anchor = 1;
715
716 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
717
718 if (!ret)
719 return NULL;
720
721 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
722 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
723 }
724
725 #ifdef _LIBC
726 libc_freeres_fn (free_mem)
727 {
728 __regfree (&re_comp_buf);
729 }
730 #endif
731
732 #endif /* _REGEX_RE_COMP */
733 \f
734 /* Internal entry point.
735 Compile the regular expression PATTERN, whose length is LENGTH.
736 SYNTAX indicate regular expression's syntax. */
737
738 static reg_errcode_t
739 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
740 reg_syntax_t syntax)
741 {
742 reg_errcode_t err = REG_NOERROR;
743 re_dfa_t *dfa;
744 re_string_t regexp;
745
746 /* Initialize the pattern buffer. */
747 preg->fastmap_accurate = 0;
748 preg->syntax = syntax;
749 preg->not_bol = preg->not_eol = 0;
750 preg->used = 0;
751 preg->re_nsub = 0;
752 preg->can_be_null = 0;
753 preg->regs_allocated = REGS_UNALLOCATED;
754
755 /* Initialize the dfa. */
756 dfa = preg->buffer;
757 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
758 {
759 /* If zero allocated, but buffer is non-null, try to realloc
760 enough space. This loses if buffer's address is bogus, but
761 that is the user's responsibility. If ->buffer is NULL this
762 is a simple allocation. */
763 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
764 if (dfa == NULL)
765 return REG_ESPACE;
766 preg->allocated = sizeof (re_dfa_t);
767 preg->buffer = dfa;
768 }
769 preg->used = sizeof (re_dfa_t);
770
771 err = init_dfa (dfa, length);
772 if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
773 err = REG_ESPACE;
774 if (BE (err != REG_NOERROR, 0))
775 {
776 free_dfa_content (dfa);
777 preg->buffer = NULL;
778 preg->allocated = 0;
779 return err;
780 }
781 #ifdef DEBUG
782 /* Note: length+1 will not overflow since it is checked in init_dfa. */
783 dfa->re_str = re_malloc (char, length + 1);
784 strncpy (dfa->re_str, pattern, length + 1);
785 #endif
786
787 err = re_string_construct (&regexp, pattern, length, preg->translate,
788 (syntax & RE_ICASE) != 0, dfa);
789 if (BE (err != REG_NOERROR, 0))
790 {
791 re_compile_internal_free_return:
792 free_workarea_compile (preg);
793 re_string_destruct (&regexp);
794 lock_fini (dfa->lock);
795 free_dfa_content (dfa);
796 preg->buffer = NULL;
797 preg->allocated = 0;
798 return err;
799 }
800
801 /* Parse the regular expression, and build a structure tree. */
802 preg->re_nsub = 0;
803 dfa->str_tree = parse (&regexp, preg, syntax, &err);
804 if (BE (dfa->str_tree == NULL, 0))
805 goto re_compile_internal_free_return;
806
807 /* Analyze the tree and create the nfa. */
808 err = analyze (preg);
809 if (BE (err != REG_NOERROR, 0))
810 goto re_compile_internal_free_return;
811
812 #ifdef RE_ENABLE_I18N
813 /* If possible, do searching in single byte encoding to speed things up. */
814 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
815 optimize_utf8 (dfa);
816 #endif
817
818 /* Then create the initial state of the dfa. */
819 err = create_initial_state (dfa);
820
821 /* Release work areas. */
822 free_workarea_compile (preg);
823 re_string_destruct (&regexp);
824
825 if (BE (err != REG_NOERROR, 0))
826 {
827 lock_fini (dfa->lock);
828 free_dfa_content (dfa);
829 preg->buffer = NULL;
830 preg->allocated = 0;
831 }
832
833 return err;
834 }
835
836 /* Initialize DFA. We use the length of the regular expression PAT_LEN
837 as the initial length of some arrays. */
838
839 static reg_errcode_t
840 init_dfa (re_dfa_t *dfa, size_t pat_len)
841 {
842 __re_size_t table_size;
843 #if !defined(_LIBC) && !defined(_REGEX_STANDALONE)
844 const char *codeset_name;
845 #endif
846 #ifdef RE_ENABLE_I18N
847 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
848 #else
849 size_t max_i18n_object_size = 0;
850 #endif
851 size_t max_object_size =
852 MAX (sizeof (struct re_state_table_entry),
853 MAX (sizeof (re_token_t),
854 MAX (sizeof (re_node_set),
855 MAX (sizeof (regmatch_t),
856 max_i18n_object_size))));
857
858 memset (dfa, '\0', sizeof (re_dfa_t));
859
860 /* Force allocation of str_tree_storage the first time. */
861 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
862
863 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
864 calculation below, and for similar doubling calculations
865 elsewhere. And it's <= rather than <, because some of the
866 doubling calculations add 1 afterwards. */
867 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
868 return REG_ESPACE;
869
870 dfa->nodes_alloc = pat_len + 1;
871 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
872
873 /* table_size = 2 ^ ceil(log pat_len) */
874 for (table_size = 1; ; table_size <<= 1)
875 if (table_size > pat_len)
876 break;
877
878 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
879 dfa->state_hash_mask = table_size - 1;
880
881 dfa->mb_cur_max = MB_CUR_MAX;
882 #ifdef _LIBC
883 if (dfa->mb_cur_max == 6
884 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
885 dfa->is_utf8 = 1;
886 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
887 != 0);
888 #else
889 # if !defined(_REGEX_STANDALONE)
890 codeset_name = nl_langinfo (CODESET);
891 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
892 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
893 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
894 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
895 dfa->is_utf8 = 1;
896 # endif
897
898 /* We check exhaustively in the loop below if this charset is a
899 superset of ASCII. */
900 dfa->map_notascii = 0;
901 #endif
902
903 #ifdef RE_ENABLE_I18N
904 if (dfa->mb_cur_max > 1)
905 {
906 if (dfa->is_utf8)
907 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
908 else
909 {
910 int i, j, ch;
911
912 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
913 if (BE (dfa->sb_char == NULL, 0))
914 return REG_ESPACE;
915
916 /* Set the bits corresponding to single byte chars. */
917 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
918 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
919 {
920 wint_t wch = __btowc (ch);
921 if (wch != WEOF)
922 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
923 # ifndef _LIBC
924 if (isascii (ch) && wch != ch)
925 dfa->map_notascii = 1;
926 # endif
927 }
928 }
929 }
930 #endif
931
932 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
933 return REG_ESPACE;
934 return REG_NOERROR;
935 }
936
937 /* Initialize WORD_CHAR table, which indicate which character is
938 "word". In this case "word" means that it is the word construction
939 character used by some operators like "\<", "\>", etc. */
940
941 static void
942 init_word_char (re_dfa_t *dfa)
943 {
944 int i = 0;
945 int j;
946 int ch = 0;
947 dfa->word_ops_used = 1;
948 if (BE (dfa->map_notascii == 0, 1))
949 {
950 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
951 them, an issue when this code is used in Gnulib. */
952 bitset_word_t bits0 = 0x00000000;
953 bitset_word_t bits1 = 0x03ff0000;
954 bitset_word_t bits2 = 0x87fffffe;
955 bitset_word_t bits3 = 0x07fffffe;
956 if (BITSET_WORD_BITS == 64)
957 {
958 /* Pacify gcc -Woverflow on 32-bit platformns. */
959 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
960 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
961 i = 2;
962 }
963 else if (BITSET_WORD_BITS == 32)
964 {
965 dfa->word_char[0] = bits0;
966 dfa->word_char[1] = bits1;
967 dfa->word_char[2] = bits2;
968 dfa->word_char[3] = bits3;
969 i = 4;
970 }
971 else
972 goto general_case;
973 ch = 128;
974
975 if (BE (dfa->is_utf8, 1))
976 {
977 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
978 return;
979 }
980 }
981
982 general_case:
983 for (; i < BITSET_WORDS; ++i)
984 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
985 if (isalnum (ch) || ch == '_')
986 dfa->word_char[i] |= (bitset_word_t) 1 << j;
987 }
988
989 /* Free the work area which are only used while compiling. */
990
991 static void
992 free_workarea_compile (regex_t *preg)
993 {
994 re_dfa_t *dfa = preg->buffer;
995 bin_tree_storage_t *storage, *next;
996 for (storage = dfa->str_tree_storage; storage; storage = next)
997 {
998 next = storage->next;
999 re_free (storage);
1000 }
1001 dfa->str_tree_storage = NULL;
1002 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1003 dfa->str_tree = NULL;
1004 re_free (dfa->org_indices);
1005 dfa->org_indices = NULL;
1006 }
1007
1008 /* Create initial states for all contexts. */
1009
1010 static reg_errcode_t
1011 create_initial_state (re_dfa_t *dfa)
1012 {
1013 Idx first, i;
1014 reg_errcode_t err;
1015 re_node_set init_nodes;
1016
1017 /* Initial states have the epsilon closure of the node which is
1018 the first node of the regular expression. */
1019 first = dfa->str_tree->first->node_idx;
1020 dfa->init_node = first;
1021 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1022 if (BE (err != REG_NOERROR, 0))
1023 return err;
1024
1025 /* The back-references which are in initial states can epsilon transit,
1026 since in this case all of the subexpressions can be null.
1027 Then we add epsilon closures of the nodes which are the next nodes of
1028 the back-references. */
1029 if (dfa->nbackref > 0)
1030 for (i = 0; i < init_nodes.nelem; ++i)
1031 {
1032 Idx node_idx = init_nodes.elems[i];
1033 re_token_type_t type = dfa->nodes[node_idx].type;
1034
1035 Idx clexp_idx;
1036 if (type != OP_BACK_REF)
1037 continue;
1038 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1039 {
1040 re_token_t *clexp_node;
1041 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1042 if (clexp_node->type == OP_CLOSE_SUBEXP
1043 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1044 break;
1045 }
1046 if (clexp_idx == init_nodes.nelem)
1047 continue;
1048
1049 if (type == OP_BACK_REF)
1050 {
1051 Idx dest_idx = dfa->edests[node_idx].elems[0];
1052 if (!re_node_set_contains (&init_nodes, dest_idx))
1053 {
1054 reg_errcode_t merge_err
1055 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1056 if (merge_err != REG_NOERROR)
1057 return merge_err;
1058 i = 0;
1059 }
1060 }
1061 }
1062
1063 /* It must be the first time to invoke acquire_state. */
1064 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1065 /* We don't check ERR here, since the initial state must not be NULL. */
1066 if (BE (dfa->init_state == NULL, 0))
1067 return err;
1068 if (dfa->init_state->has_constraint)
1069 {
1070 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1071 CONTEXT_WORD);
1072 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1073 CONTEXT_NEWLINE);
1074 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1075 &init_nodes,
1076 CONTEXT_NEWLINE
1077 | CONTEXT_BEGBUF);
1078 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1079 || dfa->init_state_begbuf == NULL, 0))
1080 return err;
1081 }
1082 else
1083 dfa->init_state_word = dfa->init_state_nl
1084 = dfa->init_state_begbuf = dfa->init_state;
1085
1086 re_node_set_free (&init_nodes);
1087 return REG_NOERROR;
1088 }
1089 \f
1090 #ifdef RE_ENABLE_I18N
1091 /* If it is possible to do searching in single byte encoding instead of UTF-8
1092 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1093 DFA nodes where needed. */
1094
1095 static void
1096 optimize_utf8 (re_dfa_t *dfa)
1097 {
1098 Idx node;
1099 int i;
1100 bool mb_chars = false;
1101 bool has_period = false;
1102
1103 for (node = 0; node < dfa->nodes_len; ++node)
1104 switch (dfa->nodes[node].type)
1105 {
1106 case CHARACTER:
1107 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1108 mb_chars = true;
1109 break;
1110 case ANCHOR:
1111 switch (dfa->nodes[node].opr.ctx_type)
1112 {
1113 case LINE_FIRST:
1114 case LINE_LAST:
1115 case BUF_FIRST:
1116 case BUF_LAST:
1117 break;
1118 default:
1119 /* Word anchors etc. cannot be handled. It's okay to test
1120 opr.ctx_type since constraints (for all DFA nodes) are
1121 created by ORing one or more opr.ctx_type values. */
1122 return;
1123 }
1124 break;
1125 case OP_PERIOD:
1126 has_period = true;
1127 break;
1128 case OP_BACK_REF:
1129 case OP_ALT:
1130 case END_OF_RE:
1131 case OP_DUP_ASTERISK:
1132 case OP_OPEN_SUBEXP:
1133 case OP_CLOSE_SUBEXP:
1134 break;
1135 case COMPLEX_BRACKET:
1136 return;
1137 case SIMPLE_BRACKET:
1138 /* Just double check. */
1139 {
1140 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1141 ? 0
1142 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1143 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1144 {
1145 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1146 return;
1147 rshift = 0;
1148 }
1149 }
1150 break;
1151 default:
1152 abort ();
1153 }
1154
1155 if (mb_chars || has_period)
1156 for (node = 0; node < dfa->nodes_len; ++node)
1157 {
1158 if (dfa->nodes[node].type == CHARACTER
1159 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1160 dfa->nodes[node].mb_partial = 0;
1161 else if (dfa->nodes[node].type == OP_PERIOD)
1162 dfa->nodes[node].type = OP_UTF8_PERIOD;
1163 }
1164
1165 /* The search can be in single byte locale. */
1166 dfa->mb_cur_max = 1;
1167 dfa->is_utf8 = 0;
1168 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1169 }
1170 #endif
1171 \f
1172 /* Analyze the structure tree, and calculate "first", "next", "edest",
1173 "eclosure", and "inveclosure". */
1174
1175 static reg_errcode_t
1176 analyze (regex_t *preg)
1177 {
1178 re_dfa_t *dfa = preg->buffer;
1179 reg_errcode_t ret;
1180
1181 /* Allocate arrays. */
1182 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1183 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1184 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1185 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1186 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1187 || dfa->eclosures == NULL, 0))
1188 return REG_ESPACE;
1189
1190 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1191 if (dfa->subexp_map != NULL)
1192 {
1193 Idx i;
1194 for (i = 0; i < preg->re_nsub; i++)
1195 dfa->subexp_map[i] = i;
1196 preorder (dfa->str_tree, optimize_subexps, dfa);
1197 for (i = 0; i < preg->re_nsub; i++)
1198 if (dfa->subexp_map[i] != i)
1199 break;
1200 if (i == preg->re_nsub)
1201 {
1202 re_free (dfa->subexp_map);
1203 dfa->subexp_map = NULL;
1204 }
1205 }
1206
1207 ret = postorder (dfa->str_tree, lower_subexps, preg);
1208 if (BE (ret != REG_NOERROR, 0))
1209 return ret;
1210 ret = postorder (dfa->str_tree, calc_first, dfa);
1211 if (BE (ret != REG_NOERROR, 0))
1212 return ret;
1213 preorder (dfa->str_tree, calc_next, dfa);
1214 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1215 if (BE (ret != REG_NOERROR, 0))
1216 return ret;
1217 ret = calc_eclosure (dfa);
1218 if (BE (ret != REG_NOERROR, 0))
1219 return ret;
1220
1221 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1222 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1223 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1224 || dfa->nbackref)
1225 {
1226 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1227 if (BE (dfa->inveclosures == NULL, 0))
1228 return REG_ESPACE;
1229 ret = calc_inveclosure (dfa);
1230 }
1231
1232 return ret;
1233 }
1234
1235 /* Our parse trees are very unbalanced, so we cannot use a stack to
1236 implement parse tree visits. Instead, we use parent pointers and
1237 some hairy code in these two functions. */
1238 static reg_errcode_t
1239 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1240 void *extra)
1241 {
1242 bin_tree_t *node, *prev;
1243
1244 for (node = root; ; )
1245 {
1246 /* Descend down the tree, preferably to the left (or to the right
1247 if that's the only child). */
1248 while (node->left || node->right)
1249 if (node->left)
1250 node = node->left;
1251 else
1252 node = node->right;
1253
1254 do
1255 {
1256 reg_errcode_t err = fn (extra, node);
1257 if (BE (err != REG_NOERROR, 0))
1258 return err;
1259 if (node->parent == NULL)
1260 return REG_NOERROR;
1261 prev = node;
1262 node = node->parent;
1263 }
1264 /* Go up while we have a node that is reached from the right. */
1265 while (node->right == prev || node->right == NULL);
1266 node = node->right;
1267 }
1268 }
1269
1270 static reg_errcode_t
1271 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1272 void *extra)
1273 {
1274 bin_tree_t *node;
1275
1276 for (node = root; ; )
1277 {
1278 reg_errcode_t err = fn (extra, node);
1279 if (BE (err != REG_NOERROR, 0))
1280 return err;
1281
1282 /* Go to the left node, or up and to the right. */
1283 if (node->left)
1284 node = node->left;
1285 else
1286 {
1287 bin_tree_t *prev = NULL;
1288 while (node->right == prev || node->right == NULL)
1289 {
1290 prev = node;
1291 node = node->parent;
1292 if (!node)
1293 return REG_NOERROR;
1294 }
1295 node = node->right;
1296 }
1297 }
1298 }
1299
1300 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1301 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1302 backreferences as well. Requires a preorder visit. */
1303 static reg_errcode_t
1304 optimize_subexps (void *extra, bin_tree_t *node)
1305 {
1306 re_dfa_t *dfa = (re_dfa_t *) extra;
1307
1308 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1309 {
1310 int idx = node->token.opr.idx;
1311 node->token.opr.idx = dfa->subexp_map[idx];
1312 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1313 }
1314
1315 else if (node->token.type == SUBEXP
1316 && node->left && node->left->token.type == SUBEXP)
1317 {
1318 Idx other_idx = node->left->token.opr.idx;
1319
1320 node->left = node->left->left;
1321 if (node->left)
1322 node->left->parent = node;
1323
1324 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1325 if (other_idx < BITSET_WORD_BITS)
1326 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1327 }
1328
1329 return REG_NOERROR;
1330 }
1331
1332 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1333 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1334 static reg_errcode_t
1335 lower_subexps (void *extra, bin_tree_t *node)
1336 {
1337 regex_t *preg = (regex_t *) extra;
1338 reg_errcode_t err = REG_NOERROR;
1339
1340 if (node->left && node->left->token.type == SUBEXP)
1341 {
1342 node->left = lower_subexp (&err, preg, node->left);
1343 if (node->left)
1344 node->left->parent = node;
1345 }
1346 if (node->right && node->right->token.type == SUBEXP)
1347 {
1348 node->right = lower_subexp (&err, preg, node->right);
1349 if (node->right)
1350 node->right->parent = node;
1351 }
1352
1353 return err;
1354 }
1355
1356 static bin_tree_t *
1357 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1358 {
1359 re_dfa_t *dfa = preg->buffer;
1360 bin_tree_t *body = node->left;
1361 bin_tree_t *op, *cls, *tree1, *tree;
1362
1363 if (preg->no_sub
1364 /* We do not optimize empty subexpressions, because otherwise we may
1365 have bad CONCAT nodes with NULL children. This is obviously not
1366 very common, so we do not lose much. An example that triggers
1367 this case is the sed "script" /\(\)/x. */
1368 && node->left != NULL
1369 && (node->token.opr.idx >= BITSET_WORD_BITS
1370 || !(dfa->used_bkref_map
1371 & ((bitset_word_t) 1 << node->token.opr.idx))))
1372 return node->left;
1373
1374 /* Convert the SUBEXP node to the concatenation of an
1375 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1376 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1377 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1378 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1379 tree = create_tree (dfa, op, tree1, CONCAT);
1380 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1381 {
1382 *err = REG_ESPACE;
1383 return NULL;
1384 }
1385
1386 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1387 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1388 return tree;
1389 }
1390
1391 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1392 nodes. Requires a postorder visit. */
1393 static reg_errcode_t
1394 calc_first (void *extra, bin_tree_t *node)
1395 {
1396 re_dfa_t *dfa = (re_dfa_t *) extra;
1397 if (node->token.type == CONCAT)
1398 {
1399 node->first = node->left->first;
1400 node->node_idx = node->left->node_idx;
1401 }
1402 else
1403 {
1404 node->first = node;
1405 node->node_idx = re_dfa_add_node (dfa, node->token);
1406 if (BE (node->node_idx == -1, 0))
1407 return REG_ESPACE;
1408 if (node->token.type == ANCHOR)
1409 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1410 }
1411 return REG_NOERROR;
1412 }
1413
1414 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1415 static reg_errcode_t
1416 calc_next (void *extra, bin_tree_t *node)
1417 {
1418 switch (node->token.type)
1419 {
1420 case OP_DUP_ASTERISK:
1421 node->left->next = node;
1422 break;
1423 case CONCAT:
1424 node->left->next = node->right->first;
1425 node->right->next = node->next;
1426 break;
1427 default:
1428 if (node->left)
1429 node->left->next = node->next;
1430 if (node->right)
1431 node->right->next = node->next;
1432 break;
1433 }
1434 return REG_NOERROR;
1435 }
1436
1437 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1438 static reg_errcode_t
1439 link_nfa_nodes (void *extra, bin_tree_t *node)
1440 {
1441 re_dfa_t *dfa = (re_dfa_t *) extra;
1442 Idx idx = node->node_idx;
1443 reg_errcode_t err = REG_NOERROR;
1444
1445 switch (node->token.type)
1446 {
1447 case CONCAT:
1448 break;
1449
1450 case END_OF_RE:
1451 assert (node->next == NULL);
1452 break;
1453
1454 case OP_DUP_ASTERISK:
1455 case OP_ALT:
1456 {
1457 Idx left, right;
1458 dfa->has_plural_match = 1;
1459 if (node->left != NULL)
1460 left = node->left->first->node_idx;
1461 else
1462 left = node->next->node_idx;
1463 if (node->right != NULL)
1464 right = node->right->first->node_idx;
1465 else
1466 right = node->next->node_idx;
1467 assert (left > -1);
1468 assert (right > -1);
1469 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1470 }
1471 break;
1472
1473 case ANCHOR:
1474 case OP_OPEN_SUBEXP:
1475 case OP_CLOSE_SUBEXP:
1476 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1477 break;
1478
1479 case OP_BACK_REF:
1480 dfa->nexts[idx] = node->next->node_idx;
1481 if (node->token.type == OP_BACK_REF)
1482 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1483 break;
1484
1485 default:
1486 assert (!IS_EPSILON_NODE (node->token.type));
1487 dfa->nexts[idx] = node->next->node_idx;
1488 break;
1489 }
1490
1491 return err;
1492 }
1493
1494 /* Duplicate the epsilon closure of the node ROOT_NODE.
1495 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1496 to their own constraint. */
1497
1498 static reg_errcode_t
1499 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1500 Idx root_node, unsigned int init_constraint)
1501 {
1502 Idx org_node, clone_node;
1503 bool ok;
1504 unsigned int constraint = init_constraint;
1505 for (org_node = top_org_node, clone_node = top_clone_node;;)
1506 {
1507 Idx org_dest, clone_dest;
1508 if (dfa->nodes[org_node].type == OP_BACK_REF)
1509 {
1510 /* If the back reference epsilon-transit, its destination must
1511 also have the constraint. Then duplicate the epsilon closure
1512 of the destination of the back reference, and store it in
1513 edests of the back reference. */
1514 org_dest = dfa->nexts[org_node];
1515 re_node_set_empty (dfa->edests + clone_node);
1516 clone_dest = duplicate_node (dfa, org_dest, constraint);
1517 if (BE (clone_dest == -1, 0))
1518 return REG_ESPACE;
1519 dfa->nexts[clone_node] = dfa->nexts[org_node];
1520 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1521 if (BE (! ok, 0))
1522 return REG_ESPACE;
1523 }
1524 else if (dfa->edests[org_node].nelem == 0)
1525 {
1526 /* In case of the node can't epsilon-transit, don't duplicate the
1527 destination and store the original destination as the
1528 destination of the node. */
1529 dfa->nexts[clone_node] = dfa->nexts[org_node];
1530 break;
1531 }
1532 else if (dfa->edests[org_node].nelem == 1)
1533 {
1534 /* In case of the node can epsilon-transit, and it has only one
1535 destination. */
1536 org_dest = dfa->edests[org_node].elems[0];
1537 re_node_set_empty (dfa->edests + clone_node);
1538 /* If the node is root_node itself, it means the epsilon closure
1539 has a loop. Then tie it to the destination of the root_node. */
1540 if (org_node == root_node && clone_node != org_node)
1541 {
1542 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1543 if (BE (! ok, 0))
1544 return REG_ESPACE;
1545 break;
1546 }
1547 /* In case the node has another constraint, append it. */
1548 constraint |= dfa->nodes[org_node].constraint;
1549 clone_dest = duplicate_node (dfa, org_dest, constraint);
1550 if (BE (clone_dest == -1, 0))
1551 return REG_ESPACE;
1552 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1553 if (BE (! ok, 0))
1554 return REG_ESPACE;
1555 }
1556 else /* dfa->edests[org_node].nelem == 2 */
1557 {
1558 /* In case of the node can epsilon-transit, and it has two
1559 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1560 org_dest = dfa->edests[org_node].elems[0];
1561 re_node_set_empty (dfa->edests + clone_node);
1562 /* Search for a duplicated node which satisfies the constraint. */
1563 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1564 if (clone_dest == -1)
1565 {
1566 /* There is no such duplicated node, create a new one. */
1567 reg_errcode_t err;
1568 clone_dest = duplicate_node (dfa, org_dest, constraint);
1569 if (BE (clone_dest == -1, 0))
1570 return REG_ESPACE;
1571 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572 if (BE (! ok, 0))
1573 return REG_ESPACE;
1574 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1575 root_node, constraint);
1576 if (BE (err != REG_NOERROR, 0))
1577 return err;
1578 }
1579 else
1580 {
1581 /* There is a duplicated node which satisfies the constraint,
1582 use it to avoid infinite loop. */
1583 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1584 if (BE (! ok, 0))
1585 return REG_ESPACE;
1586 }
1587
1588 org_dest = dfa->edests[org_node].elems[1];
1589 clone_dest = duplicate_node (dfa, org_dest, constraint);
1590 if (BE (clone_dest == -1, 0))
1591 return REG_ESPACE;
1592 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1593 if (BE (! ok, 0))
1594 return REG_ESPACE;
1595 }
1596 org_node = org_dest;
1597 clone_node = clone_dest;
1598 }
1599 return REG_NOERROR;
1600 }
1601
1602 /* Search for a node which is duplicated from the node ORG_NODE, and
1603 satisfies the constraint CONSTRAINT. */
1604
1605 static Idx
1606 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1607 unsigned int constraint)
1608 {
1609 Idx idx;
1610 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1611 {
1612 if (org_node == dfa->org_indices[idx]
1613 && constraint == dfa->nodes[idx].constraint)
1614 return idx; /* Found. */
1615 }
1616 return -1; /* Not found. */
1617 }
1618
1619 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1620 Return the index of the new node, or -1 if insufficient storage is
1621 available. */
1622
1623 static Idx
1624 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1625 {
1626 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1627 if (BE (dup_idx != -1, 1))
1628 {
1629 dfa->nodes[dup_idx].constraint = constraint;
1630 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1631 dfa->nodes[dup_idx].duplicated = 1;
1632
1633 /* Store the index of the original node. */
1634 dfa->org_indices[dup_idx] = org_idx;
1635 }
1636 return dup_idx;
1637 }
1638
1639 static reg_errcode_t
1640 calc_inveclosure (re_dfa_t *dfa)
1641 {
1642 Idx src, idx;
1643 bool ok;
1644 for (idx = 0; idx < dfa->nodes_len; ++idx)
1645 re_node_set_init_empty (dfa->inveclosures + idx);
1646
1647 for (src = 0; src < dfa->nodes_len; ++src)
1648 {
1649 Idx *elems = dfa->eclosures[src].elems;
1650 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1651 {
1652 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1653 if (BE (! ok, 0))
1654 return REG_ESPACE;
1655 }
1656 }
1657
1658 return REG_NOERROR;
1659 }
1660
1661 /* Calculate "eclosure" for all the node in DFA. */
1662
1663 static reg_errcode_t
1664 calc_eclosure (re_dfa_t *dfa)
1665 {
1666 Idx node_idx;
1667 bool incomplete;
1668 #ifdef DEBUG
1669 assert (dfa->nodes_len > 0);
1670 #endif
1671 incomplete = false;
1672 /* For each nodes, calculate epsilon closure. */
1673 for (node_idx = 0; ; ++node_idx)
1674 {
1675 reg_errcode_t err;
1676 re_node_set eclosure_elem;
1677 if (node_idx == dfa->nodes_len)
1678 {
1679 if (!incomplete)
1680 break;
1681 incomplete = false;
1682 node_idx = 0;
1683 }
1684
1685 #ifdef DEBUG
1686 assert (dfa->eclosures[node_idx].nelem != -1);
1687 #endif
1688
1689 /* If we have already calculated, skip it. */
1690 if (dfa->eclosures[node_idx].nelem != 0)
1691 continue;
1692 /* Calculate epsilon closure of 'node_idx'. */
1693 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1694 if (BE (err != REG_NOERROR, 0))
1695 return err;
1696
1697 if (dfa->eclosures[node_idx].nelem == 0)
1698 {
1699 incomplete = true;
1700 re_node_set_free (&eclosure_elem);
1701 }
1702 }
1703 return REG_NOERROR;
1704 }
1705
1706 /* Calculate epsilon closure of NODE. */
1707
1708 static reg_errcode_t
1709 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1710 {
1711 reg_errcode_t err;
1712 Idx i;
1713 re_node_set eclosure;
1714 bool ok;
1715 bool incomplete = false;
1716 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1717 if (BE (err != REG_NOERROR, 0))
1718 return err;
1719
1720 /* This indicates that we are calculating this node now.
1721 We reference this value to avoid infinite loop. */
1722 dfa->eclosures[node].nelem = -1;
1723
1724 /* If the current node has constraints, duplicate all nodes
1725 since they must inherit the constraints. */
1726 if (dfa->nodes[node].constraint
1727 && dfa->edests[node].nelem
1728 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1729 {
1730 err = duplicate_node_closure (dfa, node, node, node,
1731 dfa->nodes[node].constraint);
1732 if (BE (err != REG_NOERROR, 0))
1733 return err;
1734 }
1735
1736 /* Expand each epsilon destination nodes. */
1737 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1738 for (i = 0; i < dfa->edests[node].nelem; ++i)
1739 {
1740 re_node_set eclosure_elem;
1741 Idx edest = dfa->edests[node].elems[i];
1742 /* If calculating the epsilon closure of 'edest' is in progress,
1743 return intermediate result. */
1744 if (dfa->eclosures[edest].nelem == -1)
1745 {
1746 incomplete = true;
1747 continue;
1748 }
1749 /* If we haven't calculated the epsilon closure of 'edest' yet,
1750 calculate now. Otherwise use calculated epsilon closure. */
1751 if (dfa->eclosures[edest].nelem == 0)
1752 {
1753 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1754 if (BE (err != REG_NOERROR, 0))
1755 return err;
1756 }
1757 else
1758 eclosure_elem = dfa->eclosures[edest];
1759 /* Merge the epsilon closure of 'edest'. */
1760 err = re_node_set_merge (&eclosure, &eclosure_elem);
1761 if (BE (err != REG_NOERROR, 0))
1762 return err;
1763 /* If the epsilon closure of 'edest' is incomplete,
1764 the epsilon closure of this node is also incomplete. */
1765 if (dfa->eclosures[edest].nelem == 0)
1766 {
1767 incomplete = true;
1768 re_node_set_free (&eclosure_elem);
1769 }
1770 }
1771
1772 /* An epsilon closure includes itself. */
1773 ok = re_node_set_insert (&eclosure, node);
1774 if (BE (! ok, 0))
1775 return REG_ESPACE;
1776 if (incomplete && !root)
1777 dfa->eclosures[node].nelem = 0;
1778 else
1779 dfa->eclosures[node] = eclosure;
1780 *new_set = eclosure;
1781 return REG_NOERROR;
1782 }
1783 \f
1784 /* Functions for token which are used in the parser. */
1785
1786 /* Fetch a token from INPUT.
1787 We must not use this function inside bracket expressions. */
1788
1789 static void
1790 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1791 {
1792 re_string_skip_bytes (input, peek_token (result, input, syntax));
1793 }
1794
1795 /* Peek a token from INPUT, and return the length of the token.
1796 We must not use this function inside bracket expressions. */
1797
1798 static int
1799 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1800 {
1801 unsigned char c;
1802
1803 if (re_string_eoi (input))
1804 {
1805 token->type = END_OF_RE;
1806 return 0;
1807 }
1808
1809 c = re_string_peek_byte (input, 0);
1810 token->opr.c = c;
1811
1812 token->word_char = 0;
1813 #ifdef RE_ENABLE_I18N
1814 token->mb_partial = 0;
1815 if (input->mb_cur_max > 1 &&
1816 !re_string_first_byte (input, re_string_cur_idx (input)))
1817 {
1818 token->type = CHARACTER;
1819 token->mb_partial = 1;
1820 return 1;
1821 }
1822 #endif
1823 if (c == '\\')
1824 {
1825 unsigned char c2;
1826 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1827 {
1828 token->type = BACK_SLASH;
1829 return 1;
1830 }
1831
1832 c2 = re_string_peek_byte_case (input, 1);
1833 token->opr.c = c2;
1834 token->type = CHARACTER;
1835 #ifdef RE_ENABLE_I18N
1836 if (input->mb_cur_max > 1)
1837 {
1838 wint_t wc = re_string_wchar_at (input,
1839 re_string_cur_idx (input) + 1);
1840 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1841 }
1842 else
1843 #endif
1844 token->word_char = IS_WORD_CHAR (c2) != 0;
1845
1846 switch (c2)
1847 {
1848 case '|':
1849 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1850 token->type = OP_ALT;
1851 break;
1852 case '1': case '2': case '3': case '4': case '5':
1853 case '6': case '7': case '8': case '9':
1854 if (!(syntax & RE_NO_BK_REFS))
1855 {
1856 token->type = OP_BACK_REF;
1857 token->opr.idx = c2 - '1';
1858 }
1859 break;
1860 case '<':
1861 if (!(syntax & RE_NO_GNU_OPS))
1862 {
1863 token->type = ANCHOR;
1864 token->opr.ctx_type = WORD_FIRST;
1865 }
1866 break;
1867 case '>':
1868 if (!(syntax & RE_NO_GNU_OPS))
1869 {
1870 token->type = ANCHOR;
1871 token->opr.ctx_type = WORD_LAST;
1872 }
1873 break;
1874 case 'b':
1875 if (!(syntax & RE_NO_GNU_OPS))
1876 {
1877 token->type = ANCHOR;
1878 token->opr.ctx_type = WORD_DELIM;
1879 }
1880 break;
1881 case 'B':
1882 if (!(syntax & RE_NO_GNU_OPS))
1883 {
1884 token->type = ANCHOR;
1885 token->opr.ctx_type = NOT_WORD_DELIM;
1886 }
1887 break;
1888 case 'w':
1889 if (!(syntax & RE_NO_GNU_OPS))
1890 token->type = OP_WORD;
1891 break;
1892 case 'W':
1893 if (!(syntax & RE_NO_GNU_OPS))
1894 token->type = OP_NOTWORD;
1895 break;
1896 case 's':
1897 if (!(syntax & RE_NO_GNU_OPS))
1898 token->type = OP_SPACE;
1899 break;
1900 case 'S':
1901 if (!(syntax & RE_NO_GNU_OPS))
1902 token->type = OP_NOTSPACE;
1903 break;
1904 case '`':
1905 if (!(syntax & RE_NO_GNU_OPS))
1906 {
1907 token->type = ANCHOR;
1908 token->opr.ctx_type = BUF_FIRST;
1909 }
1910 break;
1911 case '\'':
1912 if (!(syntax & RE_NO_GNU_OPS))
1913 {
1914 token->type = ANCHOR;
1915 token->opr.ctx_type = BUF_LAST;
1916 }
1917 break;
1918 case '(':
1919 if (!(syntax & RE_NO_BK_PARENS))
1920 token->type = OP_OPEN_SUBEXP;
1921 break;
1922 case ')':
1923 if (!(syntax & RE_NO_BK_PARENS))
1924 token->type = OP_CLOSE_SUBEXP;
1925 break;
1926 case '+':
1927 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1928 token->type = OP_DUP_PLUS;
1929 break;
1930 case '?':
1931 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1932 token->type = OP_DUP_QUESTION;
1933 break;
1934 case '{':
1935 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1936 token->type = OP_OPEN_DUP_NUM;
1937 break;
1938 case '}':
1939 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1940 token->type = OP_CLOSE_DUP_NUM;
1941 break;
1942 default:
1943 break;
1944 }
1945 return 2;
1946 }
1947
1948 token->type = CHARACTER;
1949 #ifdef RE_ENABLE_I18N
1950 if (input->mb_cur_max > 1)
1951 {
1952 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1953 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1954 }
1955 else
1956 #endif
1957 token->word_char = IS_WORD_CHAR (token->opr.c);
1958
1959 switch (c)
1960 {
1961 case '\n':
1962 if (syntax & RE_NEWLINE_ALT)
1963 token->type = OP_ALT;
1964 break;
1965 case '|':
1966 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1967 token->type = OP_ALT;
1968 break;
1969 case '*':
1970 token->type = OP_DUP_ASTERISK;
1971 break;
1972 case '+':
1973 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1974 token->type = OP_DUP_PLUS;
1975 break;
1976 case '?':
1977 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1978 token->type = OP_DUP_QUESTION;
1979 break;
1980 case '{':
1981 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1982 token->type = OP_OPEN_DUP_NUM;
1983 break;
1984 case '}':
1985 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1986 token->type = OP_CLOSE_DUP_NUM;
1987 break;
1988 case '(':
1989 if (syntax & RE_NO_BK_PARENS)
1990 token->type = OP_OPEN_SUBEXP;
1991 break;
1992 case ')':
1993 if (syntax & RE_NO_BK_PARENS)
1994 token->type = OP_CLOSE_SUBEXP;
1995 break;
1996 case '[':
1997 token->type = OP_OPEN_BRACKET;
1998 break;
1999 case '.':
2000 token->type = OP_PERIOD;
2001 break;
2002 case '^':
2003 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2004 re_string_cur_idx (input) != 0)
2005 {
2006 char prev = re_string_peek_byte (input, -1);
2007 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2008 break;
2009 }
2010 token->type = ANCHOR;
2011 token->opr.ctx_type = LINE_FIRST;
2012 break;
2013 case '$':
2014 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2015 re_string_cur_idx (input) + 1 != re_string_length (input))
2016 {
2017 re_token_t next;
2018 re_string_skip_bytes (input, 1);
2019 peek_token (&next, input, syntax);
2020 re_string_skip_bytes (input, -1);
2021 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2022 break;
2023 }
2024 token->type = ANCHOR;
2025 token->opr.ctx_type = LINE_LAST;
2026 break;
2027 default:
2028 break;
2029 }
2030 return 1;
2031 }
2032
2033 /* Peek a token from INPUT, and return the length of the token.
2034 We must not use this function out of bracket expressions. */
2035
2036 static int
2037 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2038 {
2039 unsigned char c;
2040 if (re_string_eoi (input))
2041 {
2042 token->type = END_OF_RE;
2043 return 0;
2044 }
2045 c = re_string_peek_byte (input, 0);
2046 token->opr.c = c;
2047
2048 #ifdef RE_ENABLE_I18N
2049 if (input->mb_cur_max > 1 &&
2050 !re_string_first_byte (input, re_string_cur_idx (input)))
2051 {
2052 token->type = CHARACTER;
2053 return 1;
2054 }
2055 #endif /* RE_ENABLE_I18N */
2056
2057 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2058 && re_string_cur_idx (input) + 1 < re_string_length (input))
2059 {
2060 /* In this case, '\' escape a character. */
2061 unsigned char c2;
2062 re_string_skip_bytes (input, 1);
2063 c2 = re_string_peek_byte (input, 0);
2064 token->opr.c = c2;
2065 token->type = CHARACTER;
2066 return 1;
2067 }
2068 if (c == '[') /* '[' is a special char in a bracket exps. */
2069 {
2070 unsigned char c2;
2071 int token_len;
2072 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2073 c2 = re_string_peek_byte (input, 1);
2074 else
2075 c2 = 0;
2076 token->opr.c = c2;
2077 token_len = 2;
2078 switch (c2)
2079 {
2080 case '.':
2081 token->type = OP_OPEN_COLL_ELEM;
2082 break;
2083
2084 case '=':
2085 token->type = OP_OPEN_EQUIV_CLASS;
2086 break;
2087
2088 case ':':
2089 if (syntax & RE_CHAR_CLASSES)
2090 {
2091 token->type = OP_OPEN_CHAR_CLASS;
2092 break;
2093 }
2094 FALLTHROUGH;
2095 default:
2096 token->type = CHARACTER;
2097 token->opr.c = c;
2098 token_len = 1;
2099 break;
2100 }
2101 return token_len;
2102 }
2103 switch (c)
2104 {
2105 case '-':
2106 token->type = OP_CHARSET_RANGE;
2107 break;
2108 case ']':
2109 token->type = OP_CLOSE_BRACKET;
2110 break;
2111 case '^':
2112 token->type = OP_NON_MATCH_LIST;
2113 break;
2114 default:
2115 token->type = CHARACTER;
2116 }
2117 return 1;
2118 }
2119 \f
2120 /* Functions for parser. */
2121
2122 /* Entry point of the parser.
2123 Parse the regular expression REGEXP and return the structure tree.
2124 If an error occurs, ERR is set by error code, and return NULL.
2125 This function build the following tree, from regular expression <reg_exp>:
2126 CAT
2127 / \
2128 / \
2129 <reg_exp> EOR
2130
2131 CAT means concatenation.
2132 EOR means end of regular expression. */
2133
2134 static bin_tree_t *
2135 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2136 reg_errcode_t *err)
2137 {
2138 re_dfa_t *dfa = preg->buffer;
2139 bin_tree_t *tree, *eor, *root;
2140 re_token_t current_token;
2141 dfa->syntax = syntax;
2142 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2143 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2144 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2145 return NULL;
2146 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2147 if (tree != NULL)
2148 root = create_tree (dfa, tree, eor, CONCAT);
2149 else
2150 root = eor;
2151 if (BE (eor == NULL || root == NULL, 0))
2152 {
2153 *err = REG_ESPACE;
2154 return NULL;
2155 }
2156 return root;
2157 }
2158
2159 /* This function build the following tree, from regular expression
2160 <branch1>|<branch2>:
2161 ALT
2162 / \
2163 / \
2164 <branch1> <branch2>
2165
2166 ALT means alternative, which represents the operator '|'. */
2167
2168 static bin_tree_t *
2169 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2170 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2171 {
2172 re_dfa_t *dfa = preg->buffer;
2173 bin_tree_t *tree, *branch = NULL;
2174 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2175 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2176 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2177 return NULL;
2178
2179 while (token->type == OP_ALT)
2180 {
2181 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2182 if (token->type != OP_ALT && token->type != END_OF_RE
2183 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2184 {
2185 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2186 dfa->completed_bkref_map = initial_bkref_map;
2187 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2188 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2189 {
2190 if (tree != NULL)
2191 postorder (tree, free_tree, NULL);
2192 return NULL;
2193 }
2194 dfa->completed_bkref_map |= accumulated_bkref_map;
2195 }
2196 else
2197 branch = NULL;
2198 tree = create_tree (dfa, tree, branch, OP_ALT);
2199 if (BE (tree == NULL, 0))
2200 {
2201 *err = REG_ESPACE;
2202 return NULL;
2203 }
2204 }
2205 return tree;
2206 }
2207
2208 /* This function build the following tree, from regular expression
2209 <exp1><exp2>:
2210 CAT
2211 / \
2212 / \
2213 <exp1> <exp2>
2214
2215 CAT means concatenation. */
2216
2217 static bin_tree_t *
2218 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2219 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2220 {
2221 bin_tree_t *tree, *expr;
2222 re_dfa_t *dfa = preg->buffer;
2223 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2224 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2225 return NULL;
2226
2227 while (token->type != OP_ALT && token->type != END_OF_RE
2228 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2229 {
2230 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2231 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2232 {
2233 if (tree != NULL)
2234 postorder (tree, free_tree, NULL);
2235 return NULL;
2236 }
2237 if (tree != NULL && expr != NULL)
2238 {
2239 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2240 if (newtree == NULL)
2241 {
2242 postorder (expr, free_tree, NULL);
2243 postorder (tree, free_tree, NULL);
2244 *err = REG_ESPACE;
2245 return NULL;
2246 }
2247 tree = newtree;
2248 }
2249 else if (tree == NULL)
2250 tree = expr;
2251 /* Otherwise expr == NULL, we don't need to create new tree. */
2252 }
2253 return tree;
2254 }
2255
2256 /* This function build the following tree, from regular expression a*:
2257 *
2258 |
2259 a
2260 */
2261
2262 static bin_tree_t *
2263 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2264 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2265 {
2266 re_dfa_t *dfa = preg->buffer;
2267 bin_tree_t *tree;
2268 switch (token->type)
2269 {
2270 case CHARACTER:
2271 tree = create_token_tree (dfa, NULL, NULL, token);
2272 if (BE (tree == NULL, 0))
2273 {
2274 *err = REG_ESPACE;
2275 return NULL;
2276 }
2277 #ifdef RE_ENABLE_I18N
2278 if (dfa->mb_cur_max > 1)
2279 {
2280 while (!re_string_eoi (regexp)
2281 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2282 {
2283 bin_tree_t *mbc_remain;
2284 fetch_token (token, regexp, syntax);
2285 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2286 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2287 if (BE (mbc_remain == NULL || tree == NULL, 0))
2288 {
2289 *err = REG_ESPACE;
2290 return NULL;
2291 }
2292 }
2293 }
2294 #endif
2295 break;
2296
2297 case OP_OPEN_SUBEXP:
2298 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2299 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2300 return NULL;
2301 break;
2302
2303 case OP_OPEN_BRACKET:
2304 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2305 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2306 return NULL;
2307 break;
2308
2309 case OP_BACK_REF:
2310 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2311 {
2312 *err = REG_ESUBREG;
2313 return NULL;
2314 }
2315 dfa->used_bkref_map |= 1 << token->opr.idx;
2316 tree = create_token_tree (dfa, NULL, NULL, token);
2317 if (BE (tree == NULL, 0))
2318 {
2319 *err = REG_ESPACE;
2320 return NULL;
2321 }
2322 ++dfa->nbackref;
2323 dfa->has_mb_node = 1;
2324 break;
2325
2326 case OP_OPEN_DUP_NUM:
2327 if (syntax & RE_CONTEXT_INVALID_DUP)
2328 {
2329 *err = REG_BADRPT;
2330 return NULL;
2331 }
2332 FALLTHROUGH;
2333 case OP_DUP_ASTERISK:
2334 case OP_DUP_PLUS:
2335 case OP_DUP_QUESTION:
2336 if (syntax & RE_CONTEXT_INVALID_OPS)
2337 {
2338 *err = REG_BADRPT;
2339 return NULL;
2340 }
2341 else if (syntax & RE_CONTEXT_INDEP_OPS)
2342 {
2343 fetch_token (token, regexp, syntax);
2344 return parse_expression (regexp, preg, token, syntax, nest, err);
2345 }
2346 FALLTHROUGH;
2347 case OP_CLOSE_SUBEXP:
2348 if ((token->type == OP_CLOSE_SUBEXP) &&
2349 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2350 {
2351 *err = REG_ERPAREN;
2352 return NULL;
2353 }
2354 FALLTHROUGH;
2355 case OP_CLOSE_DUP_NUM:
2356 /* We treat it as a normal character. */
2357
2358 /* Then we can these characters as normal characters. */
2359 token->type = CHARACTER;
2360 /* mb_partial and word_char bits should be initialized already
2361 by peek_token. */
2362 tree = create_token_tree (dfa, NULL, NULL, token);
2363 if (BE (tree == NULL, 0))
2364 {
2365 *err = REG_ESPACE;
2366 return NULL;
2367 }
2368 break;
2369
2370 case ANCHOR:
2371 if ((token->opr.ctx_type
2372 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2373 && dfa->word_ops_used == 0)
2374 init_word_char (dfa);
2375 if (token->opr.ctx_type == WORD_DELIM
2376 || token->opr.ctx_type == NOT_WORD_DELIM)
2377 {
2378 bin_tree_t *tree_first, *tree_last;
2379 if (token->opr.ctx_type == WORD_DELIM)
2380 {
2381 token->opr.ctx_type = WORD_FIRST;
2382 tree_first = create_token_tree (dfa, NULL, NULL, token);
2383 token->opr.ctx_type = WORD_LAST;
2384 }
2385 else
2386 {
2387 token->opr.ctx_type = INSIDE_WORD;
2388 tree_first = create_token_tree (dfa, NULL, NULL, token);
2389 token->opr.ctx_type = INSIDE_NOTWORD;
2390 }
2391 tree_last = create_token_tree (dfa, NULL, NULL, token);
2392 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2393 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2394 {
2395 *err = REG_ESPACE;
2396 return NULL;
2397 }
2398 }
2399 else
2400 {
2401 tree = create_token_tree (dfa, NULL, NULL, token);
2402 if (BE (tree == NULL, 0))
2403 {
2404 *err = REG_ESPACE;
2405 return NULL;
2406 }
2407 }
2408 /* We must return here, since ANCHORs can't be followed
2409 by repetition operators.
2410 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2411 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2412 fetch_token (token, regexp, syntax);
2413 return tree;
2414
2415 case OP_PERIOD:
2416 tree = create_token_tree (dfa, NULL, NULL, token);
2417 if (BE (tree == NULL, 0))
2418 {
2419 *err = REG_ESPACE;
2420 return NULL;
2421 }
2422 if (dfa->mb_cur_max > 1)
2423 dfa->has_mb_node = 1;
2424 break;
2425
2426 case OP_WORD:
2427 case OP_NOTWORD:
2428 tree = build_charclass_op (dfa, regexp->trans,
2429 "alnum",
2430 "_",
2431 token->type == OP_NOTWORD, err);
2432 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2433 return NULL;
2434 break;
2435
2436 case OP_SPACE:
2437 case OP_NOTSPACE:
2438 tree = build_charclass_op (dfa, regexp->trans,
2439 "space",
2440 "",
2441 token->type == OP_NOTSPACE, err);
2442 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2443 return NULL;
2444 break;
2445
2446 case OP_ALT:
2447 case END_OF_RE:
2448 return NULL;
2449
2450 case BACK_SLASH:
2451 *err = REG_EESCAPE;
2452 return NULL;
2453
2454 default:
2455 /* Must not happen? */
2456 #ifdef DEBUG
2457 assert (0);
2458 #endif
2459 return NULL;
2460 }
2461 fetch_token (token, regexp, syntax);
2462
2463 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2464 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2465 {
2466 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2467 syntax, err);
2468 if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2469 {
2470 if (tree != NULL)
2471 postorder (tree, free_tree, NULL);
2472 return NULL;
2473 }
2474 tree = dup_tree;
2475 /* In BRE consecutive duplications are not allowed. */
2476 if ((syntax & RE_CONTEXT_INVALID_DUP)
2477 && (token->type == OP_DUP_ASTERISK
2478 || token->type == OP_OPEN_DUP_NUM))
2479 {
2480 if (tree != NULL)
2481 postorder (tree, free_tree, NULL);
2482 *err = REG_BADRPT;
2483 return NULL;
2484 }
2485 }
2486
2487 return tree;
2488 }
2489
2490 /* This function build the following tree, from regular expression
2491 (<reg_exp>):
2492 SUBEXP
2493 |
2494 <reg_exp>
2495 */
2496
2497 static bin_tree_t *
2498 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2499 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2500 {
2501 re_dfa_t *dfa = preg->buffer;
2502 bin_tree_t *tree;
2503 size_t cur_nsub;
2504 cur_nsub = preg->re_nsub++;
2505
2506 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2507
2508 /* The subexpression may be a null string. */
2509 if (token->type == OP_CLOSE_SUBEXP)
2510 tree = NULL;
2511 else
2512 {
2513 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2514 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2515 {
2516 if (tree != NULL)
2517 postorder (tree, free_tree, NULL);
2518 *err = REG_EPAREN;
2519 }
2520 if (BE (*err != REG_NOERROR, 0))
2521 return NULL;
2522 }
2523
2524 if (cur_nsub <= '9' - '1')
2525 dfa->completed_bkref_map |= 1 << cur_nsub;
2526
2527 tree = create_tree (dfa, tree, NULL, SUBEXP);
2528 if (BE (tree == NULL, 0))
2529 {
2530 *err = REG_ESPACE;
2531 return NULL;
2532 }
2533 tree->token.opr.idx = cur_nsub;
2534 return tree;
2535 }
2536
2537 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2538
2539 static bin_tree_t *
2540 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2541 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2542 {
2543 bin_tree_t *tree = NULL, *old_tree = NULL;
2544 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2545 re_token_t start_token = *token;
2546
2547 if (token->type == OP_OPEN_DUP_NUM)
2548 {
2549 end = 0;
2550 start = fetch_number (regexp, token, syntax);
2551 if (start == -1)
2552 {
2553 if (token->type == CHARACTER && token->opr.c == ',')
2554 start = 0; /* We treat "{,m}" as "{0,m}". */
2555 else
2556 {
2557 *err = REG_BADBR; /* <re>{} is invalid. */
2558 return NULL;
2559 }
2560 }
2561 if (BE (start != -2, 1))
2562 {
2563 /* We treat "{n}" as "{n,n}". */
2564 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2565 : ((token->type == CHARACTER && token->opr.c == ',')
2566 ? fetch_number (regexp, token, syntax) : -2));
2567 }
2568 if (BE (start == -2 || end == -2, 0))
2569 {
2570 /* Invalid sequence. */
2571 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2572 {
2573 if (token->type == END_OF_RE)
2574 *err = REG_EBRACE;
2575 else
2576 *err = REG_BADBR;
2577
2578 return NULL;
2579 }
2580
2581 /* If the syntax bit is set, rollback. */
2582 re_string_set_index (regexp, start_idx);
2583 *token = start_token;
2584 token->type = CHARACTER;
2585 /* mb_partial and word_char bits should be already initialized by
2586 peek_token. */
2587 return elem;
2588 }
2589
2590 if (BE ((end != -1 && start > end)
2591 || token->type != OP_CLOSE_DUP_NUM, 0))
2592 {
2593 /* First number greater than second. */
2594 *err = REG_BADBR;
2595 return NULL;
2596 }
2597
2598 if (BE (RE_DUP_MAX < (end == -1 ? start : end), 0))
2599 {
2600 *err = REG_ESIZE;
2601 return NULL;
2602 }
2603 }
2604 else
2605 {
2606 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2607 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2608 }
2609
2610 fetch_token (token, regexp, syntax);
2611
2612 if (BE (elem == NULL, 0))
2613 return NULL;
2614 if (BE (start == 0 && end == 0, 0))
2615 {
2616 postorder (elem, free_tree, NULL);
2617 return NULL;
2618 }
2619
2620 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2621 if (BE (start > 0, 0))
2622 {
2623 tree = elem;
2624 for (i = 2; i <= start; ++i)
2625 {
2626 elem = duplicate_tree (elem, dfa);
2627 tree = create_tree (dfa, tree, elem, CONCAT);
2628 if (BE (elem == NULL || tree == NULL, 0))
2629 goto parse_dup_op_espace;
2630 }
2631
2632 if (start == end)
2633 return tree;
2634
2635 /* Duplicate ELEM before it is marked optional. */
2636 elem = duplicate_tree (elem, dfa);
2637 if (BE (elem == NULL, 0))
2638 goto parse_dup_op_espace;
2639 old_tree = tree;
2640 }
2641 else
2642 old_tree = NULL;
2643
2644 if (elem->token.type == SUBEXP)
2645 {
2646 uintptr_t subidx = elem->token.opr.idx;
2647 postorder (elem, mark_opt_subexp, (void *) subidx);
2648 }
2649
2650 tree = create_tree (dfa, elem, NULL,
2651 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2652 if (BE (tree == NULL, 0))
2653 goto parse_dup_op_espace;
2654
2655 /* This loop is actually executed only when end != -1,
2656 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2657 already created the start+1-th copy. */
2658 if (TYPE_SIGNED (Idx) || end != -1)
2659 for (i = start + 2; i <= end; ++i)
2660 {
2661 elem = duplicate_tree (elem, dfa);
2662 tree = create_tree (dfa, tree, elem, CONCAT);
2663 if (BE (elem == NULL || tree == NULL, 0))
2664 goto parse_dup_op_espace;
2665
2666 tree = create_tree (dfa, tree, NULL, OP_ALT);
2667 if (BE (tree == NULL, 0))
2668 goto parse_dup_op_espace;
2669 }
2670
2671 if (old_tree)
2672 tree = create_tree (dfa, old_tree, tree, CONCAT);
2673
2674 return tree;
2675
2676 parse_dup_op_espace:
2677 *err = REG_ESPACE;
2678 return NULL;
2679 }
2680
2681 /* Size of the names for collating symbol/equivalence_class/character_class.
2682 I'm not sure, but maybe enough. */
2683 #define BRACKET_NAME_BUF_SIZE 32
2684
2685 #ifndef _LIBC
2686
2687 # ifdef RE_ENABLE_I18N
2688 /* Convert the byte B to the corresponding wide character. In a
2689 unibyte locale, treat B as itself if it is an encoding error.
2690 In a multibyte locale, return WEOF if B is an encoding error. */
2691 static wint_t
2692 parse_byte (unsigned char b, re_charset_t *mbcset)
2693 {
2694 wint_t wc = __btowc (b);
2695 return wc == WEOF && !mbcset ? b : wc;
2696 }
2697 #endif
2698
2699 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2700 Build the range expression which starts from START_ELEM, and ends
2701 at END_ELEM. The result are written to MBCSET and SBCSET.
2702 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2703 mbcset->range_ends, is a pointer argument since we may
2704 update it. */
2705
2706 static reg_errcode_t
2707 # ifdef RE_ENABLE_I18N
2708 build_range_exp (const reg_syntax_t syntax,
2709 bitset_t sbcset,
2710 re_charset_t *mbcset,
2711 Idx *range_alloc,
2712 const bracket_elem_t *start_elem,
2713 const bracket_elem_t *end_elem)
2714 # else /* not RE_ENABLE_I18N */
2715 build_range_exp (const reg_syntax_t syntax,
2716 bitset_t sbcset,
2717 const bracket_elem_t *start_elem,
2718 const bracket_elem_t *end_elem)
2719 # endif /* not RE_ENABLE_I18N */
2720 {
2721 unsigned int start_ch, end_ch;
2722 /* Equivalence Classes and Character Classes can't be a range start/end. */
2723 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2724 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2725 0))
2726 return REG_ERANGE;
2727
2728 /* We can handle no multi character collating elements without libc
2729 support. */
2730 if (BE ((start_elem->type == COLL_SYM
2731 && strlen ((char *) start_elem->opr.name) > 1)
2732 || (end_elem->type == COLL_SYM
2733 && strlen ((char *) end_elem->opr.name) > 1), 0))
2734 return REG_ECOLLATE;
2735
2736 # ifdef RE_ENABLE_I18N
2737 {
2738 wchar_t wc;
2739 wint_t start_wc;
2740 wint_t end_wc;
2741
2742 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2743 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2744 : 0));
2745 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2746 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2747 : 0));
2748 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2749 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2750 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2751 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2752 if (start_wc == WEOF || end_wc == WEOF)
2753 return REG_ECOLLATE;
2754 else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2755 return REG_ERANGE;
2756
2757 /* Got valid collation sequence values, add them as a new entry.
2758 However, for !_LIBC we have no collation elements: if the
2759 character set is single byte, the single byte character set
2760 that we build below suffices. parse_bracket_exp passes
2761 no MBCSET if dfa->mb_cur_max == 1. */
2762 if (mbcset)
2763 {
2764 /* Check the space of the arrays. */
2765 if (BE (*range_alloc == mbcset->nranges, 0))
2766 {
2767 /* There is not enough space, need realloc. */
2768 wchar_t *new_array_start, *new_array_end;
2769 Idx new_nranges;
2770
2771 /* +1 in case of mbcset->nranges is 0. */
2772 new_nranges = 2 * mbcset->nranges + 1;
2773 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2774 are NULL if *range_alloc == 0. */
2775 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2776 new_nranges);
2777 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2778 new_nranges);
2779
2780 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2781 {
2782 re_free (new_array_start);
2783 re_free (new_array_end);
2784 return REG_ESPACE;
2785 }
2786
2787 mbcset->range_starts = new_array_start;
2788 mbcset->range_ends = new_array_end;
2789 *range_alloc = new_nranges;
2790 }
2791
2792 mbcset->range_starts[mbcset->nranges] = start_wc;
2793 mbcset->range_ends[mbcset->nranges++] = end_wc;
2794 }
2795
2796 /* Build the table for single byte characters. */
2797 for (wc = 0; wc < SBC_MAX; ++wc)
2798 {
2799 if (start_wc <= wc && wc <= end_wc)
2800 bitset_set (sbcset, wc);
2801 }
2802 }
2803 # else /* not RE_ENABLE_I18N */
2804 {
2805 unsigned int ch;
2806 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2807 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2808 : 0));
2809 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2810 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2811 : 0));
2812 if (start_ch > end_ch)
2813 return REG_ERANGE;
2814 /* Build the table for single byte characters. */
2815 for (ch = 0; ch < SBC_MAX; ++ch)
2816 if (start_ch <= ch && ch <= end_ch)
2817 bitset_set (sbcset, ch);
2818 }
2819 # endif /* not RE_ENABLE_I18N */
2820 return REG_NOERROR;
2821 }
2822 #endif /* not _LIBC */
2823
2824 #ifndef _LIBC
2825 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2826 Build the collating element which is represented by NAME.
2827 The result are written to MBCSET and SBCSET.
2828 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2829 pointer argument since we may update it. */
2830
2831 static reg_errcode_t
2832 # ifdef RE_ENABLE_I18N
2833 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2834 Idx *coll_sym_alloc, const unsigned char *name)
2835 # else /* not RE_ENABLE_I18N */
2836 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2837 # endif /* not RE_ENABLE_I18N */
2838 {
2839 size_t name_len = strlen ((const char *) name);
2840 if (BE (name_len != 1, 0))
2841 return REG_ECOLLATE;
2842 else
2843 {
2844 bitset_set (sbcset, name[0]);
2845 return REG_NOERROR;
2846 }
2847 }
2848 #endif /* not _LIBC */
2849
2850 /* This function parse bracket expression like "[abc]", "[a-c]",
2851 "[[.a-a.]]" etc. */
2852
2853 static bin_tree_t *
2854 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2855 reg_syntax_t syntax, reg_errcode_t *err)
2856 {
2857 #ifdef _LIBC
2858 const unsigned char *collseqmb;
2859 const char *collseqwc;
2860 uint32_t nrules;
2861 int32_t table_size;
2862 const int32_t *symb_table;
2863 const unsigned char *extra;
2864
2865 /* Local function for parse_bracket_exp used in _LIBC environment.
2866 Seek the collating symbol entry corresponding to NAME.
2867 Return the index of the symbol in the SYMB_TABLE,
2868 or -1 if not found. */
2869
2870 auto inline int32_t
2871 __attribute__ ((always_inline))
2872 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2873 {
2874 int32_t elem;
2875
2876 for (elem = 0; elem < table_size; elem++)
2877 if (symb_table[2 * elem] != 0)
2878 {
2879 int32_t idx = symb_table[2 * elem + 1];
2880 /* Skip the name of collating element name. */
2881 idx += 1 + extra[idx];
2882 if (/* Compare the length of the name. */
2883 name_len == extra[idx]
2884 /* Compare the name. */
2885 && memcmp (name, &extra[idx + 1], name_len) == 0)
2886 /* Yep, this is the entry. */
2887 return elem;
2888 }
2889 return -1;
2890 }
2891
2892 /* Local function for parse_bracket_exp used in _LIBC environment.
2893 Look up the collation sequence value of BR_ELEM.
2894 Return the value if succeeded, UINT_MAX otherwise. */
2895
2896 auto inline unsigned int
2897 __attribute__ ((always_inline))
2898 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2899 {
2900 if (br_elem->type == SB_CHAR)
2901 {
2902 /*
2903 if (MB_CUR_MAX == 1)
2904 */
2905 if (nrules == 0)
2906 return collseqmb[br_elem->opr.ch];
2907 else
2908 {
2909 wint_t wc = __btowc (br_elem->opr.ch);
2910 return __collseq_table_lookup (collseqwc, wc);
2911 }
2912 }
2913 else if (br_elem->type == MB_CHAR)
2914 {
2915 if (nrules != 0)
2916 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2917 }
2918 else if (br_elem->type == COLL_SYM)
2919 {
2920 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2921 if (nrules != 0)
2922 {
2923 int32_t elem, idx;
2924 elem = seek_collating_symbol_entry (br_elem->opr.name,
2925 sym_name_len);
2926 if (elem != -1)
2927 {
2928 /* We found the entry. */
2929 idx = symb_table[2 * elem + 1];
2930 /* Skip the name of collating element name. */
2931 idx += 1 + extra[idx];
2932 /* Skip the byte sequence of the collating element. */
2933 idx += 1 + extra[idx];
2934 /* Adjust for the alignment. */
2935 idx = (idx + 3) & ~3;
2936 /* Skip the multibyte collation sequence value. */
2937 idx += sizeof (unsigned int);
2938 /* Skip the wide char sequence of the collating element. */
2939 idx += sizeof (unsigned int) *
2940 (1 + *(unsigned int *) (extra + idx));
2941 /* Return the collation sequence value. */
2942 return *(unsigned int *) (extra + idx);
2943 }
2944 else if (sym_name_len == 1)
2945 {
2946 /* No valid character. Match it as a single byte
2947 character. */
2948 return collseqmb[br_elem->opr.name[0]];
2949 }
2950 }
2951 else if (sym_name_len == 1)
2952 return collseqmb[br_elem->opr.name[0]];
2953 }
2954 return UINT_MAX;
2955 }
2956
2957 /* Local function for parse_bracket_exp used in _LIBC environment.
2958 Build the range expression which starts from START_ELEM, and ends
2959 at END_ELEM. The result are written to MBCSET and SBCSET.
2960 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2961 mbcset->range_ends, is a pointer argument since we may
2962 update it. */
2963
2964 auto inline reg_errcode_t
2965 __attribute__ ((always_inline))
2966 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2967 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2968 {
2969 unsigned int ch;
2970 uint32_t start_collseq;
2971 uint32_t end_collseq;
2972
2973 /* Equivalence Classes and Character Classes can't be a range
2974 start/end. */
2975 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2976 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2977 0))
2978 return REG_ERANGE;
2979
2980 /* FIXME: Implement rational ranges here, too. */
2981 start_collseq = lookup_collation_sequence_value (start_elem);
2982 end_collseq = lookup_collation_sequence_value (end_elem);
2983 /* Check start/end collation sequence values. */
2984 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2985 return REG_ECOLLATE;
2986 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2987 return REG_ERANGE;
2988
2989 /* Got valid collation sequence values, add them as a new entry.
2990 However, if we have no collation elements, and the character set
2991 is single byte, the single byte character set that we
2992 build below suffices. */
2993 if (nrules > 0 || dfa->mb_cur_max > 1)
2994 {
2995 /* Check the space of the arrays. */
2996 if (BE (*range_alloc == mbcset->nranges, 0))
2997 {
2998 /* There is not enough space, need realloc. */
2999 uint32_t *new_array_start;
3000 uint32_t *new_array_end;
3001 Idx new_nranges;
3002
3003 /* +1 in case of mbcset->nranges is 0. */
3004 new_nranges = 2 * mbcset->nranges + 1;
3005 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3006 new_nranges);
3007 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3008 new_nranges);
3009
3010 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3011 return REG_ESPACE;
3012
3013 mbcset->range_starts = new_array_start;
3014 mbcset->range_ends = new_array_end;
3015 *range_alloc = new_nranges;
3016 }
3017
3018 mbcset->range_starts[mbcset->nranges] = start_collseq;
3019 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3020 }
3021
3022 /* Build the table for single byte characters. */
3023 for (ch = 0; ch < SBC_MAX; ch++)
3024 {
3025 uint32_t ch_collseq;
3026 /*
3027 if (MB_CUR_MAX == 1)
3028 */
3029 if (nrules == 0)
3030 ch_collseq = collseqmb[ch];
3031 else
3032 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3033 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3034 bitset_set (sbcset, ch);
3035 }
3036 return REG_NOERROR;
3037 }
3038
3039 /* Local function for parse_bracket_exp used in _LIBC environment.
3040 Build the collating element which is represented by NAME.
3041 The result are written to MBCSET and SBCSET.
3042 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3043 pointer argument since we may update it. */
3044
3045 auto inline reg_errcode_t
3046 __attribute__ ((always_inline))
3047 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3048 Idx *coll_sym_alloc, const unsigned char *name)
3049 {
3050 int32_t elem, idx;
3051 size_t name_len = strlen ((const char *) name);
3052 if (nrules != 0)
3053 {
3054 elem = seek_collating_symbol_entry (name, name_len);
3055 if (elem != -1)
3056 {
3057 /* We found the entry. */
3058 idx = symb_table[2 * elem + 1];
3059 /* Skip the name of collating element name. */
3060 idx += 1 + extra[idx];
3061 }
3062 else if (name_len == 1)
3063 {
3064 /* No valid character, treat it as a normal
3065 character. */
3066 bitset_set (sbcset, name[0]);
3067 return REG_NOERROR;
3068 }
3069 else
3070 return REG_ECOLLATE;
3071
3072 /* Got valid collation sequence, add it as a new entry. */
3073 /* Check the space of the arrays. */
3074 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3075 {
3076 /* Not enough, realloc it. */
3077 /* +1 in case of mbcset->ncoll_syms is 0. */
3078 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3079 /* Use realloc since mbcset->coll_syms is NULL
3080 if *alloc == 0. */
3081 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3082 new_coll_sym_alloc);
3083 if (BE (new_coll_syms == NULL, 0))
3084 return REG_ESPACE;
3085 mbcset->coll_syms = new_coll_syms;
3086 *coll_sym_alloc = new_coll_sym_alloc;
3087 }
3088 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3089 return REG_NOERROR;
3090 }
3091 else
3092 {
3093 if (BE (name_len != 1, 0))
3094 return REG_ECOLLATE;
3095 else
3096 {
3097 bitset_set (sbcset, name[0]);
3098 return REG_NOERROR;
3099 }
3100 }
3101 }
3102 #endif
3103
3104 re_token_t br_token;
3105 re_bitset_ptr_t sbcset;
3106 #ifdef RE_ENABLE_I18N
3107 re_charset_t *mbcset;
3108 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3109 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3110 #endif /* not RE_ENABLE_I18N */
3111 bool non_match = false;
3112 bin_tree_t *work_tree;
3113 int token_len;
3114 bool first_round = true;
3115 #ifdef _LIBC
3116 collseqmb = (const unsigned char *)
3117 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3118 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3119 if (nrules)
3120 {
3121 /*
3122 if (MB_CUR_MAX > 1)
3123 */
3124 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3125 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3126 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3127 _NL_COLLATE_SYMB_TABLEMB);
3128 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3129 _NL_COLLATE_SYMB_EXTRAMB);
3130 }
3131 #endif
3132 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3133 #ifdef RE_ENABLE_I18N
3134 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3135 #endif /* RE_ENABLE_I18N */
3136 #ifdef RE_ENABLE_I18N
3137 if (BE (sbcset == NULL || mbcset == NULL, 0))
3138 #else
3139 if (BE (sbcset == NULL, 0))
3140 #endif /* RE_ENABLE_I18N */
3141 {
3142 re_free (sbcset);
3143 #ifdef RE_ENABLE_I18N
3144 re_free (mbcset);
3145 #endif
3146 *err = REG_ESPACE;
3147 return NULL;
3148 }
3149
3150 token_len = peek_token_bracket (token, regexp, syntax);
3151 if (BE (token->type == END_OF_RE, 0))
3152 {
3153 *err = REG_BADPAT;
3154 goto parse_bracket_exp_free_return;
3155 }
3156 if (token->type == OP_NON_MATCH_LIST)
3157 {
3158 #ifdef RE_ENABLE_I18N
3159 mbcset->non_match = 1;
3160 #endif /* not RE_ENABLE_I18N */
3161 non_match = true;
3162 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3163 bitset_set (sbcset, '\n');
3164 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3165 token_len = peek_token_bracket (token, regexp, syntax);
3166 if (BE (token->type == END_OF_RE, 0))
3167 {
3168 *err = REG_BADPAT;
3169 goto parse_bracket_exp_free_return;
3170 }
3171 }
3172
3173 /* We treat the first ']' as a normal character. */
3174 if (token->type == OP_CLOSE_BRACKET)
3175 token->type = CHARACTER;
3176
3177 while (1)
3178 {
3179 bracket_elem_t start_elem, end_elem;
3180 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3181 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3182 reg_errcode_t ret;
3183 int token_len2 = 0;
3184 bool is_range_exp = false;
3185 re_token_t token2;
3186
3187 start_elem.opr.name = start_name_buf;
3188 start_elem.type = COLL_SYM;
3189 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3190 syntax, first_round);
3191 if (BE (ret != REG_NOERROR, 0))
3192 {
3193 *err = ret;
3194 goto parse_bracket_exp_free_return;
3195 }
3196 first_round = false;
3197
3198 /* Get information about the next token. We need it in any case. */
3199 token_len = peek_token_bracket (token, regexp, syntax);
3200
3201 /* Do not check for ranges if we know they are not allowed. */
3202 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3203 {
3204 if (BE (token->type == END_OF_RE, 0))
3205 {
3206 *err = REG_EBRACK;
3207 goto parse_bracket_exp_free_return;
3208 }
3209 if (token->type == OP_CHARSET_RANGE)
3210 {
3211 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3212 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3213 if (BE (token2.type == END_OF_RE, 0))
3214 {
3215 *err = REG_EBRACK;
3216 goto parse_bracket_exp_free_return;
3217 }
3218 if (token2.type == OP_CLOSE_BRACKET)
3219 {
3220 /* We treat the last '-' as a normal character. */
3221 re_string_skip_bytes (regexp, -token_len);
3222 token->type = CHARACTER;
3223 }
3224 else
3225 is_range_exp = true;
3226 }
3227 }
3228
3229 if (is_range_exp == true)
3230 {
3231 end_elem.opr.name = end_name_buf;
3232 end_elem.type = COLL_SYM;
3233 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3234 dfa, syntax, true);
3235 if (BE (ret != REG_NOERROR, 0))
3236 {
3237 *err = ret;
3238 goto parse_bracket_exp_free_return;
3239 }
3240
3241 token_len = peek_token_bracket (token, regexp, syntax);
3242
3243 #ifdef _LIBC
3244 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3245 &start_elem, &end_elem);
3246 #else
3247 # ifdef RE_ENABLE_I18N
3248 *err = build_range_exp (syntax, sbcset,
3249 dfa->mb_cur_max > 1 ? mbcset : NULL,
3250 &range_alloc, &start_elem, &end_elem);
3251 # else
3252 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3253 # endif
3254 #endif /* RE_ENABLE_I18N */
3255 if (BE (*err != REG_NOERROR, 0))
3256 goto parse_bracket_exp_free_return;
3257 }
3258 else
3259 {
3260 switch (start_elem.type)
3261 {
3262 case SB_CHAR:
3263 bitset_set (sbcset, start_elem.opr.ch);
3264 break;
3265 #ifdef RE_ENABLE_I18N
3266 case MB_CHAR:
3267 /* Check whether the array has enough space. */
3268 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3269 {
3270 wchar_t *new_mbchars;
3271 /* Not enough, realloc it. */
3272 /* +1 in case of mbcset->nmbchars is 0. */
3273 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3274 /* Use realloc since array is NULL if *alloc == 0. */
3275 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3276 mbchar_alloc);
3277 if (BE (new_mbchars == NULL, 0))
3278 goto parse_bracket_exp_espace;
3279 mbcset->mbchars = new_mbchars;
3280 }
3281 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3282 break;
3283 #endif /* RE_ENABLE_I18N */
3284 case EQUIV_CLASS:
3285 *err = build_equiv_class (sbcset,
3286 #ifdef RE_ENABLE_I18N
3287 mbcset, &equiv_class_alloc,
3288 #endif /* RE_ENABLE_I18N */
3289 start_elem.opr.name);
3290 if (BE (*err != REG_NOERROR, 0))
3291 goto parse_bracket_exp_free_return;
3292 break;
3293 case COLL_SYM:
3294 *err = build_collating_symbol (sbcset,
3295 #ifdef RE_ENABLE_I18N
3296 mbcset, &coll_sym_alloc,
3297 #endif /* RE_ENABLE_I18N */
3298 start_elem.opr.name);
3299 if (BE (*err != REG_NOERROR, 0))
3300 goto parse_bracket_exp_free_return;
3301 break;
3302 case CHAR_CLASS:
3303 *err = build_charclass (regexp->trans, sbcset,
3304 #ifdef RE_ENABLE_I18N
3305 mbcset, &char_class_alloc,
3306 #endif /* RE_ENABLE_I18N */
3307 (const char *) start_elem.opr.name,
3308 syntax);
3309 if (BE (*err != REG_NOERROR, 0))
3310 goto parse_bracket_exp_free_return;
3311 break;
3312 default:
3313 assert (0);
3314 break;
3315 }
3316 }
3317 if (BE (token->type == END_OF_RE, 0))
3318 {
3319 *err = REG_EBRACK;
3320 goto parse_bracket_exp_free_return;
3321 }
3322 if (token->type == OP_CLOSE_BRACKET)
3323 break;
3324 }
3325
3326 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3327
3328 /* If it is non-matching list. */
3329 if (non_match)
3330 bitset_not (sbcset);
3331
3332 #ifdef RE_ENABLE_I18N
3333 /* Ensure only single byte characters are set. */
3334 if (dfa->mb_cur_max > 1)
3335 bitset_mask (sbcset, dfa->sb_char);
3336
3337 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3338 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3339 || mbcset->non_match)))
3340 {
3341 bin_tree_t *mbc_tree;
3342 int sbc_idx;
3343 /* Build a tree for complex bracket. */
3344 dfa->has_mb_node = 1;
3345 br_token.type = COMPLEX_BRACKET;
3346 br_token.opr.mbcset = mbcset;
3347 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3348 if (BE (mbc_tree == NULL, 0))
3349 goto parse_bracket_exp_espace;
3350 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3351 if (sbcset[sbc_idx])
3352 break;
3353 /* If there are no bits set in sbcset, there is no point
3354 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3355 if (sbc_idx < BITSET_WORDS)
3356 {
3357 /* Build a tree for simple bracket. */
3358 br_token.type = SIMPLE_BRACKET;
3359 br_token.opr.sbcset = sbcset;
3360 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3361 if (BE (work_tree == NULL, 0))
3362 goto parse_bracket_exp_espace;
3363
3364 /* Then join them by ALT node. */
3365 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3366 if (BE (work_tree == NULL, 0))
3367 goto parse_bracket_exp_espace;
3368 }
3369 else
3370 {
3371 re_free (sbcset);
3372 work_tree = mbc_tree;
3373 }
3374 }
3375 else
3376 #endif /* not RE_ENABLE_I18N */
3377 {
3378 #ifdef RE_ENABLE_I18N
3379 free_charset (mbcset);
3380 #endif
3381 /* Build a tree for simple bracket. */
3382 br_token.type = SIMPLE_BRACKET;
3383 br_token.opr.sbcset = sbcset;
3384 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3385 if (BE (work_tree == NULL, 0))
3386 goto parse_bracket_exp_espace;
3387 }
3388 return work_tree;
3389
3390 parse_bracket_exp_espace:
3391 *err = REG_ESPACE;
3392 parse_bracket_exp_free_return:
3393 re_free (sbcset);
3394 #ifdef RE_ENABLE_I18N
3395 free_charset (mbcset);
3396 #endif /* RE_ENABLE_I18N */
3397 return NULL;
3398 }
3399
3400 /* Parse an element in the bracket expression. */
3401
3402 static reg_errcode_t
3403 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3404 re_token_t *token, int token_len, re_dfa_t *dfa,
3405 reg_syntax_t syntax, bool accept_hyphen)
3406 {
3407 #ifdef RE_ENABLE_I18N
3408 int cur_char_size;
3409 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3410 if (cur_char_size > 1)
3411 {
3412 elem->type = MB_CHAR;
3413 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3414 re_string_skip_bytes (regexp, cur_char_size);
3415 return REG_NOERROR;
3416 }
3417 #endif /* RE_ENABLE_I18N */
3418 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3419 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3420 || token->type == OP_OPEN_EQUIV_CLASS)
3421 return parse_bracket_symbol (elem, regexp, token);
3422 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3423 {
3424 /* A '-' must only appear as anything but a range indicator before
3425 the closing bracket. Everything else is an error. */
3426 re_token_t token2;
3427 (void) peek_token_bracket (&token2, regexp, syntax);
3428 if (token2.type != OP_CLOSE_BRACKET)
3429 /* The actual error value is not standardized since this whole
3430 case is undefined. But ERANGE makes good sense. */
3431 return REG_ERANGE;
3432 }
3433 elem->type = SB_CHAR;
3434 elem->opr.ch = token->opr.c;
3435 return REG_NOERROR;
3436 }
3437
3438 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3439 such as [:<character_class>:], [.<collating_element>.], and
3440 [=<equivalent_class>=]. */
3441
3442 static reg_errcode_t
3443 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3444 re_token_t *token)
3445 {
3446 unsigned char ch, delim = token->opr.c;
3447 int i = 0;
3448 if (re_string_eoi(regexp))
3449 return REG_EBRACK;
3450 for (;; ++i)
3451 {
3452 if (i >= BRACKET_NAME_BUF_SIZE)
3453 return REG_EBRACK;
3454 if (token->type == OP_OPEN_CHAR_CLASS)
3455 ch = re_string_fetch_byte_case (regexp);
3456 else
3457 ch = re_string_fetch_byte (regexp);
3458 if (re_string_eoi(regexp))
3459 return REG_EBRACK;
3460 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3461 break;
3462 elem->opr.name[i] = ch;
3463 }
3464 re_string_skip_bytes (regexp, 1);
3465 elem->opr.name[i] = '\0';
3466 switch (token->type)
3467 {
3468 case OP_OPEN_COLL_ELEM:
3469 elem->type = COLL_SYM;
3470 break;
3471 case OP_OPEN_EQUIV_CLASS:
3472 elem->type = EQUIV_CLASS;
3473 break;
3474 case OP_OPEN_CHAR_CLASS:
3475 elem->type = CHAR_CLASS;
3476 break;
3477 default:
3478 break;
3479 }
3480 return REG_NOERROR;
3481 }
3482
3483 /* Helper function for parse_bracket_exp.
3484 Build the equivalence class which is represented by NAME.
3485 The result are written to MBCSET and SBCSET.
3486 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3487 is a pointer argument since we may update it. */
3488
3489 static reg_errcode_t
3490 #ifdef RE_ENABLE_I18N
3491 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3492 Idx *equiv_class_alloc, const unsigned char *name)
3493 #else /* not RE_ENABLE_I18N */
3494 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3495 #endif /* not RE_ENABLE_I18N */
3496 {
3497 #ifdef _LIBC
3498 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3499 if (nrules != 0)
3500 {
3501 const int32_t *table, *indirect;
3502 const unsigned char *weights, *extra, *cp;
3503 unsigned char char_buf[2];
3504 int32_t idx1, idx2;
3505 unsigned int ch;
3506 size_t len;
3507 /* Calculate the index for equivalence class. */
3508 cp = name;
3509 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3510 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3511 _NL_COLLATE_WEIGHTMB);
3512 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3513 _NL_COLLATE_EXTRAMB);
3514 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3515 _NL_COLLATE_INDIRECTMB);
3516 idx1 = findidx (table, indirect, extra, &cp, -1);
3517 if (BE (idx1 == 0 || *cp != '\0', 0))
3518 /* This isn't a valid character. */
3519 return REG_ECOLLATE;
3520
3521 /* Build single byte matching table for this equivalence class. */
3522 len = weights[idx1 & 0xffffff];
3523 for (ch = 0; ch < SBC_MAX; ++ch)
3524 {
3525 char_buf[0] = ch;
3526 cp = char_buf;
3527 idx2 = findidx (table, indirect, extra, &cp, 1);
3528 /*
3529 idx2 = table[ch];
3530 */
3531 if (idx2 == 0)
3532 /* This isn't a valid character. */
3533 continue;
3534 /* Compare only if the length matches and the collation rule
3535 index is the same. */
3536 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3537 && memcmp (weights + (idx1 & 0xffffff) + 1,
3538 weights + (idx2 & 0xffffff) + 1, len) == 0)
3539 bitset_set (sbcset, ch);
3540 }
3541 /* Check whether the array has enough space. */
3542 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3543 {
3544 /* Not enough, realloc it. */
3545 /* +1 in case of mbcset->nequiv_classes is 0. */
3546 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3547 /* Use realloc since the array is NULL if *alloc == 0. */
3548 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3549 int32_t,
3550 new_equiv_class_alloc);
3551 if (BE (new_equiv_classes == NULL, 0))
3552 return REG_ESPACE;
3553 mbcset->equiv_classes = new_equiv_classes;
3554 *equiv_class_alloc = new_equiv_class_alloc;
3555 }
3556 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3557 }
3558 else
3559 #endif /* _LIBC */
3560 {
3561 if (BE (strlen ((const char *) name) != 1, 0))
3562 return REG_ECOLLATE;
3563 bitset_set (sbcset, *name);
3564 }
3565 return REG_NOERROR;
3566 }
3567
3568 /* Helper function for parse_bracket_exp.
3569 Build the character class which is represented by NAME.
3570 The result are written to MBCSET and SBCSET.
3571 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3572 is a pointer argument since we may update it. */
3573
3574 static reg_errcode_t
3575 #ifdef RE_ENABLE_I18N
3576 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3577 re_charset_t *mbcset, Idx *char_class_alloc,
3578 const char *class_name, reg_syntax_t syntax)
3579 #else /* not RE_ENABLE_I18N */
3580 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3581 const char *class_name, reg_syntax_t syntax)
3582 #endif /* not RE_ENABLE_I18N */
3583 {
3584 int i;
3585 const char *name = class_name;
3586
3587 /* In case of REG_ICASE "upper" and "lower" match the both of
3588 upper and lower cases. */
3589 if ((syntax & RE_ICASE)
3590 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3591 name = "alpha";
3592
3593 #ifdef RE_ENABLE_I18N
3594 /* Check the space of the arrays. */
3595 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3596 {
3597 /* Not enough, realloc it. */
3598 /* +1 in case of mbcset->nchar_classes is 0. */
3599 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3600 /* Use realloc since array is NULL if *alloc == 0. */
3601 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3602 new_char_class_alloc);
3603 if (BE (new_char_classes == NULL, 0))
3604 return REG_ESPACE;
3605 mbcset->char_classes = new_char_classes;
3606 *char_class_alloc = new_char_class_alloc;
3607 }
3608 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3609 #endif /* RE_ENABLE_I18N */
3610
3611 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3612 do { \
3613 if (BE (trans != NULL, 0)) \
3614 { \
3615 for (i = 0; i < SBC_MAX; ++i) \
3616 if (ctype_func (i)) \
3617 bitset_set (sbcset, trans[i]); \
3618 } \
3619 else \
3620 { \
3621 for (i = 0; i < SBC_MAX; ++i) \
3622 if (ctype_func (i)) \
3623 bitset_set (sbcset, i); \
3624 } \
3625 } while (0)
3626
3627 if (strcmp (name, "alnum") == 0)
3628 BUILD_CHARCLASS_LOOP (isalnum);
3629 else if (strcmp (name, "cntrl") == 0)
3630 BUILD_CHARCLASS_LOOP (iscntrl);
3631 else if (strcmp (name, "lower") == 0)
3632 BUILD_CHARCLASS_LOOP (islower);
3633 else if (strcmp (name, "space") == 0)
3634 BUILD_CHARCLASS_LOOP (isspace);
3635 else if (strcmp (name, "alpha") == 0)
3636 BUILD_CHARCLASS_LOOP (isalpha);
3637 else if (strcmp (name, "digit") == 0)
3638 BUILD_CHARCLASS_LOOP (isdigit);
3639 else if (strcmp (name, "print") == 0)
3640 BUILD_CHARCLASS_LOOP (isprint);
3641 else if (strcmp (name, "upper") == 0)
3642 BUILD_CHARCLASS_LOOP (isupper);
3643 else if (strcmp (name, "blank") == 0)
3644 BUILD_CHARCLASS_LOOP (isblank);
3645 else if (strcmp (name, "graph") == 0)
3646 BUILD_CHARCLASS_LOOP (isgraph);
3647 else if (strcmp (name, "punct") == 0)
3648 BUILD_CHARCLASS_LOOP (ispunct);
3649 else if (strcmp (name, "xdigit") == 0)
3650 BUILD_CHARCLASS_LOOP (isxdigit);
3651 else
3652 return REG_ECTYPE;
3653
3654 return REG_NOERROR;
3655 }
3656
3657 static bin_tree_t *
3658 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3659 const char *class_name,
3660 const char *extra, bool non_match,
3661 reg_errcode_t *err)
3662 {
3663 re_bitset_ptr_t sbcset;
3664 #ifdef RE_ENABLE_I18N
3665 re_charset_t *mbcset;
3666 Idx alloc = 0;
3667 #endif /* not RE_ENABLE_I18N */
3668 reg_errcode_t ret;
3669 re_token_t br_token;
3670 bin_tree_t *tree;
3671
3672 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3673 if (BE (sbcset == NULL, 0))
3674 {
3675 *err = REG_ESPACE;
3676 return NULL;
3677 }
3678 #ifdef RE_ENABLE_I18N
3679 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3680 if (BE (mbcset == NULL, 0))
3681 {
3682 re_free (sbcset);
3683 *err = REG_ESPACE;
3684 return NULL;
3685 }
3686 mbcset->non_match = non_match;
3687 #endif /* RE_ENABLE_I18N */
3688
3689 /* We don't care the syntax in this case. */
3690 ret = build_charclass (trans, sbcset,
3691 #ifdef RE_ENABLE_I18N
3692 mbcset, &alloc,
3693 #endif /* RE_ENABLE_I18N */
3694 class_name, 0);
3695
3696 if (BE (ret != REG_NOERROR, 0))
3697 {
3698 re_free (sbcset);
3699 #ifdef RE_ENABLE_I18N
3700 free_charset (mbcset);
3701 #endif /* RE_ENABLE_I18N */
3702 *err = ret;
3703 return NULL;
3704 }
3705 /* \w match '_' also. */
3706 for (; *extra; extra++)
3707 bitset_set (sbcset, *extra);
3708
3709 /* If it is non-matching list. */
3710 if (non_match)
3711 bitset_not (sbcset);
3712
3713 #ifdef RE_ENABLE_I18N
3714 /* Ensure only single byte characters are set. */
3715 if (dfa->mb_cur_max > 1)
3716 bitset_mask (sbcset, dfa->sb_char);
3717 #endif
3718
3719 /* Build a tree for simple bracket. */
3720 #if defined GCC_LINT || defined lint
3721 memset (&br_token, 0, sizeof br_token);
3722 #endif
3723 br_token.type = SIMPLE_BRACKET;
3724 br_token.opr.sbcset = sbcset;
3725 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3726 if (BE (tree == NULL, 0))
3727 goto build_word_op_espace;
3728
3729 #ifdef RE_ENABLE_I18N
3730 if (dfa->mb_cur_max > 1)
3731 {
3732 bin_tree_t *mbc_tree;
3733 /* Build a tree for complex bracket. */
3734 br_token.type = COMPLEX_BRACKET;
3735 br_token.opr.mbcset = mbcset;
3736 dfa->has_mb_node = 1;
3737 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3738 if (BE (mbc_tree == NULL, 0))
3739 goto build_word_op_espace;
3740 /* Then join them by ALT node. */
3741 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3742 if (BE (mbc_tree != NULL, 1))
3743 return tree;
3744 }
3745 else
3746 {
3747 free_charset (mbcset);
3748 return tree;
3749 }
3750 #else /* not RE_ENABLE_I18N */
3751 return tree;
3752 #endif /* not RE_ENABLE_I18N */
3753
3754 build_word_op_espace:
3755 re_free (sbcset);
3756 #ifdef RE_ENABLE_I18N
3757 free_charset (mbcset);
3758 #endif /* RE_ENABLE_I18N */
3759 *err = REG_ESPACE;
3760 return NULL;
3761 }
3762
3763 /* This is intended for the expressions like "a{1,3}".
3764 Fetch a number from 'input', and return the number.
3765 Return -1 if the number field is empty like "{,1}".
3766 Return RE_DUP_MAX + 1 if the number field is too large.
3767 Return -2 if an error occurred. */
3768
3769 static Idx
3770 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3771 {
3772 Idx num = -1;
3773 unsigned char c;
3774 while (1)
3775 {
3776 fetch_token (token, input, syntax);
3777 c = token->opr.c;
3778 if (BE (token->type == END_OF_RE, 0))
3779 return -2;
3780 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3781 break;
3782 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3783 ? -2
3784 : num == -1
3785 ? c - '0'
3786 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3787 }
3788 return num;
3789 }
3790 \f
3791 #ifdef RE_ENABLE_I18N
3792 static void
3793 free_charset (re_charset_t *cset)
3794 {
3795 re_free (cset->mbchars);
3796 # ifdef _LIBC
3797 re_free (cset->coll_syms);
3798 re_free (cset->equiv_classes);
3799 re_free (cset->range_starts);
3800 re_free (cset->range_ends);
3801 # endif
3802 re_free (cset->char_classes);
3803 re_free (cset);
3804 }
3805 #endif /* RE_ENABLE_I18N */
3806 \f
3807 /* Functions for binary tree operation. */
3808
3809 /* Create a tree node. */
3810
3811 static bin_tree_t *
3812 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3813 re_token_type_t type)
3814 {
3815 re_token_t t;
3816 #if defined GCC_LINT || defined lint
3817 memset (&t, 0, sizeof t);
3818 #endif
3819 t.type = type;
3820 return create_token_tree (dfa, left, right, &t);
3821 }
3822
3823 static bin_tree_t *
3824 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3825 const re_token_t *token)
3826 {
3827 bin_tree_t *tree;
3828 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3829 {
3830 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3831
3832 if (storage == NULL)
3833 return NULL;
3834 storage->next = dfa->str_tree_storage;
3835 dfa->str_tree_storage = storage;
3836 dfa->str_tree_storage_idx = 0;
3837 }
3838 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3839
3840 tree->parent = NULL;
3841 tree->left = left;
3842 tree->right = right;
3843 tree->token = *token;
3844 tree->token.duplicated = 0;
3845 tree->token.opt_subexp = 0;
3846 tree->first = NULL;
3847 tree->next = NULL;
3848 tree->node_idx = -1;
3849
3850 if (left != NULL)
3851 left->parent = tree;
3852 if (right != NULL)
3853 right->parent = tree;
3854 return tree;
3855 }
3856
3857 /* Mark the tree SRC as an optional subexpression.
3858 To be called from preorder or postorder. */
3859
3860 static reg_errcode_t
3861 mark_opt_subexp (void *extra, bin_tree_t *node)
3862 {
3863 Idx idx = (uintptr_t) extra;
3864 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3865 node->token.opt_subexp = 1;
3866
3867 return REG_NOERROR;
3868 }
3869
3870 /* Free the allocated memory inside NODE. */
3871
3872 static void
3873 free_token (re_token_t *node)
3874 {
3875 #ifdef RE_ENABLE_I18N
3876 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3877 free_charset (node->opr.mbcset);
3878 else
3879 #endif /* RE_ENABLE_I18N */
3880 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3881 re_free (node->opr.sbcset);
3882 }
3883
3884 /* Worker function for tree walking. Free the allocated memory inside NODE
3885 and its children. */
3886
3887 static reg_errcode_t
3888 free_tree (void *extra, bin_tree_t *node)
3889 {
3890 free_token (&node->token);
3891 return REG_NOERROR;
3892 }
3893
3894
3895 /* Duplicate the node SRC, and return new node. This is a preorder
3896 visit similar to the one implemented by the generic visitor, but
3897 we need more infrastructure to maintain two parallel trees --- so,
3898 it's easier to duplicate. */
3899
3900 static bin_tree_t *
3901 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3902 {
3903 const bin_tree_t *node;
3904 bin_tree_t *dup_root;
3905 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3906
3907 for (node = root; ; )
3908 {
3909 /* Create a new tree and link it back to the current parent. */
3910 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3911 if (*p_new == NULL)
3912 return NULL;
3913 (*p_new)->parent = dup_node;
3914 (*p_new)->token.duplicated = 1;
3915 dup_node = *p_new;
3916
3917 /* Go to the left node, or up and to the right. */
3918 if (node->left)
3919 {
3920 node = node->left;
3921 p_new = &dup_node->left;
3922 }
3923 else
3924 {
3925 const bin_tree_t *prev = NULL;
3926 while (node->right == prev || node->right == NULL)
3927 {
3928 prev = node;
3929 node = node->parent;
3930 dup_node = dup_node->parent;
3931 if (!node)
3932 return dup_root;
3933 }
3934 node = node->right;
3935 p_new = &dup_node->right;
3936 }
3937 }
3938 }