1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
21 # include <locale/weight.h>
24 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
25 size_t length
, reg_syntax_t syntax
);
26 static void re_compile_fastmap_iter (regex_t
*bufp
,
27 const re_dfastate_t
*init_state
,
29 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
31 static void free_charset (re_charset_t
*cset
);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t
*preg
);
34 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
36 static void optimize_utf8 (re_dfa_t
*dfa
);
38 static reg_errcode_t
analyze (regex_t
*preg
);
39 static reg_errcode_t
preorder (bin_tree_t
*root
,
40 reg_errcode_t (fn (void *, bin_tree_t
*)),
42 static reg_errcode_t
postorder (bin_tree_t
*root
,
43 reg_errcode_t (fn (void *, bin_tree_t
*)),
45 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
46 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
47 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
49 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
50 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
51 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
52 static Idx
duplicate_node (re_dfa_t
*dfa
, Idx org_idx
, unsigned int constraint
);
53 static Idx
search_duplicated_node (const re_dfa_t
*dfa
, Idx org_node
,
54 unsigned int constraint
);
55 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
56 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
58 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
59 static Idx
fetch_number (re_string_t
*input
, re_token_t
*token
,
61 static int peek_token (re_token_t
*token
, re_string_t
*input
,
63 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
64 reg_syntax_t syntax
, reg_errcode_t
*err
);
65 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
66 re_token_t
*token
, reg_syntax_t syntax
,
67 Idx nest
, reg_errcode_t
*err
);
68 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
69 re_token_t
*token
, reg_syntax_t syntax
,
70 Idx nest
, reg_errcode_t
*err
);
71 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
72 re_token_t
*token
, reg_syntax_t syntax
,
73 Idx nest
, reg_errcode_t
*err
);
74 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
75 re_token_t
*token
, reg_syntax_t syntax
,
76 Idx nest
, reg_errcode_t
*err
);
77 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
78 re_dfa_t
*dfa
, re_token_t
*token
,
79 reg_syntax_t syntax
, reg_errcode_t
*err
);
80 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
81 re_token_t
*token
, reg_syntax_t syntax
,
83 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
85 re_token_t
*token
, int token_len
,
89 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
93 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
95 Idx
*equiv_class_alloc
,
96 const unsigned char *name
);
97 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
100 Idx
*char_class_alloc
,
101 const char *class_name
,
102 reg_syntax_t syntax
);
103 #else /* not RE_ENABLE_I18N */
104 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
105 const unsigned char *name
);
106 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
108 const char *class_name
,
109 reg_syntax_t syntax
);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
112 RE_TRANSLATE_TYPE trans
,
113 const char *class_name
,
115 bool non_match
, reg_errcode_t
*err
);
116 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
117 bin_tree_t
*left
, bin_tree_t
*right
,
118 re_token_type_t type
);
119 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
120 bin_tree_t
*left
, bin_tree_t
*right
,
121 const re_token_t
*token
);
122 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
123 static void free_token (re_token_t
*node
);
124 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
125 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
127 /* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
132 static const char __re_error_msgid
[] =
134 #define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
186 static const size_t __re_error_msgid_idx
[] =
207 /* Entry points for GNU code. */
209 /* re_compile_pattern is the GNU regular expression compiler: it
210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211 Returns 0 if the pattern was valid, otherwise an error string.
213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214 are set in BUFP on entry. */
217 re_compile_pattern (const char *pattern
, size_t length
,
218 struct re_pattern_buffer
*bufp
)
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
227 /* Match anchors at newline. */
228 bufp
->newline_anchor
= 1;
230 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
234 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
237 weak_alias (__re_compile_pattern
, re_compile_pattern
)
240 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243 /* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245 reg_syntax_t re_syntax_options
;
248 /* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
256 re_set_syntax (reg_syntax_t syntax
)
258 reg_syntax_t ret
= re_syntax_options
;
260 re_syntax_options
= syntax
;
264 weak_alias (__re_set_syntax
, re_set_syntax
)
268 re_compile_fastmap (struct re_pattern_buffer
*bufp
)
270 re_dfa_t
*dfa
= bufp
->buffer
;
271 char *fastmap
= bufp
->fastmap
;
273 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
274 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
275 if (dfa
->init_state
!= dfa
->init_state_word
)
276 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
277 if (dfa
->init_state
!= dfa
->init_state_nl
)
278 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
279 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
280 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
281 bufp
->fastmap_accurate
= 1;
285 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
289 __attribute__ ((always_inline
))
290 re_set_fastmap (char *fastmap
, bool icase
, int ch
)
294 fastmap
[tolower (ch
)] = 1;
297 /* Helper function for re_compile_fastmap.
298 Compile fastmap for the initial_state INIT_STATE. */
301 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
304 re_dfa_t
*dfa
= bufp
->buffer
;
306 bool icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
307 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
309 Idx node
= init_state
->nodes
.elems
[node_cnt
];
310 re_token_type_t type
= dfa
->nodes
[node
].type
;
312 if (type
== CHARACTER
)
314 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
315 #ifdef RE_ENABLE_I18N
316 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
318 unsigned char buf
[MB_LEN_MAX
];
324 *p
++ = dfa
->nodes
[node
].opr
.c
;
325 while (++node
< dfa
->nodes_len
326 && dfa
->nodes
[node
].type
== CHARACTER
327 && dfa
->nodes
[node
].mb_partial
)
328 *p
++ = dfa
->nodes
[node
].opr
.c
;
329 memset (&state
, '\0', sizeof (state
));
330 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
332 && (__wcrtomb ((char *) buf
, __towlower (wc
), &state
)
334 re_set_fastmap (fastmap
, false, buf
[0]);
338 else if (type
== SIMPLE_BRACKET
)
341 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
344 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
345 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
346 if (w
& ((bitset_word_t
) 1 << j
))
347 re_set_fastmap (fastmap
, icase
, ch
);
350 #ifdef RE_ENABLE_I18N
351 else if (type
== COMPLEX_BRACKET
)
353 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
357 /* See if we have to try all bytes which start multiple collation
359 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 collation element, and don't catch 'b' since 'b' is
361 the only collation element which starts from 'b' (and
362 it is caught by SIMPLE_BRACKET). */
363 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
364 && (cset
->ncoll_syms
|| cset
->nranges
))
366 const int32_t *table
= (const int32_t *)
367 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
368 for (i
= 0; i
< SBC_MAX
; ++i
)
370 re_set_fastmap (fastmap
, icase
, i
);
374 /* See if we have to start the match at all multibyte characters,
375 i.e. where we would not find an invalid sequence. This only
376 applies to multibyte character sets; for single byte character
377 sets, the SIMPLE_BRACKET again suffices. */
378 if (dfa
->mb_cur_max
> 1
379 && (cset
->nchar_classes
|| cset
->non_match
|| cset
->nranges
381 || cset
->nequiv_classes
389 memset (&mbs
, 0, sizeof (mbs
));
390 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
391 re_set_fastmap (fastmap
, false, (int) c
);
398 /* ... Else catch all bytes which can start the mbchars. */
399 for (i
= 0; i
< cset
->nmbchars
; ++i
)
403 memset (&state
, '\0', sizeof (state
));
404 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
405 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
406 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
408 if (__wcrtomb (buf
, __towlower (cset
->mbchars
[i
]), &state
)
410 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
415 #endif /* RE_ENABLE_I18N */
416 else if (type
== OP_PERIOD
417 #ifdef RE_ENABLE_I18N
418 || type
== OP_UTF8_PERIOD
419 #endif /* RE_ENABLE_I18N */
420 || type
== END_OF_RE
)
422 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
423 if (type
== END_OF_RE
)
424 bufp
->can_be_null
= 1;
430 /* Entry point for POSIX code. */
431 /* regcomp takes a regular expression as a string and compiles it.
433 PREG is a regex_t *. We do not expect any fields to be initialized,
434 since POSIX says we shouldn't. Thus, we set
436 'buffer' to the compiled pattern;
437 'used' to the length of the compiled pattern;
438 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439 REG_EXTENDED bit in CFLAGS is set; otherwise, to
440 RE_SYNTAX_POSIX_BASIC;
441 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
442 'fastmap' to an allocated space for the fastmap;
443 'fastmap_accurate' to zero;
444 're_nsub' to the number of subexpressions in PATTERN.
446 PATTERN is the address of the pattern string.
448 CFLAGS is a series of bits which affect compilation.
450 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451 use POSIX basic syntax.
453 If REG_NEWLINE is set, then . and [^...] don't match newline.
454 Also, regexec will try a match beginning after every newline.
456 If REG_ICASE is set, then we considers upper- and lowercase
457 versions of letters to be equivalent when matching.
459 If REG_NOSUB is set, then when PREG is passed to regexec, that
460 routine will report only success or failure, and nothing about the
463 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
464 the return codes and their meanings.) */
467 regcomp (regex_t
*_Restrict_ preg
, const char *_Restrict_ pattern
, int cflags
)
470 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
471 : RE_SYNTAX_POSIX_BASIC
);
477 /* Try to allocate space for the fastmap. */
478 preg
->fastmap
= re_malloc (char, SBC_MAX
);
479 if (BE (preg
->fastmap
== NULL
, 0))
482 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
484 /* If REG_NEWLINE is set, newlines are treated differently. */
485 if (cflags
& REG_NEWLINE
)
486 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
487 syntax
&= ~RE_DOT_NEWLINE
;
488 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
489 /* It also changes the matching behavior. */
490 preg
->newline_anchor
= 1;
493 preg
->newline_anchor
= 0;
494 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
495 preg
->translate
= NULL
;
497 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
499 /* POSIX doesn't distinguish between an unmatched open-group and an
500 unmatched close-group: both are REG_EPAREN. */
501 if (ret
== REG_ERPAREN
)
504 /* We have already checked preg->fastmap != NULL. */
505 if (BE (ret
== REG_NOERROR
, 1))
506 /* Compute the fastmap now, since regexec cannot modify the pattern
507 buffer. This function never fails in this implementation. */
508 (void) re_compile_fastmap (preg
);
511 /* Some error occurred while compiling the expression. */
512 re_free (preg
->fastmap
);
513 preg
->fastmap
= NULL
;
519 libc_hidden_def (__regcomp
)
520 weak_alias (__regcomp
, regcomp
)
523 /* Returns a message corresponding to an error code, ERRCODE, returned
524 from either regcomp or regexec. We don't use PREG here. */
527 regerror (int errcode
, const regex_t
*_Restrict_ preg
, char *_Restrict_ errbuf
,
534 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
535 / sizeof (__re_error_msgid_idx
[0])), 0))
536 /* Only error codes returned by the rest of the code should be passed
537 to this routine. If we are given anything else, or if other regex
538 code generates an invalid error code, then the program has a bug.
539 Dump core so we can fix it. */
542 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
544 msg_size
= strlen (msg
) + 1; /* Includes the null. */
546 if (BE (errbuf_size
!= 0, 1))
548 size_t cpy_size
= msg_size
;
549 if (BE (msg_size
> errbuf_size
, 0))
551 cpy_size
= errbuf_size
- 1;
552 errbuf
[cpy_size
] = '\0';
554 memcpy (errbuf
, msg
, cpy_size
);
560 weak_alias (__regerror
, regerror
)
564 #ifdef RE_ENABLE_I18N
565 /* This static array is used for the map to single-byte characters when
566 UTF-8 is used. Otherwise we would allocate memory just to initialize
567 it the same all the time. UTF-8 is the preferred encoding so this is
568 a worthwhile optimization. */
569 static const bitset_t utf8_sb_map
=
571 /* Set the first 128 bits. */
572 # if defined __GNUC__ && !defined __STRICT_ANSI__
573 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
575 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
576 # error "bitset_word_t is narrower than 32 bits"
577 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
578 BITSET_WORD_MAX
, BITSET_WORD_MAX
, BITSET_WORD_MAX
,
579 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
580 BITSET_WORD_MAX
, BITSET_WORD_MAX
,
581 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
585 >> (SBC_MAX
% BITSET_WORD_BITS
== 0
587 : BITSET_WORD_BITS
- SBC_MAX
% BITSET_WORD_BITS
))
594 free_dfa_content (re_dfa_t
*dfa
)
599 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
600 free_token (dfa
->nodes
+ i
);
601 re_free (dfa
->nexts
);
602 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
604 if (dfa
->eclosures
!= NULL
)
605 re_node_set_free (dfa
->eclosures
+ i
);
606 if (dfa
->inveclosures
!= NULL
)
607 re_node_set_free (dfa
->inveclosures
+ i
);
608 if (dfa
->edests
!= NULL
)
609 re_node_set_free (dfa
->edests
+ i
);
611 re_free (dfa
->edests
);
612 re_free (dfa
->eclosures
);
613 re_free (dfa
->inveclosures
);
614 re_free (dfa
->nodes
);
616 if (dfa
->state_table
)
617 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
619 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
620 for (j
= 0; j
< entry
->num
; ++j
)
622 re_dfastate_t
*state
= entry
->array
[j
];
625 re_free (entry
->array
);
627 re_free (dfa
->state_table
);
628 #ifdef RE_ENABLE_I18N
629 if (dfa
->sb_char
!= utf8_sb_map
)
630 re_free (dfa
->sb_char
);
632 re_free (dfa
->subexp_map
);
634 re_free (dfa
->re_str
);
641 /* Free dynamically allocated space used by PREG. */
644 regfree (regex_t
*preg
)
646 re_dfa_t
*dfa
= preg
->buffer
;
647 if (BE (dfa
!= NULL
, 1))
649 lock_fini (dfa
->lock
);
650 free_dfa_content (dfa
);
655 re_free (preg
->fastmap
);
656 preg
->fastmap
= NULL
;
658 re_free (preg
->translate
);
659 preg
->translate
= NULL
;
662 libc_hidden_def (__regfree
)
663 weak_alias (__regfree
, regfree
)
666 /* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
669 #if defined _REGEX_RE_COMP || defined _LIBC
671 /* BSD has one and only one pattern buffer. */
672 static struct re_pattern_buffer re_comp_buf
;
676 /* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
681 re_comp (const char *s
)
688 if (!re_comp_buf
.buffer
)
689 return gettext ("No previous regular expression");
693 if (re_comp_buf
.buffer
)
695 fastmap
= re_comp_buf
.fastmap
;
696 re_comp_buf
.fastmap
= NULL
;
697 __regfree (&re_comp_buf
);
698 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
699 re_comp_buf
.fastmap
= fastmap
;
702 if (re_comp_buf
.fastmap
== NULL
)
704 re_comp_buf
.fastmap
= re_malloc (char, SBC_MAX
);
705 if (re_comp_buf
.fastmap
== NULL
)
706 return (char *) gettext (__re_error_msgid
707 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
710 /* Since 're_exec' always passes NULL for the 'regs' argument, we
711 don't need to initialize the pattern buffer fields which affect it. */
713 /* Match anchors at newlines. */
714 re_comp_buf
.newline_anchor
= 1;
716 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
721 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
722 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
726 libc_freeres_fn (free_mem
)
728 __regfree (&re_comp_buf
);
732 #endif /* _REGEX_RE_COMP */
734 /* Internal entry point.
735 Compile the regular expression PATTERN, whose length is LENGTH.
736 SYNTAX indicate regular expression's syntax. */
739 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
742 reg_errcode_t err
= REG_NOERROR
;
746 /* Initialize the pattern buffer. */
747 preg
->fastmap_accurate
= 0;
748 preg
->syntax
= syntax
;
749 preg
->not_bol
= preg
->not_eol
= 0;
752 preg
->can_be_null
= 0;
753 preg
->regs_allocated
= REGS_UNALLOCATED
;
755 /* Initialize the dfa. */
757 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
759 /* If zero allocated, but buffer is non-null, try to realloc
760 enough space. This loses if buffer's address is bogus, but
761 that is the user's responsibility. If ->buffer is NULL this
762 is a simple allocation. */
763 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
766 preg
->allocated
= sizeof (re_dfa_t
);
769 preg
->used
= sizeof (re_dfa_t
);
771 err
= init_dfa (dfa
, length
);
772 if (BE (err
== REG_NOERROR
&& lock_init (dfa
->lock
) != 0, 0))
774 if (BE (err
!= REG_NOERROR
, 0))
776 free_dfa_content (dfa
);
782 /* Note: length+1 will not overflow since it is checked in init_dfa. */
783 dfa
->re_str
= re_malloc (char, length
+ 1);
784 strncpy (dfa
->re_str
, pattern
, length
+ 1);
787 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
788 (syntax
& RE_ICASE
) != 0, dfa
);
789 if (BE (err
!= REG_NOERROR
, 0))
791 re_compile_internal_free_return
:
792 free_workarea_compile (preg
);
793 re_string_destruct (®exp
);
794 lock_fini (dfa
->lock
);
795 free_dfa_content (dfa
);
801 /* Parse the regular expression, and build a structure tree. */
803 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
804 if (BE (dfa
->str_tree
== NULL
, 0))
805 goto re_compile_internal_free_return
;
807 /* Analyze the tree and create the nfa. */
808 err
= analyze (preg
);
809 if (BE (err
!= REG_NOERROR
, 0))
810 goto re_compile_internal_free_return
;
812 #ifdef RE_ENABLE_I18N
813 /* If possible, do searching in single byte encoding to speed things up. */
814 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
818 /* Then create the initial state of the dfa. */
819 err
= create_initial_state (dfa
);
821 /* Release work areas. */
822 free_workarea_compile (preg
);
823 re_string_destruct (®exp
);
825 if (BE (err
!= REG_NOERROR
, 0))
827 lock_fini (dfa
->lock
);
828 free_dfa_content (dfa
);
836 /* Initialize DFA. We use the length of the regular expression PAT_LEN
837 as the initial length of some arrays. */
840 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
842 __re_size_t table_size
;
843 #if !defined(_LIBC) && !defined(_REGEX_STANDALONE)
844 const char *codeset_name
;
846 #ifdef RE_ENABLE_I18N
847 size_t max_i18n_object_size
= MAX (sizeof (wchar_t), sizeof (wctype_t));
849 size_t max_i18n_object_size
= 0;
851 size_t max_object_size
=
852 MAX (sizeof (struct re_state_table_entry
),
853 MAX (sizeof (re_token_t
),
854 MAX (sizeof (re_node_set
),
855 MAX (sizeof (regmatch_t
),
856 max_i18n_object_size
))));
858 memset (dfa
, '\0', sizeof (re_dfa_t
));
860 /* Force allocation of str_tree_storage the first time. */
861 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
863 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
864 calculation below, and for similar doubling calculations
865 elsewhere. And it's <= rather than <, because some of the
866 doubling calculations add 1 afterwards. */
867 if (BE (MIN (IDX_MAX
, SIZE_MAX
/ max_object_size
) / 2 <= pat_len
, 0))
870 dfa
->nodes_alloc
= pat_len
+ 1;
871 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
873 /* table_size = 2 ^ ceil(log pat_len) */
874 for (table_size
= 1; ; table_size
<<= 1)
875 if (table_size
> pat_len
)
878 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
879 dfa
->state_hash_mask
= table_size
- 1;
881 dfa
->mb_cur_max
= MB_CUR_MAX
;
883 if (dfa
->mb_cur_max
== 6
884 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
886 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
889 # if !defined(_REGEX_STANDALONE)
890 codeset_name
= nl_langinfo (CODESET
);
891 if ((codeset_name
[0] == 'U' || codeset_name
[0] == 'u')
892 && (codeset_name
[1] == 'T' || codeset_name
[1] == 't')
893 && (codeset_name
[2] == 'F' || codeset_name
[2] == 'f')
894 && strcmp (codeset_name
+ 3 + (codeset_name
[3] == '-'), "8") == 0)
898 /* We check exhaustively in the loop below if this charset is a
899 superset of ASCII. */
900 dfa
->map_notascii
= 0;
903 #ifdef RE_ENABLE_I18N
904 if (dfa
->mb_cur_max
> 1)
907 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
912 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
913 if (BE (dfa
->sb_char
== NULL
, 0))
916 /* Set the bits corresponding to single byte chars. */
917 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
918 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
920 wint_t wch
= __btowc (ch
);
922 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
924 if (isascii (ch
) && wch
!= ch
)
925 dfa
->map_notascii
= 1;
932 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
937 /* Initialize WORD_CHAR table, which indicate which character is
938 "word". In this case "word" means that it is the word construction
939 character used by some operators like "\<", "\>", etc. */
942 init_word_char (re_dfa_t
*dfa
)
947 dfa
->word_ops_used
= 1;
948 if (BE (dfa
->map_notascii
== 0, 1))
950 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
951 them, an issue when this code is used in Gnulib. */
952 bitset_word_t bits0
= 0x00000000;
953 bitset_word_t bits1
= 0x03ff0000;
954 bitset_word_t bits2
= 0x87fffffe;
955 bitset_word_t bits3
= 0x07fffffe;
956 if (BITSET_WORD_BITS
== 64)
958 /* Pacify gcc -Woverflow on 32-bit platformns. */
959 dfa
->word_char
[0] = bits1
<< 31 << 1 | bits0
;
960 dfa
->word_char
[1] = bits3
<< 31 << 1 | bits2
;
963 else if (BITSET_WORD_BITS
== 32)
965 dfa
->word_char
[0] = bits0
;
966 dfa
->word_char
[1] = bits1
;
967 dfa
->word_char
[2] = bits2
;
968 dfa
->word_char
[3] = bits3
;
975 if (BE (dfa
->is_utf8
, 1))
977 memset (&dfa
->word_char
[i
], '\0', (SBC_MAX
- ch
) / 8);
983 for (; i
< BITSET_WORDS
; ++i
)
984 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
985 if (isalnum (ch
) || ch
== '_')
986 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
989 /* Free the work area which are only used while compiling. */
992 free_workarea_compile (regex_t
*preg
)
994 re_dfa_t
*dfa
= preg
->buffer
;
995 bin_tree_storage_t
*storage
, *next
;
996 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
998 next
= storage
->next
;
1001 dfa
->str_tree_storage
= NULL
;
1002 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
1003 dfa
->str_tree
= NULL
;
1004 re_free (dfa
->org_indices
);
1005 dfa
->org_indices
= NULL
;
1008 /* Create initial states for all contexts. */
1010 static reg_errcode_t
1011 create_initial_state (re_dfa_t
*dfa
)
1015 re_node_set init_nodes
;
1017 /* Initial states have the epsilon closure of the node which is
1018 the first node of the regular expression. */
1019 first
= dfa
->str_tree
->first
->node_idx
;
1020 dfa
->init_node
= first
;
1021 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
1022 if (BE (err
!= REG_NOERROR
, 0))
1025 /* The back-references which are in initial states can epsilon transit,
1026 since in this case all of the subexpressions can be null.
1027 Then we add epsilon closures of the nodes which are the next nodes of
1028 the back-references. */
1029 if (dfa
->nbackref
> 0)
1030 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1032 Idx node_idx
= init_nodes
.elems
[i
];
1033 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1036 if (type
!= OP_BACK_REF
)
1038 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1040 re_token_t
*clexp_node
;
1041 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1042 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1043 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1046 if (clexp_idx
== init_nodes
.nelem
)
1049 if (type
== OP_BACK_REF
)
1051 Idx dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1052 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1054 reg_errcode_t merge_err
1055 = re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
1056 if (merge_err
!= REG_NOERROR
)
1063 /* It must be the first time to invoke acquire_state. */
1064 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1065 /* We don't check ERR here, since the initial state must not be NULL. */
1066 if (BE (dfa
->init_state
== NULL
, 0))
1068 if (dfa
->init_state
->has_constraint
)
1070 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1072 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1074 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1078 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1079 || dfa
->init_state_begbuf
== NULL
, 0))
1083 dfa
->init_state_word
= dfa
->init_state_nl
1084 = dfa
->init_state_begbuf
= dfa
->init_state
;
1086 re_node_set_free (&init_nodes
);
1090 #ifdef RE_ENABLE_I18N
1091 /* If it is possible to do searching in single byte encoding instead of UTF-8
1092 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1093 DFA nodes where needed. */
1096 optimize_utf8 (re_dfa_t
*dfa
)
1100 bool mb_chars
= false;
1101 bool has_period
= false;
1103 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1104 switch (dfa
->nodes
[node
].type
)
1107 if (dfa
->nodes
[node
].opr
.c
>= ASCII_CHARS
)
1111 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1119 /* Word anchors etc. cannot be handled. It's okay to test
1120 opr.ctx_type since constraints (for all DFA nodes) are
1121 created by ORing one or more opr.ctx_type values. */
1131 case OP_DUP_ASTERISK
:
1132 case OP_OPEN_SUBEXP
:
1133 case OP_CLOSE_SUBEXP
:
1135 case COMPLEX_BRACKET
:
1137 case SIMPLE_BRACKET
:
1138 /* Just double check. */
1140 int rshift
= (ASCII_CHARS
% BITSET_WORD_BITS
== 0
1142 : BITSET_WORD_BITS
- ASCII_CHARS
% BITSET_WORD_BITS
);
1143 for (i
= ASCII_CHARS
/ BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1145 if (dfa
->nodes
[node
].opr
.sbcset
[i
] >> rshift
!= 0)
1155 if (mb_chars
|| has_period
)
1156 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1158 if (dfa
->nodes
[node
].type
== CHARACTER
1159 && dfa
->nodes
[node
].opr
.c
>= ASCII_CHARS
)
1160 dfa
->nodes
[node
].mb_partial
= 0;
1161 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1162 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1165 /* The search can be in single byte locale. */
1166 dfa
->mb_cur_max
= 1;
1168 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1172 /* Analyze the structure tree, and calculate "first", "next", "edest",
1173 "eclosure", and "inveclosure". */
1175 static reg_errcode_t
1176 analyze (regex_t
*preg
)
1178 re_dfa_t
*dfa
= preg
->buffer
;
1181 /* Allocate arrays. */
1182 dfa
->nexts
= re_malloc (Idx
, dfa
->nodes_alloc
);
1183 dfa
->org_indices
= re_malloc (Idx
, dfa
->nodes_alloc
);
1184 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1185 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1186 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1187 || dfa
->eclosures
== NULL
, 0))
1190 dfa
->subexp_map
= re_malloc (Idx
, preg
->re_nsub
);
1191 if (dfa
->subexp_map
!= NULL
)
1194 for (i
= 0; i
< preg
->re_nsub
; i
++)
1195 dfa
->subexp_map
[i
] = i
;
1196 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1197 for (i
= 0; i
< preg
->re_nsub
; i
++)
1198 if (dfa
->subexp_map
[i
] != i
)
1200 if (i
== preg
->re_nsub
)
1202 re_free (dfa
->subexp_map
);
1203 dfa
->subexp_map
= NULL
;
1207 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1208 if (BE (ret
!= REG_NOERROR
, 0))
1210 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1211 if (BE (ret
!= REG_NOERROR
, 0))
1213 preorder (dfa
->str_tree
, calc_next
, dfa
);
1214 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1215 if (BE (ret
!= REG_NOERROR
, 0))
1217 ret
= calc_eclosure (dfa
);
1218 if (BE (ret
!= REG_NOERROR
, 0))
1221 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1222 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1223 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1226 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1227 if (BE (dfa
->inveclosures
== NULL
, 0))
1229 ret
= calc_inveclosure (dfa
);
1235 /* Our parse trees are very unbalanced, so we cannot use a stack to
1236 implement parse tree visits. Instead, we use parent pointers and
1237 some hairy code in these two functions. */
1238 static reg_errcode_t
1239 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1242 bin_tree_t
*node
, *prev
;
1244 for (node
= root
; ; )
1246 /* Descend down the tree, preferably to the left (or to the right
1247 if that's the only child). */
1248 while (node
->left
|| node
->right
)
1256 reg_errcode_t err
= fn (extra
, node
);
1257 if (BE (err
!= REG_NOERROR
, 0))
1259 if (node
->parent
== NULL
)
1262 node
= node
->parent
;
1264 /* Go up while we have a node that is reached from the right. */
1265 while (node
->right
== prev
|| node
->right
== NULL
);
1270 static reg_errcode_t
1271 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1276 for (node
= root
; ; )
1278 reg_errcode_t err
= fn (extra
, node
);
1279 if (BE (err
!= REG_NOERROR
, 0))
1282 /* Go to the left node, or up and to the right. */
1287 bin_tree_t
*prev
= NULL
;
1288 while (node
->right
== prev
|| node
->right
== NULL
)
1291 node
= node
->parent
;
1300 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1301 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1302 backreferences as well. Requires a preorder visit. */
1303 static reg_errcode_t
1304 optimize_subexps (void *extra
, bin_tree_t
*node
)
1306 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1308 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1310 int idx
= node
->token
.opr
.idx
;
1311 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1312 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1315 else if (node
->token
.type
== SUBEXP
1316 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1318 Idx other_idx
= node
->left
->token
.opr
.idx
;
1320 node
->left
= node
->left
->left
;
1322 node
->left
->parent
= node
;
1324 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1325 if (other_idx
< BITSET_WORD_BITS
)
1326 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1332 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1333 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1334 static reg_errcode_t
1335 lower_subexps (void *extra
, bin_tree_t
*node
)
1337 regex_t
*preg
= (regex_t
*) extra
;
1338 reg_errcode_t err
= REG_NOERROR
;
1340 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1342 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1344 node
->left
->parent
= node
;
1346 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1348 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1350 node
->right
->parent
= node
;
1357 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1359 re_dfa_t
*dfa
= preg
->buffer
;
1360 bin_tree_t
*body
= node
->left
;
1361 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1364 /* We do not optimize empty subexpressions, because otherwise we may
1365 have bad CONCAT nodes with NULL children. This is obviously not
1366 very common, so we do not lose much. An example that triggers
1367 this case is the sed "script" /\(\)/x. */
1368 && node
->left
!= NULL
1369 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1370 || !(dfa
->used_bkref_map
1371 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1374 /* Convert the SUBEXP node to the concatenation of an
1375 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1376 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1377 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1378 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1379 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1380 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1386 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1387 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1391 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1392 nodes. Requires a postorder visit. */
1393 static reg_errcode_t
1394 calc_first (void *extra
, bin_tree_t
*node
)
1396 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1397 if (node
->token
.type
== CONCAT
)
1399 node
->first
= node
->left
->first
;
1400 node
->node_idx
= node
->left
->node_idx
;
1405 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1406 if (BE (node
->node_idx
== -1, 0))
1408 if (node
->token
.type
== ANCHOR
)
1409 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1414 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1415 static reg_errcode_t
1416 calc_next (void *extra
, bin_tree_t
*node
)
1418 switch (node
->token
.type
)
1420 case OP_DUP_ASTERISK
:
1421 node
->left
->next
= node
;
1424 node
->left
->next
= node
->right
->first
;
1425 node
->right
->next
= node
->next
;
1429 node
->left
->next
= node
->next
;
1431 node
->right
->next
= node
->next
;
1437 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1438 static reg_errcode_t
1439 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1441 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1442 Idx idx
= node
->node_idx
;
1443 reg_errcode_t err
= REG_NOERROR
;
1445 switch (node
->token
.type
)
1451 assert (node
->next
== NULL
);
1454 case OP_DUP_ASTERISK
:
1458 dfa
->has_plural_match
= 1;
1459 if (node
->left
!= NULL
)
1460 left
= node
->left
->first
->node_idx
;
1462 left
= node
->next
->node_idx
;
1463 if (node
->right
!= NULL
)
1464 right
= node
->right
->first
->node_idx
;
1466 right
= node
->next
->node_idx
;
1468 assert (right
> -1);
1469 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1474 case OP_OPEN_SUBEXP
:
1475 case OP_CLOSE_SUBEXP
:
1476 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1480 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1481 if (node
->token
.type
== OP_BACK_REF
)
1482 err
= re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1486 assert (!IS_EPSILON_NODE (node
->token
.type
));
1487 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1494 /* Duplicate the epsilon closure of the node ROOT_NODE.
1495 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1496 to their own constraint. */
1498 static reg_errcode_t
1499 duplicate_node_closure (re_dfa_t
*dfa
, Idx top_org_node
, Idx top_clone_node
,
1500 Idx root_node
, unsigned int init_constraint
)
1502 Idx org_node
, clone_node
;
1504 unsigned int constraint
= init_constraint
;
1505 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1507 Idx org_dest
, clone_dest
;
1508 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1510 /* If the back reference epsilon-transit, its destination must
1511 also have the constraint. Then duplicate the epsilon closure
1512 of the destination of the back reference, and store it in
1513 edests of the back reference. */
1514 org_dest
= dfa
->nexts
[org_node
];
1515 re_node_set_empty (dfa
->edests
+ clone_node
);
1516 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1517 if (BE (clone_dest
== -1, 0))
1519 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1520 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1524 else if (dfa
->edests
[org_node
].nelem
== 0)
1526 /* In case of the node can't epsilon-transit, don't duplicate the
1527 destination and store the original destination as the
1528 destination of the node. */
1529 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1532 else if (dfa
->edests
[org_node
].nelem
== 1)
1534 /* In case of the node can epsilon-transit, and it has only one
1536 org_dest
= dfa
->edests
[org_node
].elems
[0];
1537 re_node_set_empty (dfa
->edests
+ clone_node
);
1538 /* If the node is root_node itself, it means the epsilon closure
1539 has a loop. Then tie it to the destination of the root_node. */
1540 if (org_node
== root_node
&& clone_node
!= org_node
)
1542 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1547 /* In case the node has another constraint, append it. */
1548 constraint
|= dfa
->nodes
[org_node
].constraint
;
1549 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1550 if (BE (clone_dest
== -1, 0))
1552 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1556 else /* dfa->edests[org_node].nelem == 2 */
1558 /* In case of the node can epsilon-transit, and it has two
1559 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1560 org_dest
= dfa
->edests
[org_node
].elems
[0];
1561 re_node_set_empty (dfa
->edests
+ clone_node
);
1562 /* Search for a duplicated node which satisfies the constraint. */
1563 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1564 if (clone_dest
== -1)
1566 /* There is no such duplicated node, create a new one. */
1568 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1569 if (BE (clone_dest
== -1, 0))
1571 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1574 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1575 root_node
, constraint
);
1576 if (BE (err
!= REG_NOERROR
, 0))
1581 /* There is a duplicated node which satisfies the constraint,
1582 use it to avoid infinite loop. */
1583 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1588 org_dest
= dfa
->edests
[org_node
].elems
[1];
1589 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1590 if (BE (clone_dest
== -1, 0))
1592 ok
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1596 org_node
= org_dest
;
1597 clone_node
= clone_dest
;
1602 /* Search for a node which is duplicated from the node ORG_NODE, and
1603 satisfies the constraint CONSTRAINT. */
1606 search_duplicated_node (const re_dfa_t
*dfa
, Idx org_node
,
1607 unsigned int constraint
)
1610 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1612 if (org_node
== dfa
->org_indices
[idx
]
1613 && constraint
== dfa
->nodes
[idx
].constraint
)
1614 return idx
; /* Found. */
1616 return -1; /* Not found. */
1619 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1620 Return the index of the new node, or -1 if insufficient storage is
1624 duplicate_node (re_dfa_t
*dfa
, Idx org_idx
, unsigned int constraint
)
1626 Idx dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1627 if (BE (dup_idx
!= -1, 1))
1629 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1630 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1631 dfa
->nodes
[dup_idx
].duplicated
= 1;
1633 /* Store the index of the original node. */
1634 dfa
->org_indices
[dup_idx
] = org_idx
;
1639 static reg_errcode_t
1640 calc_inveclosure (re_dfa_t
*dfa
)
1644 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1645 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1647 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1649 Idx
*elems
= dfa
->eclosures
[src
].elems
;
1650 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1652 ok
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1661 /* Calculate "eclosure" for all the node in DFA. */
1663 static reg_errcode_t
1664 calc_eclosure (re_dfa_t
*dfa
)
1669 assert (dfa
->nodes_len
> 0);
1672 /* For each nodes, calculate epsilon closure. */
1673 for (node_idx
= 0; ; ++node_idx
)
1676 re_node_set eclosure_elem
;
1677 if (node_idx
== dfa
->nodes_len
)
1686 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1689 /* If we have already calculated, skip it. */
1690 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1692 /* Calculate epsilon closure of 'node_idx'. */
1693 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, true);
1694 if (BE (err
!= REG_NOERROR
, 0))
1697 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1700 re_node_set_free (&eclosure_elem
);
1706 /* Calculate epsilon closure of NODE. */
1708 static reg_errcode_t
1709 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, Idx node
, bool root
)
1713 re_node_set eclosure
;
1715 bool incomplete
= false;
1716 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1717 if (BE (err
!= REG_NOERROR
, 0))
1720 /* This indicates that we are calculating this node now.
1721 We reference this value to avoid infinite loop. */
1722 dfa
->eclosures
[node
].nelem
= -1;
1724 /* If the current node has constraints, duplicate all nodes
1725 since they must inherit the constraints. */
1726 if (dfa
->nodes
[node
].constraint
1727 && dfa
->edests
[node
].nelem
1728 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1730 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1731 dfa
->nodes
[node
].constraint
);
1732 if (BE (err
!= REG_NOERROR
, 0))
1736 /* Expand each epsilon destination nodes. */
1737 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1738 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1740 re_node_set eclosure_elem
;
1741 Idx edest
= dfa
->edests
[node
].elems
[i
];
1742 /* If calculating the epsilon closure of 'edest' is in progress,
1743 return intermediate result. */
1744 if (dfa
->eclosures
[edest
].nelem
== -1)
1749 /* If we haven't calculated the epsilon closure of 'edest' yet,
1750 calculate now. Otherwise use calculated epsilon closure. */
1751 if (dfa
->eclosures
[edest
].nelem
== 0)
1753 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, false);
1754 if (BE (err
!= REG_NOERROR
, 0))
1758 eclosure_elem
= dfa
->eclosures
[edest
];
1759 /* Merge the epsilon closure of 'edest'. */
1760 err
= re_node_set_merge (&eclosure
, &eclosure_elem
);
1761 if (BE (err
!= REG_NOERROR
, 0))
1763 /* If the epsilon closure of 'edest' is incomplete,
1764 the epsilon closure of this node is also incomplete. */
1765 if (dfa
->eclosures
[edest
].nelem
== 0)
1768 re_node_set_free (&eclosure_elem
);
1772 /* An epsilon closure includes itself. */
1773 ok
= re_node_set_insert (&eclosure
, node
);
1776 if (incomplete
&& !root
)
1777 dfa
->eclosures
[node
].nelem
= 0;
1779 dfa
->eclosures
[node
] = eclosure
;
1780 *new_set
= eclosure
;
1784 /* Functions for token which are used in the parser. */
1786 /* Fetch a token from INPUT.
1787 We must not use this function inside bracket expressions. */
1790 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1792 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1795 /* Peek a token from INPUT, and return the length of the token.
1796 We must not use this function inside bracket expressions. */
1799 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1803 if (re_string_eoi (input
))
1805 token
->type
= END_OF_RE
;
1809 c
= re_string_peek_byte (input
, 0);
1812 token
->word_char
= 0;
1813 #ifdef RE_ENABLE_I18N
1814 token
->mb_partial
= 0;
1815 if (input
->mb_cur_max
> 1 &&
1816 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1818 token
->type
= CHARACTER
;
1819 token
->mb_partial
= 1;
1826 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1828 token
->type
= BACK_SLASH
;
1832 c2
= re_string_peek_byte_case (input
, 1);
1834 token
->type
= CHARACTER
;
1835 #ifdef RE_ENABLE_I18N
1836 if (input
->mb_cur_max
> 1)
1838 wint_t wc
= re_string_wchar_at (input
,
1839 re_string_cur_idx (input
) + 1);
1840 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1844 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1849 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1850 token
->type
= OP_ALT
;
1852 case '1': case '2': case '3': case '4': case '5':
1853 case '6': case '7': case '8': case '9':
1854 if (!(syntax
& RE_NO_BK_REFS
))
1856 token
->type
= OP_BACK_REF
;
1857 token
->opr
.idx
= c2
- '1';
1861 if (!(syntax
& RE_NO_GNU_OPS
))
1863 token
->type
= ANCHOR
;
1864 token
->opr
.ctx_type
= WORD_FIRST
;
1868 if (!(syntax
& RE_NO_GNU_OPS
))
1870 token
->type
= ANCHOR
;
1871 token
->opr
.ctx_type
= WORD_LAST
;
1875 if (!(syntax
& RE_NO_GNU_OPS
))
1877 token
->type
= ANCHOR
;
1878 token
->opr
.ctx_type
= WORD_DELIM
;
1882 if (!(syntax
& RE_NO_GNU_OPS
))
1884 token
->type
= ANCHOR
;
1885 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1889 if (!(syntax
& RE_NO_GNU_OPS
))
1890 token
->type
= OP_WORD
;
1893 if (!(syntax
& RE_NO_GNU_OPS
))
1894 token
->type
= OP_NOTWORD
;
1897 if (!(syntax
& RE_NO_GNU_OPS
))
1898 token
->type
= OP_SPACE
;
1901 if (!(syntax
& RE_NO_GNU_OPS
))
1902 token
->type
= OP_NOTSPACE
;
1905 if (!(syntax
& RE_NO_GNU_OPS
))
1907 token
->type
= ANCHOR
;
1908 token
->opr
.ctx_type
= BUF_FIRST
;
1912 if (!(syntax
& RE_NO_GNU_OPS
))
1914 token
->type
= ANCHOR
;
1915 token
->opr
.ctx_type
= BUF_LAST
;
1919 if (!(syntax
& RE_NO_BK_PARENS
))
1920 token
->type
= OP_OPEN_SUBEXP
;
1923 if (!(syntax
& RE_NO_BK_PARENS
))
1924 token
->type
= OP_CLOSE_SUBEXP
;
1927 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1928 token
->type
= OP_DUP_PLUS
;
1931 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1932 token
->type
= OP_DUP_QUESTION
;
1935 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1936 token
->type
= OP_OPEN_DUP_NUM
;
1939 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1940 token
->type
= OP_CLOSE_DUP_NUM
;
1948 token
->type
= CHARACTER
;
1949 #ifdef RE_ENABLE_I18N
1950 if (input
->mb_cur_max
> 1)
1952 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1953 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1957 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1962 if (syntax
& RE_NEWLINE_ALT
)
1963 token
->type
= OP_ALT
;
1966 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1967 token
->type
= OP_ALT
;
1970 token
->type
= OP_DUP_ASTERISK
;
1973 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1974 token
->type
= OP_DUP_PLUS
;
1977 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1978 token
->type
= OP_DUP_QUESTION
;
1981 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1982 token
->type
= OP_OPEN_DUP_NUM
;
1985 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1986 token
->type
= OP_CLOSE_DUP_NUM
;
1989 if (syntax
& RE_NO_BK_PARENS
)
1990 token
->type
= OP_OPEN_SUBEXP
;
1993 if (syntax
& RE_NO_BK_PARENS
)
1994 token
->type
= OP_CLOSE_SUBEXP
;
1997 token
->type
= OP_OPEN_BRACKET
;
2000 token
->type
= OP_PERIOD
;
2003 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
2004 re_string_cur_idx (input
) != 0)
2006 char prev
= re_string_peek_byte (input
, -1);
2007 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
2010 token
->type
= ANCHOR
;
2011 token
->opr
.ctx_type
= LINE_FIRST
;
2014 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
2015 re_string_cur_idx (input
) + 1 != re_string_length (input
))
2018 re_string_skip_bytes (input
, 1);
2019 peek_token (&next
, input
, syntax
);
2020 re_string_skip_bytes (input
, -1);
2021 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
2024 token
->type
= ANCHOR
;
2025 token
->opr
.ctx_type
= LINE_LAST
;
2033 /* Peek a token from INPUT, and return the length of the token.
2034 We must not use this function out of bracket expressions. */
2037 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
2040 if (re_string_eoi (input
))
2042 token
->type
= END_OF_RE
;
2045 c
= re_string_peek_byte (input
, 0);
2048 #ifdef RE_ENABLE_I18N
2049 if (input
->mb_cur_max
> 1 &&
2050 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2052 token
->type
= CHARACTER
;
2055 #endif /* RE_ENABLE_I18N */
2057 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2058 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2060 /* In this case, '\' escape a character. */
2062 re_string_skip_bytes (input
, 1);
2063 c2
= re_string_peek_byte (input
, 0);
2065 token
->type
= CHARACTER
;
2068 if (c
== '[') /* '[' is a special char in a bracket exps. */
2072 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2073 c2
= re_string_peek_byte (input
, 1);
2081 token
->type
= OP_OPEN_COLL_ELEM
;
2085 token
->type
= OP_OPEN_EQUIV_CLASS
;
2089 if (syntax
& RE_CHAR_CLASSES
)
2091 token
->type
= OP_OPEN_CHAR_CLASS
;
2096 token
->type
= CHARACTER
;
2106 token
->type
= OP_CHARSET_RANGE
;
2109 token
->type
= OP_CLOSE_BRACKET
;
2112 token
->type
= OP_NON_MATCH_LIST
;
2115 token
->type
= CHARACTER
;
2120 /* Functions for parser. */
2122 /* Entry point of the parser.
2123 Parse the regular expression REGEXP and return the structure tree.
2124 If an error occurs, ERR is set by error code, and return NULL.
2125 This function build the following tree, from regular expression <reg_exp>:
2131 CAT means concatenation.
2132 EOR means end of regular expression. */
2135 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2138 re_dfa_t
*dfa
= preg
->buffer
;
2139 bin_tree_t
*tree
, *eor
, *root
;
2140 re_token_t current_token
;
2141 dfa
->syntax
= syntax
;
2142 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2143 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2144 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2146 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2148 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2151 if (BE (eor
== NULL
|| root
== NULL
, 0))
2159 /* This function build the following tree, from regular expression
2160 <branch1>|<branch2>:
2166 ALT means alternative, which represents the operator '|'. */
2169 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2170 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2172 re_dfa_t
*dfa
= preg
->buffer
;
2173 bin_tree_t
*tree
, *branch
= NULL
;
2174 bitset_word_t initial_bkref_map
= dfa
->completed_bkref_map
;
2175 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2176 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2179 while (token
->type
== OP_ALT
)
2181 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2182 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2183 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2185 bitset_word_t accumulated_bkref_map
= dfa
->completed_bkref_map
;
2186 dfa
->completed_bkref_map
= initial_bkref_map
;
2187 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2188 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2191 postorder (tree
, free_tree
, NULL
);
2194 dfa
->completed_bkref_map
|= accumulated_bkref_map
;
2198 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2199 if (BE (tree
== NULL
, 0))
2208 /* This function build the following tree, from regular expression
2215 CAT means concatenation. */
2218 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2219 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2221 bin_tree_t
*tree
, *expr
;
2222 re_dfa_t
*dfa
= preg
->buffer
;
2223 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2224 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2227 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2228 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2230 expr
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2231 if (BE (*err
!= REG_NOERROR
&& expr
== NULL
, 0))
2234 postorder (tree
, free_tree
, NULL
);
2237 if (tree
!= NULL
&& expr
!= NULL
)
2239 bin_tree_t
*newtree
= create_tree (dfa
, tree
, expr
, CONCAT
);
2240 if (newtree
== NULL
)
2242 postorder (expr
, free_tree
, NULL
);
2243 postorder (tree
, free_tree
, NULL
);
2249 else if (tree
== NULL
)
2251 /* Otherwise expr == NULL, we don't need to create new tree. */
2256 /* This function build the following tree, from regular expression a*:
2263 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2264 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2266 re_dfa_t
*dfa
= preg
->buffer
;
2268 switch (token
->type
)
2271 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2272 if (BE (tree
== NULL
, 0))
2277 #ifdef RE_ENABLE_I18N
2278 if (dfa
->mb_cur_max
> 1)
2280 while (!re_string_eoi (regexp
)
2281 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2283 bin_tree_t
*mbc_remain
;
2284 fetch_token (token
, regexp
, syntax
);
2285 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2286 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2287 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2297 case OP_OPEN_SUBEXP
:
2298 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2299 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2303 case OP_OPEN_BRACKET
:
2304 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2305 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2310 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2315 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2316 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2317 if (BE (tree
== NULL
, 0))
2323 dfa
->has_mb_node
= 1;
2326 case OP_OPEN_DUP_NUM
:
2327 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2333 case OP_DUP_ASTERISK
:
2335 case OP_DUP_QUESTION
:
2336 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2341 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2343 fetch_token (token
, regexp
, syntax
);
2344 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2347 case OP_CLOSE_SUBEXP
:
2348 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2349 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2355 case OP_CLOSE_DUP_NUM
:
2356 /* We treat it as a normal character. */
2358 /* Then we can these characters as normal characters. */
2359 token
->type
= CHARACTER
;
2360 /* mb_partial and word_char bits should be initialized already
2362 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2363 if (BE (tree
== NULL
, 0))
2371 if ((token
->opr
.ctx_type
2372 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2373 && dfa
->word_ops_used
== 0)
2374 init_word_char (dfa
);
2375 if (token
->opr
.ctx_type
== WORD_DELIM
2376 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2378 bin_tree_t
*tree_first
, *tree_last
;
2379 if (token
->opr
.ctx_type
== WORD_DELIM
)
2381 token
->opr
.ctx_type
= WORD_FIRST
;
2382 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2383 token
->opr
.ctx_type
= WORD_LAST
;
2387 token
->opr
.ctx_type
= INSIDE_WORD
;
2388 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2389 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2391 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2392 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2393 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2401 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2402 if (BE (tree
== NULL
, 0))
2408 /* We must return here, since ANCHORs can't be followed
2409 by repetition operators.
2410 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2411 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2412 fetch_token (token
, regexp
, syntax
);
2416 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2417 if (BE (tree
== NULL
, 0))
2422 if (dfa
->mb_cur_max
> 1)
2423 dfa
->has_mb_node
= 1;
2428 tree
= build_charclass_op (dfa
, regexp
->trans
,
2431 token
->type
== OP_NOTWORD
, err
);
2432 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2438 tree
= build_charclass_op (dfa
, regexp
->trans
,
2441 token
->type
== OP_NOTSPACE
, err
);
2442 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2455 /* Must not happen? */
2461 fetch_token (token
, regexp
, syntax
);
2463 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2464 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2466 bin_tree_t
*dup_tree
= parse_dup_op (tree
, regexp
, dfa
, token
,
2468 if (BE (*err
!= REG_NOERROR
&& dup_tree
== NULL
, 0))
2471 postorder (tree
, free_tree
, NULL
);
2475 /* In BRE consecutive duplications are not allowed. */
2476 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2477 && (token
->type
== OP_DUP_ASTERISK
2478 || token
->type
== OP_OPEN_DUP_NUM
))
2481 postorder (tree
, free_tree
, NULL
);
2490 /* This function build the following tree, from regular expression
2498 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2499 reg_syntax_t syntax
, Idx nest
, reg_errcode_t
*err
)
2501 re_dfa_t
*dfa
= preg
->buffer
;
2504 cur_nsub
= preg
->re_nsub
++;
2506 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2508 /* The subexpression may be a null string. */
2509 if (token
->type
== OP_CLOSE_SUBEXP
)
2513 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2514 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2517 postorder (tree
, free_tree
, NULL
);
2520 if (BE (*err
!= REG_NOERROR
, 0))
2524 if (cur_nsub
<= '9' - '1')
2525 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2527 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2528 if (BE (tree
== NULL
, 0))
2533 tree
->token
.opr
.idx
= cur_nsub
;
2537 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2540 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2541 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2543 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2544 Idx i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2545 re_token_t start_token
= *token
;
2547 if (token
->type
== OP_OPEN_DUP_NUM
)
2550 start
= fetch_number (regexp
, token
, syntax
);
2553 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2554 start
= 0; /* We treat "{,m}" as "{0,m}". */
2557 *err
= REG_BADBR
; /* <re>{} is invalid. */
2561 if (BE (start
!= -2, 1))
2563 /* We treat "{n}" as "{n,n}". */
2564 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2565 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2566 ? fetch_number (regexp
, token
, syntax
) : -2));
2568 if (BE (start
== -2 || end
== -2, 0))
2570 /* Invalid sequence. */
2571 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2573 if (token
->type
== END_OF_RE
)
2581 /* If the syntax bit is set, rollback. */
2582 re_string_set_index (regexp
, start_idx
);
2583 *token
= start_token
;
2584 token
->type
= CHARACTER
;
2585 /* mb_partial and word_char bits should be already initialized by
2590 if (BE ((end
!= -1 && start
> end
)
2591 || token
->type
!= OP_CLOSE_DUP_NUM
, 0))
2593 /* First number greater than second. */
2598 if (BE (RE_DUP_MAX
< (end
== -1 ? start
: end
), 0))
2606 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2607 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2610 fetch_token (token
, regexp
, syntax
);
2612 if (BE (elem
== NULL
, 0))
2614 if (BE (start
== 0 && end
== 0, 0))
2616 postorder (elem
, free_tree
, NULL
);
2620 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2621 if (BE (start
> 0, 0))
2624 for (i
= 2; i
<= start
; ++i
)
2626 elem
= duplicate_tree (elem
, dfa
);
2627 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2628 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2629 goto parse_dup_op_espace
;
2635 /* Duplicate ELEM before it is marked optional. */
2636 elem
= duplicate_tree (elem
, dfa
);
2637 if (BE (elem
== NULL
, 0))
2638 goto parse_dup_op_espace
;
2644 if (elem
->token
.type
== SUBEXP
)
2646 uintptr_t subidx
= elem
->token
.opr
.idx
;
2647 postorder (elem
, mark_opt_subexp
, (void *) subidx
);
2650 tree
= create_tree (dfa
, elem
, NULL
,
2651 (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2652 if (BE (tree
== NULL
, 0))
2653 goto parse_dup_op_espace
;
2655 /* This loop is actually executed only when end != -1,
2656 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2657 already created the start+1-th copy. */
2658 if (TYPE_SIGNED (Idx
) || end
!= -1)
2659 for (i
= start
+ 2; i
<= end
; ++i
)
2661 elem
= duplicate_tree (elem
, dfa
);
2662 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2663 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2664 goto parse_dup_op_espace
;
2666 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2667 if (BE (tree
== NULL
, 0))
2668 goto parse_dup_op_espace
;
2672 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2676 parse_dup_op_espace
:
2681 /* Size of the names for collating symbol/equivalence_class/character_class.
2682 I'm not sure, but maybe enough. */
2683 #define BRACKET_NAME_BUF_SIZE 32
2687 # ifdef RE_ENABLE_I18N
2688 /* Convert the byte B to the corresponding wide character. In a
2689 unibyte locale, treat B as itself if it is an encoding error.
2690 In a multibyte locale, return WEOF if B is an encoding error. */
2692 parse_byte (unsigned char b
, re_charset_t
*mbcset
)
2694 wint_t wc
= __btowc (b
);
2695 return wc
== WEOF
&& !mbcset
? b
: wc
;
2699 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2700 Build the range expression which starts from START_ELEM, and ends
2701 at END_ELEM. The result are written to MBCSET and SBCSET.
2702 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2703 mbcset->range_ends, is a pointer argument since we may
2706 static reg_errcode_t
2707 # ifdef RE_ENABLE_I18N
2708 build_range_exp (const reg_syntax_t syntax
,
2710 re_charset_t
*mbcset
,
2712 const bracket_elem_t
*start_elem
,
2713 const bracket_elem_t
*end_elem
)
2714 # else /* not RE_ENABLE_I18N */
2715 build_range_exp (const reg_syntax_t syntax
,
2717 const bracket_elem_t
*start_elem
,
2718 const bracket_elem_t
*end_elem
)
2719 # endif /* not RE_ENABLE_I18N */
2721 unsigned int start_ch
, end_ch
;
2722 /* Equivalence Classes and Character Classes can't be a range start/end. */
2723 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2724 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2728 /* We can handle no multi character collating elements without libc
2730 if (BE ((start_elem
->type
== COLL_SYM
2731 && strlen ((char *) start_elem
->opr
.name
) > 1)
2732 || (end_elem
->type
== COLL_SYM
2733 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2734 return REG_ECOLLATE
;
2736 # ifdef RE_ENABLE_I18N
2742 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2743 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2745 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2746 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2748 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2749 ? parse_byte (start_ch
, mbcset
) : start_elem
->opr
.wch
);
2750 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2751 ? parse_byte (end_ch
, mbcset
) : end_elem
->opr
.wch
);
2752 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2753 return REG_ECOLLATE
;
2754 else if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_wc
> end_wc
, 0))
2757 /* Got valid collation sequence values, add them as a new entry.
2758 However, for !_LIBC we have no collation elements: if the
2759 character set is single byte, the single byte character set
2760 that we build below suffices. parse_bracket_exp passes
2761 no MBCSET if dfa->mb_cur_max == 1. */
2764 /* Check the space of the arrays. */
2765 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2767 /* There is not enough space, need realloc. */
2768 wchar_t *new_array_start
, *new_array_end
;
2771 /* +1 in case of mbcset->nranges is 0. */
2772 new_nranges
= 2 * mbcset
->nranges
+ 1;
2773 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2774 are NULL if *range_alloc == 0. */
2775 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2777 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2780 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2782 re_free (new_array_start
);
2783 re_free (new_array_end
);
2787 mbcset
->range_starts
= new_array_start
;
2788 mbcset
->range_ends
= new_array_end
;
2789 *range_alloc
= new_nranges
;
2792 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2793 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2796 /* Build the table for single byte characters. */
2797 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2799 if (start_wc
<= wc
&& wc
<= end_wc
)
2800 bitset_set (sbcset
, wc
);
2803 # else /* not RE_ENABLE_I18N */
2806 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2807 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2809 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2810 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2812 if (start_ch
> end_ch
)
2814 /* Build the table for single byte characters. */
2815 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2816 if (start_ch
<= ch
&& ch
<= end_ch
)
2817 bitset_set (sbcset
, ch
);
2819 # endif /* not RE_ENABLE_I18N */
2822 #endif /* not _LIBC */
2825 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2826 Build the collating element which is represented by NAME.
2827 The result are written to MBCSET and SBCSET.
2828 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2829 pointer argument since we may update it. */
2831 static reg_errcode_t
2832 # ifdef RE_ENABLE_I18N
2833 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2834 Idx
*coll_sym_alloc
, const unsigned char *name
)
2835 # else /* not RE_ENABLE_I18N */
2836 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2837 # endif /* not RE_ENABLE_I18N */
2839 size_t name_len
= strlen ((const char *) name
);
2840 if (BE (name_len
!= 1, 0))
2841 return REG_ECOLLATE
;
2844 bitset_set (sbcset
, name
[0]);
2848 #endif /* not _LIBC */
2850 /* This function parse bracket expression like "[abc]", "[a-c]",
2854 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2855 reg_syntax_t syntax
, reg_errcode_t
*err
)
2858 const unsigned char *collseqmb
;
2859 const char *collseqwc
;
2862 const int32_t *symb_table
;
2863 const unsigned char *extra
;
2865 /* Local function for parse_bracket_exp used in _LIBC environment.
2866 Seek the collating symbol entry corresponding to NAME.
2867 Return the index of the symbol in the SYMB_TABLE,
2868 or -1 if not found. */
2871 __attribute__ ((always_inline
))
2872 seek_collating_symbol_entry (const unsigned char *name
, size_t name_len
)
2876 for (elem
= 0; elem
< table_size
; elem
++)
2877 if (symb_table
[2 * elem
] != 0)
2879 int32_t idx
= symb_table
[2 * elem
+ 1];
2880 /* Skip the name of collating element name. */
2881 idx
+= 1 + extra
[idx
];
2882 if (/* Compare the length of the name. */
2883 name_len
== extra
[idx
]
2884 /* Compare the name. */
2885 && memcmp (name
, &extra
[idx
+ 1], name_len
) == 0)
2886 /* Yep, this is the entry. */
2892 /* Local function for parse_bracket_exp used in _LIBC environment.
2893 Look up the collation sequence value of BR_ELEM.
2894 Return the value if succeeded, UINT_MAX otherwise. */
2896 auto inline unsigned int
2897 __attribute__ ((always_inline
))
2898 lookup_collation_sequence_value (bracket_elem_t
*br_elem
)
2900 if (br_elem
->type
== SB_CHAR
)
2903 if (MB_CUR_MAX == 1)
2906 return collseqmb
[br_elem
->opr
.ch
];
2909 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2910 return __collseq_table_lookup (collseqwc
, wc
);
2913 else if (br_elem
->type
== MB_CHAR
)
2916 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2918 else if (br_elem
->type
== COLL_SYM
)
2920 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2924 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2928 /* We found the entry. */
2929 idx
= symb_table
[2 * elem
+ 1];
2930 /* Skip the name of collating element name. */
2931 idx
+= 1 + extra
[idx
];
2932 /* Skip the byte sequence of the collating element. */
2933 idx
+= 1 + extra
[idx
];
2934 /* Adjust for the alignment. */
2935 idx
= (idx
+ 3) & ~3;
2936 /* Skip the multibyte collation sequence value. */
2937 idx
+= sizeof (unsigned int);
2938 /* Skip the wide char sequence of the collating element. */
2939 idx
+= sizeof (unsigned int) *
2940 (1 + *(unsigned int *) (extra
+ idx
));
2941 /* Return the collation sequence value. */
2942 return *(unsigned int *) (extra
+ idx
);
2944 else if (sym_name_len
== 1)
2946 /* No valid character. Match it as a single byte
2948 return collseqmb
[br_elem
->opr
.name
[0]];
2951 else if (sym_name_len
== 1)
2952 return collseqmb
[br_elem
->opr
.name
[0]];
2957 /* Local function for parse_bracket_exp used in _LIBC environment.
2958 Build the range expression which starts from START_ELEM, and ends
2959 at END_ELEM. The result are written to MBCSET and SBCSET.
2960 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2961 mbcset->range_ends, is a pointer argument since we may
2964 auto inline reg_errcode_t
2965 __attribute__ ((always_inline
))
2966 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2967 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2970 uint32_t start_collseq
;
2971 uint32_t end_collseq
;
2973 /* Equivalence Classes and Character Classes can't be a range
2975 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2976 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2980 /* FIXME: Implement rational ranges here, too. */
2981 start_collseq
= lookup_collation_sequence_value (start_elem
);
2982 end_collseq
= lookup_collation_sequence_value (end_elem
);
2983 /* Check start/end collation sequence values. */
2984 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2985 return REG_ECOLLATE
;
2986 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2989 /* Got valid collation sequence values, add them as a new entry.
2990 However, if we have no collation elements, and the character set
2991 is single byte, the single byte character set that we
2992 build below suffices. */
2993 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2995 /* Check the space of the arrays. */
2996 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2998 /* There is not enough space, need realloc. */
2999 uint32_t *new_array_start
;
3000 uint32_t *new_array_end
;
3003 /* +1 in case of mbcset->nranges is 0. */
3004 new_nranges
= 2 * mbcset
->nranges
+ 1;
3005 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
3007 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
3010 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
3013 mbcset
->range_starts
= new_array_start
;
3014 mbcset
->range_ends
= new_array_end
;
3015 *range_alloc
= new_nranges
;
3018 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
3019 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
3022 /* Build the table for single byte characters. */
3023 for (ch
= 0; ch
< SBC_MAX
; ch
++)
3025 uint32_t ch_collseq
;
3027 if (MB_CUR_MAX == 1)
3030 ch_collseq
= collseqmb
[ch
];
3032 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
3033 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
3034 bitset_set (sbcset
, ch
);
3039 /* Local function for parse_bracket_exp used in _LIBC environment.
3040 Build the collating element which is represented by NAME.
3041 The result are written to MBCSET and SBCSET.
3042 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3043 pointer argument since we may update it. */
3045 auto inline reg_errcode_t
3046 __attribute__ ((always_inline
))
3047 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
3048 Idx
*coll_sym_alloc
, const unsigned char *name
)
3051 size_t name_len
= strlen ((const char *) name
);
3054 elem
= seek_collating_symbol_entry (name
, name_len
);
3057 /* We found the entry. */
3058 idx
= symb_table
[2 * elem
+ 1];
3059 /* Skip the name of collating element name. */
3060 idx
+= 1 + extra
[idx
];
3062 else if (name_len
== 1)
3064 /* No valid character, treat it as a normal
3066 bitset_set (sbcset
, name
[0]);
3070 return REG_ECOLLATE
;
3072 /* Got valid collation sequence, add it as a new entry. */
3073 /* Check the space of the arrays. */
3074 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
3076 /* Not enough, realloc it. */
3077 /* +1 in case of mbcset->ncoll_syms is 0. */
3078 Idx new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3079 /* Use realloc since mbcset->coll_syms is NULL
3081 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3082 new_coll_sym_alloc
);
3083 if (BE (new_coll_syms
== NULL
, 0))
3085 mbcset
->coll_syms
= new_coll_syms
;
3086 *coll_sym_alloc
= new_coll_sym_alloc
;
3088 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3093 if (BE (name_len
!= 1, 0))
3094 return REG_ECOLLATE
;
3097 bitset_set (sbcset
, name
[0]);
3104 re_token_t br_token
;
3105 re_bitset_ptr_t sbcset
;
3106 #ifdef RE_ENABLE_I18N
3107 re_charset_t
*mbcset
;
3108 Idx coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3109 Idx equiv_class_alloc
= 0, char_class_alloc
= 0;
3110 #endif /* not RE_ENABLE_I18N */
3111 bool non_match
= false;
3112 bin_tree_t
*work_tree
;
3114 bool first_round
= true;
3116 collseqmb
= (const unsigned char *)
3117 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3118 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3124 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3125 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3126 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3127 _NL_COLLATE_SYMB_TABLEMB
);
3128 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3129 _NL_COLLATE_SYMB_EXTRAMB
);
3132 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3133 #ifdef RE_ENABLE_I18N
3134 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3135 #endif /* RE_ENABLE_I18N */
3136 #ifdef RE_ENABLE_I18N
3137 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3139 if (BE (sbcset
== NULL
, 0))
3140 #endif /* RE_ENABLE_I18N */
3143 #ifdef RE_ENABLE_I18N
3150 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3151 if (BE (token
->type
== END_OF_RE
, 0))
3154 goto parse_bracket_exp_free_return
;
3156 if (token
->type
== OP_NON_MATCH_LIST
)
3158 #ifdef RE_ENABLE_I18N
3159 mbcset
->non_match
= 1;
3160 #endif /* not RE_ENABLE_I18N */
3162 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3163 bitset_set (sbcset
, '\n');
3164 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3165 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3166 if (BE (token
->type
== END_OF_RE
, 0))
3169 goto parse_bracket_exp_free_return
;
3173 /* We treat the first ']' as a normal character. */
3174 if (token
->type
== OP_CLOSE_BRACKET
)
3175 token
->type
= CHARACTER
;
3179 bracket_elem_t start_elem
, end_elem
;
3180 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3181 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3184 bool is_range_exp
= false;
3187 start_elem
.opr
.name
= start_name_buf
;
3188 start_elem
.type
= COLL_SYM
;
3189 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3190 syntax
, first_round
);
3191 if (BE (ret
!= REG_NOERROR
, 0))
3194 goto parse_bracket_exp_free_return
;
3196 first_round
= false;
3198 /* Get information about the next token. We need it in any case. */
3199 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3201 /* Do not check for ranges if we know they are not allowed. */
3202 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3204 if (BE (token
->type
== END_OF_RE
, 0))
3207 goto parse_bracket_exp_free_return
;
3209 if (token
->type
== OP_CHARSET_RANGE
)
3211 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3212 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3213 if (BE (token2
.type
== END_OF_RE
, 0))
3216 goto parse_bracket_exp_free_return
;
3218 if (token2
.type
== OP_CLOSE_BRACKET
)
3220 /* We treat the last '-' as a normal character. */
3221 re_string_skip_bytes (regexp
, -token_len
);
3222 token
->type
= CHARACTER
;
3225 is_range_exp
= true;
3229 if (is_range_exp
== true)
3231 end_elem
.opr
.name
= end_name_buf
;
3232 end_elem
.type
= COLL_SYM
;
3233 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3235 if (BE (ret
!= REG_NOERROR
, 0))
3238 goto parse_bracket_exp_free_return
;
3241 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3244 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3245 &start_elem
, &end_elem
);
3247 # ifdef RE_ENABLE_I18N
3248 *err
= build_range_exp (syntax
, sbcset
,
3249 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3250 &range_alloc
, &start_elem
, &end_elem
);
3252 *err
= build_range_exp (syntax
, sbcset
, &start_elem
, &end_elem
);
3254 #endif /* RE_ENABLE_I18N */
3255 if (BE (*err
!= REG_NOERROR
, 0))
3256 goto parse_bracket_exp_free_return
;
3260 switch (start_elem
.type
)
3263 bitset_set (sbcset
, start_elem
.opr
.ch
);
3265 #ifdef RE_ENABLE_I18N
3267 /* Check whether the array has enough space. */
3268 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3270 wchar_t *new_mbchars
;
3271 /* Not enough, realloc it. */
3272 /* +1 in case of mbcset->nmbchars is 0. */
3273 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3274 /* Use realloc since array is NULL if *alloc == 0. */
3275 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3277 if (BE (new_mbchars
== NULL
, 0))
3278 goto parse_bracket_exp_espace
;
3279 mbcset
->mbchars
= new_mbchars
;
3281 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3283 #endif /* RE_ENABLE_I18N */
3285 *err
= build_equiv_class (sbcset
,
3286 #ifdef RE_ENABLE_I18N
3287 mbcset
, &equiv_class_alloc
,
3288 #endif /* RE_ENABLE_I18N */
3289 start_elem
.opr
.name
);
3290 if (BE (*err
!= REG_NOERROR
, 0))
3291 goto parse_bracket_exp_free_return
;
3294 *err
= build_collating_symbol (sbcset
,
3295 #ifdef RE_ENABLE_I18N
3296 mbcset
, &coll_sym_alloc
,
3297 #endif /* RE_ENABLE_I18N */
3298 start_elem
.opr
.name
);
3299 if (BE (*err
!= REG_NOERROR
, 0))
3300 goto parse_bracket_exp_free_return
;
3303 *err
= build_charclass (regexp
->trans
, sbcset
,
3304 #ifdef RE_ENABLE_I18N
3305 mbcset
, &char_class_alloc
,
3306 #endif /* RE_ENABLE_I18N */
3307 (const char *) start_elem
.opr
.name
,
3309 if (BE (*err
!= REG_NOERROR
, 0))
3310 goto parse_bracket_exp_free_return
;
3317 if (BE (token
->type
== END_OF_RE
, 0))
3320 goto parse_bracket_exp_free_return
;
3322 if (token
->type
== OP_CLOSE_BRACKET
)
3326 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3328 /* If it is non-matching list. */
3330 bitset_not (sbcset
);
3332 #ifdef RE_ENABLE_I18N
3333 /* Ensure only single byte characters are set. */
3334 if (dfa
->mb_cur_max
> 1)
3335 bitset_mask (sbcset
, dfa
->sb_char
);
3337 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3338 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3339 || mbcset
->non_match
)))
3341 bin_tree_t
*mbc_tree
;
3343 /* Build a tree for complex bracket. */
3344 dfa
->has_mb_node
= 1;
3345 br_token
.type
= COMPLEX_BRACKET
;
3346 br_token
.opr
.mbcset
= mbcset
;
3347 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3348 if (BE (mbc_tree
== NULL
, 0))
3349 goto parse_bracket_exp_espace
;
3350 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3351 if (sbcset
[sbc_idx
])
3353 /* If there are no bits set in sbcset, there is no point
3354 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3355 if (sbc_idx
< BITSET_WORDS
)
3357 /* Build a tree for simple bracket. */
3358 br_token
.type
= SIMPLE_BRACKET
;
3359 br_token
.opr
.sbcset
= sbcset
;
3360 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3361 if (BE (work_tree
== NULL
, 0))
3362 goto parse_bracket_exp_espace
;
3364 /* Then join them by ALT node. */
3365 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3366 if (BE (work_tree
== NULL
, 0))
3367 goto parse_bracket_exp_espace
;
3372 work_tree
= mbc_tree
;
3376 #endif /* not RE_ENABLE_I18N */
3378 #ifdef RE_ENABLE_I18N
3379 free_charset (mbcset
);
3381 /* Build a tree for simple bracket. */
3382 br_token
.type
= SIMPLE_BRACKET
;
3383 br_token
.opr
.sbcset
= sbcset
;
3384 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3385 if (BE (work_tree
== NULL
, 0))
3386 goto parse_bracket_exp_espace
;
3390 parse_bracket_exp_espace
:
3392 parse_bracket_exp_free_return
:
3394 #ifdef RE_ENABLE_I18N
3395 free_charset (mbcset
);
3396 #endif /* RE_ENABLE_I18N */
3400 /* Parse an element in the bracket expression. */
3402 static reg_errcode_t
3403 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3404 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3405 reg_syntax_t syntax
, bool accept_hyphen
)
3407 #ifdef RE_ENABLE_I18N
3409 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3410 if (cur_char_size
> 1)
3412 elem
->type
= MB_CHAR
;
3413 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3414 re_string_skip_bytes (regexp
, cur_char_size
);
3417 #endif /* RE_ENABLE_I18N */
3418 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3419 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3420 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3421 return parse_bracket_symbol (elem
, regexp
, token
);
3422 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3424 /* A '-' must only appear as anything but a range indicator before
3425 the closing bracket. Everything else is an error. */
3427 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3428 if (token2
.type
!= OP_CLOSE_BRACKET
)
3429 /* The actual error value is not standardized since this whole
3430 case is undefined. But ERANGE makes good sense. */
3433 elem
->type
= SB_CHAR
;
3434 elem
->opr
.ch
= token
->opr
.c
;
3438 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3439 such as [:<character_class>:], [.<collating_element>.], and
3440 [=<equivalent_class>=]. */
3442 static reg_errcode_t
3443 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3446 unsigned char ch
, delim
= token
->opr
.c
;
3448 if (re_string_eoi(regexp
))
3452 if (i
>= BRACKET_NAME_BUF_SIZE
)
3454 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3455 ch
= re_string_fetch_byte_case (regexp
);
3457 ch
= re_string_fetch_byte (regexp
);
3458 if (re_string_eoi(regexp
))
3460 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3462 elem
->opr
.name
[i
] = ch
;
3464 re_string_skip_bytes (regexp
, 1);
3465 elem
->opr
.name
[i
] = '\0';
3466 switch (token
->type
)
3468 case OP_OPEN_COLL_ELEM
:
3469 elem
->type
= COLL_SYM
;
3471 case OP_OPEN_EQUIV_CLASS
:
3472 elem
->type
= EQUIV_CLASS
;
3474 case OP_OPEN_CHAR_CLASS
:
3475 elem
->type
= CHAR_CLASS
;
3483 /* Helper function for parse_bracket_exp.
3484 Build the equivalence class which is represented by NAME.
3485 The result are written to MBCSET and SBCSET.
3486 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3487 is a pointer argument since we may update it. */
3489 static reg_errcode_t
3490 #ifdef RE_ENABLE_I18N
3491 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3492 Idx
*equiv_class_alloc
, const unsigned char *name
)
3493 #else /* not RE_ENABLE_I18N */
3494 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3495 #endif /* not RE_ENABLE_I18N */
3498 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3501 const int32_t *table
, *indirect
;
3502 const unsigned char *weights
, *extra
, *cp
;
3503 unsigned char char_buf
[2];
3507 /* Calculate the index for equivalence class. */
3509 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3510 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3511 _NL_COLLATE_WEIGHTMB
);
3512 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3513 _NL_COLLATE_EXTRAMB
);
3514 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3515 _NL_COLLATE_INDIRECTMB
);
3516 idx1
= findidx (table
, indirect
, extra
, &cp
, -1);
3517 if (BE (idx1
== 0 || *cp
!= '\0', 0))
3518 /* This isn't a valid character. */
3519 return REG_ECOLLATE
;
3521 /* Build single byte matching table for this equivalence class. */
3522 len
= weights
[idx1
& 0xffffff];
3523 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3527 idx2
= findidx (table
, indirect
, extra
, &cp
, 1);
3532 /* This isn't a valid character. */
3534 /* Compare only if the length matches and the collation rule
3535 index is the same. */
3536 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24)
3537 && memcmp (weights
+ (idx1
& 0xffffff) + 1,
3538 weights
+ (idx2
& 0xffffff) + 1, len
) == 0)
3539 bitset_set (sbcset
, ch
);
3541 /* Check whether the array has enough space. */
3542 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3544 /* Not enough, realloc it. */
3545 /* +1 in case of mbcset->nequiv_classes is 0. */
3546 Idx new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3547 /* Use realloc since the array is NULL if *alloc == 0. */
3548 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3550 new_equiv_class_alloc
);
3551 if (BE (new_equiv_classes
== NULL
, 0))
3553 mbcset
->equiv_classes
= new_equiv_classes
;
3554 *equiv_class_alloc
= new_equiv_class_alloc
;
3556 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3561 if (BE (strlen ((const char *) name
) != 1, 0))
3562 return REG_ECOLLATE
;
3563 bitset_set (sbcset
, *name
);
3568 /* Helper function for parse_bracket_exp.
3569 Build the character class which is represented by NAME.
3570 The result are written to MBCSET and SBCSET.
3571 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3572 is a pointer argument since we may update it. */
3574 static reg_errcode_t
3575 #ifdef RE_ENABLE_I18N
3576 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3577 re_charset_t
*mbcset
, Idx
*char_class_alloc
,
3578 const char *class_name
, reg_syntax_t syntax
)
3579 #else /* not RE_ENABLE_I18N */
3580 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3581 const char *class_name
, reg_syntax_t syntax
)
3582 #endif /* not RE_ENABLE_I18N */
3585 const char *name
= class_name
;
3587 /* In case of REG_ICASE "upper" and "lower" match the both of
3588 upper and lower cases. */
3589 if ((syntax
& RE_ICASE
)
3590 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3593 #ifdef RE_ENABLE_I18N
3594 /* Check the space of the arrays. */
3595 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3597 /* Not enough, realloc it. */
3598 /* +1 in case of mbcset->nchar_classes is 0. */
3599 Idx new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3600 /* Use realloc since array is NULL if *alloc == 0. */
3601 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3602 new_char_class_alloc
);
3603 if (BE (new_char_classes
== NULL
, 0))
3605 mbcset
->char_classes
= new_char_classes
;
3606 *char_class_alloc
= new_char_class_alloc
;
3608 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3609 #endif /* RE_ENABLE_I18N */
3611 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3613 if (BE (trans != NULL, 0)) \
3615 for (i = 0; i < SBC_MAX; ++i) \
3616 if (ctype_func (i)) \
3617 bitset_set (sbcset, trans[i]); \
3621 for (i = 0; i < SBC_MAX; ++i) \
3622 if (ctype_func (i)) \
3623 bitset_set (sbcset, i); \
3627 if (strcmp (name
, "alnum") == 0)
3628 BUILD_CHARCLASS_LOOP (isalnum
);
3629 else if (strcmp (name
, "cntrl") == 0)
3630 BUILD_CHARCLASS_LOOP (iscntrl
);
3631 else if (strcmp (name
, "lower") == 0)
3632 BUILD_CHARCLASS_LOOP (islower
);
3633 else if (strcmp (name
, "space") == 0)
3634 BUILD_CHARCLASS_LOOP (isspace
);
3635 else if (strcmp (name
, "alpha") == 0)
3636 BUILD_CHARCLASS_LOOP (isalpha
);
3637 else if (strcmp (name
, "digit") == 0)
3638 BUILD_CHARCLASS_LOOP (isdigit
);
3639 else if (strcmp (name
, "print") == 0)
3640 BUILD_CHARCLASS_LOOP (isprint
);
3641 else if (strcmp (name
, "upper") == 0)
3642 BUILD_CHARCLASS_LOOP (isupper
);
3643 else if (strcmp (name
, "blank") == 0)
3644 BUILD_CHARCLASS_LOOP (isblank
);
3645 else if (strcmp (name
, "graph") == 0)
3646 BUILD_CHARCLASS_LOOP (isgraph
);
3647 else if (strcmp (name
, "punct") == 0)
3648 BUILD_CHARCLASS_LOOP (ispunct
);
3649 else if (strcmp (name
, "xdigit") == 0)
3650 BUILD_CHARCLASS_LOOP (isxdigit
);
3658 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3659 const char *class_name
,
3660 const char *extra
, bool non_match
,
3663 re_bitset_ptr_t sbcset
;
3664 #ifdef RE_ENABLE_I18N
3665 re_charset_t
*mbcset
;
3667 #endif /* not RE_ENABLE_I18N */
3669 re_token_t br_token
;
3672 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3673 if (BE (sbcset
== NULL
, 0))
3678 #ifdef RE_ENABLE_I18N
3679 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3680 if (BE (mbcset
== NULL
, 0))
3686 mbcset
->non_match
= non_match
;
3687 #endif /* RE_ENABLE_I18N */
3689 /* We don't care the syntax in this case. */
3690 ret
= build_charclass (trans
, sbcset
,
3691 #ifdef RE_ENABLE_I18N
3693 #endif /* RE_ENABLE_I18N */
3696 if (BE (ret
!= REG_NOERROR
, 0))
3699 #ifdef RE_ENABLE_I18N
3700 free_charset (mbcset
);
3701 #endif /* RE_ENABLE_I18N */
3705 /* \w match '_' also. */
3706 for (; *extra
; extra
++)
3707 bitset_set (sbcset
, *extra
);
3709 /* If it is non-matching list. */
3711 bitset_not (sbcset
);
3713 #ifdef RE_ENABLE_I18N
3714 /* Ensure only single byte characters are set. */
3715 if (dfa
->mb_cur_max
> 1)
3716 bitset_mask (sbcset
, dfa
->sb_char
);
3719 /* Build a tree for simple bracket. */
3720 #if defined GCC_LINT || defined lint
3721 memset (&br_token
, 0, sizeof br_token
);
3723 br_token
.type
= SIMPLE_BRACKET
;
3724 br_token
.opr
.sbcset
= sbcset
;
3725 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3726 if (BE (tree
== NULL
, 0))
3727 goto build_word_op_espace
;
3729 #ifdef RE_ENABLE_I18N
3730 if (dfa
->mb_cur_max
> 1)
3732 bin_tree_t
*mbc_tree
;
3733 /* Build a tree for complex bracket. */
3734 br_token
.type
= COMPLEX_BRACKET
;
3735 br_token
.opr
.mbcset
= mbcset
;
3736 dfa
->has_mb_node
= 1;
3737 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3738 if (BE (mbc_tree
== NULL
, 0))
3739 goto build_word_op_espace
;
3740 /* Then join them by ALT node. */
3741 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3742 if (BE (mbc_tree
!= NULL
, 1))
3747 free_charset (mbcset
);
3750 #else /* not RE_ENABLE_I18N */
3752 #endif /* not RE_ENABLE_I18N */
3754 build_word_op_espace
:
3756 #ifdef RE_ENABLE_I18N
3757 free_charset (mbcset
);
3758 #endif /* RE_ENABLE_I18N */
3763 /* This is intended for the expressions like "a{1,3}".
3764 Fetch a number from 'input', and return the number.
3765 Return -1 if the number field is empty like "{,1}".
3766 Return RE_DUP_MAX + 1 if the number field is too large.
3767 Return -2 if an error occurred. */
3770 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3776 fetch_token (token
, input
, syntax
);
3778 if (BE (token
->type
== END_OF_RE
, 0))
3780 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3782 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3786 : MIN (RE_DUP_MAX
+ 1, num
* 10 + c
- '0'));
3791 #ifdef RE_ENABLE_I18N
3793 free_charset (re_charset_t
*cset
)
3795 re_free (cset
->mbchars
);
3797 re_free (cset
->coll_syms
);
3798 re_free (cset
->equiv_classes
);
3799 re_free (cset
->range_starts
);
3800 re_free (cset
->range_ends
);
3802 re_free (cset
->char_classes
);
3805 #endif /* RE_ENABLE_I18N */
3807 /* Functions for binary tree operation. */
3809 /* Create a tree node. */
3812 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3813 re_token_type_t type
)
3816 #if defined GCC_LINT || defined lint
3817 memset (&t
, 0, sizeof t
);
3820 return create_token_tree (dfa
, left
, right
, &t
);
3824 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3825 const re_token_t
*token
)
3828 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3830 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3832 if (storage
== NULL
)
3834 storage
->next
= dfa
->str_tree_storage
;
3835 dfa
->str_tree_storage
= storage
;
3836 dfa
->str_tree_storage_idx
= 0;
3838 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3840 tree
->parent
= NULL
;
3842 tree
->right
= right
;
3843 tree
->token
= *token
;
3844 tree
->token
.duplicated
= 0;
3845 tree
->token
.opt_subexp
= 0;
3848 tree
->node_idx
= -1;
3851 left
->parent
= tree
;
3853 right
->parent
= tree
;
3857 /* Mark the tree SRC as an optional subexpression.
3858 To be called from preorder or postorder. */
3860 static reg_errcode_t
3861 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3863 Idx idx
= (uintptr_t) extra
;
3864 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3865 node
->token
.opt_subexp
= 1;
3870 /* Free the allocated memory inside NODE. */
3873 free_token (re_token_t
*node
)
3875 #ifdef RE_ENABLE_I18N
3876 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3877 free_charset (node
->opr
.mbcset
);
3879 #endif /* RE_ENABLE_I18N */
3880 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3881 re_free (node
->opr
.sbcset
);
3884 /* Worker function for tree walking. Free the allocated memory inside NODE
3885 and its children. */
3887 static reg_errcode_t
3888 free_tree (void *extra
, bin_tree_t
*node
)
3890 free_token (&node
->token
);
3895 /* Duplicate the node SRC, and return new node. This is a preorder
3896 visit similar to the one implemented by the generic visitor, but
3897 we need more infrastructure to maintain two parallel trees --- so,
3898 it's easier to duplicate. */
3901 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3903 const bin_tree_t
*node
;
3904 bin_tree_t
*dup_root
;
3905 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3907 for (node
= root
; ; )
3909 /* Create a new tree and link it back to the current parent. */
3910 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3913 (*p_new
)->parent
= dup_node
;
3914 (*p_new
)->token
.duplicated
= 1;
3917 /* Go to the left node, or up and to the right. */
3921 p_new
= &dup_node
->left
;
3925 const bin_tree_t
*prev
= NULL
;
3926 while (node
->right
== prev
|| node
->right
== NULL
)
3929 node
= node
->parent
;
3930 dup_node
= dup_node
->parent
;
3935 p_new
= &dup_node
->right
;