]> git.proxmox.com Git - mirror_smartmontools-debian.git/blob - regex/regcomp.c
Imported Upstream version 6.1+svn3812
[mirror_smartmontools-debian.git] / regex / regcomp.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 MA 02110-1301 USA. */
20
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 int length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static reg_errcode_t init_word_char (re_dfa_t *dfa);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 static reg_errcode_t analyze (re_dfa_t *dfa);
34 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
35 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
36 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
37 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
38 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
39 int top_clone_node, int root_node,
40 unsigned int constraint);
41 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
42 unsigned int constraint);
43 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
44 unsigned int constraint);
45 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
46 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
47 int node, int root);
48 static void calc_inveclosure (re_dfa_t *dfa);
49 static int fetch_number (re_string_t *input, re_token_t *token,
50 reg_syntax_t syntax);
51 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
52 static int peek_token (re_token_t *token, re_string_t *input,
53 reg_syntax_t syntax);
54 static int peek_token_bracket (re_token_t *token, re_string_t *input,
55 reg_syntax_t syntax);
56 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
57 reg_syntax_t syntax, reg_errcode_t *err);
58 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
59 re_token_t *token, reg_syntax_t syntax,
60 int nest, reg_errcode_t *err);
61 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
64 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
71 re_dfa_t *dfa, re_token_t *token,
72 reg_syntax_t syntax, reg_errcode_t *err);
73 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
74 re_token_t *token, reg_syntax_t syntax,
75 reg_errcode_t *err);
76 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
77 re_string_t *regexp,
78 re_token_t *token, int token_len,
79 re_dfa_t *dfa,
80 reg_syntax_t syntax);
81 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
82 re_string_t *regexp,
83 re_token_t *token);
84 #ifndef _LIBC
85 # ifdef RE_ENABLE_I18N
86 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
87 re_charset_t *mbcset, int *range_alloc,
88 bracket_elem_t *start_elem,
89 bracket_elem_t *end_elem);
90 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
91 re_charset_t *mbcset,
92 int *coll_sym_alloc,
93 const unsigned char *name);
94 # else /* not RE_ENABLE_I18N */
95 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
96 bracket_elem_t *start_elem,
97 bracket_elem_t *end_elem);
98 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
99 const unsigned char *name);
100 # endif /* not RE_ENABLE_I18N */
101 #endif /* not _LIBC */
102 #ifdef RE_ENABLE_I18N
103 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
104 re_charset_t *mbcset,
105 int *equiv_class_alloc,
106 const unsigned char *name);
107 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
108 re_charset_t *mbcset,
109 int *char_class_alloc,
110 const unsigned char *class_name,
111 reg_syntax_t syntax);
112 #else /* not RE_ENABLE_I18N */
113 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
114 const unsigned char *name);
115 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
116 const unsigned char *class_name,
117 reg_syntax_t syntax);
118 #endif /* not RE_ENABLE_I18N */
119 static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
120 static void free_bin_tree (bin_tree_t *tree);
121 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
122 re_token_type_t type, int index);
123 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
124 \f
125 /* This table gives an error message for each of the error codes listed
126 in regex.h. Obviously the order here has to be same as there.
127 POSIX doesn't require that we do anything for REG_NOERROR,
128 but why not be nice? */
129
130 const char __re_error_msgid[] attribute_hidden =
131 {
132 #define REG_NOERROR_IDX 0
133 gettext_noop ("Success") /* REG_NOERROR */
134 "\0"
135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136 gettext_noop ("No match") /* REG_NOMATCH */
137 "\0"
138 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
139 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 "\0"
141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 "\0"
144 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 "\0"
147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
148 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 "\0"
150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
151 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 "\0"
153 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
154 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 "\0"
156 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 "\0"
159 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 "\0"
162 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
163 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 "\0"
165 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 "\0"
168 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
169 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 "\0"
171 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
172 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 "\0"
174 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 "\0"
177 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
178 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 "\0"
180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
181 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 };
183
184 const size_t __re_error_msgid_idx[] attribute_hidden =
185 {
186 REG_NOERROR_IDX,
187 REG_NOMATCH_IDX,
188 REG_BADPAT_IDX,
189 REG_ECOLLATE_IDX,
190 REG_ECTYPE_IDX,
191 REG_EESCAPE_IDX,
192 REG_ESUBREG_IDX,
193 REG_EBRACK_IDX,
194 REG_EPAREN_IDX,
195 REG_EBRACE_IDX,
196 REG_BADBR_IDX,
197 REG_ERANGE_IDX,
198 REG_ESPACE_IDX,
199 REG_BADRPT_IDX,
200 REG_EEND_IDX,
201 REG_ESIZE_IDX,
202 REG_ERPAREN_IDX
203 };
204 \f
205 /* Entry points for GNU code. */
206
207 /* re_compile_pattern is the GNU regular expression compiler: it
208 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
209 Returns 0 if the pattern was valid, otherwise an error string.
210
211 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
212 are set in BUFP on entry. */
213
214 const char *
215 re_compile_pattern (pattern, length, bufp)
216 const char *pattern;
217 size_t length;
218 struct re_pattern_buffer *bufp;
219 {
220 reg_errcode_t ret;
221
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub. */
225 bufp->no_sub = 0;
226
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
229
230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231
232 if (!ret)
233 return NULL;
234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235 }
236 #ifdef _LIBC
237 weak_alias (__re_compile_pattern, re_compile_pattern)
238 #endif
239
240 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243 /* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245 reg_syntax_t re_syntax_options;
246
247
248 /* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
251
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
254
255 reg_syntax_t
256 re_set_syntax (syntax)
257 reg_syntax_t syntax;
258 {
259 reg_syntax_t ret = re_syntax_options;
260
261 re_syntax_options = syntax;
262 return ret;
263 }
264 #ifdef _LIBC
265 weak_alias (__re_set_syntax, re_set_syntax)
266 #endif
267
268 int
269 re_compile_fastmap (bufp)
270 struct re_pattern_buffer *bufp;
271 {
272 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
273 char *fastmap = bufp->fastmap;
274
275 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
276 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
277 if (dfa->init_state != dfa->init_state_word)
278 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
279 if (dfa->init_state != dfa->init_state_nl)
280 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
281 if (dfa->init_state != dfa->init_state_begbuf)
282 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
283 bufp->fastmap_accurate = 1;
284 return 0;
285 }
286 #ifdef _LIBC
287 weak_alias (__re_compile_fastmap, re_compile_fastmap)
288 #endif
289
290 static inline void
291 re_set_fastmap (char *fastmap, int icase, int ch)
292 {
293 fastmap[ch] = 1;
294 if (icase)
295 fastmap[tolower (ch)] = 1;
296 }
297
298 /* Helper function for re_compile_fastmap.
299 Compile fastmap for the initial_state INIT_STATE. */
300
301 static void
302 re_compile_fastmap_iter (bufp, init_state, fastmap)
303 regex_t *bufp;
304 const re_dfastate_t *init_state;
305 char *fastmap;
306 {
307 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
308 int node_cnt;
309 int icase = (MB_CUR_MAX == 1 && (bufp->syntax & RE_ICASE));
310 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
311 {
312 int node = init_state->nodes.elems[node_cnt];
313 re_token_type_t type = dfa->nodes[node].type;
314
315 if (type == CHARACTER)
316 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
317 else if (type == SIMPLE_BRACKET)
318 {
319 int i, j, ch;
320 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
321 for (j = 0; j < UINT_BITS; ++j, ++ch)
322 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
323 re_set_fastmap (fastmap, icase, ch);
324 }
325 #ifdef RE_ENABLE_I18N
326 else if (type == COMPLEX_BRACKET)
327 {
328 int i;
329 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
330 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
331 || cset->nranges || cset->nchar_classes)
332 {
333 # ifdef _LIBC
334 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
335 {
336 /* In this case we want to catch the bytes which are
337 the first byte of any collation elements.
338 e.g. In da_DK, we want to catch 'a' since "aa"
339 is a valid collation element, and don't catch
340 'b' since 'b' is the only collation element
341 which starts from 'b'. */
342 int j, ch;
343 const int32_t *table = (const int32_t *)
344 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
345 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
346 for (j = 0; j < UINT_BITS; ++j, ++ch)
347 if (table[ch] < 0)
348 re_set_fastmap (fastmap, icase, ch);
349 }
350 # else
351 if (MB_CUR_MAX > 1)
352 for (i = 0; i < SBC_MAX; ++i)
353 if (__btowc (i) == WEOF)
354 re_set_fastmap (fastmap, icase, i);
355 # endif /* not _LIBC */
356 }
357 for (i = 0; i < cset->nmbchars; ++i)
358 {
359 char buf[256];
360 mbstate_t state;
361 memset (&state, '\0', sizeof (state));
362 __wcrtomb (buf, cset->mbchars[i], &state);
363 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
364 }
365 }
366 #endif /* RE_ENABLE_I18N */
367 else if (type == END_OF_RE || type == OP_PERIOD)
368 {
369 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
370 if (type == END_OF_RE)
371 bufp->can_be_null = 1;
372 return;
373 }
374 }
375 }
376 \f
377 /* Entry point for POSIX code. */
378 /* regcomp takes a regular expression as a string and compiles it.
379
380 PREG is a regex_t *. We do not expect any fields to be initialized,
381 since POSIX says we shouldn't. Thus, we set
382
383 `buffer' to the compiled pattern;
384 `used' to the length of the compiled pattern;
385 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
386 REG_EXTENDED bit in CFLAGS is set; otherwise, to
387 RE_SYNTAX_POSIX_BASIC;
388 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
389 `fastmap' to an allocated space for the fastmap;
390 `fastmap_accurate' to zero;
391 `re_nsub' to the number of subexpressions in PATTERN.
392
393 PATTERN is the address of the pattern string.
394
395 CFLAGS is a series of bits which affect compilation.
396
397 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
398 use POSIX basic syntax.
399
400 If REG_NEWLINE is set, then . and [^...] don't match newline.
401 Also, regexec will try a match beginning after every newline.
402
403 If REG_ICASE is set, then we considers upper- and lowercase
404 versions of letters to be equivalent when matching.
405
406 If REG_NOSUB is set, then when PREG is passed to regexec, that
407 routine will report only success or failure, and nothing about the
408 registers.
409
410 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
411 the return codes and their meanings.) */
412
413 int
414 regcomp (preg, pattern, cflags)
415 regex_t *__restrict preg;
416 const char *__restrict pattern;
417 int cflags;
418 {
419 reg_errcode_t ret;
420 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
421 : RE_SYNTAX_POSIX_BASIC);
422
423 preg->buffer = NULL;
424 preg->allocated = 0;
425 preg->used = 0;
426
427 /* Try to allocate space for the fastmap. */
428 preg->fastmap = re_malloc (char, SBC_MAX);
429 if (BE (preg->fastmap == NULL, 0))
430 return REG_ESPACE;
431
432 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
433
434 /* If REG_NEWLINE is set, newlines are treated differently. */
435 if (cflags & REG_NEWLINE)
436 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
437 syntax &= ~RE_DOT_NEWLINE;
438 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
439 /* It also changes the matching behavior. */
440 preg->newline_anchor = 1;
441 }
442 else
443 preg->newline_anchor = 0;
444 preg->no_sub = !!(cflags & REG_NOSUB);
445 preg->translate = NULL;
446
447 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
448
449 /* POSIX doesn't distinguish between an unmatched open-group and an
450 unmatched close-group: both are REG_EPAREN. */
451 if (ret == REG_ERPAREN)
452 ret = REG_EPAREN;
453
454 /* We have already checked preg->fastmap != NULL. */
455 if (BE (ret == REG_NOERROR, 1))
456 /* Compute the fastmap now, since regexec cannot modify the pattern
457 buffer. This function nevers fails in this implementation. */
458 (void) re_compile_fastmap (preg);
459 else
460 {
461 /* Some error occurred while compiling the expression. */
462 re_free (preg->fastmap);
463 preg->fastmap = NULL;
464 }
465
466 return (int) ret;
467 }
468 #ifdef _LIBC
469 weak_alias (__regcomp, regcomp)
470 #endif
471
472 /* Returns a message corresponding to an error code, ERRCODE, returned
473 from either regcomp or regexec. We don't use PREG here. */
474
475 size_t
476 regerror (errcode, preg, errbuf, errbuf_size)
477 int errcode;
478 const regex_t *preg;
479 char *errbuf;
480 size_t errbuf_size;
481 {
482 const char *msg;
483 size_t msg_size;
484
485 if (BE (errcode < 0
486 || errcode >= (int) (sizeof (__re_error_msgid_idx)
487 / sizeof (__re_error_msgid_idx[0])), 0))
488 /* Only error codes returned by the rest of the code should be passed
489 to this routine. If we are given anything else, or if other regex
490 code generates an invalid error code, then the program has a bug.
491 Dump core so we can fix it. */
492 abort ();
493
494 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
495
496 msg_size = strlen (msg) + 1; /* Includes the null. */
497
498 if (BE (errbuf_size != 0, 1))
499 {
500 if (BE (msg_size > errbuf_size, 0))
501 {
502 #if defined HAVE_MEMPCPY || defined _LIBC
503 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
504 #else
505 memcpy (errbuf, msg, errbuf_size - 1);
506 errbuf[errbuf_size - 1] = 0;
507 #endif
508 }
509 else
510 memcpy (errbuf, msg, msg_size);
511 }
512
513 return msg_size;
514 }
515 #ifdef _LIBC
516 weak_alias (__regerror, regerror)
517 #endif
518
519
520 static void
521 free_dfa_content (re_dfa_t *dfa)
522 {
523 int i, j;
524
525 re_free (dfa->subexps);
526
527 for (i = 0; i < dfa->nodes_len; ++i)
528 {
529 re_token_t *node = dfa->nodes + i;
530 #ifdef RE_ENABLE_I18N
531 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
532 free_charset (node->opr.mbcset);
533 else
534 #endif /* RE_ENABLE_I18N */
535 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
536 re_free (node->opr.sbcset);
537 }
538 re_free (dfa->nexts);
539 for (i = 0; i < dfa->nodes_len; ++i)
540 {
541 if (dfa->eclosures != NULL)
542 re_node_set_free (dfa->eclosures + i);
543 if (dfa->inveclosures != NULL)
544 re_node_set_free (dfa->inveclosures + i);
545 if (dfa->edests != NULL)
546 re_node_set_free (dfa->edests + i);
547 }
548 re_free (dfa->edests);
549 re_free (dfa->eclosures);
550 re_free (dfa->inveclosures);
551 re_free (dfa->nodes);
552
553 for (i = 0; i <= dfa->state_hash_mask; ++i)
554 {
555 struct re_state_table_entry *entry = dfa->state_table + i;
556 for (j = 0; j < entry->num; ++j)
557 {
558 re_dfastate_t *state = entry->array[j];
559 free_state (state);
560 }
561 re_free (entry->array);
562 }
563 re_free (dfa->state_table);
564
565 if (dfa->word_char != NULL)
566 re_free (dfa->word_char);
567 #ifdef DEBUG
568 re_free (dfa->re_str);
569 #endif
570
571 re_free (dfa);
572 }
573
574
575 /* Free dynamically allocated space used by PREG. */
576
577 void
578 regfree (preg)
579 regex_t *preg;
580 {
581 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
582 if (BE (dfa != NULL, 1))
583 free_dfa_content (dfa);
584
585 re_free (preg->fastmap);
586 }
587 #ifdef _LIBC
588 weak_alias (__regfree, regfree)
589 #endif
590 \f
591 /* Entry points compatible with 4.2 BSD regex library. We don't define
592 them unless specifically requested. */
593
594 #if defined _REGEX_RE_COMP || defined _LIBC
595
596 /* BSD has one and only one pattern buffer. */
597 static struct re_pattern_buffer re_comp_buf;
598
599 char *
600 # ifdef _LIBC
601 /* Make these definitions weak in libc, so POSIX programs can redefine
602 these names if they don't use our functions, and still use
603 regcomp/regexec above without link errors. */
604 weak_function
605 # endif
606 re_comp (s)
607 const char *s;
608 {
609 reg_errcode_t ret;
610 char *fastmap;
611
612 if (!s)
613 {
614 if (!re_comp_buf.buffer)
615 return gettext ("No previous regular expression");
616 return 0;
617 }
618
619 if (re_comp_buf.buffer)
620 {
621 fastmap = re_comp_buf.fastmap;
622 re_comp_buf.fastmap = NULL;
623 __regfree (&re_comp_buf);
624 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
625 re_comp_buf.fastmap = fastmap;
626 }
627
628 if (re_comp_buf.fastmap == NULL)
629 {
630 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
631 if (re_comp_buf.fastmap == NULL)
632 return (char *) gettext (__re_error_msgid
633 + __re_error_msgid_idx[(int) REG_ESPACE]);
634 }
635
636 /* Since `re_exec' always passes NULL for the `regs' argument, we
637 don't need to initialize the pattern buffer fields which affect it. */
638
639 /* Match anchors at newlines. */
640 re_comp_buf.newline_anchor = 1;
641
642 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
643
644 if (!ret)
645 return NULL;
646
647 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
648 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
649 }
650
651 #ifdef _LIBC
652 libc_freeres_fn (free_mem)
653 {
654 __regfree (&re_comp_buf);
655 }
656 #endif
657
658 #endif /* _REGEX_RE_COMP */
659 \f
660 /* Internal entry point.
661 Compile the regular expression PATTERN, whose length is LENGTH.
662 SYNTAX indicate regular expression's syntax. */
663
664 static reg_errcode_t
665 re_compile_internal (preg, pattern, length, syntax)
666 regex_t *preg;
667 const char * pattern;
668 int length;
669 reg_syntax_t syntax;
670 {
671 reg_errcode_t err = REG_NOERROR;
672 re_dfa_t *dfa;
673 re_string_t regexp;
674
675 /* Initialize the pattern buffer. */
676 preg->fastmap_accurate = 0;
677 preg->syntax = syntax;
678 preg->not_bol = preg->not_eol = 0;
679 preg->used = 0;
680 preg->re_nsub = 0;
681 preg->can_be_null = 0;
682 preg->regs_allocated = REGS_UNALLOCATED;
683
684 /* Initialize the dfa. */
685 dfa = (re_dfa_t *) preg->buffer;
686 if (preg->allocated < sizeof (re_dfa_t))
687 {
688 /* If zero allocated, but buffer is non-null, try to realloc
689 enough space. This loses if buffer's address is bogus, but
690 that is the user's responsibility. If ->buffer is NULL this
691 is a simple allocation. */
692 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
693 if (dfa == NULL)
694 return REG_ESPACE;
695 preg->allocated = sizeof (re_dfa_t);
696 }
697 preg->buffer = (unsigned char *) dfa;
698 preg->used = sizeof (re_dfa_t);
699
700 err = init_dfa (dfa, length);
701 if (BE (err != REG_NOERROR, 0))
702 {
703 re_free (dfa);
704 preg->buffer = NULL;
705 preg->allocated = 0;
706 return err;
707 }
708 #ifdef DEBUG
709 dfa->re_str = re_malloc (char, length + 1);
710 strncpy (dfa->re_str, pattern, length + 1);
711 #endif
712
713 err = re_string_construct (&regexp, pattern, length, preg->translate,
714 syntax & RE_ICASE);
715 if (BE (err != REG_NOERROR, 0))
716 {
717 re_free (dfa);
718 preg->buffer = NULL;
719 preg->allocated = 0;
720 return err;
721 }
722
723 /* Parse the regular expression, and build a structure tree. */
724 preg->re_nsub = 0;
725 dfa->str_tree = parse (&regexp, preg, syntax, &err);
726 if (BE (dfa->str_tree == NULL, 0))
727 goto re_compile_internal_free_return;
728
729 /* Analyze the tree and collect information which is necessary to
730 create the dfa. */
731 err = analyze (dfa);
732 if (BE (err != REG_NOERROR, 0))
733 goto re_compile_internal_free_return;
734
735 /* Then create the initial state of the dfa. */
736 err = create_initial_state (dfa);
737
738 /* Release work areas. */
739 free_workarea_compile (preg);
740 re_string_destruct (&regexp);
741
742 if (BE (err != REG_NOERROR, 0))
743 {
744 re_compile_internal_free_return:
745 free_dfa_content (dfa);
746 preg->buffer = NULL;
747 preg->allocated = 0;
748 }
749
750 return err;
751 }
752
753 /* Initialize DFA. We use the length of the regular expression PAT_LEN
754 as the initial length of some arrays. */
755
756 static reg_errcode_t
757 init_dfa (dfa, pat_len)
758 re_dfa_t *dfa;
759 int pat_len;
760 {
761 int table_size;
762
763 memset (dfa, '\0', sizeof (re_dfa_t));
764
765 dfa->nodes_alloc = pat_len + 1;
766 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
767
768 dfa->states_alloc = pat_len + 1;
769
770 /* table_size = 2 ^ ceil(log pat_len) */
771 for (table_size = 1; table_size > 0; table_size <<= 1)
772 if (table_size > pat_len)
773 break;
774
775 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
776 dfa->state_hash_mask = table_size - 1;
777
778 dfa->subexps_alloc = 1;
779 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
780 dfa->word_char = NULL;
781
782 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
783 || dfa->subexps == NULL, 0))
784 {
785 /* We don't bother to free anything which was allocated. Very
786 soon the process will go down anyway. */
787 dfa->subexps = NULL;
788 dfa->state_table = NULL;
789 dfa->nodes = NULL;
790 return REG_ESPACE;
791 }
792 return REG_NOERROR;
793 }
794
795 /* Initialize WORD_CHAR table, which indicate which character is
796 "word". In this case "word" means that it is the word construction
797 character used by some operators like "\<", "\>", etc. */
798
799 static reg_errcode_t
800 init_word_char (dfa)
801 re_dfa_t *dfa;
802 {
803 int i, j, ch;
804 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
805 if (BE (dfa->word_char == NULL, 0))
806 return REG_ESPACE;
807 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
808 for (j = 0; j < UINT_BITS; ++j, ++ch)
809 if (isalnum (ch) || ch == '_')
810 dfa->word_char[i] |= 1 << j;
811 return REG_NOERROR;
812 }
813
814 /* Free the work area which are only used while compiling. */
815
816 static void
817 free_workarea_compile (preg)
818 regex_t *preg;
819 {
820 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
821 free_bin_tree (dfa->str_tree);
822 dfa->str_tree = NULL;
823 re_free (dfa->org_indices);
824 dfa->org_indices = NULL;
825 }
826
827 /* Create initial states for all contexts. */
828
829 static reg_errcode_t
830 create_initial_state (dfa)
831 re_dfa_t *dfa;
832 {
833 int first, i;
834 reg_errcode_t err;
835 re_node_set init_nodes;
836
837 /* Initial states have the epsilon closure of the node which is
838 the first node of the regular expression. */
839 first = dfa->str_tree->first;
840 dfa->init_node = first;
841 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
842 if (BE (err != REG_NOERROR, 0))
843 return err;
844
845 /* The back-references which are in initial states can epsilon transit,
846 since in this case all of the subexpressions can be null.
847 Then we add epsilon closures of the nodes which are the next nodes of
848 the back-references. */
849 if (dfa->nbackref > 0)
850 for (i = 0; i < init_nodes.nelem; ++i)
851 {
852 int node_idx = init_nodes.elems[i];
853 re_token_type_t type = dfa->nodes[node_idx].type;
854
855 int clexp_idx;
856 if (type != OP_BACK_REF)
857 continue;
858 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
859 {
860 re_token_t *clexp_node;
861 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
862 if (clexp_node->type == OP_CLOSE_SUBEXP
863 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
864 break;
865 }
866 if (clexp_idx == init_nodes.nelem)
867 continue;
868
869 if (type == OP_BACK_REF)
870 {
871 int dest_idx = dfa->edests[node_idx].elems[0];
872 if (!re_node_set_contains (&init_nodes, dest_idx))
873 {
874 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
875 i = 0;
876 }
877 }
878 }
879
880 /* It must be the first time to invoke acquire_state. */
881 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
882 /* We don't check ERR here, since the initial state must not be NULL. */
883 if (BE (dfa->init_state == NULL, 0))
884 return err;
885 if (dfa->init_state->has_constraint)
886 {
887 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
888 CONTEXT_WORD);
889 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
890 CONTEXT_NEWLINE);
891 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
892 &init_nodes,
893 CONTEXT_NEWLINE
894 | CONTEXT_BEGBUF);
895 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
896 || dfa->init_state_begbuf == NULL, 0))
897 return err;
898 }
899 else
900 dfa->init_state_word = dfa->init_state_nl
901 = dfa->init_state_begbuf = dfa->init_state;
902
903 re_node_set_free (&init_nodes);
904 return REG_NOERROR;
905 }
906 \f
907 /* Analyze the structure tree, and calculate "first", "next", "edest",
908 "eclosure", and "inveclosure". */
909
910 static reg_errcode_t
911 analyze (dfa)
912 re_dfa_t *dfa;
913 {
914 int i;
915 reg_errcode_t ret;
916
917 /* Allocate arrays. */
918 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
919 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
920 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
921 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
922 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
923 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
924 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
925 return REG_ESPACE;
926 /* Initialize them. */
927 for (i = 0; i < dfa->nodes_len; ++i)
928 {
929 dfa->nexts[i] = -1;
930 re_node_set_init_empty (dfa->edests + i);
931 re_node_set_init_empty (dfa->eclosures + i);
932 re_node_set_init_empty (dfa->inveclosures + i);
933 }
934
935 ret = analyze_tree (dfa, dfa->str_tree);
936 if (BE (ret == REG_NOERROR, 1))
937 {
938 ret = calc_eclosure (dfa);
939 if (ret == REG_NOERROR)
940 calc_inveclosure (dfa);
941 }
942 return ret;
943 }
944
945 /* Helper functions for analyze.
946 This function calculate "first", "next", and "edest" for the subtree
947 whose root is NODE. */
948
949 static reg_errcode_t
950 analyze_tree (dfa, node)
951 re_dfa_t *dfa;
952 bin_tree_t *node;
953 {
954 reg_errcode_t ret;
955 if (node->first == -1)
956 calc_first (dfa, node);
957 if (node->next == -1)
958 calc_next (dfa, node);
959 if (node->eclosure.nelem == 0)
960 calc_epsdest (dfa, node);
961 /* Calculate "first" etc. for the left child. */
962 if (node->left != NULL)
963 {
964 ret = analyze_tree (dfa, node->left);
965 if (BE (ret != REG_NOERROR, 0))
966 return ret;
967 }
968 /* Calculate "first" etc. for the right child. */
969 if (node->right != NULL)
970 {
971 ret = analyze_tree (dfa, node->right);
972 if (BE (ret != REG_NOERROR, 0))
973 return ret;
974 }
975 return REG_NOERROR;
976 }
977
978 /* Calculate "first" for the node NODE. */
979 static void
980 calc_first (dfa, node)
981 re_dfa_t *dfa;
982 bin_tree_t *node;
983 {
984 int idx, type;
985 idx = node->node_idx;
986 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
987
988 switch (type)
989 {
990 #ifdef DEBUG
991 case OP_OPEN_BRACKET:
992 case OP_CLOSE_BRACKET:
993 case OP_OPEN_DUP_NUM:
994 case OP_CLOSE_DUP_NUM:
995 case OP_NON_MATCH_LIST:
996 case OP_OPEN_COLL_ELEM:
997 case OP_CLOSE_COLL_ELEM:
998 case OP_OPEN_EQUIV_CLASS:
999 case OP_CLOSE_EQUIV_CLASS:
1000 case OP_OPEN_CHAR_CLASS:
1001 case OP_CLOSE_CHAR_CLASS:
1002 /* These must not be appeared here. */
1003 assert (0);
1004 #endif
1005 case END_OF_RE:
1006 case CHARACTER:
1007 case OP_PERIOD:
1008 case OP_DUP_ASTERISK:
1009 case OP_DUP_QUESTION:
1010 #ifdef RE_ENABLE_I18N
1011 case COMPLEX_BRACKET:
1012 #endif /* RE_ENABLE_I18N */
1013 case SIMPLE_BRACKET:
1014 case OP_BACK_REF:
1015 case ANCHOR:
1016 case OP_OPEN_SUBEXP:
1017 case OP_CLOSE_SUBEXP:
1018 node->first = idx;
1019 break;
1020 case OP_DUP_PLUS:
1021 #ifdef DEBUG
1022 assert (node->left != NULL);
1023 #endif
1024 if (node->left->first == -1)
1025 calc_first (dfa, node->left);
1026 node->first = node->left->first;
1027 break;
1028 case OP_ALT:
1029 node->first = idx;
1030 break;
1031 /* else fall through */
1032 default:
1033 #ifdef DEBUG
1034 assert (node->left != NULL);
1035 #endif
1036 if (node->left->first == -1)
1037 calc_first (dfa, node->left);
1038 node->first = node->left->first;
1039 break;
1040 }
1041 }
1042
1043 /* Calculate "next" for the node NODE. */
1044
1045 static void
1046 calc_next (dfa, node)
1047 re_dfa_t *dfa;
1048 bin_tree_t *node;
1049 {
1050 int idx, type;
1051 bin_tree_t *parent = node->parent;
1052 if (parent == NULL)
1053 {
1054 node->next = -1;
1055 idx = node->node_idx;
1056 if (node->type == 0)
1057 dfa->nexts[idx] = node->next;
1058 return;
1059 }
1060
1061 idx = parent->node_idx;
1062 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1063
1064 switch (type)
1065 {
1066 case OP_DUP_ASTERISK:
1067 case OP_DUP_PLUS:
1068 node->next = idx;
1069 break;
1070 case CONCAT:
1071 if (parent->left == node)
1072 {
1073 if (parent->right->first == -1)
1074 calc_first (dfa, parent->right);
1075 node->next = parent->right->first;
1076 break;
1077 }
1078 /* else fall through */
1079 default:
1080 if (parent->next == -1)
1081 calc_next (dfa, parent);
1082 node->next = parent->next;
1083 break;
1084 }
1085 idx = node->node_idx;
1086 if (node->type == 0)
1087 dfa->nexts[idx] = node->next;
1088 }
1089
1090 /* Calculate "edest" for the node NODE. */
1091
1092 static void
1093 calc_epsdest (dfa, node)
1094 re_dfa_t *dfa;
1095 bin_tree_t *node;
1096 {
1097 int idx;
1098 idx = node->node_idx;
1099 if (node->type == 0)
1100 {
1101 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1102 || dfa->nodes[idx].type == OP_DUP_PLUS
1103 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1104 {
1105 if (node->left->first == -1)
1106 calc_first (dfa, node->left);
1107 if (node->next == -1)
1108 calc_next (dfa, node);
1109 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1110 node->next);
1111 }
1112 else if (dfa->nodes[idx].type == OP_ALT)
1113 {
1114 int left, right;
1115 if (node->left != NULL)
1116 {
1117 if (node->left->first == -1)
1118 calc_first (dfa, node->left);
1119 left = node->left->first;
1120 }
1121 else
1122 {
1123 if (node->next == -1)
1124 calc_next (dfa, node);
1125 left = node->next;
1126 }
1127 if (node->right != NULL)
1128 {
1129 if (node->right->first == -1)
1130 calc_first (dfa, node->right);
1131 right = node->right->first;
1132 }
1133 else
1134 {
1135 if (node->next == -1)
1136 calc_next (dfa, node);
1137 right = node->next;
1138 }
1139 re_node_set_init_2 (dfa->edests + idx, left, right);
1140 }
1141 else if (dfa->nodes[idx].type == ANCHOR
1142 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1143 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1144 || dfa->nodes[idx].type == OP_BACK_REF)
1145 re_node_set_init_1 (dfa->edests + idx, node->next);
1146 }
1147 }
1148
1149 /* Duplicate the epsilon closure of the node ROOT_NODE.
1150 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1151 to their own constraint. */
1152
1153 static reg_errcode_t
1154 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1155 init_constraint)
1156 re_dfa_t *dfa;
1157 int top_org_node, top_clone_node, root_node;
1158 unsigned int init_constraint;
1159 {
1160 reg_errcode_t err;
1161 int org_node, clone_node, ret;
1162 unsigned int constraint = init_constraint;
1163 for (org_node = top_org_node, clone_node = top_clone_node;;)
1164 {
1165 int org_dest, clone_dest;
1166 if (dfa->nodes[org_node].type == OP_BACK_REF)
1167 {
1168 /* If the back reference epsilon-transit, its destination must
1169 also have the constraint. Then duplicate the epsilon closure
1170 of the destination of the back reference, and store it in
1171 edests of the back reference. */
1172 org_dest = dfa->nexts[org_node];
1173 re_node_set_empty (dfa->edests + clone_node);
1174 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1175 if (BE (err != REG_NOERROR, 0))
1176 return err;
1177 dfa->nexts[clone_node] = dfa->nexts[org_node];
1178 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1179 if (BE (ret < 0, 0))
1180 return REG_ESPACE;
1181 }
1182 else if (dfa->edests[org_node].nelem == 0)
1183 {
1184 /* In case of the node can't epsilon-transit, don't duplicate the
1185 destination and store the original destination as the
1186 destination of the node. */
1187 dfa->nexts[clone_node] = dfa->nexts[org_node];
1188 break;
1189 }
1190 else if (dfa->edests[org_node].nelem == 1)
1191 {
1192 /* In case of the node can epsilon-transit, and it has only one
1193 destination. */
1194 org_dest = dfa->edests[org_node].elems[0];
1195 re_node_set_empty (dfa->edests + clone_node);
1196 if (dfa->nodes[org_node].type == ANCHOR)
1197 {
1198 /* In case of the node has another constraint, append it. */
1199 if (org_node == root_node && clone_node != org_node)
1200 {
1201 /* ...but if the node is root_node itself, it means the
1202 epsilon closure have a loop, then tie it to the
1203 destination of the root_node. */
1204 ret = re_node_set_insert (dfa->edests + clone_node,
1205 org_dest);
1206 if (BE (ret < 0, 0))
1207 return REG_ESPACE;
1208 break;
1209 }
1210 constraint |= dfa->nodes[org_node].opr.ctx_type;
1211 }
1212 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1213 if (BE (err != REG_NOERROR, 0))
1214 return err;
1215 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1216 if (BE (ret < 0, 0))
1217 return REG_ESPACE;
1218 }
1219 else /* dfa->edests[org_node].nelem == 2 */
1220 {
1221 /* In case of the node can epsilon-transit, and it has two
1222 destinations. E.g. '|', '*', '+', '?'. */
1223 org_dest = dfa->edests[org_node].elems[0];
1224 re_node_set_empty (dfa->edests + clone_node);
1225 /* Search for a duplicated node which satisfies the constraint. */
1226 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1227 if (clone_dest == -1)
1228 {
1229 /* There are no such a duplicated node, create a new one. */
1230 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1231 if (BE (err != REG_NOERROR, 0))
1232 return err;
1233 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1234 if (BE (ret < 0, 0))
1235 return REG_ESPACE;
1236 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1237 root_node, constraint);
1238 if (BE (err != REG_NOERROR, 0))
1239 return err;
1240 }
1241 else
1242 {
1243 /* There are a duplicated node which satisfy the constraint,
1244 use it to avoid infinite loop. */
1245 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1246 if (BE (ret < 0, 0))
1247 return REG_ESPACE;
1248 }
1249
1250 org_dest = dfa->edests[org_node].elems[1];
1251 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1252 if (BE (err != REG_NOERROR, 0))
1253 return err;
1254 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1255 if (BE (ret < 0, 0))
1256 return REG_ESPACE;
1257 }
1258 org_node = org_dest;
1259 clone_node = clone_dest;
1260 }
1261 return REG_NOERROR;
1262 }
1263
1264 /* Search for a node which is duplicated from the node ORG_NODE, and
1265 satisfies the constraint CONSTRAINT. */
1266
1267 static int
1268 search_duplicated_node (dfa, org_node, constraint)
1269 re_dfa_t *dfa;
1270 int org_node;
1271 unsigned int constraint;
1272 {
1273 int idx;
1274 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1275 {
1276 if (org_node == dfa->org_indices[idx]
1277 && constraint == dfa->nodes[idx].constraint)
1278 return idx; /* Found. */
1279 }
1280 return -1; /* Not found. */
1281 }
1282
1283 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1284 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1285 otherwise return the error code. */
1286
1287 static reg_errcode_t
1288 duplicate_node (new_idx, dfa, org_idx, constraint)
1289 re_dfa_t *dfa;
1290 int *new_idx, org_idx;
1291 unsigned int constraint;
1292 {
1293 re_token_t dup;
1294 int dup_idx;
1295
1296 dup = dfa->nodes[org_idx];
1297 dup_idx = re_dfa_add_node (dfa, dup, 1);
1298 if (BE (dup_idx == -1, 0))
1299 return REG_ESPACE;
1300 dfa->nodes[dup_idx].constraint = constraint;
1301 if (dfa->nodes[org_idx].type == ANCHOR)
1302 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1303 dfa->nodes[dup_idx].duplicated = 1;
1304 re_node_set_init_empty (dfa->edests + dup_idx);
1305 re_node_set_init_empty (dfa->eclosures + dup_idx);
1306 re_node_set_init_empty (dfa->inveclosures + dup_idx);
1307
1308 /* Store the index of the original node. */
1309 dfa->org_indices[dup_idx] = org_idx;
1310 *new_idx = dup_idx;
1311 return REG_NOERROR;
1312 }
1313
1314 static void
1315 calc_inveclosure (dfa)
1316 re_dfa_t *dfa;
1317 {
1318 int src, idx, dest;
1319 for (src = 0; src < dfa->nodes_len; ++src)
1320 {
1321 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1322 {
1323 dest = dfa->eclosures[src].elems[idx];
1324 re_node_set_insert (dfa->inveclosures + dest, src);
1325 }
1326 }
1327 }
1328
1329 /* Calculate "eclosure" for all the node in DFA. */
1330
1331 static reg_errcode_t
1332 calc_eclosure (dfa)
1333 re_dfa_t *dfa;
1334 {
1335 int node_idx, incomplete;
1336 #ifdef DEBUG
1337 assert (dfa->nodes_len > 0);
1338 #endif
1339 incomplete = 0;
1340 /* For each nodes, calculate epsilon closure. */
1341 for (node_idx = 0; ; ++node_idx)
1342 {
1343 reg_errcode_t err;
1344 re_node_set eclosure_elem;
1345 if (node_idx == dfa->nodes_len)
1346 {
1347 if (!incomplete)
1348 break;
1349 incomplete = 0;
1350 node_idx = 0;
1351 }
1352
1353 #ifdef DEBUG
1354 assert (dfa->eclosures[node_idx].nelem != -1);
1355 #endif
1356 /* If we have already calculated, skip it. */
1357 if (dfa->eclosures[node_idx].nelem != 0)
1358 continue;
1359 /* Calculate epsilon closure of `node_idx'. */
1360 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1361 if (BE (err != REG_NOERROR, 0))
1362 return err;
1363
1364 if (dfa->eclosures[node_idx].nelem == 0)
1365 {
1366 incomplete = 1;
1367 re_node_set_free (&eclosure_elem);
1368 }
1369 }
1370 return REG_NOERROR;
1371 }
1372
1373 /* Calculate epsilon closure of NODE. */
1374
1375 static reg_errcode_t
1376 calc_eclosure_iter (new_set, dfa, node, root)
1377 re_node_set *new_set;
1378 re_dfa_t *dfa;
1379 int node, root;
1380 {
1381 reg_errcode_t err;
1382 unsigned int constraint;
1383 int i, incomplete;
1384 re_node_set eclosure;
1385 incomplete = 0;
1386 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1387 if (BE (err != REG_NOERROR, 0))
1388 return err;
1389
1390 /* This indicates that we are calculating this node now.
1391 We reference this value to avoid infinite loop. */
1392 dfa->eclosures[node].nelem = -1;
1393
1394 constraint = ((dfa->nodes[node].type == ANCHOR)
1395 ? dfa->nodes[node].opr.ctx_type : 0);
1396 /* If the current node has constraints, duplicate all nodes.
1397 Since they must inherit the constraints. */
1398 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1399 {
1400 int org_node, cur_node;
1401 org_node = cur_node = node;
1402 err = duplicate_node_closure (dfa, node, node, node, constraint);
1403 if (BE (err != REG_NOERROR, 0))
1404 return err;
1405 }
1406
1407 /* Expand each epsilon destination nodes. */
1408 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1409 for (i = 0; i < dfa->edests[node].nelem; ++i)
1410 {
1411 re_node_set eclosure_elem;
1412 int edest = dfa->edests[node].elems[i];
1413 /* If calculating the epsilon closure of `edest' is in progress,
1414 return intermediate result. */
1415 if (dfa->eclosures[edest].nelem == -1)
1416 {
1417 incomplete = 1;
1418 continue;
1419 }
1420 /* If we haven't calculated the epsilon closure of `edest' yet,
1421 calculate now. Otherwise use calculated epsilon closure. */
1422 if (dfa->eclosures[edest].nelem == 0)
1423 {
1424 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1425 if (BE (err != REG_NOERROR, 0))
1426 return err;
1427 }
1428 else
1429 eclosure_elem = dfa->eclosures[edest];
1430 /* Merge the epsilon closure of `edest'. */
1431 re_node_set_merge (&eclosure, &eclosure_elem);
1432 /* If the epsilon closure of `edest' is incomplete,
1433 the epsilon closure of this node is also incomplete. */
1434 if (dfa->eclosures[edest].nelem == 0)
1435 {
1436 incomplete = 1;
1437 re_node_set_free (&eclosure_elem);
1438 }
1439 }
1440
1441 /* Epsilon closures include itself. */
1442 re_node_set_insert (&eclosure, node);
1443 if (incomplete && !root)
1444 dfa->eclosures[node].nelem = 0;
1445 else
1446 dfa->eclosures[node] = eclosure;
1447 *new_set = eclosure;
1448 return REG_NOERROR;
1449 }
1450 \f
1451 /* Functions for token which are used in the parser. */
1452
1453 /* Fetch a token from INPUT.
1454 We must not use this function inside bracket expressions. */
1455
1456 static re_token_t
1457 fetch_token (input, syntax)
1458 re_string_t *input;
1459 reg_syntax_t syntax;
1460 {
1461 re_token_t token;
1462 int consumed_byte;
1463 consumed_byte = peek_token (&token, input, syntax);
1464 re_string_skip_bytes (input, consumed_byte);
1465 return token;
1466 }
1467
1468 /* Peek a token from INPUT, and return the length of the token.
1469 We must not use this function inside bracket expressions. */
1470
1471 static int
1472 peek_token (token, input, syntax)
1473 re_token_t *token;
1474 re_string_t *input;
1475 reg_syntax_t syntax;
1476 {
1477 unsigned char c;
1478
1479 if (re_string_eoi (input))
1480 {
1481 token->type = END_OF_RE;
1482 return 0;
1483 }
1484
1485 c = re_string_peek_byte (input, 0);
1486 token->opr.c = c;
1487
1488 #ifdef RE_ENABLE_I18N
1489 token->mb_partial = 0;
1490 if (MB_CUR_MAX > 1 &&
1491 !re_string_first_byte (input, re_string_cur_idx (input)))
1492 {
1493 token->type = CHARACTER;
1494 token->mb_partial = 1;
1495 return 1;
1496 }
1497 #endif
1498 if (c == '\\')
1499 {
1500 unsigned char c2;
1501 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1502 {
1503 token->type = BACK_SLASH;
1504 return 1;
1505 }
1506
1507 c2 = re_string_peek_byte_case (input, 1);
1508 token->opr.c = c2;
1509 token->type = CHARACTER;
1510 switch (c2)
1511 {
1512 case '|':
1513 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1514 token->type = OP_ALT;
1515 break;
1516 case '1': case '2': case '3': case '4': case '5':
1517 case '6': case '7': case '8': case '9':
1518 if (!(syntax & RE_NO_BK_REFS))
1519 {
1520 token->type = OP_BACK_REF;
1521 token->opr.idx = c2 - '0';
1522 }
1523 break;
1524 case '<':
1525 if (!(syntax & RE_NO_GNU_OPS))
1526 {
1527 token->type = ANCHOR;
1528 token->opr.idx = WORD_FIRST;
1529 }
1530 break;
1531 case '>':
1532 if (!(syntax & RE_NO_GNU_OPS))
1533 {
1534 token->type = ANCHOR;
1535 token->opr.idx = WORD_LAST;
1536 }
1537 break;
1538 case 'b':
1539 if (!(syntax & RE_NO_GNU_OPS))
1540 {
1541 token->type = ANCHOR;
1542 token->opr.idx = WORD_DELIM;
1543 }
1544 break;
1545 case 'B':
1546 if (!(syntax & RE_NO_GNU_OPS))
1547 {
1548 token->type = ANCHOR;
1549 token->opr.idx = INSIDE_WORD;
1550 }
1551 break;
1552 case 'w':
1553 if (!(syntax & RE_NO_GNU_OPS))
1554 token->type = OP_WORD;
1555 break;
1556 case 'W':
1557 if (!(syntax & RE_NO_GNU_OPS))
1558 token->type = OP_NOTWORD;
1559 break;
1560 case '`':
1561 if (!(syntax & RE_NO_GNU_OPS))
1562 {
1563 token->type = ANCHOR;
1564 token->opr.idx = BUF_FIRST;
1565 }
1566 break;
1567 case '\'':
1568 if (!(syntax & RE_NO_GNU_OPS))
1569 {
1570 token->type = ANCHOR;
1571 token->opr.idx = BUF_LAST;
1572 }
1573 break;
1574 case '(':
1575 if (!(syntax & RE_NO_BK_PARENS))
1576 token->type = OP_OPEN_SUBEXP;
1577 break;
1578 case ')':
1579 if (!(syntax & RE_NO_BK_PARENS))
1580 token->type = OP_CLOSE_SUBEXP;
1581 break;
1582 case '+':
1583 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1584 token->type = OP_DUP_PLUS;
1585 break;
1586 case '?':
1587 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1588 token->type = OP_DUP_QUESTION;
1589 break;
1590 case '{':
1591 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1592 token->type = OP_OPEN_DUP_NUM;
1593 break;
1594 case '}':
1595 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1596 token->type = OP_CLOSE_DUP_NUM;
1597 break;
1598 default:
1599 break;
1600 }
1601 return 2;
1602 }
1603
1604 token->type = CHARACTER;
1605 switch (c)
1606 {
1607 case '\n':
1608 if (syntax & RE_NEWLINE_ALT)
1609 token->type = OP_ALT;
1610 break;
1611 case '|':
1612 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1613 token->type = OP_ALT;
1614 break;
1615 case '*':
1616 token->type = OP_DUP_ASTERISK;
1617 break;
1618 case '+':
1619 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1620 token->type = OP_DUP_PLUS;
1621 break;
1622 case '?':
1623 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1624 token->type = OP_DUP_QUESTION;
1625 break;
1626 case '{':
1627 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1628 token->type = OP_OPEN_DUP_NUM;
1629 break;
1630 case '}':
1631 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1632 token->type = OP_CLOSE_DUP_NUM;
1633 break;
1634 case '(':
1635 if (syntax & RE_NO_BK_PARENS)
1636 token->type = OP_OPEN_SUBEXP;
1637 break;
1638 case ')':
1639 if (syntax & RE_NO_BK_PARENS)
1640 token->type = OP_CLOSE_SUBEXP;
1641 break;
1642 case '[':
1643 token->type = OP_OPEN_BRACKET;
1644 break;
1645 case '.':
1646 token->type = OP_PERIOD;
1647 break;
1648 case '^':
1649 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1650 re_string_cur_idx (input) != 0)
1651 {
1652 char prev = re_string_peek_byte (input, -1);
1653 if (prev != '|' && prev != '(' &&
1654 (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1655 break;
1656 }
1657 token->type = ANCHOR;
1658 token->opr.idx = LINE_FIRST;
1659 break;
1660 case '$':
1661 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1662 re_string_cur_idx (input) + 1 != re_string_length (input))
1663 {
1664 re_token_t next;
1665 re_string_skip_bytes (input, 1);
1666 peek_token (&next, input, syntax);
1667 re_string_skip_bytes (input, -1);
1668 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1669 break;
1670 }
1671 token->type = ANCHOR;
1672 token->opr.idx = LINE_LAST;
1673 break;
1674 default:
1675 break;
1676 }
1677 return 1;
1678 }
1679
1680 /* Peek a token from INPUT, and return the length of the token.
1681 We must not use this function out of bracket expressions. */
1682
1683 static int
1684 peek_token_bracket (token, input, syntax)
1685 re_token_t *token;
1686 re_string_t *input;
1687 reg_syntax_t syntax;
1688 {
1689 unsigned char c;
1690 if (re_string_eoi (input))
1691 {
1692 token->type = END_OF_RE;
1693 return 0;
1694 }
1695 c = re_string_peek_byte (input, 0);
1696 token->opr.c = c;
1697
1698 #ifdef RE_ENABLE_I18N
1699 if (MB_CUR_MAX > 1 &&
1700 !re_string_first_byte (input, re_string_cur_idx (input)))
1701 {
1702 token->type = CHARACTER;
1703 return 1;
1704 }
1705 #endif /* RE_ENABLE_I18N */
1706
1707 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1708 {
1709 /* In this case, '\' escape a character. */
1710 unsigned char c2;
1711 re_string_skip_bytes (input, 1);
1712 c2 = re_string_peek_byte (input, 0);
1713 token->opr.c = c2;
1714 token->type = CHARACTER;
1715 return 1;
1716 }
1717 if (c == '[') /* '[' is a special char in a bracket exps. */
1718 {
1719 unsigned char c2;
1720 int token_len;
1721 c2 = re_string_peek_byte (input, 1);
1722 token->opr.c = c2;
1723 token_len = 2;
1724 switch (c2)
1725 {
1726 case '.':
1727 token->type = OP_OPEN_COLL_ELEM;
1728 break;
1729 case '=':
1730 token->type = OP_OPEN_EQUIV_CLASS;
1731 break;
1732 case ':':
1733 if (syntax & RE_CHAR_CLASSES)
1734 {
1735 token->type = OP_OPEN_CHAR_CLASS;
1736 break;
1737 }
1738 /* else fall through. */
1739 default:
1740 token->type = CHARACTER;
1741 token->opr.c = c;
1742 token_len = 1;
1743 break;
1744 }
1745 return token_len;
1746 }
1747 switch (c)
1748 {
1749 case '-':
1750 token->type = OP_CHARSET_RANGE;
1751 break;
1752 case ']':
1753 token->type = OP_CLOSE_BRACKET;
1754 break;
1755 case '^':
1756 token->type = OP_NON_MATCH_LIST;
1757 break;
1758 default:
1759 token->type = CHARACTER;
1760 }
1761 return 1;
1762 }
1763 \f
1764 /* Functions for parser. */
1765
1766 /* Entry point of the parser.
1767 Parse the regular expression REGEXP and return the structure tree.
1768 If an error is occured, ERR is set by error code, and return NULL.
1769 This function build the following tree, from regular expression <reg_exp>:
1770 CAT
1771 / \
1772 / \
1773 <reg_exp> EOR
1774
1775 CAT means concatenation.
1776 EOR means end of regular expression. */
1777
1778 static bin_tree_t *
1779 parse (regexp, preg, syntax, err)
1780 re_string_t *regexp;
1781 regex_t *preg;
1782 reg_syntax_t syntax;
1783 reg_errcode_t *err;
1784 {
1785 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1786 bin_tree_t *tree, *eor, *root;
1787 re_token_t current_token;
1788 int new_idx;
1789 current_token = fetch_token (regexp, syntax);
1790 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1791 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1792 return NULL;
1793 new_idx = re_dfa_add_node (dfa, current_token, 0);
1794 eor = create_tree (NULL, NULL, 0, new_idx);
1795 if (tree != NULL)
1796 root = create_tree (tree, eor, CONCAT, 0);
1797 else
1798 root = eor;
1799 if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1800 {
1801 *err = REG_ESPACE;
1802 return NULL;
1803 }
1804 return root;
1805 }
1806
1807 /* This function build the following tree, from regular expression
1808 <branch1>|<branch2>:
1809 ALT
1810 / \
1811 / \
1812 <branch1> <branch2>
1813
1814 ALT means alternative, which represents the operator `|'. */
1815
1816 static bin_tree_t *
1817 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1818 re_string_t *regexp;
1819 regex_t *preg;
1820 re_token_t *token;
1821 reg_syntax_t syntax;
1822 int nest;
1823 reg_errcode_t *err;
1824 {
1825 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1826 bin_tree_t *tree, *branch = NULL;
1827 int new_idx;
1828 tree = parse_branch (regexp, preg, token, syntax, nest, err);
1829 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1830 return NULL;
1831
1832 while (token->type == OP_ALT)
1833 {
1834 re_token_t alt_token = *token;
1835 new_idx = re_dfa_add_node (dfa, alt_token, 0);
1836 *token = fetch_token (regexp, syntax);
1837 if (token->type != OP_ALT && token->type != END_OF_RE
1838 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1839 {
1840 branch = parse_branch (regexp, preg, token, syntax, nest, err);
1841 if (BE (*err != REG_NOERROR && branch == NULL, 0))
1842 {
1843 free_bin_tree (tree);
1844 return NULL;
1845 }
1846 }
1847 else
1848 branch = NULL;
1849 tree = create_tree (tree, branch, 0, new_idx);
1850 if (BE (new_idx == -1 || tree == NULL, 0))
1851 {
1852 *err = REG_ESPACE;
1853 return NULL;
1854 }
1855 dfa->has_plural_match = 1;
1856 }
1857 return tree;
1858 }
1859
1860 /* This function build the following tree, from regular expression
1861 <exp1><exp2>:
1862 CAT
1863 / \
1864 / \
1865 <exp1> <exp2>
1866
1867 CAT means concatenation. */
1868
1869 static bin_tree_t *
1870 parse_branch (regexp, preg, token, syntax, nest, err)
1871 re_string_t *regexp;
1872 regex_t *preg;
1873 re_token_t *token;
1874 reg_syntax_t syntax;
1875 int nest;
1876 reg_errcode_t *err;
1877 {
1878 bin_tree_t *tree, *exp;
1879 tree = parse_expression (regexp, preg, token, syntax, nest, err);
1880 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1881 return NULL;
1882
1883 while (token->type != OP_ALT && token->type != END_OF_RE
1884 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1885 {
1886 exp = parse_expression (regexp, preg, token, syntax, nest, err);
1887 if (BE (*err != REG_NOERROR && exp == NULL, 0))
1888 {
1889 free_bin_tree (tree);
1890 return NULL;
1891 }
1892 if (tree != NULL && exp != NULL)
1893 {
1894 tree = create_tree (tree, exp, CONCAT, 0);
1895 if (tree == NULL)
1896 {
1897 *err = REG_ESPACE;
1898 return NULL;
1899 }
1900 }
1901 else if (tree == NULL)
1902 tree = exp;
1903 /* Otherwise exp == NULL, we don't need to create new tree. */
1904 }
1905 return tree;
1906 }
1907
1908 /* This function build the following tree, from regular expression a*:
1909 *
1910 |
1911 a
1912 */
1913
1914 static bin_tree_t *
1915 parse_expression (regexp, preg, token, syntax, nest, err)
1916 re_string_t *regexp;
1917 regex_t *preg;
1918 re_token_t *token;
1919 reg_syntax_t syntax;
1920 int nest;
1921 reg_errcode_t *err;
1922 {
1923 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1924 bin_tree_t *tree;
1925 int new_idx;
1926 switch (token->type)
1927 {
1928 case CHARACTER:
1929 new_idx = re_dfa_add_node (dfa, *token, 0);
1930 tree = create_tree (NULL, NULL, 0, new_idx);
1931 if (BE (new_idx == -1 || tree == NULL, 0))
1932 {
1933 *err = REG_ESPACE;
1934 return NULL;
1935 }
1936 #ifdef RE_ENABLE_I18N
1937 if (MB_CUR_MAX > 1)
1938 {
1939 while (!re_string_eoi (regexp)
1940 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1941 {
1942 bin_tree_t *mbc_remain;
1943 *token = fetch_token (regexp, syntax);
1944 new_idx = re_dfa_add_node (dfa, *token, 0);
1945 mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1946 tree = create_tree (tree, mbc_remain, CONCAT, 0);
1947 if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1948 {
1949 *err = REG_ESPACE;
1950 return NULL;
1951 }
1952 }
1953 }
1954 #endif
1955 break;
1956 case OP_OPEN_SUBEXP:
1957 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1958 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1959 return NULL;
1960 break;
1961 case OP_OPEN_BRACKET:
1962 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1963 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1964 return NULL;
1965 break;
1966 case OP_BACK_REF:
1967 if (BE (preg->re_nsub < token->opr.idx
1968 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1969 {
1970 *err = REG_ESUBREG;
1971 return NULL;
1972 }
1973 dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
1974 new_idx = re_dfa_add_node (dfa, *token, 0);
1975 tree = create_tree (NULL, NULL, 0, new_idx);
1976 if (BE (new_idx == -1 || tree == NULL, 0))
1977 {
1978 *err = REG_ESPACE;
1979 return NULL;
1980 }
1981 ++dfa->nbackref;
1982 dfa->has_mb_node = 1;
1983 break;
1984 case OP_DUP_ASTERISK:
1985 case OP_DUP_PLUS:
1986 case OP_DUP_QUESTION:
1987 case OP_OPEN_DUP_NUM:
1988 if (syntax & RE_CONTEXT_INVALID_OPS)
1989 {
1990 *err = REG_BADRPT;
1991 return NULL;
1992 }
1993 else if (syntax & RE_CONTEXT_INDEP_OPS)
1994 {
1995 *token = fetch_token (regexp, syntax);
1996 return parse_expression (regexp, preg, token, syntax, nest, err);
1997 }
1998 /* else fall through */
1999 case OP_CLOSE_SUBEXP:
2000 if ((token->type == OP_CLOSE_SUBEXP) &&
2001 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2002 {
2003 *err = REG_ERPAREN;
2004 return NULL;
2005 }
2006 /* else fall through */
2007 case OP_CLOSE_DUP_NUM:
2008 /* We treat it as a normal character. */
2009
2010 /* Then we can these characters as normal characters. */
2011 token->type = CHARACTER;
2012 new_idx = re_dfa_add_node (dfa, *token, 0);
2013 tree = create_tree (NULL, NULL, 0, new_idx);
2014 if (BE (new_idx == -1 || tree == NULL, 0))
2015 {
2016 *err = REG_ESPACE;
2017 return NULL;
2018 }
2019 break;
2020 case ANCHOR:
2021 if (dfa->word_char == NULL)
2022 {
2023 *err = init_word_char (dfa);
2024 if (BE (*err != REG_NOERROR, 0))
2025 return NULL;
2026 }
2027 if (token->opr.ctx_type == WORD_DELIM)
2028 {
2029 bin_tree_t *tree_first, *tree_last;
2030 int idx_first, idx_last;
2031 token->opr.ctx_type = WORD_FIRST;
2032 idx_first = re_dfa_add_node (dfa, *token, 0);
2033 tree_first = create_tree (NULL, NULL, 0, idx_first);
2034 token->opr.ctx_type = WORD_LAST;
2035 idx_last = re_dfa_add_node (dfa, *token, 0);
2036 tree_last = create_tree (NULL, NULL, 0, idx_last);
2037 token->type = OP_ALT;
2038 new_idx = re_dfa_add_node (dfa, *token, 0);
2039 tree = create_tree (tree_first, tree_last, 0, new_idx);
2040 if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2041 || tree_first == NULL || tree_last == NULL
2042 || tree == NULL, 0))
2043 {
2044 *err = REG_ESPACE;
2045 return NULL;
2046 }
2047 }
2048 else
2049 {
2050 new_idx = re_dfa_add_node (dfa, *token, 0);
2051 tree = create_tree (NULL, NULL, 0, new_idx);
2052 if (BE (new_idx == -1 || tree == NULL, 0))
2053 {
2054 *err = REG_ESPACE;
2055 return NULL;
2056 }
2057 }
2058 /* We must return here, since ANCHORs can't be followed
2059 by repetition operators.
2060 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2061 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2062 *token = fetch_token (regexp, syntax);
2063 return tree;
2064 case OP_PERIOD:
2065 new_idx = re_dfa_add_node (dfa, *token, 0);
2066 tree = create_tree (NULL, NULL, 0, new_idx);
2067 if (BE (new_idx == -1 || tree == NULL, 0))
2068 {
2069 *err = REG_ESPACE;
2070 return NULL;
2071 }
2072 if (MB_CUR_MAX > 1)
2073 dfa->has_mb_node = 1;
2074 break;
2075 case OP_WORD:
2076 tree = build_word_op (dfa, 0, err);
2077 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2078 return NULL;
2079 break;
2080 case OP_NOTWORD:
2081 tree = build_word_op (dfa, 1, err);
2082 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2083 return NULL;
2084 break;
2085 case OP_ALT:
2086 case END_OF_RE:
2087 return NULL;
2088 case BACK_SLASH:
2089 *err = REG_EESCAPE;
2090 return NULL;
2091 default:
2092 /* Must not happen? */
2093 #ifdef DEBUG
2094 assert (0);
2095 #endif
2096 return NULL;
2097 }
2098 *token = fetch_token (regexp, syntax);
2099
2100 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2101 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2102 {
2103 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2104 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2105 return NULL;
2106 dfa->has_plural_match = 1;
2107 }
2108
2109 return tree;
2110 }
2111
2112 /* This function build the following tree, from regular expression
2113 (<reg_exp>):
2114 SUBEXP
2115 |
2116 <reg_exp>
2117 */
2118
2119 static bin_tree_t *
2120 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2121 re_string_t *regexp;
2122 regex_t *preg;
2123 re_token_t *token;
2124 reg_syntax_t syntax;
2125 int nest;
2126 reg_errcode_t *err;
2127 {
2128 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2129 bin_tree_t *tree, *left_par, *right_par;
2130 size_t cur_nsub;
2131 int new_idx;
2132 cur_nsub = preg->re_nsub++;
2133 if (dfa->subexps_alloc < preg->re_nsub)
2134 {
2135 re_subexp_t *new_array;
2136 dfa->subexps_alloc *= 2;
2137 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2138 if (BE (new_array == NULL, 0))
2139 {
2140 dfa->subexps_alloc /= 2;
2141 *err = REG_ESPACE;
2142 return NULL;
2143 }
2144 dfa->subexps = new_array;
2145 }
2146 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2147 dfa->subexps[cur_nsub].end = -1;
2148
2149 new_idx = re_dfa_add_node (dfa, *token, 0);
2150 left_par = create_tree (NULL, NULL, 0, new_idx);
2151 if (BE (new_idx == -1 || left_par == NULL, 0))
2152 {
2153 *err = REG_ESPACE;
2154 return NULL;
2155 }
2156 dfa->nodes[new_idx].opr.idx = cur_nsub;
2157 *token = fetch_token (regexp, syntax);
2158
2159 /* The subexpression may be a null string. */
2160 if (token->type == OP_CLOSE_SUBEXP)
2161 tree = NULL;
2162 else
2163 {
2164 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2165 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2166 return NULL;
2167 }
2168 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2169 {
2170 free_bin_tree (tree);
2171 *err = REG_BADPAT;
2172 return NULL;
2173 }
2174 new_idx = re_dfa_add_node (dfa, *token, 0);
2175 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2176 right_par = create_tree (NULL, NULL, 0, new_idx);
2177 tree = ((tree == NULL) ? right_par
2178 : create_tree (tree, right_par, CONCAT, 0));
2179 tree = create_tree (left_par, tree, CONCAT, 0);
2180 if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2181 {
2182 *err = REG_ESPACE;
2183 return NULL;
2184 }
2185 dfa->nodes[new_idx].opr.idx = cur_nsub;
2186
2187 return tree;
2188 }
2189
2190 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2191
2192 static bin_tree_t *
2193 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2194 bin_tree_t *dup_elem;
2195 re_string_t *regexp;
2196 re_dfa_t *dfa;
2197 re_token_t *token;
2198 reg_syntax_t syntax;
2199 reg_errcode_t *err;
2200 {
2201 re_token_t dup_token;
2202 bin_tree_t *tree = dup_elem, *work_tree;
2203 int new_idx, start_idx = re_string_cur_idx (regexp);
2204 re_token_t start_token = *token;
2205 if (token->type == OP_OPEN_DUP_NUM)
2206 {
2207 int i;
2208 int end = 0;
2209 int start = fetch_number (regexp, token, syntax);
2210 bin_tree_t *elem;
2211 if (start == -1)
2212 {
2213 if (token->type == CHARACTER && token->opr.c == ',')
2214 start = 0; /* We treat "{,m}" as "{0,m}". */
2215 else
2216 {
2217 *err = REG_BADBR; /* <re>{} is invalid. */
2218 return NULL;
2219 }
2220 }
2221 if (BE (start != -2, 1))
2222 {
2223 /* We treat "{n}" as "{n,n}". */
2224 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2225 : ((token->type == CHARACTER && token->opr.c == ',')
2226 ? fetch_number (regexp, token, syntax) : -2));
2227 }
2228 if (BE (start == -2 || end == -2, 0))
2229 {
2230 /* Invalid sequence. */
2231 if (token->type == OP_CLOSE_DUP_NUM)
2232 goto parse_dup_op_invalid_interval;
2233 else
2234 goto parse_dup_op_ebrace;
2235 }
2236 if (BE (start == 0 && end == 0, 0))
2237 {
2238 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2239 *token = fetch_token (regexp, syntax);
2240 free_bin_tree (dup_elem);
2241 return NULL;
2242 }
2243
2244 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2245 elem = tree;
2246 for (i = 0; i < start; ++i)
2247 if (i != 0)
2248 {
2249 work_tree = duplicate_tree (elem, dfa);
2250 tree = create_tree (tree, work_tree, CONCAT, 0);
2251 if (BE (work_tree == NULL || tree == NULL, 0))
2252 goto parse_dup_op_espace;
2253 }
2254
2255 if (end == -1)
2256 {
2257 /* We treat "<re>{0,}" as "<re>*". */
2258 dup_token.type = OP_DUP_ASTERISK;
2259 if (start > 0)
2260 {
2261 elem = duplicate_tree (elem, dfa);
2262 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2263 work_tree = create_tree (elem, NULL, 0, new_idx);
2264 tree = create_tree (tree, work_tree, CONCAT, 0);
2265 if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2266 || tree == NULL, 0))
2267 goto parse_dup_op_espace;
2268 }
2269 else
2270 {
2271 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2272 tree = create_tree (elem, NULL, 0, new_idx);
2273 if (BE (new_idx == -1 || tree == NULL, 0))
2274 goto parse_dup_op_espace;
2275 }
2276 }
2277 else if (end - start > 0)
2278 {
2279 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2280 dup_token.type = OP_DUP_QUESTION;
2281 if (start > 0)
2282 {
2283 elem = duplicate_tree (elem, dfa);
2284 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2285 elem = create_tree (elem, NULL, 0, new_idx);
2286 tree = create_tree (tree, elem, CONCAT, 0);
2287 if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2288 goto parse_dup_op_espace;
2289 }
2290 else
2291 {
2292 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2293 tree = elem = create_tree (elem, NULL, 0, new_idx);
2294 if (BE (new_idx == -1 || tree == NULL, 0))
2295 goto parse_dup_op_espace;
2296 }
2297 for (i = 1; i < end - start; ++i)
2298 {
2299 work_tree = duplicate_tree (elem, dfa);
2300 tree = create_tree (tree, work_tree, CONCAT, 0);
2301 if (BE (work_tree == NULL || tree == NULL, 0))
2302 {
2303 *err = REG_ESPACE;
2304 return NULL;
2305 }
2306 }
2307 }
2308 }
2309 else
2310 {
2311 new_idx = re_dfa_add_node (dfa, *token, 0);
2312 tree = create_tree (tree, NULL, 0, new_idx);
2313 if (BE (new_idx == -1 || tree == NULL, 0))
2314 {
2315 *err = REG_ESPACE;
2316 return NULL;
2317 }
2318 }
2319 *token = fetch_token (regexp, syntax);
2320 return tree;
2321
2322 parse_dup_op_espace:
2323 free_bin_tree (tree);
2324 *err = REG_ESPACE;
2325 return NULL;
2326
2327 parse_dup_op_ebrace:
2328 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2329 {
2330 *err = REG_EBRACE;
2331 return NULL;
2332 }
2333 goto parse_dup_op_rollback;
2334 parse_dup_op_invalid_interval:
2335 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2336 {
2337 *err = REG_BADBR;
2338 return NULL;
2339 }
2340 parse_dup_op_rollback:
2341 re_string_set_index (regexp, start_idx);
2342 *token = start_token;
2343 token->type = CHARACTER;
2344 return dup_elem;
2345 }
2346
2347 /* Size of the names for collating symbol/equivalence_class/character_class.
2348 I'm not sure, but maybe enough. */
2349 #define BRACKET_NAME_BUF_SIZE 32
2350
2351 #ifndef _LIBC
2352 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2353 Build the range expression which starts from START_ELEM, and ends
2354 at END_ELEM. The result are written to MBCSET and SBCSET.
2355 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2356 mbcset->range_ends, is a pointer argument sinse we may
2357 update it. */
2358
2359 static reg_errcode_t
2360 # ifdef RE_ENABLE_I18N
2361 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2362 re_charset_t *mbcset;
2363 int *range_alloc;
2364 # else /* not RE_ENABLE_I18N */
2365 build_range_exp (sbcset, start_elem, end_elem)
2366 # endif /* not RE_ENABLE_I18N */
2367 re_bitset_ptr_t sbcset;
2368 bracket_elem_t *start_elem, *end_elem;
2369 {
2370 unsigned int start_ch, end_ch;
2371 /* Equivalence Classes and Character Classes can't be a range start/end. */
2372 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2373 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2374 0))
2375 return REG_ERANGE;
2376
2377 /* We can handle no multi character collating elements without libc
2378 support. */
2379 if (BE ((start_elem->type == COLL_SYM
2380 && strlen ((char *) start_elem->opr.name) > 1)
2381 || (end_elem->type == COLL_SYM
2382 && strlen ((char *) end_elem->opr.name) > 1), 0))
2383 return REG_ECOLLATE;
2384
2385 # ifdef RE_ENABLE_I18N
2386 {
2387 wchar_t wc, start_wc, end_wc;
2388 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2389
2390 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2391 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2392 : 0));
2393 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2394 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2395 : 0));
2396 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2397 ? __btowc (start_ch) : start_elem->opr.wch);
2398 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2399 ? __btowc (end_ch) : end_elem->opr.wch);
2400 cmp_buf[0] = start_wc;
2401 cmp_buf[4] = end_wc;
2402 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2403 return REG_ERANGE;
2404
2405 /* Check the space of the arrays. */
2406 if (*range_alloc == mbcset->nranges)
2407 {
2408 /* There are not enough space, need realloc. */
2409 wchar_t *new_array_start, *new_array_end;
2410 int new_nranges;
2411
2412 /* +1 in case of mbcset->nranges is 0. */
2413 new_nranges = 2 * mbcset->nranges + 1;
2414 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2415 are NULL if *range_alloc == 0. */
2416 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2417 new_nranges);
2418 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2419 new_nranges);
2420
2421 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2422 return REG_ESPACE;
2423
2424 mbcset->range_starts = new_array_start;
2425 mbcset->range_ends = new_array_end;
2426 *range_alloc = new_nranges;
2427 }
2428
2429 mbcset->range_starts[mbcset->nranges] = start_wc;
2430 mbcset->range_ends[mbcset->nranges++] = end_wc;
2431
2432 /* Build the table for single byte characters. */
2433 for (wc = 0; wc <= SBC_MAX; ++wc)
2434 {
2435 cmp_buf[2] = wc;
2436 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2437 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2438 bitset_set (sbcset, wc);
2439 }
2440 }
2441 # else /* not RE_ENABLE_I18N */
2442 {
2443 unsigned int ch;
2444 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2445 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2446 : 0));
2447 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2448 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2449 : 0));
2450 if (start_ch > end_ch)
2451 return REG_ERANGE;
2452 /* Build the table for single byte characters. */
2453 for (ch = 0; ch <= SBC_MAX; ++ch)
2454 if (start_ch <= ch && ch <= end_ch)
2455 bitset_set (sbcset, ch);
2456 }
2457 # endif /* not RE_ENABLE_I18N */
2458 return REG_NOERROR;
2459 }
2460 #endif /* not _LIBC */
2461
2462 #ifndef _LIBC
2463 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2464 Build the collating element which is represented by NAME.
2465 The result are written to MBCSET and SBCSET.
2466 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2467 pointer argument since we may update it. */
2468
2469 static reg_errcode_t
2470 # ifdef RE_ENABLE_I18N
2471 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2472 re_charset_t *mbcset;
2473 int *coll_sym_alloc;
2474 # else /* not RE_ENABLE_I18N */
2475 build_collating_symbol (sbcset, name)
2476 # endif /* not RE_ENABLE_I18N */
2477 re_bitset_ptr_t sbcset;
2478 const unsigned char *name;
2479 {
2480 size_t name_len = strlen ((const char *) name);
2481 if (BE (name_len != 1, 0))
2482 return REG_ECOLLATE;
2483 else
2484 {
2485 bitset_set (sbcset, name[0]);
2486 return REG_NOERROR;
2487 }
2488 }
2489 #endif /* not _LIBC */
2490
2491 /* This function parse bracket expression like "[abc]", "[a-c]",
2492 "[[.a-a.]]" etc. */
2493
2494 static bin_tree_t *
2495 parse_bracket_exp (regexp, dfa, token, syntax, err)
2496 re_string_t *regexp;
2497 re_dfa_t *dfa;
2498 re_token_t *token;
2499 reg_syntax_t syntax;
2500 reg_errcode_t *err;
2501 {
2502 #ifdef _LIBC
2503 const unsigned char *collseqmb;
2504 const char *collseqwc;
2505 uint32_t nrules;
2506 int32_t table_size;
2507 const int32_t *symb_table;
2508 const unsigned char *extra;
2509
2510 /* Local function for parse_bracket_exp used in _LIBC environement.
2511 Seek the collating symbol entry correspondings to NAME.
2512 Return the index of the symbol in the SYMB_TABLE. */
2513
2514 static inline int32_t
2515 seek_collating_symbol_entry (name, name_len)
2516 const unsigned char *name;
2517 size_t name_len;
2518 {
2519 int32_t hash = elem_hash ((const char *) name, name_len);
2520 int32_t elem = hash % table_size;
2521 int32_t second = hash % (table_size - 2);
2522 while (symb_table[2 * elem] != 0)
2523 {
2524 /* First compare the hashing value. */
2525 if (symb_table[2 * elem] == hash
2526 /* Compare the length of the name. */
2527 && name_len == extra[symb_table[2 * elem + 1]]
2528 /* Compare the name. */
2529 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2530 name_len) == 0)
2531 {
2532 /* Yep, this is the entry. */
2533 break;
2534 }
2535
2536 /* Next entry. */
2537 elem += second;
2538 }
2539 return elem;
2540 }
2541
2542 /* Local function for parse_bracket_exp used in _LIBC environement.
2543 Look up the collation sequence value of BR_ELEM.
2544 Return the value if succeeded, UINT_MAX otherwise. */
2545
2546 static inline unsigned int
2547 lookup_collation_sequence_value (br_elem)
2548 bracket_elem_t *br_elem;
2549 {
2550 if (br_elem->type == SB_CHAR)
2551 {
2552 /*
2553 if (MB_CUR_MAX == 1)
2554 */
2555 if (nrules == 0)
2556 return collseqmb[br_elem->opr.ch];
2557 else
2558 {
2559 wint_t wc = __btowc (br_elem->opr.ch);
2560 return collseq_table_lookup (collseqwc, wc);
2561 }
2562 }
2563 else if (br_elem->type == MB_CHAR)
2564 {
2565 return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2566 }
2567 else if (br_elem->type == COLL_SYM)
2568 {
2569 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2570 if (nrules != 0)
2571 {
2572 int32_t elem, idx;
2573 elem = seek_collating_symbol_entry (br_elem->opr.name,
2574 sym_name_len);
2575 if (symb_table[2 * elem] != 0)
2576 {
2577 /* We found the entry. */
2578 idx = symb_table[2 * elem + 1];
2579 /* Skip the name of collating element name. */
2580 idx += 1 + extra[idx];
2581 /* Skip the byte sequence of the collating element. */
2582 idx += 1 + extra[idx];
2583 /* Adjust for the alignment. */
2584 idx = (idx + 3) & ~3;
2585 /* Skip the multibyte collation sequence value. */
2586 idx += sizeof (unsigned int);
2587 /* Skip the wide char sequence of the collating element. */
2588 idx += sizeof (unsigned int) *
2589 (1 + *(unsigned int *) (extra + idx));
2590 /* Return the collation sequence value. */
2591 return *(unsigned int *) (extra + idx);
2592 }
2593 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2594 {
2595 /* No valid character. Match it as a single byte
2596 character. */
2597 return collseqmb[br_elem->opr.name[0]];
2598 }
2599 }
2600 else if (sym_name_len == 1)
2601 return collseqmb[br_elem->opr.name[0]];
2602 }
2603 return UINT_MAX;
2604 }
2605
2606 /* Local function for parse_bracket_exp used in _LIBC environement.
2607 Build the range expression which starts from START_ELEM, and ends
2608 at END_ELEM. The result are written to MBCSET and SBCSET.
2609 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2610 mbcset->range_ends, is a pointer argument sinse we may
2611 update it. */
2612
2613 static inline reg_errcode_t
2614 # ifdef RE_ENABLE_I18N
2615 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2616 re_charset_t *mbcset;
2617 int *range_alloc;
2618 # else /* not RE_ENABLE_I18N */
2619 build_range_exp (sbcset, start_elem, end_elem)
2620 # endif /* not RE_ENABLE_I18N */
2621 re_bitset_ptr_t sbcset;
2622 bracket_elem_t *start_elem, *end_elem;
2623 {
2624 unsigned int ch;
2625 uint32_t start_collseq;
2626 uint32_t end_collseq;
2627
2628 # ifdef RE_ENABLE_I18N
2629 /* Check the space of the arrays. */
2630 if (*range_alloc == mbcset->nranges)
2631 {
2632 /* There are not enough space, need realloc. */
2633 uint32_t *new_array_start;
2634 uint32_t *new_array_end;
2635 int new_nranges;
2636
2637 /* +1 in case of mbcset->nranges is 0. */
2638 new_nranges = 2 * mbcset->nranges + 1;
2639 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2640 are NULL if *range_alloc == 0. */
2641 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2642 new_nranges);
2643 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2644 new_nranges);
2645
2646 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2647 return REG_ESPACE;
2648
2649 mbcset->range_starts = new_array_start;
2650 mbcset->range_ends = new_array_end;
2651 *range_alloc = new_nranges;
2652 }
2653 # endif /* RE_ENABLE_I18N */
2654
2655 /* Equivalence Classes and Character Classes can't be a range
2656 start/end. */
2657 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2658 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2659 0))
2660 return REG_ERANGE;
2661
2662 start_collseq = lookup_collation_sequence_value (start_elem);
2663 end_collseq = lookup_collation_sequence_value (end_elem);
2664 /* Check start/end collation sequence values. */
2665 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2666 return REG_ECOLLATE;
2667 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2668 return REG_ERANGE;
2669
2670 # ifdef RE_ENABLE_I18N
2671 /* Got valid collation sequence values, add them as a new entry. */
2672 mbcset->range_starts[mbcset->nranges] = start_collseq;
2673 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2674 # endif /* RE_ENABLE_I18N */
2675
2676 /* Build the table for single byte characters. */
2677 for (ch = 0; ch <= SBC_MAX; ch++)
2678 {
2679 uint32_t ch_collseq;
2680 /*
2681 if (MB_CUR_MAX == 1)
2682 */
2683 if (nrules == 0)
2684 ch_collseq = collseqmb[ch];
2685 else
2686 ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2687 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2688 bitset_set (sbcset, ch);
2689 }
2690 return REG_NOERROR;
2691 }
2692
2693 /* Local function for parse_bracket_exp used in _LIBC environement.
2694 Build the collating element which is represented by NAME.
2695 The result are written to MBCSET and SBCSET.
2696 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2697 pointer argument sinse we may update it. */
2698
2699 static inline reg_errcode_t
2700 # ifdef RE_ENABLE_I18N
2701 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2702 re_charset_t *mbcset;
2703 int *coll_sym_alloc;
2704 # else /* not RE_ENABLE_I18N */
2705 build_collating_symbol (sbcset, name)
2706 # endif /* not RE_ENABLE_I18N */
2707 re_bitset_ptr_t sbcset;
2708 const unsigned char *name;
2709 {
2710 int32_t elem, idx;
2711 size_t name_len = strlen ((const char *) name);
2712 if (nrules != 0)
2713 {
2714 elem = seek_collating_symbol_entry (name, name_len);
2715 if (symb_table[2 * elem] != 0)
2716 {
2717 /* We found the entry. */
2718 idx = symb_table[2 * elem + 1];
2719 /* Skip the name of collating element name. */
2720 idx += 1 + extra[idx];
2721 }
2722 else if (symb_table[2 * elem] == 0 && name_len == 1)
2723 {
2724 /* No valid character, treat it as a normal
2725 character. */
2726 bitset_set (sbcset, name[0]);
2727 return REG_NOERROR;
2728 }
2729 else
2730 return REG_ECOLLATE;
2731
2732 # ifdef RE_ENABLE_I18N
2733 /* Got valid collation sequence, add it as a new entry. */
2734 /* Check the space of the arrays. */
2735 if (*coll_sym_alloc == mbcset->ncoll_syms)
2736 {
2737 /* Not enough, realloc it. */
2738 /* +1 in case of mbcset->ncoll_syms is 0. */
2739 *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2740 /* Use realloc since mbcset->coll_syms is NULL
2741 if *alloc == 0. */
2742 mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2743 *coll_sym_alloc);
2744 if (BE (mbcset->coll_syms == NULL, 0))
2745 return REG_ESPACE;
2746 }
2747 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2748 # endif /* RE_ENABLE_I18N */
2749 return REG_NOERROR;
2750 }
2751 else
2752 {
2753 if (BE (name_len != 1, 0))
2754 return REG_ECOLLATE;
2755 else
2756 {
2757 bitset_set (sbcset, name[0]);
2758 return REG_NOERROR;
2759 }
2760 }
2761 }
2762 #endif
2763
2764 re_token_t br_token;
2765 re_bitset_ptr_t sbcset;
2766 #ifdef RE_ENABLE_I18N
2767 re_charset_t *mbcset;
2768 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2769 int equiv_class_alloc = 0, char_class_alloc = 0;
2770 #else /* not RE_ENABLE_I18N */
2771 int non_match = 0;
2772 #endif /* not RE_ENABLE_I18N */
2773 bin_tree_t *work_tree;
2774 int token_len, new_idx;
2775 #ifdef _LIBC
2776 collseqmb = (const unsigned char *)
2777 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2778 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2779 if (nrules)
2780 {
2781 /*
2782 if (MB_CUR_MAX > 1)
2783 */
2784 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2785 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2786 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2787 _NL_COLLATE_SYMB_TABLEMB);
2788 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2789 _NL_COLLATE_SYMB_EXTRAMB);
2790 }
2791 #endif
2792 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2793 #ifdef RE_ENABLE_I18N
2794 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2795 #endif /* RE_ENABLE_I18N */
2796 #ifdef RE_ENABLE_I18N
2797 if (BE (sbcset == NULL || mbcset == NULL, 0))
2798 #else
2799 if (BE (sbcset == NULL, 0))
2800 #endif /* RE_ENABLE_I18N */
2801 {
2802 *err = REG_ESPACE;
2803 return NULL;
2804 }
2805
2806 token_len = peek_token_bracket (token, regexp, syntax);
2807 if (BE (token->type == END_OF_RE, 0))
2808 {
2809 *err = REG_BADPAT;
2810 goto parse_bracket_exp_free_return;
2811 }
2812 if (token->type == OP_NON_MATCH_LIST)
2813 {
2814 #ifdef RE_ENABLE_I18N
2815 int i;
2816 mbcset->non_match = 1;
2817 #else /* not RE_ENABLE_I18N */
2818 non_match = 1;
2819 #endif /* not RE_ENABLE_I18N */
2820 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2821 bitset_set (sbcset, '\0');
2822 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2823 token_len = peek_token_bracket (token, regexp, syntax);
2824 if (BE (token->type == END_OF_RE, 0))
2825 {
2826 *err = REG_BADPAT;
2827 goto parse_bracket_exp_free_return;
2828 }
2829 #ifdef RE_ENABLE_I18N
2830 if (MB_CUR_MAX > 1)
2831 for (i = 0; i < SBC_MAX; ++i)
2832 if (__btowc (i) == WEOF)
2833 bitset_set (sbcset, i);
2834 #endif /* RE_ENABLE_I18N */
2835 }
2836
2837 /* We treat the first ']' as a normal character. */
2838 if (token->type == OP_CLOSE_BRACKET)
2839 token->type = CHARACTER;
2840
2841 while (1)
2842 {
2843 bracket_elem_t start_elem, end_elem;
2844 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2845 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2846 reg_errcode_t ret;
2847 int token_len2 = 0, is_range_exp = 0;
2848 re_token_t token2;
2849
2850 start_elem.opr.name = start_name_buf;
2851 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2852 syntax);
2853 if (BE (ret != REG_NOERROR, 0))
2854 {
2855 *err = ret;
2856 goto parse_bracket_exp_free_return;
2857 }
2858
2859 token_len = peek_token_bracket (token, regexp, syntax);
2860 if (BE (token->type == END_OF_RE, 0))
2861 {
2862 *err = REG_BADPAT;
2863 goto parse_bracket_exp_free_return;
2864 }
2865 if (token->type == OP_CHARSET_RANGE)
2866 {
2867 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
2868 token_len2 = peek_token_bracket (&token2, regexp, syntax);
2869 if (BE (token->type == END_OF_RE, 0))
2870 {
2871 *err = REG_BADPAT;
2872 goto parse_bracket_exp_free_return;
2873 }
2874 if (token2.type == OP_CLOSE_BRACKET)
2875 {
2876 /* We treat the last '-' as a normal character. */
2877 re_string_skip_bytes (regexp, -token_len);
2878 token->type = CHARACTER;
2879 }
2880 else
2881 is_range_exp = 1;
2882 }
2883
2884 if (is_range_exp == 1)
2885 {
2886 end_elem.opr.name = end_name_buf;
2887 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2888 dfa, syntax);
2889 if (BE (ret != REG_NOERROR, 0))
2890 {
2891 *err = ret;
2892 goto parse_bracket_exp_free_return;
2893 }
2894
2895 token_len = peek_token_bracket (token, regexp, syntax);
2896 if (BE (token->type == END_OF_RE, 0))
2897 {
2898 *err = REG_BADPAT;
2899 goto parse_bracket_exp_free_return;
2900 }
2901 *err = build_range_exp (sbcset,
2902 #ifdef RE_ENABLE_I18N
2903 mbcset, &range_alloc,
2904 #endif /* RE_ENABLE_I18N */
2905 &start_elem, &end_elem);
2906 if (BE (*err != REG_NOERROR, 0))
2907 goto parse_bracket_exp_free_return;
2908 }
2909 else
2910 {
2911 switch (start_elem.type)
2912 {
2913 case SB_CHAR:
2914 bitset_set (sbcset, start_elem.opr.ch);
2915 break;
2916 #ifdef RE_ENABLE_I18N
2917 case MB_CHAR:
2918 /* Check whether the array has enough space. */
2919 if (mbchar_alloc == mbcset->nmbchars)
2920 {
2921 /* Not enough, realloc it. */
2922 /* +1 in case of mbcset->nmbchars is 0. */
2923 mbchar_alloc = 2 * mbcset->nmbchars + 1;
2924 /* Use realloc since array is NULL if *alloc == 0. */
2925 mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2926 mbchar_alloc);
2927 if (BE (mbcset->mbchars == NULL, 0))
2928 goto parse_bracket_exp_espace;
2929 }
2930 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2931 break;
2932 #endif /* RE_ENABLE_I18N */
2933 case EQUIV_CLASS:
2934 *err = build_equiv_class (sbcset,
2935 #ifdef RE_ENABLE_I18N
2936 mbcset, &equiv_class_alloc,
2937 #endif /* RE_ENABLE_I18N */
2938 start_elem.opr.name);
2939 if (BE (*err != REG_NOERROR, 0))
2940 goto parse_bracket_exp_free_return;
2941 break;
2942 case COLL_SYM:
2943 *err = build_collating_symbol (sbcset,
2944 #ifdef RE_ENABLE_I18N
2945 mbcset, &coll_sym_alloc,
2946 #endif /* RE_ENABLE_I18N */
2947 start_elem.opr.name);
2948 if (BE (*err != REG_NOERROR, 0))
2949 goto parse_bracket_exp_free_return;
2950 break;
2951 case CHAR_CLASS:
2952 *err = build_charclass (sbcset,
2953 #ifdef RE_ENABLE_I18N
2954 mbcset, &char_class_alloc,
2955 #endif /* RE_ENABLE_I18N */
2956 start_elem.opr.name, syntax);
2957 if (BE (*err != REG_NOERROR, 0))
2958 goto parse_bracket_exp_free_return;
2959 break;
2960 default:
2961 assert (0);
2962 break;
2963 }
2964 }
2965 if (token->type == OP_CLOSE_BRACKET)
2966 break;
2967 }
2968
2969 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2970
2971 /* If it is non-matching list. */
2972 #ifdef RE_ENABLE_I18N
2973 if (mbcset->non_match)
2974 #else /* not RE_ENABLE_I18N */
2975 if (non_match)
2976 #endif /* not RE_ENABLE_I18N */
2977 bitset_not (sbcset);
2978
2979 /* Build a tree for simple bracket. */
2980 br_token.type = SIMPLE_BRACKET;
2981 br_token.opr.sbcset = sbcset;
2982 new_idx = re_dfa_add_node (dfa, br_token, 0);
2983 work_tree = create_tree (NULL, NULL, 0, new_idx);
2984 if (BE (new_idx == -1 || work_tree == NULL, 0))
2985 goto parse_bracket_exp_espace;
2986
2987 #ifdef RE_ENABLE_I18N
2988 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2989 || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2990 || mbcset->non_match)))
2991 {
2992 re_token_t alt_token;
2993 bin_tree_t *mbc_tree;
2994 /* Build a tree for complex bracket. */
2995 br_token.type = COMPLEX_BRACKET;
2996 br_token.opr.mbcset = mbcset;
2997 dfa->has_mb_node = 1;
2998 new_idx = re_dfa_add_node (dfa, br_token, 0);
2999 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3000 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3001 goto parse_bracket_exp_espace;
3002 /* Then join them by ALT node. */
3003 dfa->has_plural_match = 1;
3004 alt_token.type = OP_ALT;
3005 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3006 work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3007 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3008 return work_tree;
3009 }
3010 else
3011 {
3012 free_charset (mbcset);
3013 return work_tree;
3014 }
3015 #else /* not RE_ENABLE_I18N */
3016 return work_tree;
3017 #endif /* not RE_ENABLE_I18N */
3018
3019 parse_bracket_exp_espace:
3020 *err = REG_ESPACE;
3021 parse_bracket_exp_free_return:
3022 re_free (sbcset);
3023 #ifdef RE_ENABLE_I18N
3024 free_charset (mbcset);
3025 #endif /* RE_ENABLE_I18N */
3026 return NULL;
3027 }
3028
3029 /* Parse an element in the bracket expression. */
3030
3031 static reg_errcode_t
3032 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
3033 bracket_elem_t *elem;
3034 re_string_t *regexp;
3035 re_token_t *token;
3036 int token_len;
3037 re_dfa_t *dfa;
3038 reg_syntax_t syntax;
3039 {
3040 #ifdef RE_ENABLE_I18N
3041 int cur_char_size;
3042 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3043 if (cur_char_size > 1)
3044 {
3045 elem->type = MB_CHAR;
3046 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3047 re_string_skip_bytes (regexp, cur_char_size);
3048 return REG_NOERROR;
3049 }
3050 #endif /* RE_ENABLE_I18N */
3051 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3052 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3053 || token->type == OP_OPEN_EQUIV_CLASS)
3054 return parse_bracket_symbol (elem, regexp, token);
3055 elem->type = SB_CHAR;
3056 elem->opr.ch = token->opr.c;
3057 return REG_NOERROR;
3058 }
3059
3060 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3061 such as [:<character_class>:], [.<collating_element>.], and
3062 [=<equivalent_class>=]. */
3063
3064 static reg_errcode_t
3065 parse_bracket_symbol (elem, regexp, token)
3066 bracket_elem_t *elem;
3067 re_string_t *regexp;
3068 re_token_t *token;
3069 {
3070 unsigned char ch, delim = token->opr.c;
3071 int i = 0;
3072 for (;; ++i)
3073 {
3074 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3075 return REG_EBRACK;
3076 if (token->type == OP_OPEN_CHAR_CLASS)
3077 ch = re_string_fetch_byte_case (regexp);
3078 else
3079 ch = re_string_fetch_byte (regexp);
3080 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3081 break;
3082 elem->opr.name[i] = ch;
3083 }
3084 re_string_skip_bytes (regexp, 1);
3085 elem->opr.name[i] = '\0';
3086 switch (token->type)
3087 {
3088 case OP_OPEN_COLL_ELEM:
3089 elem->type = COLL_SYM;
3090 break;
3091 case OP_OPEN_EQUIV_CLASS:
3092 elem->type = EQUIV_CLASS;
3093 break;
3094 case OP_OPEN_CHAR_CLASS:
3095 elem->type = CHAR_CLASS;
3096 break;
3097 default:
3098 break;
3099 }
3100 return REG_NOERROR;
3101 }
3102
3103 /* Helper function for parse_bracket_exp.
3104 Build the equivalence class which is represented by NAME.
3105 The result are written to MBCSET and SBCSET.
3106 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3107 is a pointer argument sinse we may update it. */
3108
3109 static reg_errcode_t
3110 #ifdef RE_ENABLE_I18N
3111 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3112 re_charset_t *mbcset;
3113 int *equiv_class_alloc;
3114 #else /* not RE_ENABLE_I18N */
3115 build_equiv_class (sbcset, name)
3116 #endif /* not RE_ENABLE_I18N */
3117 re_bitset_ptr_t sbcset;
3118 const unsigned char *name;
3119 {
3120 #if defined _LIBC && defined RE_ENABLE_I18N
3121 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3122 if (nrules != 0)
3123 {
3124 const int32_t *table, *indirect;
3125 const unsigned char *weights, *extra, *cp;
3126 unsigned char char_buf[2];
3127 int32_t idx1, idx2;
3128 unsigned int ch;
3129 size_t len;
3130 /* This #include defines a local function! */
3131 # include <locale/weight.h>
3132 /* Calculate the index for equivalence class. */
3133 cp = name;
3134 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3135 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3136 _NL_COLLATE_WEIGHTMB);
3137 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3138 _NL_COLLATE_EXTRAMB);
3139 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3140 _NL_COLLATE_INDIRECTMB);
3141 idx1 = findidx (&cp);
3142 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3143 /* This isn't a valid character. */
3144 return REG_ECOLLATE;
3145
3146 /* Build single byte matcing table for this equivalence class. */
3147 char_buf[1] = (unsigned char) '\0';
3148 len = weights[idx1];
3149 for (ch = 0; ch < SBC_MAX; ++ch)
3150 {
3151 char_buf[0] = ch;
3152 cp = char_buf;
3153 idx2 = findidx (&cp);
3154 /*
3155 idx2 = table[ch];
3156 */
3157 if (idx2 == 0)
3158 /* This isn't a valid character. */
3159 continue;
3160 if (len == weights[idx2])
3161 {
3162 int cnt = 0;
3163 while (cnt <= len &&
3164 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3165 ++cnt;
3166
3167 if (cnt > len)
3168 bitset_set (sbcset, ch);
3169 }
3170 }
3171 /* Check whether the array has enough space. */
3172 if (*equiv_class_alloc == mbcset->nequiv_classes)
3173 {
3174 /* Not enough, realloc it. */
3175 /* +1 in case of mbcset->nequiv_classes is 0. */
3176 *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3177 /* Use realloc since the array is NULL if *alloc == 0. */
3178 mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3179 *equiv_class_alloc);
3180 if (BE (mbcset->equiv_classes == NULL, 0))
3181 return REG_ESPACE;
3182 }
3183 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3184 }
3185 else
3186 #endif /* _LIBC && RE_ENABLE_I18N */
3187 {
3188 if (BE (strlen ((const char *) name) != 1, 0))
3189 return REG_ECOLLATE;
3190 bitset_set (sbcset, *name);
3191 }
3192 return REG_NOERROR;
3193 }
3194
3195 /* Helper function for parse_bracket_exp.
3196 Build the character class which is represented by NAME.
3197 The result are written to MBCSET and SBCSET.
3198 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3199 is a pointer argument sinse we may update it. */
3200
3201 static reg_errcode_t
3202 #ifdef RE_ENABLE_I18N
3203 build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3204 re_charset_t *mbcset;
3205 int *char_class_alloc;
3206 #else /* not RE_ENABLE_I18N */
3207 build_charclass (sbcset, class_name, syntax)
3208 #endif /* not RE_ENABLE_I18N */
3209 re_bitset_ptr_t sbcset;
3210 const unsigned char *class_name;
3211 reg_syntax_t syntax;
3212 {
3213 int i;
3214 const char *name = (const char *) class_name;
3215
3216 /* In case of REG_ICASE "upper" and "lower" match the both of
3217 upper and lower cases. */
3218 if ((syntax & RE_ICASE)
3219 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3220 name = "alpha";
3221
3222 #ifdef RE_ENABLE_I18N
3223 /* Check the space of the arrays. */
3224 if (*char_class_alloc == mbcset->nchar_classes)
3225 {
3226 /* Not enough, realloc it. */
3227 /* +1 in case of mbcset->nchar_classes is 0. */
3228 *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3229 /* Use realloc since array is NULL if *alloc == 0. */
3230 mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3231 *char_class_alloc);
3232 if (BE (mbcset->char_classes == NULL, 0))
3233 return REG_ESPACE;
3234 }
3235 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3236 #endif /* RE_ENABLE_I18N */
3237
3238 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3239 for (i = 0; i < SBC_MAX; ++i) \
3240 { \
3241 if (ctype_func (i)) \
3242 bitset_set (sbcset, i); \
3243 }
3244
3245 if (strcmp (name, "alnum") == 0)
3246 BUILD_CHARCLASS_LOOP (isalnum)
3247 else if (strcmp (name, "cntrl") == 0)
3248 BUILD_CHARCLASS_LOOP (iscntrl)
3249 else if (strcmp (name, "lower") == 0)
3250 BUILD_CHARCLASS_LOOP (islower)
3251 else if (strcmp (name, "space") == 0)
3252 BUILD_CHARCLASS_LOOP (isspace)
3253 else if (strcmp (name, "alpha") == 0)
3254 BUILD_CHARCLASS_LOOP (isalpha)
3255 else if (strcmp (name, "digit") == 0)
3256 BUILD_CHARCLASS_LOOP (isdigit)
3257 else if (strcmp (name, "print") == 0)
3258 BUILD_CHARCLASS_LOOP (isprint)
3259 else if (strcmp (name, "upper") == 0)
3260 BUILD_CHARCLASS_LOOP (isupper)
3261 else if (strcmp (name, "blank") == 0)
3262 BUILD_CHARCLASS_LOOP (isblank)
3263 else if (strcmp (name, "graph") == 0)
3264 BUILD_CHARCLASS_LOOP (isgraph)
3265 else if (strcmp (name, "punct") == 0)
3266 BUILD_CHARCLASS_LOOP (ispunct)
3267 else if (strcmp (name, "xdigit") == 0)
3268 BUILD_CHARCLASS_LOOP (isxdigit)
3269 else
3270 return REG_ECTYPE;
3271
3272 return REG_NOERROR;
3273 }
3274
3275 static bin_tree_t *
3276 build_word_op (dfa, not, err)
3277 re_dfa_t *dfa;
3278 int not;
3279 reg_errcode_t *err;
3280 {
3281 re_bitset_ptr_t sbcset;
3282 #ifdef RE_ENABLE_I18N
3283 re_charset_t *mbcset;
3284 int alloc = 0;
3285 #else /* not RE_ENABLE_I18N */
3286 int non_match = 0;
3287 #endif /* not RE_ENABLE_I18N */
3288 reg_errcode_t ret;
3289 re_token_t br_token;
3290 bin_tree_t *tree;
3291 int new_idx;
3292
3293 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3294 #ifdef RE_ENABLE_I18N
3295 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3296 #endif /* RE_ENABLE_I18N */
3297
3298 #ifdef RE_ENABLE_I18N
3299 if (BE (sbcset == NULL || mbcset == NULL, 0))
3300 #else /* not RE_ENABLE_I18N */
3301 if (BE (sbcset == NULL, 0))
3302 #endif /* not RE_ENABLE_I18N */
3303 {
3304 *err = REG_ESPACE;
3305 return NULL;
3306 }
3307
3308 if (not)
3309 {
3310 #ifdef RE_ENABLE_I18N
3311 int i;
3312 /*
3313 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3314 bitset_set(cset->sbcset, '\0');
3315 */
3316 mbcset->non_match = 1;
3317 if (MB_CUR_MAX > 1)
3318 for (i = 0; i < SBC_MAX; ++i)
3319 if (__btowc (i) == WEOF)
3320 bitset_set (sbcset, i);
3321 #else /* not RE_ENABLE_I18N */
3322 non_match = 1;
3323 #endif /* not RE_ENABLE_I18N */
3324 }
3325
3326 /* We don't care the syntax in this case. */
3327 ret = build_charclass (sbcset,
3328 #ifdef RE_ENABLE_I18N
3329 mbcset, &alloc,
3330 #endif /* RE_ENABLE_I18N */
3331 (const unsigned char *) "alpha", 0);
3332
3333 if (BE (ret != REG_NOERROR, 0))
3334 {
3335 re_free (sbcset);
3336 #ifdef RE_ENABLE_I18N
3337 free_charset (mbcset);
3338 #endif /* RE_ENABLE_I18N */
3339 *err = ret;
3340 return NULL;
3341 }
3342 /* \w match '_' also. */
3343 bitset_set (sbcset, '_');
3344
3345 /* If it is non-matching list. */
3346 #ifdef RE_ENABLE_I18N
3347 if (mbcset->non_match)
3348 #else /* not RE_ENABLE_I18N */
3349 if (non_match)
3350 #endif /* not RE_ENABLE_I18N */
3351 bitset_not (sbcset);
3352
3353 /* Build a tree for simple bracket. */
3354 br_token.type = SIMPLE_BRACKET;
3355 br_token.opr.sbcset = sbcset;
3356 new_idx = re_dfa_add_node (dfa, br_token, 0);
3357 tree = create_tree (NULL, NULL, 0, new_idx);
3358 if (BE (new_idx == -1 || tree == NULL, 0))
3359 goto build_word_op_espace;
3360
3361 #ifdef RE_ENABLE_I18N
3362 if (MB_CUR_MAX > 1)
3363 {
3364 re_token_t alt_token;
3365 bin_tree_t *mbc_tree;
3366 /* Build a tree for complex bracket. */
3367 br_token.type = COMPLEX_BRACKET;
3368 br_token.opr.mbcset = mbcset;
3369 dfa->has_mb_node = 1;
3370 new_idx = re_dfa_add_node (dfa, br_token, 0);
3371 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3372 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3373 goto build_word_op_espace;
3374 /* Then join them by ALT node. */
3375 alt_token.type = OP_ALT;
3376 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3377 tree = create_tree (tree, mbc_tree, 0, new_idx);
3378 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3379 return tree;
3380 }
3381 else
3382 {
3383 free_charset (mbcset);
3384 return tree;
3385 }
3386 #else /* not RE_ENABLE_I18N */
3387 return tree;
3388 #endif /* not RE_ENABLE_I18N */
3389
3390 build_word_op_espace:
3391 re_free (sbcset);
3392 #ifdef RE_ENABLE_I18N
3393 free_charset (mbcset);
3394 #endif /* RE_ENABLE_I18N */
3395 *err = REG_ESPACE;
3396 return NULL;
3397 }
3398
3399 /* This is intended for the expressions like "a{1,3}".
3400 Fetch a number from `input', and return the number.
3401 Return -1, if the number field is empty like "{,1}".
3402 Return -2, If an error is occured. */
3403
3404 static int
3405 fetch_number (input, token, syntax)
3406 re_string_t *input;
3407 re_token_t *token;
3408 reg_syntax_t syntax;
3409 {
3410 int num = -1;
3411 unsigned char c;
3412 while (1)
3413 {
3414 *token = fetch_token (input, syntax);
3415 c = token->opr.c;
3416 if (BE (token->type == END_OF_RE, 0))
3417 return -2;
3418 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3419 break;
3420 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3421 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3422 num = (num > RE_DUP_MAX) ? -2 : num;
3423 }
3424 return num;
3425 }
3426 \f
3427 #ifdef RE_ENABLE_I18N
3428 static void
3429 free_charset (re_charset_t *cset)
3430 {
3431 re_free (cset->mbchars);
3432 # ifdef _LIBC
3433 re_free (cset->coll_syms);
3434 re_free (cset->equiv_classes);
3435 re_free (cset->range_starts);
3436 re_free (cset->range_ends);
3437 # endif
3438 re_free (cset->char_classes);
3439 re_free (cset);
3440 }
3441 #endif /* RE_ENABLE_I18N */
3442 \f
3443 /* Functions for binary tree operation. */
3444
3445 /* Create a node of tree.
3446 Note: This function automatically free left and right if malloc fails. */
3447
3448 static bin_tree_t *
3449 create_tree (left, right, type, index)
3450 bin_tree_t *left;
3451 bin_tree_t *right;
3452 re_token_type_t type;
3453 int index;
3454 {
3455 bin_tree_t *tree;
3456 tree = re_malloc (bin_tree_t, 1);
3457 if (BE (tree == NULL, 0))
3458 {
3459 free_bin_tree (left);
3460 free_bin_tree (right);
3461 return NULL;
3462 }
3463 tree->parent = NULL;
3464 tree->left = left;
3465 tree->right = right;
3466 tree->type = type;
3467 tree->node_idx = index;
3468 tree->first = -1;
3469 tree->next = -1;
3470 re_node_set_init_empty (&tree->eclosure);
3471
3472 if (left != NULL)
3473 left->parent = tree;
3474 if (right != NULL)
3475 right->parent = tree;
3476 return tree;
3477 }
3478
3479 /* Free the sub tree pointed by TREE. */
3480
3481 static void
3482 free_bin_tree (tree)
3483 bin_tree_t *tree;
3484 {
3485 if (tree == NULL)
3486 return;
3487 /*re_node_set_free (&tree->eclosure);*/
3488 free_bin_tree (tree->left);
3489 free_bin_tree (tree->right);
3490 re_free (tree);
3491 }
3492
3493 /* Duplicate the node SRC, and return new node. */
3494
3495 static bin_tree_t *
3496 duplicate_tree (src, dfa)
3497 const bin_tree_t *src;
3498 re_dfa_t *dfa;
3499 {
3500 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3501 int new_node_idx;
3502 /* Since node indies must be according to Post-order of the tree,
3503 we must duplicate the left at first. */
3504 if (src->left != NULL)
3505 {
3506 left = duplicate_tree (src->left, dfa);
3507 if (left == NULL)
3508 return NULL;
3509 }
3510
3511 /* Secondaly, duplicate the right. */
3512 if (src->right != NULL)
3513 {
3514 right = duplicate_tree (src->right, dfa);
3515 if (right == NULL)
3516 {
3517 free_bin_tree (left);
3518 return NULL;
3519 }
3520 }
3521
3522 /* At last, duplicate itself. */
3523 if (src->type == NON_TYPE)
3524 {
3525 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3526 dfa->nodes[new_node_idx].duplicated = 1;
3527 if (BE (new_node_idx == -1, 0))
3528 {
3529 free_bin_tree (left);
3530 free_bin_tree (right);
3531 return NULL;
3532 }
3533 }
3534 else
3535 new_node_idx = src->type;
3536
3537 new_tree = create_tree (left, right, src->type, new_node_idx);
3538 if (BE (new_tree == NULL, 0))
3539 {
3540 free_bin_tree (left);
3541 free_bin_tree (right);
3542 }
3543 return new_tree;
3544 }