]> git.proxmox.com Git - mirror_smartmontools-debian.git/blob - regex/regexec.c
import smartmontools 7.0
[mirror_smartmontools-debian.git] / regex / regexec.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 Idx n);
22 static void match_ctx_clean (re_match_context_t *mctx);
23 static void match_ctx_free (re_match_context_t *cache);
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to);
26 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28 Idx str_idx);
29 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30 Idx node, Idx str_idx);
31 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32 re_dfastate_t **limited_sts, Idx last_node,
33 Idx last_str_idx);
34 static reg_errcode_t re_search_internal (const regex_t *preg,
35 const char *string, Idx length,
36 Idx start, Idx last_start, Idx stop,
37 size_t nmatch, regmatch_t pmatch[],
38 int eflags);
39 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40 const char *string1, Idx length1,
41 const char *string2, Idx length2,
42 Idx start, regoff_t range,
43 struct re_registers *regs,
44 Idx stop, bool ret_len);
45 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46 const char *string, Idx length, Idx start,
47 regoff_t range, Idx stop,
48 struct re_registers *regs,
49 bool ret_len);
50 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51 Idx nregs, int regs_allocated);
52 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54 Idx *p_match_first);
55 static Idx check_halt_state_context (const re_match_context_t *mctx,
56 const re_dfastate_t *state, Idx idx);
57 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58 regmatch_t *prev_idx_match, Idx cur_node,
59 Idx cur_idx, Idx nmatch);
60 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61 Idx str_idx, Idx dest_node, Idx nregs,
62 regmatch_t *regs,
63 re_node_set *eps_via_nodes);
64 static reg_errcode_t set_regs (const regex_t *preg,
65 const re_match_context_t *mctx,
66 size_t nmatch, regmatch_t *pmatch,
67 bool fl_backtrack);
68 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
69
70 #ifdef RE_ENABLE_I18N
71 static int sift_states_iter_mb (const re_match_context_t *mctx,
72 re_sift_context_t *sctx,
73 Idx node_idx, Idx str_idx, Idx max_str_idx);
74 #endif /* RE_ENABLE_I18N */
75 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
76 re_sift_context_t *sctx);
77 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
78 re_sift_context_t *sctx, Idx str_idx,
79 re_node_set *cur_dest);
80 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
81 re_sift_context_t *sctx,
82 Idx str_idx,
83 re_node_set *dest_nodes);
84 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
85 re_node_set *dest_nodes,
86 const re_node_set *candidates);
87 static bool check_dst_limits (const re_match_context_t *mctx,
88 const re_node_set *limits,
89 Idx dst_node, Idx dst_idx, Idx src_node,
90 Idx src_idx);
91 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
92 int boundaries, Idx subexp_idx,
93 Idx from_node, Idx bkref_idx);
94 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
95 Idx limit, Idx subexp_idx,
96 Idx node, Idx str_idx,
97 Idx bkref_idx);
98 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
99 re_node_set *dest_nodes,
100 const re_node_set *candidates,
101 re_node_set *limits,
102 struct re_backref_cache_entry *bkref_ents,
103 Idx str_idx);
104 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
105 re_sift_context_t *sctx,
106 Idx str_idx, const re_node_set *candidates);
107 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
108 re_dfastate_t **dst,
109 re_dfastate_t **src, Idx num);
110 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
111 re_match_context_t *mctx);
112 static re_dfastate_t *transit_state (reg_errcode_t *err,
113 re_match_context_t *mctx,
114 re_dfastate_t *state);
115 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
116 re_match_context_t *mctx,
117 re_dfastate_t *next_state);
118 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
119 re_node_set *cur_nodes,
120 Idx str_idx);
121 #if 0
122 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
123 re_match_context_t *mctx,
124 re_dfastate_t *pstate);
125 #endif
126 #ifdef RE_ENABLE_I18N
127 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
128 re_dfastate_t *pstate);
129 #endif /* RE_ENABLE_I18N */
130 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
131 const re_node_set *nodes);
132 static reg_errcode_t get_subexp (re_match_context_t *mctx,
133 Idx bkref_node, Idx bkref_str_idx);
134 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
135 const re_sub_match_top_t *sub_top,
136 re_sub_match_last_t *sub_last,
137 Idx bkref_node, Idx bkref_str);
138 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
139 Idx subexp_idx, int type);
140 static reg_errcode_t check_arrival (re_match_context_t *mctx,
141 state_array_t *path, Idx top_node,
142 Idx top_str, Idx last_node, Idx last_str,
143 int type);
144 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
145 Idx str_idx,
146 re_node_set *cur_nodes,
147 re_node_set *next_nodes);
148 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
149 re_node_set *cur_nodes,
150 Idx ex_subexp, int type);
151 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
152 re_node_set *dst_nodes,
153 Idx target, Idx ex_subexp,
154 int type);
155 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
156 re_node_set *cur_nodes, Idx cur_str,
157 Idx subexp_num, int type);
158 static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
159 #ifdef RE_ENABLE_I18N
160 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
161 const re_string_t *input, Idx idx);
162 # ifdef _LIBC
163 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
164 size_t name_len);
165 # endif /* _LIBC */
166 #endif /* RE_ENABLE_I18N */
167 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
168 const re_dfastate_t *state,
169 re_node_set *states_node,
170 bitset_t *states_ch);
171 static bool check_node_accept (const re_match_context_t *mctx,
172 const re_token_t *node, Idx idx);
173 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
174 \f
175 /* Entry point for POSIX code. */
176
177 /* regexec searches for a given pattern, specified by PREG, in the
178 string STRING.
179
180 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
181 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
182 least NMATCH elements, and we set them to the offsets of the
183 corresponding matched substrings.
184
185 EFLAGS specifies "execution flags" which affect matching: if
186 REG_NOTBOL is set, then ^ does not match at the beginning of the
187 string; if REG_NOTEOL is set, then $ does not match at the end.
188
189 We return 0 if we find a match and REG_NOMATCH if not. */
190
191 int
192 regexec (const regex_t *_Restrict_ preg, const char *_Restrict_ string,
193 size_t nmatch, regmatch_t pmatch[], int eflags)
194 {
195 reg_errcode_t err;
196 Idx start, length;
197 re_dfa_t *dfa = preg->buffer;
198
199 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
200 return REG_BADPAT;
201
202 if (eflags & REG_STARTEND)
203 {
204 start = pmatch[0].rm_so;
205 length = pmatch[0].rm_eo;
206 }
207 else
208 {
209 start = 0;
210 length = strlen (string);
211 }
212
213 lock_lock (dfa->lock);
214 if (preg->no_sub)
215 err = re_search_internal (preg, string, length, start, length,
216 length, 0, NULL, eflags);
217 else
218 err = re_search_internal (preg, string, length, start, length,
219 length, nmatch, pmatch, eflags);
220 lock_unlock (dfa->lock);
221 return err != REG_NOERROR;
222 }
223
224 #ifdef _LIBC
225 libc_hidden_def (__regexec)
226
227 # include <shlib-compat.h>
228 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
229
230 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
231 __typeof__ (__regexec) __compat_regexec;
232
233 int
234 attribute_compat_text_section
235 __compat_regexec (const regex_t *_Restrict_ preg,
236 const char *_Restrict_ string, size_t nmatch,
237 regmatch_t pmatch[], int eflags)
238 {
239 return regexec (preg, string, nmatch, pmatch,
240 eflags & (REG_NOTBOL | REG_NOTEOL));
241 }
242 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
243 # endif
244 #endif
245
246 /* Entry points for GNU code. */
247
248 /* re_match, re_search, re_match_2, re_search_2
249
250 The former two functions operate on STRING with length LENGTH,
251 while the later two operate on concatenation of STRING1 and STRING2
252 with lengths LENGTH1 and LENGTH2, respectively.
253
254 re_match() matches the compiled pattern in BUFP against the string,
255 starting at index START.
256
257 re_search() first tries matching at index START, then it tries to match
258 starting from index START + 1, and so on. The last start position tried
259 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
260 way as re_match().)
261
262 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
263 the first STOP characters of the concatenation of the strings should be
264 concerned.
265
266 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
267 and all groups is stored in REGS. (For the "_2" variants, the offsets are
268 computed relative to the concatenation, not relative to the individual
269 strings.)
270
271 On success, re_match* functions return the length of the match, re_search*
272 return the position of the start of the match. Return value -1 means no
273 match was found and -2 indicates an internal error. */
274
275 regoff_t
276 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
277 Idx start, struct re_registers *regs)
278 {
279 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
280 }
281 #ifdef _LIBC
282 weak_alias (__re_match, re_match)
283 #endif
284
285 regoff_t
286 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
287 Idx start, regoff_t range, struct re_registers *regs)
288 {
289 return re_search_stub (bufp, string, length, start, range, length, regs,
290 false);
291 }
292 #ifdef _LIBC
293 weak_alias (__re_search, re_search)
294 #endif
295
296 regoff_t
297 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
298 const char *string2, Idx length2, Idx start,
299 struct re_registers *regs, Idx stop)
300 {
301 return re_search_2_stub (bufp, string1, length1, string2, length2,
302 start, 0, regs, stop, true);
303 }
304 #ifdef _LIBC
305 weak_alias (__re_match_2, re_match_2)
306 #endif
307
308 regoff_t
309 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
310 const char *string2, Idx length2, Idx start, regoff_t range,
311 struct re_registers *regs, Idx stop)
312 {
313 return re_search_2_stub (bufp, string1, length1, string2, length2,
314 start, range, regs, stop, false);
315 }
316 #ifdef _LIBC
317 weak_alias (__re_search_2, re_search_2)
318 #endif
319
320 static regoff_t
321 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
322 Idx length1, const char *string2, Idx length2, Idx start,
323 regoff_t range, struct re_registers *regs,
324 Idx stop, bool ret_len)
325 {
326 const char *str;
327 regoff_t rval;
328 Idx len;
329 char *s = NULL;
330
331 if (BE ((length1 < 0 || length2 < 0 || stop < 0
332 || INT_ADD_WRAPV (length1, length2, &len)),
333 0))
334 return -2;
335
336 /* Concatenate the strings. */
337 if (length2 > 0)
338 if (length1 > 0)
339 {
340 s = re_malloc (char, len);
341
342 if (BE (s == NULL, 0))
343 return -2;
344 #ifdef _LIBC
345 memcpy (__mempcpy (s, string1, length1), string2, length2);
346 #else
347 memcpy (s, string1, length1);
348 memcpy (s + length1, string2, length2);
349 #endif
350 str = s;
351 }
352 else
353 str = string2;
354 else
355 str = string1;
356
357 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
358 ret_len);
359 re_free (s);
360 return rval;
361 }
362
363 /* The parameters have the same meaning as those of re_search.
364 Additional parameters:
365 If RET_LEN is true the length of the match is returned (re_match style);
366 otherwise the position of the match is returned. */
367
368 static regoff_t
369 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
370 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
371 bool ret_len)
372 {
373 reg_errcode_t result;
374 regmatch_t *pmatch;
375 Idx nregs;
376 regoff_t rval;
377 int eflags = 0;
378 re_dfa_t *dfa = bufp->buffer;
379 Idx last_start = start + range;
380
381 /* Check for out-of-range. */
382 if (BE (start < 0 || start > length, 0))
383 return -1;
384 if (BE (length < last_start || (0 <= range && last_start < start), 0))
385 last_start = length;
386 else if (BE (last_start < 0 || (range < 0 && start <= last_start), 0))
387 last_start = 0;
388
389 lock_lock (dfa->lock);
390
391 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
392 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
393
394 /* Compile fastmap if we haven't yet. */
395 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
396 re_compile_fastmap (bufp);
397
398 if (BE (bufp->no_sub, 0))
399 regs = NULL;
400
401 /* We need at least 1 register. */
402 if (regs == NULL)
403 nregs = 1;
404 else if (BE (bufp->regs_allocated == REGS_FIXED
405 && regs->num_regs <= bufp->re_nsub, 0))
406 {
407 nregs = regs->num_regs;
408 if (BE (nregs < 1, 0))
409 {
410 /* Nothing can be copied to regs. */
411 regs = NULL;
412 nregs = 1;
413 }
414 }
415 else
416 nregs = bufp->re_nsub + 1;
417 pmatch = re_malloc (regmatch_t, nregs);
418 if (BE (pmatch == NULL, 0))
419 {
420 rval = -2;
421 goto out;
422 }
423
424 result = re_search_internal (bufp, string, length, start, last_start, stop,
425 nregs, pmatch, eflags);
426
427 rval = 0;
428
429 /* I hope we needn't fill their regs with -1's when no match was found. */
430 if (result != REG_NOERROR)
431 rval = result == REG_NOMATCH ? -1 : -2;
432 else if (regs != NULL)
433 {
434 /* If caller wants register contents data back, copy them. */
435 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
436 bufp->regs_allocated);
437 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
438 rval = -2;
439 }
440
441 if (BE (rval == 0, 1))
442 {
443 if (ret_len)
444 {
445 assert (pmatch[0].rm_so == start);
446 rval = pmatch[0].rm_eo - start;
447 }
448 else
449 rval = pmatch[0].rm_so;
450 }
451 re_free (pmatch);
452 out:
453 lock_unlock (dfa->lock);
454 return rval;
455 }
456
457 static unsigned
458 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
459 int regs_allocated)
460 {
461 int rval = REGS_REALLOCATE;
462 Idx i;
463 Idx need_regs = nregs + 1;
464 /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
465 uses. */
466
467 /* Have the register data arrays been allocated? */
468 if (regs_allocated == REGS_UNALLOCATED)
469 { /* No. So allocate them with malloc. */
470 regs->start = re_malloc (regoff_t, need_regs);
471 if (BE (regs->start == NULL, 0))
472 return REGS_UNALLOCATED;
473 regs->end = re_malloc (regoff_t, need_regs);
474 if (BE (regs->end == NULL, 0))
475 {
476 re_free (regs->start);
477 return REGS_UNALLOCATED;
478 }
479 regs->num_regs = need_regs;
480 }
481 else if (regs_allocated == REGS_REALLOCATE)
482 { /* Yes. If we need more elements than were already
483 allocated, reallocate them. If we need fewer, just
484 leave it alone. */
485 if (BE (need_regs > regs->num_regs, 0))
486 {
487 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
488 regoff_t *new_end;
489 if (BE (new_start == NULL, 0))
490 return REGS_UNALLOCATED;
491 new_end = re_realloc (regs->end, regoff_t, need_regs);
492 if (BE (new_end == NULL, 0))
493 {
494 re_free (new_start);
495 return REGS_UNALLOCATED;
496 }
497 regs->start = new_start;
498 regs->end = new_end;
499 regs->num_regs = need_regs;
500 }
501 }
502 else
503 {
504 assert (regs_allocated == REGS_FIXED);
505 /* This function may not be called with REGS_FIXED and nregs too big. */
506 assert (regs->num_regs >= nregs);
507 rval = REGS_FIXED;
508 }
509
510 /* Copy the regs. */
511 for (i = 0; i < nregs; ++i)
512 {
513 regs->start[i] = pmatch[i].rm_so;
514 regs->end[i] = pmatch[i].rm_eo;
515 }
516 for ( ; i < regs->num_regs; ++i)
517 regs->start[i] = regs->end[i] = -1;
518
519 return rval;
520 }
521
522 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
523 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
524 this memory for recording register information. STARTS and ENDS
525 must be allocated using the malloc library routine, and must each
526 be at least NUM_REGS * sizeof (regoff_t) bytes long.
527
528 If NUM_REGS == 0, then subsequent matches should allocate their own
529 register data.
530
531 Unless this function is called, the first search or match using
532 PATTERN_BUFFER will allocate its own register data, without
533 freeing the old data. */
534
535 void
536 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
537 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
538 {
539 if (num_regs)
540 {
541 bufp->regs_allocated = REGS_REALLOCATE;
542 regs->num_regs = num_regs;
543 regs->start = starts;
544 regs->end = ends;
545 }
546 else
547 {
548 bufp->regs_allocated = REGS_UNALLOCATED;
549 regs->num_regs = 0;
550 regs->start = regs->end = NULL;
551 }
552 }
553 #ifdef _LIBC
554 weak_alias (__re_set_registers, re_set_registers)
555 #endif
556 \f
557 /* Entry points compatible with 4.2 BSD regex library. We don't define
558 them unless specifically requested. */
559
560 #if defined _REGEX_RE_COMP || defined _LIBC
561 int
562 # ifdef _LIBC
563 weak_function
564 # endif
565 re_exec (const char *s)
566 {
567 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
568 }
569 #endif /* _REGEX_RE_COMP */
570 \f
571 /* Internal entry point. */
572
573 /* Searches for a compiled pattern PREG in the string STRING, whose
574 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
575 meaning as with regexec. LAST_START is START + RANGE, where
576 START and RANGE have the same meaning as with re_search.
577 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
578 otherwise return the error code.
579 Note: We assume front end functions already check ranges.
580 (0 <= LAST_START && LAST_START <= LENGTH) */
581
582 static reg_errcode_t
583 __attribute_warn_unused_result__
584 re_search_internal (const regex_t *preg, const char *string, Idx length,
585 Idx start, Idx last_start, Idx stop, size_t nmatch,
586 regmatch_t pmatch[], int eflags)
587 {
588 reg_errcode_t err;
589 const re_dfa_t *dfa = preg->buffer;
590 Idx left_lim, right_lim;
591 int incr;
592 bool fl_longest_match;
593 int match_kind;
594 Idx match_first;
595 Idx match_last = -1;
596 Idx extra_nmatch;
597 bool sb;
598 int ch;
599 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
600 re_match_context_t mctx = { .dfa = dfa };
601 #else
602 re_match_context_t mctx;
603 #endif
604 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
605 && start != last_start && !preg->can_be_null)
606 ? preg->fastmap : NULL);
607 RE_TRANSLATE_TYPE t = preg->translate;
608
609 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
610 memset (&mctx, '\0', sizeof (re_match_context_t));
611 mctx.dfa = dfa;
612 #endif
613
614 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
615 nmatch -= extra_nmatch;
616
617 /* Check if the DFA haven't been compiled. */
618 if (BE (preg->used == 0 || dfa->init_state == NULL
619 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
620 || dfa->init_state_begbuf == NULL, 0))
621 return REG_NOMATCH;
622
623 #ifdef DEBUG
624 /* We assume front-end functions already check them. */
625 assert (0 <= last_start && last_start <= length);
626 #endif
627
628 /* If initial states with non-begbuf contexts have no elements,
629 the regex must be anchored. If preg->newline_anchor is set,
630 we'll never use init_state_nl, so do not check it. */
631 if (dfa->init_state->nodes.nelem == 0
632 && dfa->init_state_word->nodes.nelem == 0
633 && (dfa->init_state_nl->nodes.nelem == 0
634 || !preg->newline_anchor))
635 {
636 if (start != 0 && last_start != 0)
637 return REG_NOMATCH;
638 start = last_start = 0;
639 }
640
641 /* We must check the longest matching, if nmatch > 0. */
642 fl_longest_match = (nmatch != 0 || dfa->nbackref);
643
644 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
645 preg->translate, (preg->syntax & RE_ICASE) != 0,
646 dfa);
647 if (BE (err != REG_NOERROR, 0))
648 goto free_return;
649 mctx.input.stop = stop;
650 mctx.input.raw_stop = stop;
651 mctx.input.newline_anchor = preg->newline_anchor;
652
653 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
654 if (BE (err != REG_NOERROR, 0))
655 goto free_return;
656
657 /* We will log all the DFA states through which the dfa pass,
658 if nmatch > 1, or this dfa has "multibyte node", which is a
659 back-reference or a node which can accept multibyte character or
660 multi character collating element. */
661 if (nmatch > 1 || dfa->has_mb_node)
662 {
663 /* Avoid overflow. */
664 if (BE ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
665 <= mctx.input.bufs_len), 0))
666 {
667 err = REG_ESPACE;
668 goto free_return;
669 }
670
671 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
672 if (BE (mctx.state_log == NULL, 0))
673 {
674 err = REG_ESPACE;
675 goto free_return;
676 }
677 }
678 else
679 mctx.state_log = NULL;
680
681 match_first = start;
682 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
683 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
684
685 /* Check incrementally whether the input string matches. */
686 incr = (last_start < start) ? -1 : 1;
687 left_lim = (last_start < start) ? last_start : start;
688 right_lim = (last_start < start) ? start : last_start;
689 sb = dfa->mb_cur_max == 1;
690 match_kind =
691 (fastmap
692 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
693 | (start <= last_start ? 2 : 0)
694 | (t != NULL ? 1 : 0))
695 : 8);
696
697 for (;; match_first += incr)
698 {
699 err = REG_NOMATCH;
700 if (match_first < left_lim || right_lim < match_first)
701 goto free_return;
702
703 /* Advance as rapidly as possible through the string, until we
704 find a plausible place to start matching. This may be done
705 with varying efficiency, so there are various possibilities:
706 only the most common of them are specialized, in order to
707 save on code size. We use a switch statement for speed. */
708 switch (match_kind)
709 {
710 case 8:
711 /* No fastmap. */
712 break;
713
714 case 7:
715 /* Fastmap with single-byte translation, match forward. */
716 while (BE (match_first < right_lim, 1)
717 && !fastmap[t[(unsigned char) string[match_first]]])
718 ++match_first;
719 goto forward_match_found_start_or_reached_end;
720
721 case 6:
722 /* Fastmap without translation, match forward. */
723 while (BE (match_first < right_lim, 1)
724 && !fastmap[(unsigned char) string[match_first]])
725 ++match_first;
726
727 forward_match_found_start_or_reached_end:
728 if (BE (match_first == right_lim, 0))
729 {
730 ch = match_first >= length
731 ? 0 : (unsigned char) string[match_first];
732 if (!fastmap[t ? t[ch] : ch])
733 goto free_return;
734 }
735 break;
736
737 case 4:
738 case 5:
739 /* Fastmap without multi-byte translation, match backwards. */
740 while (match_first >= left_lim)
741 {
742 ch = match_first >= length
743 ? 0 : (unsigned char) string[match_first];
744 if (fastmap[t ? t[ch] : ch])
745 break;
746 --match_first;
747 }
748 if (match_first < left_lim)
749 goto free_return;
750 break;
751
752 default:
753 /* In this case, we can't determine easily the current byte,
754 since it might be a component byte of a multibyte
755 character. Then we use the constructed buffer instead. */
756 for (;;)
757 {
758 /* If MATCH_FIRST is out of the valid range, reconstruct the
759 buffers. */
760 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
761 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
762 {
763 err = re_string_reconstruct (&mctx.input, match_first,
764 eflags);
765 if (BE (err != REG_NOERROR, 0))
766 goto free_return;
767
768 offset = match_first - mctx.input.raw_mbs_idx;
769 }
770 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
771 Note that MATCH_FIRST must not be smaller than 0. */
772 ch = (match_first >= length
773 ? 0 : re_string_byte_at (&mctx.input, offset));
774 if (fastmap[ch])
775 break;
776 match_first += incr;
777 if (match_first < left_lim || match_first > right_lim)
778 {
779 err = REG_NOMATCH;
780 goto free_return;
781 }
782 }
783 break;
784 }
785
786 /* Reconstruct the buffers so that the matcher can assume that
787 the matching starts from the beginning of the buffer. */
788 err = re_string_reconstruct (&mctx.input, match_first, eflags);
789 if (BE (err != REG_NOERROR, 0))
790 goto free_return;
791
792 #ifdef RE_ENABLE_I18N
793 /* Don't consider this char as a possible match start if it part,
794 yet isn't the head, of a multibyte character. */
795 if (!sb && !re_string_first_byte (&mctx.input, 0))
796 continue;
797 #endif
798
799 /* It seems to be appropriate one, then use the matcher. */
800 /* We assume that the matching starts from 0. */
801 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
802 match_last = check_matching (&mctx, fl_longest_match,
803 start <= last_start ? &match_first : NULL);
804 if (match_last != -1)
805 {
806 if (BE (match_last == -2, 0))
807 {
808 err = REG_ESPACE;
809 goto free_return;
810 }
811 else
812 {
813 mctx.match_last = match_last;
814 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
815 {
816 re_dfastate_t *pstate = mctx.state_log[match_last];
817 mctx.last_node = check_halt_state_context (&mctx, pstate,
818 match_last);
819 }
820 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
821 || dfa->nbackref)
822 {
823 err = prune_impossible_nodes (&mctx);
824 if (err == REG_NOERROR)
825 break;
826 if (BE (err != REG_NOMATCH, 0))
827 goto free_return;
828 match_last = -1;
829 }
830 else
831 break; /* We found a match. */
832 }
833 }
834
835 match_ctx_clean (&mctx);
836 }
837
838 #ifdef DEBUG
839 assert (match_last != -1);
840 assert (err == REG_NOERROR);
841 #endif
842
843 /* Set pmatch[] if we need. */
844 if (nmatch > 0)
845 {
846 Idx reg_idx;
847
848 /* Initialize registers. */
849 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
850 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
851
852 /* Set the points where matching start/end. */
853 pmatch[0].rm_so = 0;
854 pmatch[0].rm_eo = mctx.match_last;
855 /* FIXME: This function should fail if mctx.match_last exceeds
856 the maximum possible regoff_t value. We need a new error
857 code REG_OVERFLOW. */
858
859 if (!preg->no_sub && nmatch > 1)
860 {
861 err = set_regs (preg, &mctx, nmatch, pmatch,
862 dfa->has_plural_match && dfa->nbackref > 0);
863 if (BE (err != REG_NOERROR, 0))
864 goto free_return;
865 }
866
867 /* At last, add the offset to each register, since we slid
868 the buffers so that we could assume that the matching starts
869 from 0. */
870 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
871 if (pmatch[reg_idx].rm_so != -1)
872 {
873 #ifdef RE_ENABLE_I18N
874 if (BE (mctx.input.offsets_needed != 0, 0))
875 {
876 pmatch[reg_idx].rm_so =
877 (pmatch[reg_idx].rm_so == mctx.input.valid_len
878 ? mctx.input.valid_raw_len
879 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
880 pmatch[reg_idx].rm_eo =
881 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
882 ? mctx.input.valid_raw_len
883 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
884 }
885 #else
886 assert (mctx.input.offsets_needed == 0);
887 #endif
888 pmatch[reg_idx].rm_so += match_first;
889 pmatch[reg_idx].rm_eo += match_first;
890 }
891 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
892 {
893 pmatch[nmatch + reg_idx].rm_so = -1;
894 pmatch[nmatch + reg_idx].rm_eo = -1;
895 }
896
897 if (dfa->subexp_map)
898 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
899 if (dfa->subexp_map[reg_idx] != reg_idx)
900 {
901 pmatch[reg_idx + 1].rm_so
902 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
903 pmatch[reg_idx + 1].rm_eo
904 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
905 }
906 }
907
908 free_return:
909 re_free (mctx.state_log);
910 if (dfa->nbackref)
911 match_ctx_free (&mctx);
912 re_string_destruct (&mctx.input);
913 return err;
914 }
915
916 static reg_errcode_t
917 __attribute_warn_unused_result__
918 prune_impossible_nodes (re_match_context_t *mctx)
919 {
920 const re_dfa_t *const dfa = mctx->dfa;
921 Idx halt_node, match_last;
922 reg_errcode_t ret;
923 re_dfastate_t **sifted_states;
924 re_dfastate_t **lim_states = NULL;
925 re_sift_context_t sctx;
926 #ifdef DEBUG
927 assert (mctx->state_log != NULL);
928 #endif
929 match_last = mctx->match_last;
930 halt_node = mctx->last_node;
931
932 /* Avoid overflow. */
933 if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) <= match_last, 0))
934 return REG_ESPACE;
935
936 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
937 if (BE (sifted_states == NULL, 0))
938 {
939 ret = REG_ESPACE;
940 goto free_return;
941 }
942 if (dfa->nbackref)
943 {
944 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
945 if (BE (lim_states == NULL, 0))
946 {
947 ret = REG_ESPACE;
948 goto free_return;
949 }
950 while (1)
951 {
952 memset (lim_states, '\0',
953 sizeof (re_dfastate_t *) * (match_last + 1));
954 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
955 match_last);
956 ret = sift_states_backward (mctx, &sctx);
957 re_node_set_free (&sctx.limits);
958 if (BE (ret != REG_NOERROR, 0))
959 goto free_return;
960 if (sifted_states[0] != NULL || lim_states[0] != NULL)
961 break;
962 do
963 {
964 --match_last;
965 if (match_last < 0)
966 {
967 ret = REG_NOMATCH;
968 goto free_return;
969 }
970 } while (mctx->state_log[match_last] == NULL
971 || !mctx->state_log[match_last]->halt);
972 halt_node = check_halt_state_context (mctx,
973 mctx->state_log[match_last],
974 match_last);
975 }
976 ret = merge_state_array (dfa, sifted_states, lim_states,
977 match_last + 1);
978 re_free (lim_states);
979 lim_states = NULL;
980 if (BE (ret != REG_NOERROR, 0))
981 goto free_return;
982 }
983 else
984 {
985 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
986 ret = sift_states_backward (mctx, &sctx);
987 re_node_set_free (&sctx.limits);
988 if (BE (ret != REG_NOERROR, 0))
989 goto free_return;
990 if (sifted_states[0] == NULL)
991 {
992 ret = REG_NOMATCH;
993 goto free_return;
994 }
995 }
996 re_free (mctx->state_log);
997 mctx->state_log = sifted_states;
998 sifted_states = NULL;
999 mctx->last_node = halt_node;
1000 mctx->match_last = match_last;
1001 ret = REG_NOERROR;
1002 free_return:
1003 re_free (sifted_states);
1004 re_free (lim_states);
1005 return ret;
1006 }
1007
1008 /* Acquire an initial state and return it.
1009 We must select appropriate initial state depending on the context,
1010 since initial states may have constraints like "\<", "^", etc.. */
1011
1012 static inline re_dfastate_t *
1013 __attribute__ ((always_inline))
1014 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1015 Idx idx)
1016 {
1017 const re_dfa_t *const dfa = mctx->dfa;
1018 if (dfa->init_state->has_constraint)
1019 {
1020 unsigned int context;
1021 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1022 if (IS_WORD_CONTEXT (context))
1023 return dfa->init_state_word;
1024 else if (IS_ORDINARY_CONTEXT (context))
1025 return dfa->init_state;
1026 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1027 return dfa->init_state_begbuf;
1028 else if (IS_NEWLINE_CONTEXT (context))
1029 return dfa->init_state_nl;
1030 else if (IS_BEGBUF_CONTEXT (context))
1031 {
1032 /* It is relatively rare case, then calculate on demand. */
1033 return re_acquire_state_context (err, dfa,
1034 dfa->init_state->entrance_nodes,
1035 context);
1036 }
1037 else
1038 /* Must not happen? */
1039 return dfa->init_state;
1040 }
1041 else
1042 return dfa->init_state;
1043 }
1044
1045 /* Check whether the regular expression match input string INPUT or not,
1046 and return the index where the matching end. Return -1 if
1047 there is no match, and return -2 in case of an error.
1048 FL_LONGEST_MATCH means we want the POSIX longest matching.
1049 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1050 next place where we may want to try matching.
1051 Note that the matcher assumes that the matching starts from the current
1052 index of the buffer. */
1053
1054 static Idx
1055 __attribute_warn_unused_result__
1056 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1057 Idx *p_match_first)
1058 {
1059 const re_dfa_t *const dfa = mctx->dfa;
1060 reg_errcode_t err;
1061 Idx match = 0;
1062 Idx match_last = -1;
1063 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1064 re_dfastate_t *cur_state;
1065 bool at_init_state = p_match_first != NULL;
1066 Idx next_start_idx = cur_str_idx;
1067
1068 err = REG_NOERROR;
1069 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1070 /* An initial state must not be NULL (invalid). */
1071 if (BE (cur_state == NULL, 0))
1072 {
1073 assert (err == REG_ESPACE);
1074 return -2;
1075 }
1076
1077 if (mctx->state_log != NULL)
1078 {
1079 mctx->state_log[cur_str_idx] = cur_state;
1080
1081 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1082 later. E.g. Processing back references. */
1083 if (BE (dfa->nbackref, 0))
1084 {
1085 at_init_state = false;
1086 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1087 if (BE (err != REG_NOERROR, 0))
1088 return err;
1089
1090 if (cur_state->has_backref)
1091 {
1092 err = transit_state_bkref (mctx, &cur_state->nodes);
1093 if (BE (err != REG_NOERROR, 0))
1094 return err;
1095 }
1096 }
1097 }
1098
1099 /* If the RE accepts NULL string. */
1100 if (BE (cur_state->halt, 0))
1101 {
1102 if (!cur_state->has_constraint
1103 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1104 {
1105 if (!fl_longest_match)
1106 return cur_str_idx;
1107 else
1108 {
1109 match_last = cur_str_idx;
1110 match = 1;
1111 }
1112 }
1113 }
1114
1115 while (!re_string_eoi (&mctx->input))
1116 {
1117 re_dfastate_t *old_state = cur_state;
1118 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1119
1120 if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1121 && mctx->input.bufs_len < mctx->input.len)
1122 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1123 && mctx->input.valid_len < mctx->input.len))
1124 {
1125 err = extend_buffers (mctx, next_char_idx + 1);
1126 if (BE (err != REG_NOERROR, 0))
1127 {
1128 assert (err == REG_ESPACE);
1129 return -2;
1130 }
1131 }
1132
1133 cur_state = transit_state (&err, mctx, cur_state);
1134 if (mctx->state_log != NULL)
1135 cur_state = merge_state_with_log (&err, mctx, cur_state);
1136
1137 if (cur_state == NULL)
1138 {
1139 /* Reached the invalid state or an error. Try to recover a valid
1140 state using the state log, if available and if we have not
1141 already found a valid (even if not the longest) match. */
1142 if (BE (err != REG_NOERROR, 0))
1143 return -2;
1144
1145 if (mctx->state_log == NULL
1146 || (match && !fl_longest_match)
1147 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1148 break;
1149 }
1150
1151 if (BE (at_init_state, 0))
1152 {
1153 if (old_state == cur_state)
1154 next_start_idx = next_char_idx;
1155 else
1156 at_init_state = false;
1157 }
1158
1159 if (cur_state->halt)
1160 {
1161 /* Reached a halt state.
1162 Check the halt state can satisfy the current context. */
1163 if (!cur_state->has_constraint
1164 || check_halt_state_context (mctx, cur_state,
1165 re_string_cur_idx (&mctx->input)))
1166 {
1167 /* We found an appropriate halt state. */
1168 match_last = re_string_cur_idx (&mctx->input);
1169 match = 1;
1170
1171 /* We found a match, do not modify match_first below. */
1172 p_match_first = NULL;
1173 if (!fl_longest_match)
1174 break;
1175 }
1176 }
1177 }
1178
1179 if (p_match_first)
1180 *p_match_first += next_start_idx;
1181
1182 return match_last;
1183 }
1184
1185 /* Check NODE match the current context. */
1186
1187 static bool
1188 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1189 {
1190 re_token_type_t type = dfa->nodes[node].type;
1191 unsigned int constraint = dfa->nodes[node].constraint;
1192 if (type != END_OF_RE)
1193 return false;
1194 if (!constraint)
1195 return true;
1196 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1197 return false;
1198 return true;
1199 }
1200
1201 /* Check the halt state STATE match the current context.
1202 Return 0 if not match, if the node, STATE has, is a halt node and
1203 match the context, return the node. */
1204
1205 static Idx
1206 check_halt_state_context (const re_match_context_t *mctx,
1207 const re_dfastate_t *state, Idx idx)
1208 {
1209 Idx i;
1210 unsigned int context;
1211 #ifdef DEBUG
1212 assert (state->halt);
1213 #endif
1214 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1215 for (i = 0; i < state->nodes.nelem; ++i)
1216 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1217 return state->nodes.elems[i];
1218 return 0;
1219 }
1220
1221 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1222 corresponding to the DFA).
1223 Return the destination node, and update EPS_VIA_NODES;
1224 return -1 in case of errors. */
1225
1226 static Idx
1227 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1228 Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1229 struct re_fail_stack_t *fs)
1230 {
1231 const re_dfa_t *const dfa = mctx->dfa;
1232 Idx i;
1233 bool ok;
1234 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1235 {
1236 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1237 re_node_set *edests = &dfa->edests[node];
1238 Idx dest_node;
1239 ok = re_node_set_insert (eps_via_nodes, node);
1240 if (BE (! ok, 0))
1241 return -2;
1242 /* Pick up a valid destination, or return -1 if none
1243 is found. */
1244 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1245 {
1246 Idx candidate = edests->elems[i];
1247 if (!re_node_set_contains (cur_nodes, candidate))
1248 continue;
1249 if (dest_node == -1)
1250 dest_node = candidate;
1251
1252 else
1253 {
1254 /* In order to avoid infinite loop like "(a*)*", return the second
1255 epsilon-transition if the first was already considered. */
1256 if (re_node_set_contains (eps_via_nodes, dest_node))
1257 return candidate;
1258
1259 /* Otherwise, push the second epsilon-transition on the fail stack. */
1260 else if (fs != NULL
1261 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1262 eps_via_nodes))
1263 return -2;
1264
1265 /* We know we are going to exit. */
1266 break;
1267 }
1268 }
1269 return dest_node;
1270 }
1271 else
1272 {
1273 Idx naccepted = 0;
1274 re_token_type_t type = dfa->nodes[node].type;
1275
1276 #ifdef RE_ENABLE_I18N
1277 if (dfa->nodes[node].accept_mb)
1278 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1279 else
1280 #endif /* RE_ENABLE_I18N */
1281 if (type == OP_BACK_REF)
1282 {
1283 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1284 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1285 if (fs != NULL)
1286 {
1287 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1288 return -1;
1289 else if (naccepted)
1290 {
1291 char *buf = (char *) re_string_get_buffer (&mctx->input);
1292 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1293 naccepted) != 0)
1294 return -1;
1295 }
1296 }
1297
1298 if (naccepted == 0)
1299 {
1300 Idx dest_node;
1301 ok = re_node_set_insert (eps_via_nodes, node);
1302 if (BE (! ok, 0))
1303 return -2;
1304 dest_node = dfa->edests[node].elems[0];
1305 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1306 dest_node))
1307 return dest_node;
1308 }
1309 }
1310
1311 if (naccepted != 0
1312 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1313 {
1314 Idx dest_node = dfa->nexts[node];
1315 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1316 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1317 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1318 dest_node)))
1319 return -1;
1320 re_node_set_empty (eps_via_nodes);
1321 return dest_node;
1322 }
1323 }
1324 return -1;
1325 }
1326
1327 static reg_errcode_t
1328 __attribute_warn_unused_result__
1329 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1330 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1331 {
1332 reg_errcode_t err;
1333 Idx num = fs->num++;
1334 if (fs->num == fs->alloc)
1335 {
1336 struct re_fail_stack_ent_t *new_array;
1337 new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1338 fs->alloc * 2);
1339 if (new_array == NULL)
1340 return REG_ESPACE;
1341 fs->alloc *= 2;
1342 fs->stack = new_array;
1343 }
1344 fs->stack[num].idx = str_idx;
1345 fs->stack[num].node = dest_node;
1346 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1347 if (fs->stack[num].regs == NULL)
1348 return REG_ESPACE;
1349 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1350 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1351 return err;
1352 }
1353
1354 static Idx
1355 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1356 regmatch_t *regs, re_node_set *eps_via_nodes)
1357 {
1358 Idx num = --fs->num;
1359 assert (num >= 0);
1360 *pidx = fs->stack[num].idx;
1361 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1362 re_node_set_free (eps_via_nodes);
1363 re_free (fs->stack[num].regs);
1364 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1365 return fs->stack[num].node;
1366 }
1367
1368 /* Set the positions where the subexpressions are starts/ends to registers
1369 PMATCH.
1370 Note: We assume that pmatch[0] is already set, and
1371 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1372
1373 static reg_errcode_t
1374 __attribute_warn_unused_result__
1375 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1376 regmatch_t *pmatch, bool fl_backtrack)
1377 {
1378 const re_dfa_t *dfa = preg->buffer;
1379 Idx idx, cur_node;
1380 re_node_set eps_via_nodes;
1381 struct re_fail_stack_t *fs;
1382 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1383 regmatch_t *prev_idx_match;
1384 bool prev_idx_match_malloced = false;
1385
1386 #ifdef DEBUG
1387 assert (nmatch > 1);
1388 assert (mctx->state_log != NULL);
1389 #endif
1390 if (fl_backtrack)
1391 {
1392 fs = &fs_body;
1393 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1394 if (fs->stack == NULL)
1395 return REG_ESPACE;
1396 }
1397 else
1398 fs = NULL;
1399
1400 cur_node = dfa->init_node;
1401 re_node_set_init_empty (&eps_via_nodes);
1402
1403 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1404 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1405 else
1406 {
1407 prev_idx_match = re_malloc (regmatch_t, nmatch);
1408 if (prev_idx_match == NULL)
1409 {
1410 free_fail_stack_return (fs);
1411 return REG_ESPACE;
1412 }
1413 prev_idx_match_malloced = true;
1414 }
1415 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1416
1417 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1418 {
1419 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1420
1421 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1422 {
1423 Idx reg_idx;
1424 if (fs)
1425 {
1426 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1427 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1428 break;
1429 if (reg_idx == nmatch)
1430 {
1431 re_node_set_free (&eps_via_nodes);
1432 if (prev_idx_match_malloced)
1433 re_free (prev_idx_match);
1434 return free_fail_stack_return (fs);
1435 }
1436 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1437 &eps_via_nodes);
1438 }
1439 else
1440 {
1441 re_node_set_free (&eps_via_nodes);
1442 if (prev_idx_match_malloced)
1443 re_free (prev_idx_match);
1444 return REG_NOERROR;
1445 }
1446 }
1447
1448 /* Proceed to next node. */
1449 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1450 &eps_via_nodes, fs);
1451
1452 if (BE (cur_node < 0, 0))
1453 {
1454 if (BE (cur_node == -2, 0))
1455 {
1456 re_node_set_free (&eps_via_nodes);
1457 if (prev_idx_match_malloced)
1458 re_free (prev_idx_match);
1459 free_fail_stack_return (fs);
1460 return REG_ESPACE;
1461 }
1462 if (fs)
1463 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1464 &eps_via_nodes);
1465 else
1466 {
1467 re_node_set_free (&eps_via_nodes);
1468 if (prev_idx_match_malloced)
1469 re_free (prev_idx_match);
1470 return REG_NOMATCH;
1471 }
1472 }
1473 }
1474 re_node_set_free (&eps_via_nodes);
1475 if (prev_idx_match_malloced)
1476 re_free (prev_idx_match);
1477 return free_fail_stack_return (fs);
1478 }
1479
1480 static reg_errcode_t
1481 free_fail_stack_return (struct re_fail_stack_t *fs)
1482 {
1483 if (fs)
1484 {
1485 Idx fs_idx;
1486 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1487 {
1488 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1489 re_free (fs->stack[fs_idx].regs);
1490 }
1491 re_free (fs->stack);
1492 }
1493 return REG_NOERROR;
1494 }
1495
1496 static void
1497 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1498 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1499 {
1500 int type = dfa->nodes[cur_node].type;
1501 if (type == OP_OPEN_SUBEXP)
1502 {
1503 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1504
1505 /* We are at the first node of this sub expression. */
1506 if (reg_num < nmatch)
1507 {
1508 pmatch[reg_num].rm_so = cur_idx;
1509 pmatch[reg_num].rm_eo = -1;
1510 }
1511 }
1512 else if (type == OP_CLOSE_SUBEXP)
1513 {
1514 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1515 if (reg_num < nmatch)
1516 {
1517 /* We are at the last node of this sub expression. */
1518 if (pmatch[reg_num].rm_so < cur_idx)
1519 {
1520 pmatch[reg_num].rm_eo = cur_idx;
1521 /* This is a non-empty match or we are not inside an optional
1522 subexpression. Accept this right away. */
1523 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1524 }
1525 else
1526 {
1527 if (dfa->nodes[cur_node].opt_subexp
1528 && prev_idx_match[reg_num].rm_so != -1)
1529 /* We transited through an empty match for an optional
1530 subexpression, like (a?)*, and this is not the subexp's
1531 first match. Copy back the old content of the registers
1532 so that matches of an inner subexpression are undone as
1533 well, like in ((a?))*. */
1534 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1535 else
1536 /* We completed a subexpression, but it may be part of
1537 an optional one, so do not update PREV_IDX_MATCH. */
1538 pmatch[reg_num].rm_eo = cur_idx;
1539 }
1540 }
1541 }
1542 }
1543
1544 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1545 and sift the nodes in each states according to the following rules.
1546 Updated state_log will be wrote to STATE_LOG.
1547
1548 Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1549 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1550 If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1551 the LAST_NODE, we throw away the node 'a'.
1552 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1553 string 's' and transit to 'b':
1554 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1555 away the node 'a'.
1556 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1557 thrown away, we throw away the node 'a'.
1558 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1559 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1560 node 'a'.
1561 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1562 we throw away the node 'a'. */
1563
1564 #define STATE_NODE_CONTAINS(state,node) \
1565 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1566
1567 static reg_errcode_t
1568 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1569 {
1570 reg_errcode_t err;
1571 int null_cnt = 0;
1572 Idx str_idx = sctx->last_str_idx;
1573 re_node_set cur_dest;
1574
1575 #ifdef DEBUG
1576 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1577 #endif
1578
1579 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1580 transit to the last_node and the last_node itself. */
1581 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1582 if (BE (err != REG_NOERROR, 0))
1583 return err;
1584 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1585 if (BE (err != REG_NOERROR, 0))
1586 goto free_return;
1587
1588 /* Then check each states in the state_log. */
1589 while (str_idx > 0)
1590 {
1591 /* Update counters. */
1592 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1593 if (null_cnt > mctx->max_mb_elem_len)
1594 {
1595 memset (sctx->sifted_states, '\0',
1596 sizeof (re_dfastate_t *) * str_idx);
1597 re_node_set_free (&cur_dest);
1598 return REG_NOERROR;
1599 }
1600 re_node_set_empty (&cur_dest);
1601 --str_idx;
1602
1603 if (mctx->state_log[str_idx])
1604 {
1605 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1606 if (BE (err != REG_NOERROR, 0))
1607 goto free_return;
1608 }
1609
1610 /* Add all the nodes which satisfy the following conditions:
1611 - It can epsilon transit to a node in CUR_DEST.
1612 - It is in CUR_SRC.
1613 And update state_log. */
1614 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1615 if (BE (err != REG_NOERROR, 0))
1616 goto free_return;
1617 }
1618 err = REG_NOERROR;
1619 free_return:
1620 re_node_set_free (&cur_dest);
1621 return err;
1622 }
1623
1624 static reg_errcode_t
1625 __attribute_warn_unused_result__
1626 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1627 Idx str_idx, re_node_set *cur_dest)
1628 {
1629 const re_dfa_t *const dfa = mctx->dfa;
1630 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1631 Idx i;
1632
1633 /* Then build the next sifted state.
1634 We build the next sifted state on 'cur_dest', and update
1635 'sifted_states[str_idx]' with 'cur_dest'.
1636 Note:
1637 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1638 'cur_src' points the node_set of the old 'state_log[str_idx]'
1639 (with the epsilon nodes pre-filtered out). */
1640 for (i = 0; i < cur_src->nelem; i++)
1641 {
1642 Idx prev_node = cur_src->elems[i];
1643 int naccepted = 0;
1644 bool ok;
1645
1646 #ifdef DEBUG
1647 re_token_type_t type = dfa->nodes[prev_node].type;
1648 assert (!IS_EPSILON_NODE (type));
1649 #endif
1650 #ifdef RE_ENABLE_I18N
1651 /* If the node may accept "multi byte". */
1652 if (dfa->nodes[prev_node].accept_mb)
1653 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1654 str_idx, sctx->last_str_idx);
1655 #endif /* RE_ENABLE_I18N */
1656
1657 /* We don't check backreferences here.
1658 See update_cur_sifted_state(). */
1659 if (!naccepted
1660 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1661 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1662 dfa->nexts[prev_node]))
1663 naccepted = 1;
1664
1665 if (naccepted == 0)
1666 continue;
1667
1668 if (sctx->limits.nelem)
1669 {
1670 Idx to_idx = str_idx + naccepted;
1671 if (check_dst_limits (mctx, &sctx->limits,
1672 dfa->nexts[prev_node], to_idx,
1673 prev_node, str_idx))
1674 continue;
1675 }
1676 ok = re_node_set_insert (cur_dest, prev_node);
1677 if (BE (! ok, 0))
1678 return REG_ESPACE;
1679 }
1680
1681 return REG_NOERROR;
1682 }
1683
1684 /* Helper functions. */
1685
1686 static reg_errcode_t
1687 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1688 {
1689 Idx top = mctx->state_log_top;
1690
1691 if ((next_state_log_idx >= mctx->input.bufs_len
1692 && mctx->input.bufs_len < mctx->input.len)
1693 || (next_state_log_idx >= mctx->input.valid_len
1694 && mctx->input.valid_len < mctx->input.len))
1695 {
1696 reg_errcode_t err;
1697 err = extend_buffers (mctx, next_state_log_idx + 1);
1698 if (BE (err != REG_NOERROR, 0))
1699 return err;
1700 }
1701
1702 if (top < next_state_log_idx)
1703 {
1704 memset (mctx->state_log + top + 1, '\0',
1705 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1706 mctx->state_log_top = next_state_log_idx;
1707 }
1708 return REG_NOERROR;
1709 }
1710
1711 static reg_errcode_t
1712 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1713 re_dfastate_t **src, Idx num)
1714 {
1715 Idx st_idx;
1716 reg_errcode_t err;
1717 for (st_idx = 0; st_idx < num; ++st_idx)
1718 {
1719 if (dst[st_idx] == NULL)
1720 dst[st_idx] = src[st_idx];
1721 else if (src[st_idx] != NULL)
1722 {
1723 re_node_set merged_set;
1724 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1725 &src[st_idx]->nodes);
1726 if (BE (err != REG_NOERROR, 0))
1727 return err;
1728 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1729 re_node_set_free (&merged_set);
1730 if (BE (err != REG_NOERROR, 0))
1731 return err;
1732 }
1733 }
1734 return REG_NOERROR;
1735 }
1736
1737 static reg_errcode_t
1738 update_cur_sifted_state (const re_match_context_t *mctx,
1739 re_sift_context_t *sctx, Idx str_idx,
1740 re_node_set *dest_nodes)
1741 {
1742 const re_dfa_t *const dfa = mctx->dfa;
1743 reg_errcode_t err = REG_NOERROR;
1744 const re_node_set *candidates;
1745 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1746 : &mctx->state_log[str_idx]->nodes);
1747
1748 if (dest_nodes->nelem == 0)
1749 sctx->sifted_states[str_idx] = NULL;
1750 else
1751 {
1752 if (candidates)
1753 {
1754 /* At first, add the nodes which can epsilon transit to a node in
1755 DEST_NODE. */
1756 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1757 if (BE (err != REG_NOERROR, 0))
1758 return err;
1759
1760 /* Then, check the limitations in the current sift_context. */
1761 if (sctx->limits.nelem)
1762 {
1763 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1764 mctx->bkref_ents, str_idx);
1765 if (BE (err != REG_NOERROR, 0))
1766 return err;
1767 }
1768 }
1769
1770 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1771 if (BE (err != REG_NOERROR, 0))
1772 return err;
1773 }
1774
1775 if (candidates && mctx->state_log[str_idx]->has_backref)
1776 {
1777 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1778 if (BE (err != REG_NOERROR, 0))
1779 return err;
1780 }
1781 return REG_NOERROR;
1782 }
1783
1784 static reg_errcode_t
1785 __attribute_warn_unused_result__
1786 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1787 const re_node_set *candidates)
1788 {
1789 reg_errcode_t err = REG_NOERROR;
1790 Idx i;
1791
1792 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1793 if (BE (err != REG_NOERROR, 0))
1794 return err;
1795
1796 if (!state->inveclosure.alloc)
1797 {
1798 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1799 if (BE (err != REG_NOERROR, 0))
1800 return REG_ESPACE;
1801 for (i = 0; i < dest_nodes->nelem; i++)
1802 {
1803 err = re_node_set_merge (&state->inveclosure,
1804 dfa->inveclosures + dest_nodes->elems[i]);
1805 if (BE (err != REG_NOERROR, 0))
1806 return REG_ESPACE;
1807 }
1808 }
1809 return re_node_set_add_intersect (dest_nodes, candidates,
1810 &state->inveclosure);
1811 }
1812
1813 static reg_errcode_t
1814 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1815 const re_node_set *candidates)
1816 {
1817 Idx ecl_idx;
1818 reg_errcode_t err;
1819 re_node_set *inv_eclosure = dfa->inveclosures + node;
1820 re_node_set except_nodes;
1821 re_node_set_init_empty (&except_nodes);
1822 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1823 {
1824 Idx cur_node = inv_eclosure->elems[ecl_idx];
1825 if (cur_node == node)
1826 continue;
1827 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1828 {
1829 Idx edst1 = dfa->edests[cur_node].elems[0];
1830 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1831 ? dfa->edests[cur_node].elems[1] : -1);
1832 if ((!re_node_set_contains (inv_eclosure, edst1)
1833 && re_node_set_contains (dest_nodes, edst1))
1834 || (edst2 > 0
1835 && !re_node_set_contains (inv_eclosure, edst2)
1836 && re_node_set_contains (dest_nodes, edst2)))
1837 {
1838 err = re_node_set_add_intersect (&except_nodes, candidates,
1839 dfa->inveclosures + cur_node);
1840 if (BE (err != REG_NOERROR, 0))
1841 {
1842 re_node_set_free (&except_nodes);
1843 return err;
1844 }
1845 }
1846 }
1847 }
1848 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1849 {
1850 Idx cur_node = inv_eclosure->elems[ecl_idx];
1851 if (!re_node_set_contains (&except_nodes, cur_node))
1852 {
1853 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1854 re_node_set_remove_at (dest_nodes, idx);
1855 }
1856 }
1857 re_node_set_free (&except_nodes);
1858 return REG_NOERROR;
1859 }
1860
1861 static bool
1862 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1863 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1864 {
1865 const re_dfa_t *const dfa = mctx->dfa;
1866 Idx lim_idx, src_pos, dst_pos;
1867
1868 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1869 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1870 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1871 {
1872 Idx subexp_idx;
1873 struct re_backref_cache_entry *ent;
1874 ent = mctx->bkref_ents + limits->elems[lim_idx];
1875 subexp_idx = dfa->nodes[ent->node].opr.idx;
1876
1877 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1878 subexp_idx, dst_node, dst_idx,
1879 dst_bkref_idx);
1880 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1881 subexp_idx, src_node, src_idx,
1882 src_bkref_idx);
1883
1884 /* In case of:
1885 <src> <dst> ( <subexp> )
1886 ( <subexp> ) <src> <dst>
1887 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1888 if (src_pos == dst_pos)
1889 continue; /* This is unrelated limitation. */
1890 else
1891 return true;
1892 }
1893 return false;
1894 }
1895
1896 static int
1897 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1898 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1899 {
1900 const re_dfa_t *const dfa = mctx->dfa;
1901 const re_node_set *eclosures = dfa->eclosures + from_node;
1902 Idx node_idx;
1903
1904 /* Else, we are on the boundary: examine the nodes on the epsilon
1905 closure. */
1906 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1907 {
1908 Idx node = eclosures->elems[node_idx];
1909 switch (dfa->nodes[node].type)
1910 {
1911 case OP_BACK_REF:
1912 if (bkref_idx != -1)
1913 {
1914 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1915 do
1916 {
1917 Idx dst;
1918 int cpos;
1919
1920 if (ent->node != node)
1921 continue;
1922
1923 if (subexp_idx < BITSET_WORD_BITS
1924 && !(ent->eps_reachable_subexps_map
1925 & ((bitset_word_t) 1 << subexp_idx)))
1926 continue;
1927
1928 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1929 OP_CLOSE_SUBEXP cases below. But, if the
1930 destination node is the same node as the source
1931 node, don't recurse because it would cause an
1932 infinite loop: a regex that exhibits this behavior
1933 is ()\1*\1* */
1934 dst = dfa->edests[node].elems[0];
1935 if (dst == from_node)
1936 {
1937 if (boundaries & 1)
1938 return -1;
1939 else /* if (boundaries & 2) */
1940 return 0;
1941 }
1942
1943 cpos =
1944 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1945 dst, bkref_idx);
1946 if (cpos == -1 /* && (boundaries & 1) */)
1947 return -1;
1948 if (cpos == 0 && (boundaries & 2))
1949 return 0;
1950
1951 if (subexp_idx < BITSET_WORD_BITS)
1952 ent->eps_reachable_subexps_map
1953 &= ~((bitset_word_t) 1 << subexp_idx);
1954 }
1955 while (ent++->more);
1956 }
1957 break;
1958
1959 case OP_OPEN_SUBEXP:
1960 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1961 return -1;
1962 break;
1963
1964 case OP_CLOSE_SUBEXP:
1965 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1966 return 0;
1967 break;
1968
1969 default:
1970 break;
1971 }
1972 }
1973
1974 return (boundaries & 2) ? 1 : 0;
1975 }
1976
1977 static int
1978 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1979 Idx subexp_idx, Idx from_node, Idx str_idx,
1980 Idx bkref_idx)
1981 {
1982 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1983 int boundaries;
1984
1985 /* If we are outside the range of the subexpression, return -1 or 1. */
1986 if (str_idx < lim->subexp_from)
1987 return -1;
1988
1989 if (lim->subexp_to < str_idx)
1990 return 1;
1991
1992 /* If we are within the subexpression, return 0. */
1993 boundaries = (str_idx == lim->subexp_from);
1994 boundaries |= (str_idx == lim->subexp_to) << 1;
1995 if (boundaries == 0)
1996 return 0;
1997
1998 /* Else, examine epsilon closure. */
1999 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2000 from_node, bkref_idx);
2001 }
2002
2003 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2004 which are against limitations from DEST_NODES. */
2005
2006 static reg_errcode_t
2007 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2008 const re_node_set *candidates, re_node_set *limits,
2009 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2010 {
2011 reg_errcode_t err;
2012 Idx node_idx, lim_idx;
2013
2014 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2015 {
2016 Idx subexp_idx;
2017 struct re_backref_cache_entry *ent;
2018 ent = bkref_ents + limits->elems[lim_idx];
2019
2020 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2021 continue; /* This is unrelated limitation. */
2022
2023 subexp_idx = dfa->nodes[ent->node].opr.idx;
2024 if (ent->subexp_to == str_idx)
2025 {
2026 Idx ops_node = -1;
2027 Idx cls_node = -1;
2028 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2029 {
2030 Idx node = dest_nodes->elems[node_idx];
2031 re_token_type_t type = dfa->nodes[node].type;
2032 if (type == OP_OPEN_SUBEXP
2033 && subexp_idx == dfa->nodes[node].opr.idx)
2034 ops_node = node;
2035 else if (type == OP_CLOSE_SUBEXP
2036 && subexp_idx == dfa->nodes[node].opr.idx)
2037 cls_node = node;
2038 }
2039
2040 /* Check the limitation of the open subexpression. */
2041 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2042 if (ops_node >= 0)
2043 {
2044 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2045 candidates);
2046 if (BE (err != REG_NOERROR, 0))
2047 return err;
2048 }
2049
2050 /* Check the limitation of the close subexpression. */
2051 if (cls_node >= 0)
2052 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2053 {
2054 Idx node = dest_nodes->elems[node_idx];
2055 if (!re_node_set_contains (dfa->inveclosures + node,
2056 cls_node)
2057 && !re_node_set_contains (dfa->eclosures + node,
2058 cls_node))
2059 {
2060 /* It is against this limitation.
2061 Remove it form the current sifted state. */
2062 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2063 candidates);
2064 if (BE (err != REG_NOERROR, 0))
2065 return err;
2066 --node_idx;
2067 }
2068 }
2069 }
2070 else /* (ent->subexp_to != str_idx) */
2071 {
2072 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2073 {
2074 Idx node = dest_nodes->elems[node_idx];
2075 re_token_type_t type = dfa->nodes[node].type;
2076 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2077 {
2078 if (subexp_idx != dfa->nodes[node].opr.idx)
2079 continue;
2080 /* It is against this limitation.
2081 Remove it form the current sifted state. */
2082 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2083 candidates);
2084 if (BE (err != REG_NOERROR, 0))
2085 return err;
2086 }
2087 }
2088 }
2089 }
2090 return REG_NOERROR;
2091 }
2092
2093 static reg_errcode_t
2094 __attribute_warn_unused_result__
2095 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2096 Idx str_idx, const re_node_set *candidates)
2097 {
2098 const re_dfa_t *const dfa = mctx->dfa;
2099 reg_errcode_t err;
2100 Idx node_idx, node;
2101 re_sift_context_t local_sctx;
2102 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2103
2104 if (first_idx == -1)
2105 return REG_NOERROR;
2106
2107 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2108
2109 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2110 {
2111 Idx enabled_idx;
2112 re_token_type_t type;
2113 struct re_backref_cache_entry *entry;
2114 node = candidates->elems[node_idx];
2115 type = dfa->nodes[node].type;
2116 /* Avoid infinite loop for the REs like "()\1+". */
2117 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2118 continue;
2119 if (type != OP_BACK_REF)
2120 continue;
2121
2122 entry = mctx->bkref_ents + first_idx;
2123 enabled_idx = first_idx;
2124 do
2125 {
2126 Idx subexp_len;
2127 Idx to_idx;
2128 Idx dst_node;
2129 bool ok;
2130 re_dfastate_t *cur_state;
2131
2132 if (entry->node != node)
2133 continue;
2134 subexp_len = entry->subexp_to - entry->subexp_from;
2135 to_idx = str_idx + subexp_len;
2136 dst_node = (subexp_len ? dfa->nexts[node]
2137 : dfa->edests[node].elems[0]);
2138
2139 if (to_idx > sctx->last_str_idx
2140 || sctx->sifted_states[to_idx] == NULL
2141 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2142 || check_dst_limits (mctx, &sctx->limits, node,
2143 str_idx, dst_node, to_idx))
2144 continue;
2145
2146 if (local_sctx.sifted_states == NULL)
2147 {
2148 local_sctx = *sctx;
2149 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2150 if (BE (err != REG_NOERROR, 0))
2151 goto free_return;
2152 }
2153 local_sctx.last_node = node;
2154 local_sctx.last_str_idx = str_idx;
2155 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2156 if (BE (! ok, 0))
2157 {
2158 err = REG_ESPACE;
2159 goto free_return;
2160 }
2161 cur_state = local_sctx.sifted_states[str_idx];
2162 err = sift_states_backward (mctx, &local_sctx);
2163 if (BE (err != REG_NOERROR, 0))
2164 goto free_return;
2165 if (sctx->limited_states != NULL)
2166 {
2167 err = merge_state_array (dfa, sctx->limited_states,
2168 local_sctx.sifted_states,
2169 str_idx + 1);
2170 if (BE (err != REG_NOERROR, 0))
2171 goto free_return;
2172 }
2173 local_sctx.sifted_states[str_idx] = cur_state;
2174 re_node_set_remove (&local_sctx.limits, enabled_idx);
2175
2176 /* mctx->bkref_ents may have changed, reload the pointer. */
2177 entry = mctx->bkref_ents + enabled_idx;
2178 }
2179 while (enabled_idx++, entry++->more);
2180 }
2181 err = REG_NOERROR;
2182 free_return:
2183 if (local_sctx.sifted_states != NULL)
2184 {
2185 re_node_set_free (&local_sctx.limits);
2186 }
2187
2188 return err;
2189 }
2190
2191
2192 #ifdef RE_ENABLE_I18N
2193 static int
2194 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2195 Idx node_idx, Idx str_idx, Idx max_str_idx)
2196 {
2197 const re_dfa_t *const dfa = mctx->dfa;
2198 int naccepted;
2199 /* Check the node can accept "multi byte". */
2200 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2201 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2202 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2203 dfa->nexts[node_idx]))
2204 /* The node can't accept the "multi byte", or the
2205 destination was already thrown away, then the node
2206 could't accept the current input "multi byte". */
2207 naccepted = 0;
2208 /* Otherwise, it is sure that the node could accept
2209 'naccepted' bytes input. */
2210 return naccepted;
2211 }
2212 #endif /* RE_ENABLE_I18N */
2213
2214 \f
2215 /* Functions for state transition. */
2216
2217 /* Return the next state to which the current state STATE will transit by
2218 accepting the current input byte, and update STATE_LOG if necessary.
2219 If STATE can accept a multibyte char/collating element/back reference
2220 update the destination of STATE_LOG. */
2221
2222 static re_dfastate_t *
2223 __attribute_warn_unused_result__
2224 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2225 re_dfastate_t *state)
2226 {
2227 re_dfastate_t **trtable;
2228 unsigned char ch;
2229
2230 #ifdef RE_ENABLE_I18N
2231 /* If the current state can accept multibyte. */
2232 if (BE (state->accept_mb, 0))
2233 {
2234 *err = transit_state_mb (mctx, state);
2235 if (BE (*err != REG_NOERROR, 0))
2236 return NULL;
2237 }
2238 #endif /* RE_ENABLE_I18N */
2239
2240 /* Then decide the next state with the single byte. */
2241 #if 0
2242 if (0)
2243 /* don't use transition table */
2244 return transit_state_sb (err, mctx, state);
2245 #endif
2246
2247 /* Use transition table */
2248 ch = re_string_fetch_byte (&mctx->input);
2249 for (;;)
2250 {
2251 trtable = state->trtable;
2252 if (BE (trtable != NULL, 1))
2253 return trtable[ch];
2254
2255 trtable = state->word_trtable;
2256 if (BE (trtable != NULL, 1))
2257 {
2258 unsigned int context;
2259 context
2260 = re_string_context_at (&mctx->input,
2261 re_string_cur_idx (&mctx->input) - 1,
2262 mctx->eflags);
2263 if (IS_WORD_CONTEXT (context))
2264 return trtable[ch + SBC_MAX];
2265 else
2266 return trtable[ch];
2267 }
2268
2269 if (!build_trtable (mctx->dfa, state))
2270 {
2271 *err = REG_ESPACE;
2272 return NULL;
2273 }
2274
2275 /* Retry, we now have a transition table. */
2276 }
2277 }
2278
2279 /* Update the state_log if we need */
2280 static re_dfastate_t *
2281 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2282 re_dfastate_t *next_state)
2283 {
2284 const re_dfa_t *const dfa = mctx->dfa;
2285 Idx cur_idx = re_string_cur_idx (&mctx->input);
2286
2287 if (cur_idx > mctx->state_log_top)
2288 {
2289 mctx->state_log[cur_idx] = next_state;
2290 mctx->state_log_top = cur_idx;
2291 }
2292 else if (mctx->state_log[cur_idx] == 0)
2293 {
2294 mctx->state_log[cur_idx] = next_state;
2295 }
2296 else
2297 {
2298 re_dfastate_t *pstate;
2299 unsigned int context;
2300 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2301 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2302 the destination of a multibyte char/collating element/
2303 back reference. Then the next state is the union set of
2304 these destinations and the results of the transition table. */
2305 pstate = mctx->state_log[cur_idx];
2306 log_nodes = pstate->entrance_nodes;
2307 if (next_state != NULL)
2308 {
2309 table_nodes = next_state->entrance_nodes;
2310 *err = re_node_set_init_union (&next_nodes, table_nodes,
2311 log_nodes);
2312 if (BE (*err != REG_NOERROR, 0))
2313 return NULL;
2314 }
2315 else
2316 next_nodes = *log_nodes;
2317 /* Note: We already add the nodes of the initial state,
2318 then we don't need to add them here. */
2319
2320 context = re_string_context_at (&mctx->input,
2321 re_string_cur_idx (&mctx->input) - 1,
2322 mctx->eflags);
2323 next_state = mctx->state_log[cur_idx]
2324 = re_acquire_state_context (err, dfa, &next_nodes, context);
2325 /* We don't need to check errors here, since the return value of
2326 this function is next_state and ERR is already set. */
2327
2328 if (table_nodes != NULL)
2329 re_node_set_free (&next_nodes);
2330 }
2331
2332 if (BE (dfa->nbackref, 0) && next_state != NULL)
2333 {
2334 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2335 later. We must check them here, since the back references in the
2336 next state might use them. */
2337 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2338 cur_idx);
2339 if (BE (*err != REG_NOERROR, 0))
2340 return NULL;
2341
2342 /* If the next state has back references. */
2343 if (next_state->has_backref)
2344 {
2345 *err = transit_state_bkref (mctx, &next_state->nodes);
2346 if (BE (*err != REG_NOERROR, 0))
2347 return NULL;
2348 next_state = mctx->state_log[cur_idx];
2349 }
2350 }
2351
2352 return next_state;
2353 }
2354
2355 /* Skip bytes in the input that correspond to part of a
2356 multi-byte match, then look in the log for a state
2357 from which to restart matching. */
2358 static re_dfastate_t *
2359 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2360 {
2361 re_dfastate_t *cur_state;
2362 do
2363 {
2364 Idx max = mctx->state_log_top;
2365 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2366
2367 do
2368 {
2369 if (++cur_str_idx > max)
2370 return NULL;
2371 re_string_skip_bytes (&mctx->input, 1);
2372 }
2373 while (mctx->state_log[cur_str_idx] == NULL);
2374
2375 cur_state = merge_state_with_log (err, mctx, NULL);
2376 }
2377 while (*err == REG_NOERROR && cur_state == NULL);
2378 return cur_state;
2379 }
2380
2381 /* Helper functions for transit_state. */
2382
2383 /* From the node set CUR_NODES, pick up the nodes whose types are
2384 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2385 expression. And register them to use them later for evaluating the
2386 corresponding back references. */
2387
2388 static reg_errcode_t
2389 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2390 Idx str_idx)
2391 {
2392 const re_dfa_t *const dfa = mctx->dfa;
2393 Idx node_idx;
2394 reg_errcode_t err;
2395
2396 /* TODO: This isn't efficient.
2397 Because there might be more than one nodes whose types are
2398 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2399 nodes.
2400 E.g. RE: (a){2} */
2401 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2402 {
2403 Idx node = cur_nodes->elems[node_idx];
2404 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2405 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2406 && (dfa->used_bkref_map
2407 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2408 {
2409 err = match_ctx_add_subtop (mctx, node, str_idx);
2410 if (BE (err != REG_NOERROR, 0))
2411 return err;
2412 }
2413 }
2414 return REG_NOERROR;
2415 }
2416
2417 #if 0
2418 /* Return the next state to which the current state STATE will transit by
2419 accepting the current input byte. */
2420
2421 static re_dfastate_t *
2422 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2423 re_dfastate_t *state)
2424 {
2425 const re_dfa_t *const dfa = mctx->dfa;
2426 re_node_set next_nodes;
2427 re_dfastate_t *next_state;
2428 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2429 unsigned int context;
2430
2431 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2432 if (BE (*err != REG_NOERROR, 0))
2433 return NULL;
2434 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2435 {
2436 Idx cur_node = state->nodes.elems[node_cnt];
2437 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2438 {
2439 *err = re_node_set_merge (&next_nodes,
2440 dfa->eclosures + dfa->nexts[cur_node]);
2441 if (BE (*err != REG_NOERROR, 0))
2442 {
2443 re_node_set_free (&next_nodes);
2444 return NULL;
2445 }
2446 }
2447 }
2448 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2449 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2450 /* We don't need to check errors here, since the return value of
2451 this function is next_state and ERR is already set. */
2452
2453 re_node_set_free (&next_nodes);
2454 re_string_skip_bytes (&mctx->input, 1);
2455 return next_state;
2456 }
2457 #endif
2458
2459 #ifdef RE_ENABLE_I18N
2460 static reg_errcode_t
2461 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2462 {
2463 const re_dfa_t *const dfa = mctx->dfa;
2464 reg_errcode_t err;
2465 Idx i;
2466
2467 for (i = 0; i < pstate->nodes.nelem; ++i)
2468 {
2469 re_node_set dest_nodes, *new_nodes;
2470 Idx cur_node_idx = pstate->nodes.elems[i];
2471 int naccepted;
2472 Idx dest_idx;
2473 unsigned int context;
2474 re_dfastate_t *dest_state;
2475
2476 if (!dfa->nodes[cur_node_idx].accept_mb)
2477 continue;
2478
2479 if (dfa->nodes[cur_node_idx].constraint)
2480 {
2481 context = re_string_context_at (&mctx->input,
2482 re_string_cur_idx (&mctx->input),
2483 mctx->eflags);
2484 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2485 context))
2486 continue;
2487 }
2488
2489 /* How many bytes the node can accept? */
2490 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2491 re_string_cur_idx (&mctx->input));
2492 if (naccepted == 0)
2493 continue;
2494
2495 /* The node can accepts 'naccepted' bytes. */
2496 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2497 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2498 : mctx->max_mb_elem_len);
2499 err = clean_state_log_if_needed (mctx, dest_idx);
2500 if (BE (err != REG_NOERROR, 0))
2501 return err;
2502 #ifdef DEBUG
2503 assert (dfa->nexts[cur_node_idx] != -1);
2504 #endif
2505 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2506
2507 dest_state = mctx->state_log[dest_idx];
2508 if (dest_state == NULL)
2509 dest_nodes = *new_nodes;
2510 else
2511 {
2512 err = re_node_set_init_union (&dest_nodes,
2513 dest_state->entrance_nodes, new_nodes);
2514 if (BE (err != REG_NOERROR, 0))
2515 return err;
2516 }
2517 context = re_string_context_at (&mctx->input, dest_idx - 1,
2518 mctx->eflags);
2519 mctx->state_log[dest_idx]
2520 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2521 if (dest_state != NULL)
2522 re_node_set_free (&dest_nodes);
2523 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2524 return err;
2525 }
2526 return REG_NOERROR;
2527 }
2528 #endif /* RE_ENABLE_I18N */
2529
2530 static reg_errcode_t
2531 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2532 {
2533 const re_dfa_t *const dfa = mctx->dfa;
2534 reg_errcode_t err;
2535 Idx i;
2536 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2537
2538 for (i = 0; i < nodes->nelem; ++i)
2539 {
2540 Idx dest_str_idx, prev_nelem, bkc_idx;
2541 Idx node_idx = nodes->elems[i];
2542 unsigned int context;
2543 const re_token_t *node = dfa->nodes + node_idx;
2544 re_node_set *new_dest_nodes;
2545
2546 /* Check whether 'node' is a backreference or not. */
2547 if (node->type != OP_BACK_REF)
2548 continue;
2549
2550 if (node->constraint)
2551 {
2552 context = re_string_context_at (&mctx->input, cur_str_idx,
2553 mctx->eflags);
2554 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2555 continue;
2556 }
2557
2558 /* 'node' is a backreference.
2559 Check the substring which the substring matched. */
2560 bkc_idx = mctx->nbkref_ents;
2561 err = get_subexp (mctx, node_idx, cur_str_idx);
2562 if (BE (err != REG_NOERROR, 0))
2563 goto free_return;
2564
2565 /* And add the epsilon closures (which is 'new_dest_nodes') of
2566 the backreference to appropriate state_log. */
2567 #ifdef DEBUG
2568 assert (dfa->nexts[node_idx] != -1);
2569 #endif
2570 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2571 {
2572 Idx subexp_len;
2573 re_dfastate_t *dest_state;
2574 struct re_backref_cache_entry *bkref_ent;
2575 bkref_ent = mctx->bkref_ents + bkc_idx;
2576 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2577 continue;
2578 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2579 new_dest_nodes = (subexp_len == 0
2580 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2581 : dfa->eclosures + dfa->nexts[node_idx]);
2582 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2583 - bkref_ent->subexp_from);
2584 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2585 mctx->eflags);
2586 dest_state = mctx->state_log[dest_str_idx];
2587 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2588 : mctx->state_log[cur_str_idx]->nodes.nelem);
2589 /* Add 'new_dest_node' to state_log. */
2590 if (dest_state == NULL)
2591 {
2592 mctx->state_log[dest_str_idx]
2593 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2594 context);
2595 if (BE (mctx->state_log[dest_str_idx] == NULL
2596 && err != REG_NOERROR, 0))
2597 goto free_return;
2598 }
2599 else
2600 {
2601 re_node_set dest_nodes;
2602 err = re_node_set_init_union (&dest_nodes,
2603 dest_state->entrance_nodes,
2604 new_dest_nodes);
2605 if (BE (err != REG_NOERROR, 0))
2606 {
2607 re_node_set_free (&dest_nodes);
2608 goto free_return;
2609 }
2610 mctx->state_log[dest_str_idx]
2611 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2612 re_node_set_free (&dest_nodes);
2613 if (BE (mctx->state_log[dest_str_idx] == NULL
2614 && err != REG_NOERROR, 0))
2615 goto free_return;
2616 }
2617 /* We need to check recursively if the backreference can epsilon
2618 transit. */
2619 if (subexp_len == 0
2620 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2621 {
2622 err = check_subexp_matching_top (mctx, new_dest_nodes,
2623 cur_str_idx);
2624 if (BE (err != REG_NOERROR, 0))
2625 goto free_return;
2626 err = transit_state_bkref (mctx, new_dest_nodes);
2627 if (BE (err != REG_NOERROR, 0))
2628 goto free_return;
2629 }
2630 }
2631 }
2632 err = REG_NOERROR;
2633 free_return:
2634 return err;
2635 }
2636
2637 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2638 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2639 Note that we might collect inappropriate candidates here.
2640 However, the cost of checking them strictly here is too high, then we
2641 delay these checking for prune_impossible_nodes(). */
2642
2643 static reg_errcode_t
2644 __attribute_warn_unused_result__
2645 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2646 {
2647 const re_dfa_t *const dfa = mctx->dfa;
2648 Idx subexp_num, sub_top_idx;
2649 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2650 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2651 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2652 if (cache_idx != -1)
2653 {
2654 const struct re_backref_cache_entry *entry
2655 = mctx->bkref_ents + cache_idx;
2656 do
2657 if (entry->node == bkref_node)
2658 return REG_NOERROR; /* We already checked it. */
2659 while (entry++->more);
2660 }
2661
2662 subexp_num = dfa->nodes[bkref_node].opr.idx;
2663
2664 /* For each sub expression */
2665 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2666 {
2667 reg_errcode_t err;
2668 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2669 re_sub_match_last_t *sub_last;
2670 Idx sub_last_idx, sl_str, bkref_str_off;
2671
2672 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2673 continue; /* It isn't related. */
2674
2675 sl_str = sub_top->str_idx;
2676 bkref_str_off = bkref_str_idx;
2677 /* At first, check the last node of sub expressions we already
2678 evaluated. */
2679 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2680 {
2681 regoff_t sl_str_diff;
2682 sub_last = sub_top->lasts[sub_last_idx];
2683 sl_str_diff = sub_last->str_idx - sl_str;
2684 /* The matched string by the sub expression match with the substring
2685 at the back reference? */
2686 if (sl_str_diff > 0)
2687 {
2688 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2689 {
2690 /* Not enough chars for a successful match. */
2691 if (bkref_str_off + sl_str_diff > mctx->input.len)
2692 break;
2693
2694 err = clean_state_log_if_needed (mctx,
2695 bkref_str_off
2696 + sl_str_diff);
2697 if (BE (err != REG_NOERROR, 0))
2698 return err;
2699 buf = (const char *) re_string_get_buffer (&mctx->input);
2700 }
2701 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2702 /* We don't need to search this sub expression any more. */
2703 break;
2704 }
2705 bkref_str_off += sl_str_diff;
2706 sl_str += sl_str_diff;
2707 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2708 bkref_str_idx);
2709
2710 /* Reload buf, since the preceding call might have reallocated
2711 the buffer. */
2712 buf = (const char *) re_string_get_buffer (&mctx->input);
2713
2714 if (err == REG_NOMATCH)
2715 continue;
2716 if (BE (err != REG_NOERROR, 0))
2717 return err;
2718 }
2719
2720 if (sub_last_idx < sub_top->nlasts)
2721 continue;
2722 if (sub_last_idx > 0)
2723 ++sl_str;
2724 /* Then, search for the other last nodes of the sub expression. */
2725 for (; sl_str <= bkref_str_idx; ++sl_str)
2726 {
2727 Idx cls_node;
2728 regoff_t sl_str_off;
2729 const re_node_set *nodes;
2730 sl_str_off = sl_str - sub_top->str_idx;
2731 /* The matched string by the sub expression match with the substring
2732 at the back reference? */
2733 if (sl_str_off > 0)
2734 {
2735 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2736 {
2737 /* If we are at the end of the input, we cannot match. */
2738 if (bkref_str_off >= mctx->input.len)
2739 break;
2740
2741 err = extend_buffers (mctx, bkref_str_off + 1);
2742 if (BE (err != REG_NOERROR, 0))
2743 return err;
2744
2745 buf = (const char *) re_string_get_buffer (&mctx->input);
2746 }
2747 if (buf [bkref_str_off++] != buf[sl_str - 1])
2748 break; /* We don't need to search this sub expression
2749 any more. */
2750 }
2751 if (mctx->state_log[sl_str] == NULL)
2752 continue;
2753 /* Does this state have a ')' of the sub expression? */
2754 nodes = &mctx->state_log[sl_str]->nodes;
2755 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2756 OP_CLOSE_SUBEXP);
2757 if (cls_node == -1)
2758 continue; /* No. */
2759 if (sub_top->path == NULL)
2760 {
2761 sub_top->path = calloc (sizeof (state_array_t),
2762 sl_str - sub_top->str_idx + 1);
2763 if (sub_top->path == NULL)
2764 return REG_ESPACE;
2765 }
2766 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2767 in the current context? */
2768 err = check_arrival (mctx, sub_top->path, sub_top->node,
2769 sub_top->str_idx, cls_node, sl_str,
2770 OP_CLOSE_SUBEXP);
2771 if (err == REG_NOMATCH)
2772 continue;
2773 if (BE (err != REG_NOERROR, 0))
2774 return err;
2775 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2776 if (BE (sub_last == NULL, 0))
2777 return REG_ESPACE;
2778 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2779 bkref_str_idx);
2780 if (err == REG_NOMATCH)
2781 continue;
2782 }
2783 }
2784 return REG_NOERROR;
2785 }
2786
2787 /* Helper functions for get_subexp(). */
2788
2789 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2790 If it can arrive, register the sub expression expressed with SUB_TOP
2791 and SUB_LAST. */
2792
2793 static reg_errcode_t
2794 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2795 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2796 {
2797 reg_errcode_t err;
2798 Idx to_idx;
2799 /* Can the subexpression arrive the back reference? */
2800 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2801 sub_last->str_idx, bkref_node, bkref_str,
2802 OP_OPEN_SUBEXP);
2803 if (err != REG_NOERROR)
2804 return err;
2805 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2806 sub_last->str_idx);
2807 if (BE (err != REG_NOERROR, 0))
2808 return err;
2809 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2810 return clean_state_log_if_needed (mctx, to_idx);
2811 }
2812
2813 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2814 Search '(' if FL_OPEN, or search ')' otherwise.
2815 TODO: This function isn't efficient...
2816 Because there might be more than one nodes whose types are
2817 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2818 nodes.
2819 E.g. RE: (a){2} */
2820
2821 static Idx
2822 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2823 Idx subexp_idx, int type)
2824 {
2825 Idx cls_idx;
2826 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2827 {
2828 Idx cls_node = nodes->elems[cls_idx];
2829 const re_token_t *node = dfa->nodes + cls_node;
2830 if (node->type == type
2831 && node->opr.idx == subexp_idx)
2832 return cls_node;
2833 }
2834 return -1;
2835 }
2836
2837 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2838 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2839 heavily reused.
2840 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2841
2842 static reg_errcode_t
2843 __attribute_warn_unused_result__
2844 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2845 Idx top_str, Idx last_node, Idx last_str, int type)
2846 {
2847 const re_dfa_t *const dfa = mctx->dfa;
2848 reg_errcode_t err = REG_NOERROR;
2849 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2850 re_dfastate_t *cur_state = NULL;
2851 re_node_set *cur_nodes, next_nodes;
2852 re_dfastate_t **backup_state_log;
2853 unsigned int context;
2854
2855 subexp_num = dfa->nodes[top_node].opr.idx;
2856 /* Extend the buffer if we need. */
2857 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2858 {
2859 re_dfastate_t **new_array;
2860 Idx old_alloc = path->alloc;
2861 Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2862 Idx new_alloc;
2863 if (BE (IDX_MAX - old_alloc < incr_alloc, 0))
2864 return REG_ESPACE;
2865 new_alloc = old_alloc + incr_alloc;
2866 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2867 return REG_ESPACE;
2868 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2869 if (BE (new_array == NULL, 0))
2870 return REG_ESPACE;
2871 path->array = new_array;
2872 path->alloc = new_alloc;
2873 memset (new_array + old_alloc, '\0',
2874 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2875 }
2876
2877 str_idx = path->next_idx ? path->next_idx : top_str;
2878
2879 /* Temporary modify MCTX. */
2880 backup_state_log = mctx->state_log;
2881 backup_cur_idx = mctx->input.cur_idx;
2882 mctx->state_log = path->array;
2883 mctx->input.cur_idx = str_idx;
2884
2885 /* Setup initial node set. */
2886 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2887 if (str_idx == top_str)
2888 {
2889 err = re_node_set_init_1 (&next_nodes, top_node);
2890 if (BE (err != REG_NOERROR, 0))
2891 return err;
2892 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2893 if (BE (err != REG_NOERROR, 0))
2894 {
2895 re_node_set_free (&next_nodes);
2896 return err;
2897 }
2898 }
2899 else
2900 {
2901 cur_state = mctx->state_log[str_idx];
2902 if (cur_state && cur_state->has_backref)
2903 {
2904 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2905 if (BE (err != REG_NOERROR, 0))
2906 return err;
2907 }
2908 else
2909 re_node_set_init_empty (&next_nodes);
2910 }
2911 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2912 {
2913 if (next_nodes.nelem)
2914 {
2915 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2916 subexp_num, type);
2917 if (BE (err != REG_NOERROR, 0))
2918 {
2919 re_node_set_free (&next_nodes);
2920 return err;
2921 }
2922 }
2923 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2924 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2925 {
2926 re_node_set_free (&next_nodes);
2927 return err;
2928 }
2929 mctx->state_log[str_idx] = cur_state;
2930 }
2931
2932 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2933 {
2934 re_node_set_empty (&next_nodes);
2935 if (mctx->state_log[str_idx + 1])
2936 {
2937 err = re_node_set_merge (&next_nodes,
2938 &mctx->state_log[str_idx + 1]->nodes);
2939 if (BE (err != REG_NOERROR, 0))
2940 {
2941 re_node_set_free (&next_nodes);
2942 return err;
2943 }
2944 }
2945 if (cur_state)
2946 {
2947 err = check_arrival_add_next_nodes (mctx, str_idx,
2948 &cur_state->non_eps_nodes,
2949 &next_nodes);
2950 if (BE (err != REG_NOERROR, 0))
2951 {
2952 re_node_set_free (&next_nodes);
2953 return err;
2954 }
2955 }
2956 ++str_idx;
2957 if (next_nodes.nelem)
2958 {
2959 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2960 if (BE (err != REG_NOERROR, 0))
2961 {
2962 re_node_set_free (&next_nodes);
2963 return err;
2964 }
2965 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2966 subexp_num, type);
2967 if (BE (err != REG_NOERROR, 0))
2968 {
2969 re_node_set_free (&next_nodes);
2970 return err;
2971 }
2972 }
2973 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2974 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2975 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2976 {
2977 re_node_set_free (&next_nodes);
2978 return err;
2979 }
2980 mctx->state_log[str_idx] = cur_state;
2981 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2982 }
2983 re_node_set_free (&next_nodes);
2984 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2985 : &mctx->state_log[last_str]->nodes);
2986 path->next_idx = str_idx;
2987
2988 /* Fix MCTX. */
2989 mctx->state_log = backup_state_log;
2990 mctx->input.cur_idx = backup_cur_idx;
2991
2992 /* Then check the current node set has the node LAST_NODE. */
2993 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2994 return REG_NOERROR;
2995
2996 return REG_NOMATCH;
2997 }
2998
2999 /* Helper functions for check_arrival. */
3000
3001 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3002 to NEXT_NODES.
3003 TODO: This function is similar to the functions transit_state*(),
3004 however this function has many additional works.
3005 Can't we unify them? */
3006
3007 static reg_errcode_t
3008 __attribute_warn_unused_result__
3009 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3010 re_node_set *cur_nodes, re_node_set *next_nodes)
3011 {
3012 const re_dfa_t *const dfa = mctx->dfa;
3013 bool ok;
3014 Idx cur_idx;
3015 #ifdef RE_ENABLE_I18N
3016 reg_errcode_t err = REG_NOERROR;
3017 #endif
3018 re_node_set union_set;
3019 re_node_set_init_empty (&union_set);
3020 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3021 {
3022 int naccepted = 0;
3023 Idx cur_node = cur_nodes->elems[cur_idx];
3024 #ifdef DEBUG
3025 re_token_type_t type = dfa->nodes[cur_node].type;
3026 assert (!IS_EPSILON_NODE (type));
3027 #endif
3028 #ifdef RE_ENABLE_I18N
3029 /* If the node may accept "multi byte". */
3030 if (dfa->nodes[cur_node].accept_mb)
3031 {
3032 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3033 str_idx);
3034 if (naccepted > 1)
3035 {
3036 re_dfastate_t *dest_state;
3037 Idx next_node = dfa->nexts[cur_node];
3038 Idx next_idx = str_idx + naccepted;
3039 dest_state = mctx->state_log[next_idx];
3040 re_node_set_empty (&union_set);
3041 if (dest_state)
3042 {
3043 err = re_node_set_merge (&union_set, &dest_state->nodes);
3044 if (BE (err != REG_NOERROR, 0))
3045 {
3046 re_node_set_free (&union_set);
3047 return err;
3048 }
3049 }
3050 ok = re_node_set_insert (&union_set, next_node);
3051 if (BE (! ok, 0))
3052 {
3053 re_node_set_free (&union_set);
3054 return REG_ESPACE;
3055 }
3056 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3057 &union_set);
3058 if (BE (mctx->state_log[next_idx] == NULL
3059 && err != REG_NOERROR, 0))
3060 {
3061 re_node_set_free (&union_set);
3062 return err;
3063 }
3064 }
3065 }
3066 #endif /* RE_ENABLE_I18N */
3067 if (naccepted
3068 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3069 {
3070 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3071 if (BE (! ok, 0))
3072 {
3073 re_node_set_free (&union_set);
3074 return REG_ESPACE;
3075 }
3076 }
3077 }
3078 re_node_set_free (&union_set);
3079 return REG_NOERROR;
3080 }
3081
3082 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3083 CUR_NODES, however exclude the nodes which are:
3084 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3085 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3086 */
3087
3088 static reg_errcode_t
3089 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3090 Idx ex_subexp, int type)
3091 {
3092 reg_errcode_t err;
3093 Idx idx, outside_node;
3094 re_node_set new_nodes;
3095 #ifdef DEBUG
3096 assert (cur_nodes->nelem);
3097 #endif
3098 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3099 if (BE (err != REG_NOERROR, 0))
3100 return err;
3101 /* Create a new node set NEW_NODES with the nodes which are epsilon
3102 closures of the node in CUR_NODES. */
3103
3104 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3105 {
3106 Idx cur_node = cur_nodes->elems[idx];
3107 const re_node_set *eclosure = dfa->eclosures + cur_node;
3108 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3109 if (outside_node == -1)
3110 {
3111 /* There are no problematic nodes, just merge them. */
3112 err = re_node_set_merge (&new_nodes, eclosure);
3113 if (BE (err != REG_NOERROR, 0))
3114 {
3115 re_node_set_free (&new_nodes);
3116 return err;
3117 }
3118 }
3119 else
3120 {
3121 /* There are problematic nodes, re-calculate incrementally. */
3122 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3123 ex_subexp, type);
3124 if (BE (err != REG_NOERROR, 0))
3125 {
3126 re_node_set_free (&new_nodes);
3127 return err;
3128 }
3129 }
3130 }
3131 re_node_set_free (cur_nodes);
3132 *cur_nodes = new_nodes;
3133 return REG_NOERROR;
3134 }
3135
3136 /* Helper function for check_arrival_expand_ecl.
3137 Check incrementally the epsilon closure of TARGET, and if it isn't
3138 problematic append it to DST_NODES. */
3139
3140 static reg_errcode_t
3141 __attribute_warn_unused_result__
3142 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3143 Idx target, Idx ex_subexp, int type)
3144 {
3145 Idx cur_node;
3146 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3147 {
3148 bool ok;
3149
3150 if (dfa->nodes[cur_node].type == type
3151 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3152 {
3153 if (type == OP_CLOSE_SUBEXP)
3154 {
3155 ok = re_node_set_insert (dst_nodes, cur_node);
3156 if (BE (! ok, 0))
3157 return REG_ESPACE;
3158 }
3159 break;
3160 }
3161 ok = re_node_set_insert (dst_nodes, cur_node);
3162 if (BE (! ok, 0))
3163 return REG_ESPACE;
3164 if (dfa->edests[cur_node].nelem == 0)
3165 break;
3166 if (dfa->edests[cur_node].nelem == 2)
3167 {
3168 reg_errcode_t err;
3169 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3170 dfa->edests[cur_node].elems[1],
3171 ex_subexp, type);
3172 if (BE (err != REG_NOERROR, 0))
3173 return err;
3174 }
3175 cur_node = dfa->edests[cur_node].elems[0];
3176 }
3177 return REG_NOERROR;
3178 }
3179
3180
3181 /* For all the back references in the current state, calculate the
3182 destination of the back references by the appropriate entry
3183 in MCTX->BKREF_ENTS. */
3184
3185 static reg_errcode_t
3186 __attribute_warn_unused_result__
3187 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3188 Idx cur_str, Idx subexp_num, int type)
3189 {
3190 const re_dfa_t *const dfa = mctx->dfa;
3191 reg_errcode_t err;
3192 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3193 struct re_backref_cache_entry *ent;
3194
3195 if (cache_idx_start == -1)
3196 return REG_NOERROR;
3197
3198 restart:
3199 ent = mctx->bkref_ents + cache_idx_start;
3200 do
3201 {
3202 Idx to_idx, next_node;
3203
3204 /* Is this entry ENT is appropriate? */
3205 if (!re_node_set_contains (cur_nodes, ent->node))
3206 continue; /* No. */
3207
3208 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3209 /* Calculate the destination of the back reference, and append it
3210 to MCTX->STATE_LOG. */
3211 if (to_idx == cur_str)
3212 {
3213 /* The backreference did epsilon transit, we must re-check all the
3214 node in the current state. */
3215 re_node_set new_dests;
3216 reg_errcode_t err2, err3;
3217 next_node = dfa->edests[ent->node].elems[0];
3218 if (re_node_set_contains (cur_nodes, next_node))
3219 continue;
3220 err = re_node_set_init_1 (&new_dests, next_node);
3221 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3222 err3 = re_node_set_merge (cur_nodes, &new_dests);
3223 re_node_set_free (&new_dests);
3224 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3225 || err3 != REG_NOERROR, 0))
3226 {
3227 err = (err != REG_NOERROR ? err
3228 : (err2 != REG_NOERROR ? err2 : err3));
3229 return err;
3230 }
3231 /* TODO: It is still inefficient... */
3232 goto restart;
3233 }
3234 else
3235 {
3236 re_node_set union_set;
3237 next_node = dfa->nexts[ent->node];
3238 if (mctx->state_log[to_idx])
3239 {
3240 bool ok;
3241 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3242 next_node))
3243 continue;
3244 err = re_node_set_init_copy (&union_set,
3245 &mctx->state_log[to_idx]->nodes);
3246 ok = re_node_set_insert (&union_set, next_node);
3247 if (BE (err != REG_NOERROR || ! ok, 0))
3248 {
3249 re_node_set_free (&union_set);
3250 err = err != REG_NOERROR ? err : REG_ESPACE;
3251 return err;
3252 }
3253 }
3254 else
3255 {
3256 err = re_node_set_init_1 (&union_set, next_node);
3257 if (BE (err != REG_NOERROR, 0))
3258 return err;
3259 }
3260 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3261 re_node_set_free (&union_set);
3262 if (BE (mctx->state_log[to_idx] == NULL
3263 && err != REG_NOERROR, 0))
3264 return err;
3265 }
3266 }
3267 while (ent++->more);
3268 return REG_NOERROR;
3269 }
3270
3271 /* Build transition table for the state.
3272 Return true if successful. */
3273
3274 static bool
3275 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3276 {
3277 reg_errcode_t err;
3278 Idx i, j;
3279 int ch;
3280 bool need_word_trtable = false;
3281 bitset_word_t elem, mask;
3282 bool dests_node_malloced = false;
3283 bool dest_states_malloced = false;
3284 Idx ndests; /* Number of the destination states from 'state'. */
3285 re_dfastate_t **trtable;
3286 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3287 re_node_set follows, *dests_node;
3288 bitset_t *dests_ch;
3289 bitset_t acceptable;
3290
3291 struct dests_alloc
3292 {
3293 re_node_set dests_node[SBC_MAX];
3294 bitset_t dests_ch[SBC_MAX];
3295 } *dests_alloc;
3296
3297 /* We build DFA states which corresponds to the destination nodes
3298 from 'state'. 'dests_node[i]' represents the nodes which i-th
3299 destination state contains, and 'dests_ch[i]' represents the
3300 characters which i-th destination state accepts. */
3301 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3302 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3303 else
3304 {
3305 dests_alloc = re_malloc (struct dests_alloc, 1);
3306 if (BE (dests_alloc == NULL, 0))
3307 return false;
3308 dests_node_malloced = true;
3309 }
3310 dests_node = dests_alloc->dests_node;
3311 dests_ch = dests_alloc->dests_ch;
3312
3313 /* Initialize transition table. */
3314 state->word_trtable = state->trtable = NULL;
3315
3316 /* At first, group all nodes belonging to 'state' into several
3317 destinations. */
3318 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3319 if (BE (ndests <= 0, 0))
3320 {
3321 if (dests_node_malloced)
3322 re_free (dests_alloc);
3323 /* Return false in case of an error, true otherwise. */
3324 if (ndests == 0)
3325 {
3326 state->trtable = (re_dfastate_t **)
3327 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3328 if (BE (state->trtable == NULL, 0))
3329 return false;
3330 return true;
3331 }
3332 return false;
3333 }
3334
3335 err = re_node_set_alloc (&follows, ndests + 1);
3336 if (BE (err != REG_NOERROR, 0))
3337 goto out_free;
3338
3339 /* Avoid arithmetic overflow in size calculation. */
3340 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3341 / (3 * sizeof (re_dfastate_t *)))
3342 < ndests),
3343 0))
3344 goto out_free;
3345
3346 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3347 + ndests * 3 * sizeof (re_dfastate_t *)))
3348 dest_states = (re_dfastate_t **)
3349 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3350 else
3351 {
3352 dest_states = re_malloc (re_dfastate_t *, ndests * 3);
3353 if (BE (dest_states == NULL, 0))
3354 {
3355 out_free:
3356 if (dest_states_malloced)
3357 re_free (dest_states);
3358 re_node_set_free (&follows);
3359 for (i = 0; i < ndests; ++i)
3360 re_node_set_free (dests_node + i);
3361 if (dests_node_malloced)
3362 re_free (dests_alloc);
3363 return false;
3364 }
3365 dest_states_malloced = true;
3366 }
3367 dest_states_word = dest_states + ndests;
3368 dest_states_nl = dest_states_word + ndests;
3369 bitset_empty (acceptable);
3370
3371 /* Then build the states for all destinations. */
3372 for (i = 0; i < ndests; ++i)
3373 {
3374 Idx next_node;
3375 re_node_set_empty (&follows);
3376 /* Merge the follows of this destination states. */
3377 for (j = 0; j < dests_node[i].nelem; ++j)
3378 {
3379 next_node = dfa->nexts[dests_node[i].elems[j]];
3380 if (next_node != -1)
3381 {
3382 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3383 if (BE (err != REG_NOERROR, 0))
3384 goto out_free;
3385 }
3386 }
3387 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3388 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3389 goto out_free;
3390 /* If the new state has context constraint,
3391 build appropriate states for these contexts. */
3392 if (dest_states[i]->has_constraint)
3393 {
3394 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3395 CONTEXT_WORD);
3396 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3397 goto out_free;
3398
3399 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3400 need_word_trtable = true;
3401
3402 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3403 CONTEXT_NEWLINE);
3404 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3405 goto out_free;
3406 }
3407 else
3408 {
3409 dest_states_word[i] = dest_states[i];
3410 dest_states_nl[i] = dest_states[i];
3411 }
3412 bitset_merge (acceptable, dests_ch[i]);
3413 }
3414
3415 if (!BE (need_word_trtable, 0))
3416 {
3417 /* We don't care about whether the following character is a word
3418 character, or we are in a single-byte character set so we can
3419 discern by looking at the character code: allocate a
3420 256-entry transition table. */
3421 trtable = state->trtable =
3422 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3423 if (BE (trtable == NULL, 0))
3424 goto out_free;
3425
3426 /* For all characters ch...: */
3427 for (i = 0; i < BITSET_WORDS; ++i)
3428 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3429 elem;
3430 mask <<= 1, elem >>= 1, ++ch)
3431 if (BE (elem & 1, 0))
3432 {
3433 /* There must be exactly one destination which accepts
3434 character ch. See group_nodes_into_DFAstates. */
3435 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3436 ;
3437
3438 /* j-th destination accepts the word character ch. */
3439 if (dfa->word_char[i] & mask)
3440 trtable[ch] = dest_states_word[j];
3441 else
3442 trtable[ch] = dest_states[j];
3443 }
3444 }
3445 else
3446 {
3447 /* We care about whether the following character is a word
3448 character, and we are in a multi-byte character set: discern
3449 by looking at the character code: build two 256-entry
3450 transition tables, one starting at trtable[0] and one
3451 starting at trtable[SBC_MAX]. */
3452 trtable = state->word_trtable =
3453 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3454 if (BE (trtable == NULL, 0))
3455 goto out_free;
3456
3457 /* For all characters ch...: */
3458 for (i = 0; i < BITSET_WORDS; ++i)
3459 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3460 elem;
3461 mask <<= 1, elem >>= 1, ++ch)
3462 if (BE (elem & 1, 0))
3463 {
3464 /* There must be exactly one destination which accepts
3465 character ch. See group_nodes_into_DFAstates. */
3466 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3467 ;
3468
3469 /* j-th destination accepts the word character ch. */
3470 trtable[ch] = dest_states[j];
3471 trtable[ch + SBC_MAX] = dest_states_word[j];
3472 }
3473 }
3474
3475 /* new line */
3476 if (bitset_contain (acceptable, NEWLINE_CHAR))
3477 {
3478 /* The current state accepts newline character. */
3479 for (j = 0; j < ndests; ++j)
3480 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3481 {
3482 /* k-th destination accepts newline character. */
3483 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3484 if (need_word_trtable)
3485 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3486 /* There must be only one destination which accepts
3487 newline. See group_nodes_into_DFAstates. */
3488 break;
3489 }
3490 }
3491
3492 if (dest_states_malloced)
3493 re_free (dest_states);
3494
3495 re_node_set_free (&follows);
3496 for (i = 0; i < ndests; ++i)
3497 re_node_set_free (dests_node + i);
3498
3499 if (dests_node_malloced)
3500 re_free (dests_alloc);
3501
3502 return true;
3503 }
3504
3505 /* Group all nodes belonging to STATE into several destinations.
3506 Then for all destinations, set the nodes belonging to the destination
3507 to DESTS_NODE[i] and set the characters accepted by the destination
3508 to DEST_CH[i]. This function return the number of destinations. */
3509
3510 static Idx
3511 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3512 re_node_set *dests_node, bitset_t *dests_ch)
3513 {
3514 reg_errcode_t err;
3515 bool ok;
3516 Idx i, j, k;
3517 Idx ndests; /* Number of the destinations from 'state'. */
3518 bitset_t accepts; /* Characters a node can accept. */
3519 const re_node_set *cur_nodes = &state->nodes;
3520 bitset_empty (accepts);
3521 ndests = 0;
3522
3523 /* For all the nodes belonging to 'state', */
3524 for (i = 0; i < cur_nodes->nelem; ++i)
3525 {
3526 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3527 re_token_type_t type = node->type;
3528 unsigned int constraint = node->constraint;
3529
3530 /* Enumerate all single byte character this node can accept. */
3531 if (type == CHARACTER)
3532 bitset_set (accepts, node->opr.c);
3533 else if (type == SIMPLE_BRACKET)
3534 {
3535 bitset_merge (accepts, node->opr.sbcset);
3536 }
3537 else if (type == OP_PERIOD)
3538 {
3539 #ifdef RE_ENABLE_I18N
3540 if (dfa->mb_cur_max > 1)
3541 bitset_merge (accepts, dfa->sb_char);
3542 else
3543 #endif
3544 bitset_set_all (accepts);
3545 if (!(dfa->syntax & RE_DOT_NEWLINE))
3546 bitset_clear (accepts, '\n');
3547 if (dfa->syntax & RE_DOT_NOT_NULL)
3548 bitset_clear (accepts, '\0');
3549 }
3550 #ifdef RE_ENABLE_I18N
3551 else if (type == OP_UTF8_PERIOD)
3552 {
3553 if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3554 memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3555 else
3556 bitset_merge (accepts, utf8_sb_map);
3557 if (!(dfa->syntax & RE_DOT_NEWLINE))
3558 bitset_clear (accepts, '\n');
3559 if (dfa->syntax & RE_DOT_NOT_NULL)
3560 bitset_clear (accepts, '\0');
3561 }
3562 #endif
3563 else
3564 continue;
3565
3566 /* Check the 'accepts' and sift the characters which are not
3567 match it the context. */
3568 if (constraint)
3569 {
3570 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3571 {
3572 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3573 bitset_empty (accepts);
3574 if (accepts_newline)
3575 bitset_set (accepts, NEWLINE_CHAR);
3576 else
3577 continue;
3578 }
3579 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3580 {
3581 bitset_empty (accepts);
3582 continue;
3583 }
3584
3585 if (constraint & NEXT_WORD_CONSTRAINT)
3586 {
3587 bitset_word_t any_set = 0;
3588 if (type == CHARACTER && !node->word_char)
3589 {
3590 bitset_empty (accepts);
3591 continue;
3592 }
3593 #ifdef RE_ENABLE_I18N
3594 if (dfa->mb_cur_max > 1)
3595 for (j = 0; j < BITSET_WORDS; ++j)
3596 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3597 else
3598 #endif
3599 for (j = 0; j < BITSET_WORDS; ++j)
3600 any_set |= (accepts[j] &= dfa->word_char[j]);
3601 if (!any_set)
3602 continue;
3603 }
3604 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3605 {
3606 bitset_word_t any_set = 0;
3607 if (type == CHARACTER && node->word_char)
3608 {
3609 bitset_empty (accepts);
3610 continue;
3611 }
3612 #ifdef RE_ENABLE_I18N
3613 if (dfa->mb_cur_max > 1)
3614 for (j = 0; j < BITSET_WORDS; ++j)
3615 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3616 else
3617 #endif
3618 for (j = 0; j < BITSET_WORDS; ++j)
3619 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3620 if (!any_set)
3621 continue;
3622 }
3623 }
3624
3625 /* Then divide 'accepts' into DFA states, or create a new
3626 state. Above, we make sure that accepts is not empty. */
3627 for (j = 0; j < ndests; ++j)
3628 {
3629 bitset_t intersec; /* Intersection sets, see below. */
3630 bitset_t remains;
3631 /* Flags, see below. */
3632 bitset_word_t has_intersec, not_subset, not_consumed;
3633
3634 /* Optimization, skip if this state doesn't accept the character. */
3635 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3636 continue;
3637
3638 /* Enumerate the intersection set of this state and 'accepts'. */
3639 has_intersec = 0;
3640 for (k = 0; k < BITSET_WORDS; ++k)
3641 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3642 /* And skip if the intersection set is empty. */
3643 if (!has_intersec)
3644 continue;
3645
3646 /* Then check if this state is a subset of 'accepts'. */
3647 not_subset = not_consumed = 0;
3648 for (k = 0; k < BITSET_WORDS; ++k)
3649 {
3650 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3651 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3652 }
3653
3654 /* If this state isn't a subset of 'accepts', create a
3655 new group state, which has the 'remains'. */
3656 if (not_subset)
3657 {
3658 bitset_copy (dests_ch[ndests], remains);
3659 bitset_copy (dests_ch[j], intersec);
3660 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3661 if (BE (err != REG_NOERROR, 0))
3662 goto error_return;
3663 ++ndests;
3664 }
3665
3666 /* Put the position in the current group. */
3667 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3668 if (BE (! ok, 0))
3669 goto error_return;
3670
3671 /* If all characters are consumed, go to next node. */
3672 if (!not_consumed)
3673 break;
3674 }
3675 /* Some characters remain, create a new group. */
3676 if (j == ndests)
3677 {
3678 bitset_copy (dests_ch[ndests], accepts);
3679 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3680 if (BE (err != REG_NOERROR, 0))
3681 goto error_return;
3682 ++ndests;
3683 bitset_empty (accepts);
3684 }
3685 }
3686 return ndests;
3687 error_return:
3688 for (j = 0; j < ndests; ++j)
3689 re_node_set_free (dests_node + j);
3690 return -1;
3691 }
3692
3693 #ifdef RE_ENABLE_I18N
3694 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3695 Return the number of the bytes the node accepts.
3696 STR_IDX is the current index of the input string.
3697
3698 This function handles the nodes which can accept one character, or
3699 one collating element like '.', '[a-z]', opposite to the other nodes
3700 can only accept one byte. */
3701
3702 # ifdef _LIBC
3703 # include <locale/weight.h>
3704 # endif
3705
3706 static int
3707 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3708 const re_string_t *input, Idx str_idx)
3709 {
3710 const re_token_t *node = dfa->nodes + node_idx;
3711 int char_len, elem_len;
3712 Idx i;
3713
3714 if (BE (node->type == OP_UTF8_PERIOD, 0))
3715 {
3716 unsigned char c = re_string_byte_at (input, str_idx), d;
3717 if (BE (c < 0xc2, 1))
3718 return 0;
3719
3720 if (str_idx + 2 > input->len)
3721 return 0;
3722
3723 d = re_string_byte_at (input, str_idx + 1);
3724 if (c < 0xe0)
3725 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3726 else if (c < 0xf0)
3727 {
3728 char_len = 3;
3729 if (c == 0xe0 && d < 0xa0)
3730 return 0;
3731 }
3732 else if (c < 0xf8)
3733 {
3734 char_len = 4;
3735 if (c == 0xf0 && d < 0x90)
3736 return 0;
3737 }
3738 else if (c < 0xfc)
3739 {
3740 char_len = 5;
3741 if (c == 0xf8 && d < 0x88)
3742 return 0;
3743 }
3744 else if (c < 0xfe)
3745 {
3746 char_len = 6;
3747 if (c == 0xfc && d < 0x84)
3748 return 0;
3749 }
3750 else
3751 return 0;
3752
3753 if (str_idx + char_len > input->len)
3754 return 0;
3755
3756 for (i = 1; i < char_len; ++i)
3757 {
3758 d = re_string_byte_at (input, str_idx + i);
3759 if (d < 0x80 || d > 0xbf)
3760 return 0;
3761 }
3762 return char_len;
3763 }
3764
3765 char_len = re_string_char_size_at (input, str_idx);
3766 if (node->type == OP_PERIOD)
3767 {
3768 if (char_len <= 1)
3769 return 0;
3770 /* FIXME: I don't think this if is needed, as both '\n'
3771 and '\0' are char_len == 1. */
3772 /* '.' accepts any one character except the following two cases. */
3773 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3774 re_string_byte_at (input, str_idx) == '\n') ||
3775 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3776 re_string_byte_at (input, str_idx) == '\0'))
3777 return 0;
3778 return char_len;
3779 }
3780
3781 elem_len = re_string_elem_size_at (input, str_idx);
3782 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3783 return 0;
3784
3785 if (node->type == COMPLEX_BRACKET)
3786 {
3787 const re_charset_t *cset = node->opr.mbcset;
3788 # ifdef _LIBC
3789 const unsigned char *pin
3790 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3791 Idx j;
3792 uint32_t nrules;
3793 # endif /* _LIBC */
3794 int match_len = 0;
3795 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3796 ? re_string_wchar_at (input, str_idx) : 0);
3797
3798 /* match with multibyte character? */
3799 for (i = 0; i < cset->nmbchars; ++i)
3800 if (wc == cset->mbchars[i])
3801 {
3802 match_len = char_len;
3803 goto check_node_accept_bytes_match;
3804 }
3805 /* match with character_class? */
3806 for (i = 0; i < cset->nchar_classes; ++i)
3807 {
3808 wctype_t wt = cset->char_classes[i];
3809 if (__iswctype (wc, wt))
3810 {
3811 match_len = char_len;
3812 goto check_node_accept_bytes_match;
3813 }
3814 }
3815
3816 # ifdef _LIBC
3817 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3818 if (nrules != 0)
3819 {
3820 unsigned int in_collseq = 0;
3821 const int32_t *table, *indirect;
3822 const unsigned char *weights, *extra;
3823 const char *collseqwc;
3824
3825 /* match with collating_symbol? */
3826 if (cset->ncoll_syms)
3827 extra = (const unsigned char *)
3828 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3829 for (i = 0; i < cset->ncoll_syms; ++i)
3830 {
3831 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3832 /* Compare the length of input collating element and
3833 the length of current collating element. */
3834 if (*coll_sym != elem_len)
3835 continue;
3836 /* Compare each bytes. */
3837 for (j = 0; j < *coll_sym; j++)
3838 if (pin[j] != coll_sym[1 + j])
3839 break;
3840 if (j == *coll_sym)
3841 {
3842 /* Match if every bytes is equal. */
3843 match_len = j;
3844 goto check_node_accept_bytes_match;
3845 }
3846 }
3847
3848 if (cset->nranges)
3849 {
3850 if (elem_len <= char_len)
3851 {
3852 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3853 in_collseq = __collseq_table_lookup (collseqwc, wc);
3854 }
3855 else
3856 in_collseq = find_collation_sequence_value (pin, elem_len);
3857 }
3858 /* match with range expression? */
3859 /* FIXME: Implement rational ranges here, too. */
3860 for (i = 0; i < cset->nranges; ++i)
3861 if (cset->range_starts[i] <= in_collseq
3862 && in_collseq <= cset->range_ends[i])
3863 {
3864 match_len = elem_len;
3865 goto check_node_accept_bytes_match;
3866 }
3867
3868 /* match with equivalence_class? */
3869 if (cset->nequiv_classes)
3870 {
3871 const unsigned char *cp = pin;
3872 table = (const int32_t *)
3873 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3874 weights = (const unsigned char *)
3875 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3876 extra = (const unsigned char *)
3877 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3878 indirect = (const int32_t *)
3879 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3880 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3881 int32_t rule = idx >> 24;
3882 idx &= 0xffffff;
3883 if (idx > 0)
3884 {
3885 size_t weight_len = weights[idx];
3886 for (i = 0; i < cset->nequiv_classes; ++i)
3887 {
3888 int32_t equiv_class_idx = cset->equiv_classes[i];
3889 int32_t equiv_class_rule = equiv_class_idx >> 24;
3890 equiv_class_idx &= 0xffffff;
3891 if (weights[equiv_class_idx] == weight_len
3892 && equiv_class_rule == rule
3893 && memcmp (weights + idx + 1,
3894 weights + equiv_class_idx + 1,
3895 weight_len) == 0)
3896 {
3897 match_len = elem_len;
3898 goto check_node_accept_bytes_match;
3899 }
3900 }
3901 }
3902 }
3903 }
3904 else
3905 # endif /* _LIBC */
3906 {
3907 /* match with range expression? */
3908 for (i = 0; i < cset->nranges; ++i)
3909 {
3910 if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3911 {
3912 match_len = char_len;
3913 goto check_node_accept_bytes_match;
3914 }
3915 }
3916 }
3917 check_node_accept_bytes_match:
3918 if (!cset->non_match)
3919 return match_len;
3920 else
3921 {
3922 if (match_len > 0)
3923 return 0;
3924 else
3925 return (elem_len > char_len) ? elem_len : char_len;
3926 }
3927 }
3928 return 0;
3929 }
3930
3931 # ifdef _LIBC
3932 static unsigned int
3933 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3934 {
3935 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3936 if (nrules == 0)
3937 {
3938 if (mbs_len == 1)
3939 {
3940 /* No valid character. Match it as a single byte character. */
3941 const unsigned char *collseq = (const unsigned char *)
3942 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3943 return collseq[mbs[0]];
3944 }
3945 return UINT_MAX;
3946 }
3947 else
3948 {
3949 int32_t idx;
3950 const unsigned char *extra = (const unsigned char *)
3951 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3952 int32_t extrasize = (const unsigned char *)
3953 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3954
3955 for (idx = 0; idx < extrasize;)
3956 {
3957 int mbs_cnt;
3958 bool found = false;
3959 int32_t elem_mbs_len;
3960 /* Skip the name of collating element name. */
3961 idx = idx + extra[idx] + 1;
3962 elem_mbs_len = extra[idx++];
3963 if (mbs_len == elem_mbs_len)
3964 {
3965 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3966 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3967 break;
3968 if (mbs_cnt == elem_mbs_len)
3969 /* Found the entry. */
3970 found = true;
3971 }
3972 /* Skip the byte sequence of the collating element. */
3973 idx += elem_mbs_len;
3974 /* Adjust for the alignment. */
3975 idx = (idx + 3) & ~3;
3976 /* Skip the collation sequence value. */
3977 idx += sizeof (uint32_t);
3978 /* Skip the wide char sequence of the collating element. */
3979 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3980 /* If we found the entry, return the sequence value. */
3981 if (found)
3982 return *(uint32_t *) (extra + idx);
3983 /* Skip the collation sequence value. */
3984 idx += sizeof (uint32_t);
3985 }
3986 return UINT_MAX;
3987 }
3988 }
3989 # endif /* _LIBC */
3990 #endif /* RE_ENABLE_I18N */
3991
3992 /* Check whether the node accepts the byte which is IDX-th
3993 byte of the INPUT. */
3994
3995 static bool
3996 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3997 Idx idx)
3998 {
3999 unsigned char ch;
4000 ch = re_string_byte_at (&mctx->input, idx);
4001 switch (node->type)
4002 {
4003 case CHARACTER:
4004 if (node->opr.c != ch)
4005 return false;
4006 break;
4007
4008 case SIMPLE_BRACKET:
4009 if (!bitset_contain (node->opr.sbcset, ch))
4010 return false;
4011 break;
4012
4013 #ifdef RE_ENABLE_I18N
4014 case OP_UTF8_PERIOD:
4015 if (ch >= ASCII_CHARS)
4016 return false;
4017 FALLTHROUGH;
4018 #endif
4019 case OP_PERIOD:
4020 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4021 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4022 return false;
4023 break;
4024
4025 default:
4026 return false;
4027 }
4028
4029 if (node->constraint)
4030 {
4031 /* The node has constraints. Check whether the current context
4032 satisfies the constraints. */
4033 unsigned int context = re_string_context_at (&mctx->input, idx,
4034 mctx->eflags);
4035 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4036 return false;
4037 }
4038
4039 return true;
4040 }
4041
4042 /* Extend the buffers, if the buffers have run out. */
4043
4044 static reg_errcode_t
4045 __attribute_warn_unused_result__
4046 extend_buffers (re_match_context_t *mctx, int min_len)
4047 {
4048 reg_errcode_t ret;
4049 re_string_t *pstr = &mctx->input;
4050
4051 /* Avoid overflow. */
4052 if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
4053 <= pstr->bufs_len, 0))
4054 return REG_ESPACE;
4055
4056 /* Double the lengths of the buffers, but allocate at least MIN_LEN. */
4057 ret = re_string_realloc_buffers (pstr,
4058 MAX (min_len,
4059 MIN (pstr->len, pstr->bufs_len * 2)));
4060 if (BE (ret != REG_NOERROR, 0))
4061 return ret;
4062
4063 if (mctx->state_log != NULL)
4064 {
4065 /* And double the length of state_log. */
4066 /* XXX We have no indication of the size of this buffer. If this
4067 allocation fail we have no indication that the state_log array
4068 does not have the right size. */
4069 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4070 pstr->bufs_len + 1);
4071 if (BE (new_array == NULL, 0))
4072 return REG_ESPACE;
4073 mctx->state_log = new_array;
4074 }
4075
4076 /* Then reconstruct the buffers. */
4077 if (pstr->icase)
4078 {
4079 #ifdef RE_ENABLE_I18N
4080 if (pstr->mb_cur_max > 1)
4081 {
4082 ret = build_wcs_upper_buffer (pstr);
4083 if (BE (ret != REG_NOERROR, 0))
4084 return ret;
4085 }
4086 else
4087 #endif /* RE_ENABLE_I18N */
4088 build_upper_buffer (pstr);
4089 }
4090 else
4091 {
4092 #ifdef RE_ENABLE_I18N
4093 if (pstr->mb_cur_max > 1)
4094 build_wcs_buffer (pstr);
4095 else
4096 #endif /* RE_ENABLE_I18N */
4097 {
4098 if (pstr->trans != NULL)
4099 re_string_translate_buffer (pstr);
4100 }
4101 }
4102 return REG_NOERROR;
4103 }
4104
4105 \f
4106 /* Functions for matching context. */
4107
4108 /* Initialize MCTX. */
4109
4110 static reg_errcode_t
4111 __attribute_warn_unused_result__
4112 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4113 {
4114 mctx->eflags = eflags;
4115 mctx->match_last = -1;
4116 if (n > 0)
4117 {
4118 /* Avoid overflow. */
4119 size_t max_object_size =
4120 MAX (sizeof (struct re_backref_cache_entry),
4121 sizeof (re_sub_match_top_t *));
4122 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n, 0))
4123 return REG_ESPACE;
4124
4125 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4126 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4127 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4128 return REG_ESPACE;
4129 }
4130 /* Already zero-ed by the caller.
4131 else
4132 mctx->bkref_ents = NULL;
4133 mctx->nbkref_ents = 0;
4134 mctx->nsub_tops = 0; */
4135 mctx->abkref_ents = n;
4136 mctx->max_mb_elem_len = 1;
4137 mctx->asub_tops = n;
4138 return REG_NOERROR;
4139 }
4140
4141 /* Clean the entries which depend on the current input in MCTX.
4142 This function must be invoked when the matcher changes the start index
4143 of the input, or changes the input string. */
4144
4145 static void
4146 match_ctx_clean (re_match_context_t *mctx)
4147 {
4148 Idx st_idx;
4149 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4150 {
4151 Idx sl_idx;
4152 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4153 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4154 {
4155 re_sub_match_last_t *last = top->lasts[sl_idx];
4156 re_free (last->path.array);
4157 re_free (last);
4158 }
4159 re_free (top->lasts);
4160 if (top->path)
4161 {
4162 re_free (top->path->array);
4163 re_free (top->path);
4164 }
4165 re_free (top);
4166 }
4167
4168 mctx->nsub_tops = 0;
4169 mctx->nbkref_ents = 0;
4170 }
4171
4172 /* Free all the memory associated with MCTX. */
4173
4174 static void
4175 match_ctx_free (re_match_context_t *mctx)
4176 {
4177 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4178 match_ctx_clean (mctx);
4179 re_free (mctx->sub_tops);
4180 re_free (mctx->bkref_ents);
4181 }
4182
4183 /* Add a new backreference entry to MCTX.
4184 Note that we assume that caller never call this function with duplicate
4185 entry, and call with STR_IDX which isn't smaller than any existing entry.
4186 */
4187
4188 static reg_errcode_t
4189 __attribute_warn_unused_result__
4190 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4191 Idx to)
4192 {
4193 if (mctx->nbkref_ents >= mctx->abkref_ents)
4194 {
4195 struct re_backref_cache_entry* new_entry;
4196 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4197 mctx->abkref_ents * 2);
4198 if (BE (new_entry == NULL, 0))
4199 {
4200 re_free (mctx->bkref_ents);
4201 return REG_ESPACE;
4202 }
4203 mctx->bkref_ents = new_entry;
4204 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4205 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4206 mctx->abkref_ents *= 2;
4207 }
4208 if (mctx->nbkref_ents > 0
4209 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4210 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4211
4212 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4213 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4214 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4215 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4216
4217 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4218 If bit N is clear, means that this entry won't epsilon-transition to
4219 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4220 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4221 such node.
4222
4223 A backreference does not epsilon-transition unless it is empty, so set
4224 to all zeros if FROM != TO. */
4225 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4226 = (from == to ? -1 : 0);
4227
4228 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4229 if (mctx->max_mb_elem_len < to - from)
4230 mctx->max_mb_elem_len = to - from;
4231 return REG_NOERROR;
4232 }
4233
4234 /* Return the first entry with the same str_idx, or -1 if none is
4235 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4236
4237 static Idx
4238 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4239 {
4240 Idx left, right, mid, last;
4241 last = right = mctx->nbkref_ents;
4242 for (left = 0; left < right;)
4243 {
4244 mid = (left + right) / 2;
4245 if (mctx->bkref_ents[mid].str_idx < str_idx)
4246 left = mid + 1;
4247 else
4248 right = mid;
4249 }
4250 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4251 return left;
4252 else
4253 return -1;
4254 }
4255
4256 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4257 at STR_IDX. */
4258
4259 static reg_errcode_t
4260 __attribute_warn_unused_result__
4261 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4262 {
4263 #ifdef DEBUG
4264 assert (mctx->sub_tops != NULL);
4265 assert (mctx->asub_tops > 0);
4266 #endif
4267 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4268 {
4269 Idx new_asub_tops = mctx->asub_tops * 2;
4270 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4271 re_sub_match_top_t *,
4272 new_asub_tops);
4273 if (BE (new_array == NULL, 0))
4274 return REG_ESPACE;
4275 mctx->sub_tops = new_array;
4276 mctx->asub_tops = new_asub_tops;
4277 }
4278 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4279 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4280 return REG_ESPACE;
4281 mctx->sub_tops[mctx->nsub_tops]->node = node;
4282 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4283 return REG_NOERROR;
4284 }
4285
4286 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4287 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4288
4289 static re_sub_match_last_t *
4290 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4291 {
4292 re_sub_match_last_t *new_entry;
4293 if (BE (subtop->nlasts == subtop->alasts, 0))
4294 {
4295 Idx new_alasts = 2 * subtop->alasts + 1;
4296 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4297 re_sub_match_last_t *,
4298 new_alasts);
4299 if (BE (new_array == NULL, 0))
4300 return NULL;
4301 subtop->lasts = new_array;
4302 subtop->alasts = new_alasts;
4303 }
4304 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4305 if (BE (new_entry != NULL, 1))
4306 {
4307 subtop->lasts[subtop->nlasts] = new_entry;
4308 new_entry->node = node;
4309 new_entry->str_idx = str_idx;
4310 ++subtop->nlasts;
4311 }
4312 return new_entry;
4313 }
4314
4315 static void
4316 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4317 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4318 {
4319 sctx->sifted_states = sifted_sts;
4320 sctx->limited_states = limited_sts;
4321 sctx->last_node = last_node;
4322 sctx->last_str_idx = last_str_idx;
4323 re_node_set_init_empty (&sctx->limits);
4324 }