]> git.proxmox.com Git - mirror_smartmontools-debian.git/blame - regex/regexec.c
import smartmontools 7.0
[mirror_smartmontools-debian.git] / regex / regexec.c
CommitLineData
832b75ed 1/* Extended regular expression matching and search library.
ff28b140 2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
832b75ed
GG
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
ff28b140
TL
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
832b75ed
GG
19
20static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
ff28b140 21 Idx n);
832b75ed
GG
22static void match_ctx_clean (re_match_context_t *mctx);
23static void match_ctx_free (re_match_context_t *cache);
ff28b140
TL
24static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to);
26static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28 Idx str_idx);
832b75ed 29static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
ff28b140 30 Idx node, Idx str_idx);
832b75ed 31static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
ff28b140
TL
32 re_dfastate_t **limited_sts, Idx last_node,
33 Idx last_str_idx);
832b75ed 34static reg_errcode_t re_search_internal (const regex_t *preg,
ff28b140
TL
35 const char *string, Idx length,
36 Idx start, Idx last_start, Idx stop,
832b75ed
GG
37 size_t nmatch, regmatch_t pmatch[],
38 int eflags);
ff28b140
TL
39static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40 const char *string1, Idx length1,
41 const char *string2, Idx length2,
42 Idx start, regoff_t range,
43 struct re_registers *regs,
44 Idx stop, bool ret_len);
45static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46 const char *string, Idx length, Idx start,
47 regoff_t range, Idx stop,
48 struct re_registers *regs,
49 bool ret_len);
832b75ed 50static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
ff28b140
TL
51 Idx nregs, int regs_allocated);
52static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54 Idx *p_match_first);
55static Idx check_halt_state_context (const re_match_context_t *mctx,
56 const re_dfastate_t *state, Idx idx);
57static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58 regmatch_t *prev_idx_match, Idx cur_node,
59 Idx cur_idx, Idx nmatch);
832b75ed 60static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
ff28b140 61 Idx str_idx, Idx dest_node, Idx nregs,
832b75ed
GG
62 regmatch_t *regs,
63 re_node_set *eps_via_nodes);
832b75ed
GG
64static reg_errcode_t set_regs (const regex_t *preg,
65 const re_match_context_t *mctx,
66 size_t nmatch, regmatch_t *pmatch,
ff28b140 67 bool fl_backtrack);
832b75ed
GG
68static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
69
70#ifdef RE_ENABLE_I18N
ff28b140 71static int sift_states_iter_mb (const re_match_context_t *mctx,
832b75ed 72 re_sift_context_t *sctx,
ff28b140 73 Idx node_idx, Idx str_idx, Idx max_str_idx);
832b75ed 74#endif /* RE_ENABLE_I18N */
ff28b140 75static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
832b75ed 76 re_sift_context_t *sctx);
ff28b140
TL
77static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
78 re_sift_context_t *sctx, Idx str_idx,
79 re_node_set *cur_dest);
80static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
832b75ed 81 re_sift_context_t *sctx,
ff28b140 82 Idx str_idx,
832b75ed 83 re_node_set *dest_nodes);
ff28b140 84static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
832b75ed
GG
85 re_node_set *dest_nodes,
86 const re_node_set *candidates);
ff28b140
TL
87static bool check_dst_limits (const re_match_context_t *mctx,
88 const re_node_set *limits,
89 Idx dst_node, Idx dst_idx, Idx src_node,
90 Idx src_idx);
91static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
92 int boundaries, Idx subexp_idx,
93 Idx from_node, Idx bkref_idx);
94static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
95 Idx limit, Idx subexp_idx,
96 Idx node, Idx str_idx,
97 Idx bkref_idx);
98static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
832b75ed
GG
99 re_node_set *dest_nodes,
100 const re_node_set *candidates,
101 re_node_set *limits,
102 struct re_backref_cache_entry *bkref_ents,
ff28b140
TL
103 Idx str_idx);
104static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
832b75ed 105 re_sift_context_t *sctx,
ff28b140
TL
106 Idx str_idx, const re_node_set *candidates);
107static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
108 re_dfastate_t **dst,
109 re_dfastate_t **src, Idx num);
110static re_dfastate_t *find_recover_state (reg_errcode_t *err,
111 re_match_context_t *mctx);
112static re_dfastate_t *transit_state (reg_errcode_t *err,
832b75ed 113 re_match_context_t *mctx,
ff28b140
TL
114 re_dfastate_t *state);
115static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
116 re_match_context_t *mctx,
117 re_dfastate_t *next_state);
118static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
832b75ed 119 re_node_set *cur_nodes,
ff28b140
TL
120 Idx str_idx);
121#if 0
122static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
123 re_match_context_t *mctx,
124 re_dfastate_t *pstate);
125#endif
832b75ed 126#ifdef RE_ENABLE_I18N
ff28b140
TL
127static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
128 re_dfastate_t *pstate);
832b75ed 129#endif /* RE_ENABLE_I18N */
ff28b140
TL
130static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
131 const re_node_set *nodes);
132static reg_errcode_t get_subexp (re_match_context_t *mctx,
133 Idx bkref_node, Idx bkref_str_idx);
134static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
135 const re_sub_match_top_t *sub_top,
832b75ed 136 re_sub_match_last_t *sub_last,
ff28b140
TL
137 Idx bkref_node, Idx bkref_str);
138static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
139 Idx subexp_idx, int type);
140static reg_errcode_t check_arrival (re_match_context_t *mctx,
141 state_array_t *path, Idx top_node,
142 Idx top_str, Idx last_node, Idx last_str,
143 int type);
144static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
145 Idx str_idx,
832b75ed
GG
146 re_node_set *cur_nodes,
147 re_node_set *next_nodes);
ff28b140 148static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
832b75ed 149 re_node_set *cur_nodes,
ff28b140
TL
150 Idx ex_subexp, int type);
151static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
832b75ed 152 re_node_set *dst_nodes,
ff28b140
TL
153 Idx target, Idx ex_subexp,
154 int type);
155static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
156 re_node_set *cur_nodes, Idx cur_str,
157 Idx subexp_num, int type);
158static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
832b75ed 159#ifdef RE_ENABLE_I18N
ff28b140
TL
160static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
161 const re_string_t *input, Idx idx);
832b75ed
GG
162# ifdef _LIBC
163static unsigned int find_collation_sequence_value (const unsigned char *mbs,
164 size_t name_len);
165# endif /* _LIBC */
166#endif /* RE_ENABLE_I18N */
ff28b140 167static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
832b75ed
GG
168 const re_dfastate_t *state,
169 re_node_set *states_node,
ff28b140
TL
170 bitset_t *states_ch);
171static bool check_node_accept (const re_match_context_t *mctx,
172 const re_token_t *node, Idx idx);
173static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
832b75ed
GG
174\f
175/* Entry point for POSIX code. */
176
177/* regexec searches for a given pattern, specified by PREG, in the
178 string STRING.
179
180 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
ff28b140 181 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
832b75ed
GG
182 least NMATCH elements, and we set them to the offsets of the
183 corresponding matched substrings.
184
ff28b140 185 EFLAGS specifies "execution flags" which affect matching: if
832b75ed
GG
186 REG_NOTBOL is set, then ^ does not match at the beginning of the
187 string; if REG_NOTEOL is set, then $ does not match at the end.
188
189 We return 0 if we find a match and REG_NOMATCH if not. */
190
191int
ff28b140
TL
192regexec (const regex_t *_Restrict_ preg, const char *_Restrict_ string,
193 size_t nmatch, regmatch_t pmatch[], int eflags)
832b75ed
GG
194{
195 reg_errcode_t err;
ff28b140
TL
196 Idx start, length;
197 re_dfa_t *dfa = preg->buffer;
198
199 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
200 return REG_BADPAT;
201
202 if (eflags & REG_STARTEND)
203 {
204 start = pmatch[0].rm_so;
205 length = pmatch[0].rm_eo;
206 }
207 else
208 {
209 start = 0;
210 length = strlen (string);
211 }
212
213 lock_lock (dfa->lock);
832b75ed 214 if (preg->no_sub)
ff28b140
TL
215 err = re_search_internal (preg, string, length, start, length,
216 length, 0, NULL, eflags);
832b75ed 217 else
ff28b140
TL
218 err = re_search_internal (preg, string, length, start, length,
219 length, nmatch, pmatch, eflags);
220 lock_unlock (dfa->lock);
832b75ed
GG
221 return err != REG_NOERROR;
222}
ff28b140 223
832b75ed 224#ifdef _LIBC
ff28b140
TL
225libc_hidden_def (__regexec)
226
227# include <shlib-compat.h>
228versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
229
230# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
231__typeof__ (__regexec) __compat_regexec;
232
233int
234attribute_compat_text_section
235__compat_regexec (const regex_t *_Restrict_ preg,
236 const char *_Restrict_ string, size_t nmatch,
237 regmatch_t pmatch[], int eflags)
238{
239 return regexec (preg, string, nmatch, pmatch,
240 eflags & (REG_NOTBOL | REG_NOTEOL));
241}
242compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
243# endif
832b75ed
GG
244#endif
245
246/* Entry points for GNU code. */
247
248/* re_match, re_search, re_match_2, re_search_2
249
250 The former two functions operate on STRING with length LENGTH,
251 while the later two operate on concatenation of STRING1 and STRING2
252 with lengths LENGTH1 and LENGTH2, respectively.
253
254 re_match() matches the compiled pattern in BUFP against the string,
255 starting at index START.
256
257 re_search() first tries matching at index START, then it tries to match
258 starting from index START + 1, and so on. The last start position tried
259 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
260 way as re_match().)
261
262 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
263 the first STOP characters of the concatenation of the strings should be
264 concerned.
265
266 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
ff28b140 267 and all groups is stored in REGS. (For the "_2" variants, the offsets are
832b75ed
GG
268 computed relative to the concatenation, not relative to the individual
269 strings.)
270
271 On success, re_match* functions return the length of the match, re_search*
272 return the position of the start of the match. Return value -1 means no
273 match was found and -2 indicates an internal error. */
274
ff28b140
TL
275regoff_t
276re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
277 Idx start, struct re_registers *regs)
832b75ed 278{
ff28b140 279 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
832b75ed
GG
280}
281#ifdef _LIBC
282weak_alias (__re_match, re_match)
283#endif
284
ff28b140
TL
285regoff_t
286re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
287 Idx start, regoff_t range, struct re_registers *regs)
832b75ed 288{
ff28b140
TL
289 return re_search_stub (bufp, string, length, start, range, length, regs,
290 false);
832b75ed
GG
291}
292#ifdef _LIBC
293weak_alias (__re_search, re_search)
294#endif
295
ff28b140
TL
296regoff_t
297re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
298 const char *string2, Idx length2, Idx start,
299 struct re_registers *regs, Idx stop)
832b75ed
GG
300{
301 return re_search_2_stub (bufp, string1, length1, string2, length2,
ff28b140 302 start, 0, regs, stop, true);
832b75ed
GG
303}
304#ifdef _LIBC
305weak_alias (__re_match_2, re_match_2)
306#endif
307
ff28b140
TL
308regoff_t
309re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
310 const char *string2, Idx length2, Idx start, regoff_t range,
311 struct re_registers *regs, Idx stop)
832b75ed
GG
312{
313 return re_search_2_stub (bufp, string1, length1, string2, length2,
ff28b140 314 start, range, regs, stop, false);
832b75ed
GG
315}
316#ifdef _LIBC
317weak_alias (__re_search_2, re_search_2)
318#endif
319
ff28b140
TL
320static regoff_t
321re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
322 Idx length1, const char *string2, Idx length2, Idx start,
323 regoff_t range, struct re_registers *regs,
324 Idx stop, bool ret_len)
832b75ed
GG
325{
326 const char *str;
ff28b140
TL
327 regoff_t rval;
328 Idx len;
329 char *s = NULL;
832b75ed 330
ff28b140
TL
331 if (BE ((length1 < 0 || length2 < 0 || stop < 0
332 || INT_ADD_WRAPV (length1, length2, &len)),
333 0))
832b75ed
GG
334 return -2;
335
336 /* Concatenate the strings. */
337 if (length2 > 0)
338 if (length1 > 0)
339 {
ff28b140 340 s = re_malloc (char, len);
832b75ed
GG
341
342 if (BE (s == NULL, 0))
343 return -2;
ff28b140
TL
344#ifdef _LIBC
345 memcpy (__mempcpy (s, string1, length1), string2, length2);
346#else
832b75ed
GG
347 memcpy (s, string1, length1);
348 memcpy (s + length1, string2, length2);
ff28b140 349#endif
832b75ed 350 str = s;
832b75ed
GG
351 }
352 else
353 str = string2;
354 else
355 str = string1;
356
357 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
358 ret_len);
ff28b140 359 re_free (s);
832b75ed
GG
360 return rval;
361}
362
363/* The parameters have the same meaning as those of re_search.
364 Additional parameters:
ff28b140 365 If RET_LEN is true the length of the match is returned (re_match style);
832b75ed
GG
366 otherwise the position of the match is returned. */
367
ff28b140
TL
368static regoff_t
369re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
370 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
371 bool ret_len)
832b75ed
GG
372{
373 reg_errcode_t result;
374 regmatch_t *pmatch;
ff28b140
TL
375 Idx nregs;
376 regoff_t rval;
832b75ed 377 int eflags = 0;
ff28b140
TL
378 re_dfa_t *dfa = bufp->buffer;
379 Idx last_start = start + range;
832b75ed
GG
380
381 /* Check for out-of-range. */
382 if (BE (start < 0 || start > length, 0))
383 return -1;
ff28b140
TL
384 if (BE (length < last_start || (0 <= range && last_start < start), 0))
385 last_start = length;
386 else if (BE (last_start < 0 || (range < 0 && start <= last_start), 0))
387 last_start = 0;
388
389 lock_lock (dfa->lock);
832b75ed
GG
390
391 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
392 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
393
394 /* Compile fastmap if we haven't yet. */
ff28b140 395 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
832b75ed
GG
396 re_compile_fastmap (bufp);
397
398 if (BE (bufp->no_sub, 0))
399 regs = NULL;
400
401 /* We need at least 1 register. */
402 if (regs == NULL)
403 nregs = 1;
ff28b140
TL
404 else if (BE (bufp->regs_allocated == REGS_FIXED
405 && regs->num_regs <= bufp->re_nsub, 0))
832b75ed
GG
406 {
407 nregs = regs->num_regs;
408 if (BE (nregs < 1, 0))
409 {
410 /* Nothing can be copied to regs. */
411 regs = NULL;
412 nregs = 1;
413 }
414 }
415 else
416 nregs = bufp->re_nsub + 1;
417 pmatch = re_malloc (regmatch_t, nregs);
418 if (BE (pmatch == NULL, 0))
ff28b140
TL
419 {
420 rval = -2;
421 goto out;
422 }
832b75ed 423
ff28b140 424 result = re_search_internal (bufp, string, length, start, last_start, stop,
832b75ed
GG
425 nregs, pmatch, eflags);
426
427 rval = 0;
428
ff28b140 429 /* I hope we needn't fill their regs with -1's when no match was found. */
832b75ed 430 if (result != REG_NOERROR)
ff28b140 431 rval = result == REG_NOMATCH ? -1 : -2;
832b75ed
GG
432 else if (regs != NULL)
433 {
434 /* If caller wants register contents data back, copy them. */
435 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
436 bufp->regs_allocated);
437 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
438 rval = -2;
439 }
440
441 if (BE (rval == 0, 1))
442 {
443 if (ret_len)
444 {
445 assert (pmatch[0].rm_so == start);
446 rval = pmatch[0].rm_eo - start;
447 }
448 else
449 rval = pmatch[0].rm_so;
450 }
451 re_free (pmatch);
ff28b140
TL
452 out:
453 lock_unlock (dfa->lock);
832b75ed
GG
454 return rval;
455}
456
457static unsigned
ff28b140
TL
458re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
459 int regs_allocated)
832b75ed
GG
460{
461 int rval = REGS_REALLOCATE;
ff28b140
TL
462 Idx i;
463 Idx need_regs = nregs + 1;
464 /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
832b75ed
GG
465 uses. */
466
467 /* Have the register data arrays been allocated? */
468 if (regs_allocated == REGS_UNALLOCATED)
469 { /* No. So allocate them with malloc. */
470 regs->start = re_malloc (regoff_t, need_regs);
471 if (BE (regs->start == NULL, 0))
472 return REGS_UNALLOCATED;
473 regs->end = re_malloc (regoff_t, need_regs);
474 if (BE (regs->end == NULL, 0))
475 {
476 re_free (regs->start);
477 return REGS_UNALLOCATED;
478 }
479 regs->num_regs = need_regs;
480 }
481 else if (regs_allocated == REGS_REALLOCATE)
482 { /* Yes. If we need more elements than were already
483 allocated, reallocate them. If we need fewer, just
484 leave it alone. */
ff28b140 485 if (BE (need_regs > regs->num_regs, 0))
832b75ed 486 {
ff28b140
TL
487 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
488 regoff_t *new_end;
489 if (BE (new_start == NULL, 0))
490 return REGS_UNALLOCATED;
491 new_end = re_realloc (regs->end, regoff_t, need_regs);
492 if (BE (new_end == NULL, 0))
832b75ed 493 {
ff28b140 494 re_free (new_start);
832b75ed
GG
495 return REGS_UNALLOCATED;
496 }
ff28b140
TL
497 regs->start = new_start;
498 regs->end = new_end;
832b75ed
GG
499 regs->num_regs = need_regs;
500 }
501 }
502 else
503 {
504 assert (regs_allocated == REGS_FIXED);
505 /* This function may not be called with REGS_FIXED and nregs too big. */
506 assert (regs->num_regs >= nregs);
507 rval = REGS_FIXED;
508 }
509
510 /* Copy the regs. */
511 for (i = 0; i < nregs; ++i)
512 {
513 regs->start[i] = pmatch[i].rm_so;
514 regs->end[i] = pmatch[i].rm_eo;
515 }
516 for ( ; i < regs->num_regs; ++i)
517 regs->start[i] = regs->end[i] = -1;
518
519 return rval;
520}
521
522/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
523 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
524 this memory for recording register information. STARTS and ENDS
525 must be allocated using the malloc library routine, and must each
526 be at least NUM_REGS * sizeof (regoff_t) bytes long.
527
528 If NUM_REGS == 0, then subsequent matches should allocate their own
529 register data.
530
531 Unless this function is called, the first search or match using
532 PATTERN_BUFFER will allocate its own register data, without
533 freeing the old data. */
534
535void
ff28b140
TL
536re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
537 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
832b75ed
GG
538{
539 if (num_regs)
540 {
541 bufp->regs_allocated = REGS_REALLOCATE;
542 regs->num_regs = num_regs;
543 regs->start = starts;
544 regs->end = ends;
545 }
546 else
547 {
548 bufp->regs_allocated = REGS_UNALLOCATED;
549 regs->num_regs = 0;
ff28b140 550 regs->start = regs->end = NULL;
832b75ed
GG
551 }
552}
553#ifdef _LIBC
554weak_alias (__re_set_registers, re_set_registers)
555#endif
556\f
557/* Entry points compatible with 4.2 BSD regex library. We don't define
558 them unless specifically requested. */
559
560#if defined _REGEX_RE_COMP || defined _LIBC
561int
562# ifdef _LIBC
563weak_function
564# endif
ff28b140 565re_exec (const char *s)
832b75ed
GG
566{
567 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
568}
569#endif /* _REGEX_RE_COMP */
570\f
832b75ed
GG
571/* Internal entry point. */
572
573/* Searches for a compiled pattern PREG in the string STRING, whose
574 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
ff28b140
TL
575 meaning as with regexec. LAST_START is START + RANGE, where
576 START and RANGE have the same meaning as with re_search.
832b75ed
GG
577 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
578 otherwise return the error code.
579 Note: We assume front end functions already check ranges.
ff28b140 580 (0 <= LAST_START && LAST_START <= LENGTH) */
832b75ed
GG
581
582static reg_errcode_t
ff28b140
TL
583__attribute_warn_unused_result__
584re_search_internal (const regex_t *preg, const char *string, Idx length,
585 Idx start, Idx last_start, Idx stop, size_t nmatch,
586 regmatch_t pmatch[], int eflags)
832b75ed
GG
587{
588 reg_errcode_t err;
ff28b140
TL
589 const re_dfa_t *dfa = preg->buffer;
590 Idx left_lim, right_lim;
591 int incr;
592 bool fl_longest_match;
593 int match_kind;
594 Idx match_first;
595 Idx match_last = -1;
596 Idx extra_nmatch;
597 bool sb;
598 int ch;
599#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
600 re_match_context_t mctx = { .dfa = dfa };
601#else
832b75ed 602 re_match_context_t mctx;
ff28b140 603#endif
832b75ed 604 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
ff28b140
TL
605 && start != last_start && !preg->can_be_null)
606 ? preg->fastmap : NULL);
607 RE_TRANSLATE_TYPE t = preg->translate;
608
609#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
610 memset (&mctx, '\0', sizeof (re_match_context_t));
611 mctx.dfa = dfa;
612#endif
613
614 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
615 nmatch -= extra_nmatch;
832b75ed
GG
616
617 /* Check if the DFA haven't been compiled. */
618 if (BE (preg->used == 0 || dfa->init_state == NULL
619 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
620 || dfa->init_state_begbuf == NULL, 0))
621 return REG_NOMATCH;
622
ff28b140
TL
623#ifdef DEBUG
624 /* We assume front-end functions already check them. */
625 assert (0 <= last_start && last_start <= length);
626#endif
627
628 /* If initial states with non-begbuf contexts have no elements,
629 the regex must be anchored. If preg->newline_anchor is set,
630 we'll never use init_state_nl, so do not check it. */
631 if (dfa->init_state->nodes.nelem == 0
632 && dfa->init_state_word->nodes.nelem == 0
633 && (dfa->init_state_nl->nodes.nelem == 0
634 || !preg->newline_anchor))
635 {
636 if (start != 0 && last_start != 0)
637 return REG_NOMATCH;
638 start = last_start = 0;
639 }
832b75ed
GG
640
641 /* We must check the longest matching, if nmatch > 0. */
642 fl_longest_match = (nmatch != 0 || dfa->nbackref);
643
ff28b140
TL
644 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
645 preg->translate, (preg->syntax & RE_ICASE) != 0,
646 dfa);
832b75ed
GG
647 if (BE (err != REG_NOERROR, 0))
648 goto free_return;
ff28b140
TL
649 mctx.input.stop = stop;
650 mctx.input.raw_stop = stop;
651 mctx.input.newline_anchor = preg->newline_anchor;
832b75ed 652
ff28b140 653 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
832b75ed
GG
654 if (BE (err != REG_NOERROR, 0))
655 goto free_return;
656
657 /* We will log all the DFA states through which the dfa pass,
658 if nmatch > 1, or this dfa has "multibyte node", which is a
659 back-reference or a node which can accept multibyte character or
660 multi character collating element. */
661 if (nmatch > 1 || dfa->has_mb_node)
662 {
ff28b140
TL
663 /* Avoid overflow. */
664 if (BE ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
665 <= mctx.input.bufs_len), 0))
666 {
667 err = REG_ESPACE;
668 goto free_return;
669 }
670
671 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
832b75ed
GG
672 if (BE (mctx.state_log == NULL, 0))
673 {
674 err = REG_ESPACE;
675 goto free_return;
676 }
677 }
678 else
679 mctx.state_log = NULL;
680
832b75ed 681 match_first = start;
ff28b140
TL
682 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
683 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
684
685 /* Check incrementally whether the input string matches. */
686 incr = (last_start < start) ? -1 : 1;
687 left_lim = (last_start < start) ? last_start : start;
688 right_lim = (last_start < start) ? start : last_start;
689 sb = dfa->mb_cur_max == 1;
690 match_kind =
691 (fastmap
692 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
693 | (start <= last_start ? 2 : 0)
694 | (t != NULL ? 1 : 0))
695 : 8);
696
697 for (;; match_first += incr)
832b75ed 698 {
ff28b140
TL
699 err = REG_NOMATCH;
700 if (match_first < left_lim || right_lim < match_first)
701 goto free_return;
702
703 /* Advance as rapidly as possible through the string, until we
704 find a plausible place to start matching. This may be done
705 with varying efficiency, so there are various possibilities:
706 only the most common of them are specialized, in order to
707 save on code size. We use a switch statement for speed. */
708 switch (match_kind)
832b75ed 709 {
ff28b140
TL
710 case 8:
711 /* No fastmap. */
712 break;
713
714 case 7:
715 /* Fastmap with single-byte translation, match forward. */
716 while (BE (match_first < right_lim, 1)
717 && !fastmap[t[(unsigned char) string[match_first]]])
718 ++match_first;
719 goto forward_match_found_start_or_reached_end;
720
721 case 6:
722 /* Fastmap without translation, match forward. */
723 while (BE (match_first < right_lim, 1)
724 && !fastmap[(unsigned char) string[match_first]])
725 ++match_first;
726
727 forward_match_found_start_or_reached_end:
728 if (BE (match_first == right_lim, 0))
832b75ed 729 {
ff28b140
TL
730 ch = match_first >= length
731 ? 0 : (unsigned char) string[match_first];
732 if (!fastmap[t ? t[ch] : ch])
733 goto free_return;
832b75ed 734 }
ff28b140
TL
735 break;
736
737 case 4:
738 case 5:
739 /* Fastmap without multi-byte translation, match backwards. */
740 while (match_first >= left_lim)
832b75ed 741 {
ff28b140
TL
742 ch = match_first >= length
743 ? 0 : (unsigned char) string[match_first];
744 if (fastmap[t ? t[ch] : ch])
745 break;
746 --match_first;
747 }
748 if (match_first < left_lim)
749 goto free_return;
750 break;
832b75ed 751
ff28b140
TL
752 default:
753 /* In this case, we can't determine easily the current byte,
754 since it might be a component byte of a multibyte
755 character. Then we use the constructed buffer instead. */
756 for (;;)
757 {
758 /* If MATCH_FIRST is out of the valid range, reconstruct the
759 buffers. */
760 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
761 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
832b75ed 762 {
ff28b140
TL
763 err = re_string_reconstruct (&mctx.input, match_first,
764 eflags);
765 if (BE (err != REG_NOERROR, 0))
766 goto free_return;
767
768 offset = match_first - mctx.input.raw_mbs_idx;
832b75ed 769 }
ff28b140
TL
770 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
771 Note that MATCH_FIRST must not be smaller than 0. */
772 ch = (match_first >= length
773 ? 0 : re_string_byte_at (&mctx.input, offset));
774 if (fastmap[ch])
832b75ed 775 break;
ff28b140
TL
776 match_first += incr;
777 if (match_first < left_lim || match_first > right_lim)
778 {
779 err = REG_NOMATCH;
780 goto free_return;
781 }
832b75ed 782 }
ff28b140 783 break;
832b75ed
GG
784 }
785
786 /* Reconstruct the buffers so that the matcher can assume that
ff28b140
TL
787 the matching starts from the beginning of the buffer. */
788 err = re_string_reconstruct (&mctx.input, match_first, eflags);
832b75ed
GG
789 if (BE (err != REG_NOERROR, 0))
790 goto free_return;
ff28b140 791
832b75ed 792#ifdef RE_ENABLE_I18N
ff28b140
TL
793 /* Don't consider this char as a possible match start if it part,
794 yet isn't the head, of a multibyte character. */
795 if (!sb && !re_string_first_byte (&mctx.input, 0))
796 continue;
832b75ed 797#endif
ff28b140
TL
798
799 /* It seems to be appropriate one, then use the matcher. */
800 /* We assume that the matching starts from 0. */
801 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
802 match_last = check_matching (&mctx, fl_longest_match,
803 start <= last_start ? &match_first : NULL);
804 if (match_last != -1)
832b75ed 805 {
ff28b140
TL
806 if (BE (match_last == -2, 0))
807 {
808 err = REG_ESPACE;
809 goto free_return;
810 }
811 else
832b75ed 812 {
ff28b140
TL
813 mctx.match_last = match_last;
814 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
832b75ed 815 {
ff28b140
TL
816 re_dfastate_t *pstate = mctx.state_log[match_last];
817 mctx.last_node = check_halt_state_context (&mctx, pstate,
818 match_last);
832b75ed 819 }
ff28b140
TL
820 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
821 || dfa->nbackref)
832b75ed 822 {
ff28b140
TL
823 err = prune_impossible_nodes (&mctx);
824 if (err == REG_NOERROR)
825 break;
826 if (BE (err != REG_NOMATCH, 0))
827 goto free_return;
828 match_last = -1;
832b75ed 829 }
ff28b140
TL
830 else
831 break; /* We found a match. */
832b75ed 832 }
832b75ed 833 }
ff28b140
TL
834
835 match_ctx_clean (&mctx);
832b75ed
GG
836 }
837
ff28b140
TL
838#ifdef DEBUG
839 assert (match_last != -1);
840 assert (err == REG_NOERROR);
841#endif
842
832b75ed 843 /* Set pmatch[] if we need. */
ff28b140 844 if (nmatch > 0)
832b75ed 845 {
ff28b140 846 Idx reg_idx;
832b75ed
GG
847
848 /* Initialize registers. */
ff28b140 849 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
832b75ed
GG
850 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
851
852 /* Set the points where matching start/end. */
853 pmatch[0].rm_so = 0;
854 pmatch[0].rm_eo = mctx.match_last;
ff28b140
TL
855 /* FIXME: This function should fail if mctx.match_last exceeds
856 the maximum possible regoff_t value. We need a new error
857 code REG_OVERFLOW. */
832b75ed
GG
858
859 if (!preg->no_sub && nmatch > 1)
860 {
861 err = set_regs (preg, &mctx, nmatch, pmatch,
862 dfa->has_plural_match && dfa->nbackref > 0);
863 if (BE (err != REG_NOERROR, 0))
864 goto free_return;
865 }
866
ff28b140
TL
867 /* At last, add the offset to each register, since we slid
868 the buffers so that we could assume that the matching starts
869 from 0. */
832b75ed
GG
870 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
871 if (pmatch[reg_idx].rm_so != -1)
872 {
ff28b140
TL
873#ifdef RE_ENABLE_I18N
874 if (BE (mctx.input.offsets_needed != 0, 0))
875 {
876 pmatch[reg_idx].rm_so =
877 (pmatch[reg_idx].rm_so == mctx.input.valid_len
878 ? mctx.input.valid_raw_len
879 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
880 pmatch[reg_idx].rm_eo =
881 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
882 ? mctx.input.valid_raw_len
883 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
884 }
885#else
886 assert (mctx.input.offsets_needed == 0);
887#endif
832b75ed
GG
888 pmatch[reg_idx].rm_so += match_first;
889 pmatch[reg_idx].rm_eo += match_first;
890 }
ff28b140
TL
891 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
892 {
893 pmatch[nmatch + reg_idx].rm_so = -1;
894 pmatch[nmatch + reg_idx].rm_eo = -1;
895 }
896
897 if (dfa->subexp_map)
898 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
899 if (dfa->subexp_map[reg_idx] != reg_idx)
900 {
901 pmatch[reg_idx + 1].rm_so
902 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
903 pmatch[reg_idx + 1].rm_eo
904 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
905 }
832b75ed 906 }
ff28b140 907
832b75ed
GG
908 free_return:
909 re_free (mctx.state_log);
910 if (dfa->nbackref)
911 match_ctx_free (&mctx);
ff28b140 912 re_string_destruct (&mctx.input);
832b75ed
GG
913 return err;
914}
915
916static reg_errcode_t
ff28b140
TL
917__attribute_warn_unused_result__
918prune_impossible_nodes (re_match_context_t *mctx)
832b75ed 919{
ff28b140
TL
920 const re_dfa_t *const dfa = mctx->dfa;
921 Idx halt_node, match_last;
832b75ed 922 reg_errcode_t ret;
832b75ed
GG
923 re_dfastate_t **sifted_states;
924 re_dfastate_t **lim_states = NULL;
925 re_sift_context_t sctx;
926#ifdef DEBUG
927 assert (mctx->state_log != NULL);
928#endif
929 match_last = mctx->match_last;
930 halt_node = mctx->last_node;
ff28b140
TL
931
932 /* Avoid overflow. */
933 if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) <= match_last, 0))
934 return REG_ESPACE;
935
832b75ed
GG
936 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
937 if (BE (sifted_states == NULL, 0))
938 {
939 ret = REG_ESPACE;
940 goto free_return;
941 }
942 if (dfa->nbackref)
943 {
944 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
945 if (BE (lim_states == NULL, 0))
946 {
947 ret = REG_ESPACE;
948 goto free_return;
949 }
950 while (1)
951 {
952 memset (lim_states, '\0',
953 sizeof (re_dfastate_t *) * (match_last + 1));
832b75ed 954 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
ff28b140
TL
955 match_last);
956 ret = sift_states_backward (mctx, &sctx);
832b75ed
GG
957 re_node_set_free (&sctx.limits);
958 if (BE (ret != REG_NOERROR, 0))
959 goto free_return;
960 if (sifted_states[0] != NULL || lim_states[0] != NULL)
961 break;
962 do
963 {
964 --match_last;
965 if (match_last < 0)
966 {
967 ret = REG_NOMATCH;
968 goto free_return;
969 }
ff28b140
TL
970 } while (mctx->state_log[match_last] == NULL
971 || !mctx->state_log[match_last]->halt);
972 halt_node = check_halt_state_context (mctx,
832b75ed 973 mctx->state_log[match_last],
ff28b140 974 match_last);
832b75ed
GG
975 }
976 ret = merge_state_array (dfa, sifted_states, lim_states,
977 match_last + 1);
978 re_free (lim_states);
979 lim_states = NULL;
980 if (BE (ret != REG_NOERROR, 0))
981 goto free_return;
982 }
983 else
984 {
ff28b140
TL
985 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
986 ret = sift_states_backward (mctx, &sctx);
832b75ed
GG
987 re_node_set_free (&sctx.limits);
988 if (BE (ret != REG_NOERROR, 0))
989 goto free_return;
ff28b140
TL
990 if (sifted_states[0] == NULL)
991 {
992 ret = REG_NOMATCH;
993 goto free_return;
994 }
832b75ed
GG
995 }
996 re_free (mctx->state_log);
997 mctx->state_log = sifted_states;
998 sifted_states = NULL;
999 mctx->last_node = halt_node;
1000 mctx->match_last = match_last;
1001 ret = REG_NOERROR;
1002 free_return:
1003 re_free (sifted_states);
1004 re_free (lim_states);
1005 return ret;
1006}
1007
1008/* Acquire an initial state and return it.
1009 We must select appropriate initial state depending on the context,
1010 since initial states may have constraints like "\<", "^", etc.. */
1011
1012static inline re_dfastate_t *
ff28b140
TL
1013__attribute__ ((always_inline))
1014acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1015 Idx idx)
832b75ed 1016{
ff28b140 1017 const re_dfa_t *const dfa = mctx->dfa;
832b75ed
GG
1018 if (dfa->init_state->has_constraint)
1019 {
1020 unsigned int context;
ff28b140 1021 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
832b75ed
GG
1022 if (IS_WORD_CONTEXT (context))
1023 return dfa->init_state_word;
1024 else if (IS_ORDINARY_CONTEXT (context))
1025 return dfa->init_state;
1026 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1027 return dfa->init_state_begbuf;
1028 else if (IS_NEWLINE_CONTEXT (context))
1029 return dfa->init_state_nl;
1030 else if (IS_BEGBUF_CONTEXT (context))
1031 {
1032 /* It is relatively rare case, then calculate on demand. */
ff28b140
TL
1033 return re_acquire_state_context (err, dfa,
1034 dfa->init_state->entrance_nodes,
1035 context);
832b75ed
GG
1036 }
1037 else
1038 /* Must not happen? */
1039 return dfa->init_state;
1040 }
1041 else
1042 return dfa->init_state;
1043}
1044
1045/* Check whether the regular expression match input string INPUT or not,
ff28b140
TL
1046 and return the index where the matching end. Return -1 if
1047 there is no match, and return -2 in case of an error.
832b75ed 1048 FL_LONGEST_MATCH means we want the POSIX longest matching.
ff28b140
TL
1049 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1050 next place where we may want to try matching.
1051 Note that the matcher assumes that the matching starts from the current
832b75ed
GG
1052 index of the buffer. */
1053
ff28b140
TL
1054static Idx
1055__attribute_warn_unused_result__
1056check_matching (re_match_context_t *mctx, bool fl_longest_match,
1057 Idx *p_match_first)
832b75ed 1058{
ff28b140 1059 const re_dfa_t *const dfa = mctx->dfa;
832b75ed 1060 reg_errcode_t err;
ff28b140
TL
1061 Idx match = 0;
1062 Idx match_last = -1;
1063 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
832b75ed 1064 re_dfastate_t *cur_state;
ff28b140
TL
1065 bool at_init_state = p_match_first != NULL;
1066 Idx next_start_idx = cur_str_idx;
832b75ed 1067
ff28b140
TL
1068 err = REG_NOERROR;
1069 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1070 /* An initial state must not be NULL (invalid). */
832b75ed 1071 if (BE (cur_state == NULL, 0))
832b75ed 1072 {
ff28b140
TL
1073 assert (err == REG_ESPACE);
1074 return -2;
832b75ed
GG
1075 }
1076
ff28b140 1077 if (mctx->state_log != NULL)
832b75ed 1078 {
ff28b140
TL
1079 mctx->state_log[cur_str_idx] = cur_state;
1080
1081 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1082 later. E.g. Processing back references. */
1083 if (BE (dfa->nbackref, 0))
1084 {
1085 at_init_state = false;
1086 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1087 if (BE (err != REG_NOERROR, 0))
1088 return err;
1089
1090 if (cur_state->has_backref)
1091 {
1092 err = transit_state_bkref (mctx, &cur_state->nodes);
1093 if (BE (err != REG_NOERROR, 0))
1094 return err;
1095 }
1096 }
832b75ed
GG
1097 }
1098
1099 /* If the RE accepts NULL string. */
ff28b140 1100 if (BE (cur_state->halt, 0))
832b75ed
GG
1101 {
1102 if (!cur_state->has_constraint
ff28b140 1103 || check_halt_state_context (mctx, cur_state, cur_str_idx))
832b75ed
GG
1104 {
1105 if (!fl_longest_match)
1106 return cur_str_idx;
1107 else
1108 {
1109 match_last = cur_str_idx;
1110 match = 1;
1111 }
1112 }
1113 }
1114
ff28b140 1115 while (!re_string_eoi (&mctx->input))
832b75ed 1116 {
ff28b140
TL
1117 re_dfastate_t *old_state = cur_state;
1118 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1119
1120 if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1121 && mctx->input.bufs_len < mctx->input.len)
1122 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1123 && mctx->input.valid_len < mctx->input.len))
832b75ed 1124 {
ff28b140 1125 err = extend_buffers (mctx, next_char_idx + 1);
832b75ed 1126 if (BE (err != REG_NOERROR, 0))
832b75ed 1127 {
ff28b140
TL
1128 assert (err == REG_ESPACE);
1129 return -2;
832b75ed 1130 }
ff28b140
TL
1131 }
1132
1133 cur_state = transit_state (&err, mctx, cur_state);
1134 if (mctx->state_log != NULL)
1135 cur_state = merge_state_with_log (&err, mctx, cur_state);
1136
1137 if (cur_state == NULL)
1138 {
1139 /* Reached the invalid state or an error. Try to recover a valid
1140 state using the state log, if available and if we have not
1141 already found a valid (even if not the longest) match. */
1142 if (BE (err != REG_NOERROR, 0))
1143 return -2;
1144
1145 if (mctx->state_log == NULL
1146 || (match && !fl_longest_match)
1147 || (cur_state = find_recover_state (&err, mctx)) == NULL)
832b75ed 1148 break;
832b75ed
GG
1149 }
1150
ff28b140 1151 if (BE (at_init_state, 0))
832b75ed 1152 {
ff28b140
TL
1153 if (old_state == cur_state)
1154 next_start_idx = next_char_idx;
1155 else
1156 at_init_state = false;
1157 }
1158
1159 if (cur_state->halt)
1160 {
1161 /* Reached a halt state.
832b75ed
GG
1162 Check the halt state can satisfy the current context. */
1163 if (!cur_state->has_constraint
ff28b140
TL
1164 || check_halt_state_context (mctx, cur_state,
1165 re_string_cur_idx (&mctx->input)))
832b75ed
GG
1166 {
1167 /* We found an appropriate halt state. */
ff28b140 1168 match_last = re_string_cur_idx (&mctx->input);
832b75ed 1169 match = 1;
ff28b140
TL
1170
1171 /* We found a match, do not modify match_first below. */
1172 p_match_first = NULL;
832b75ed
GG
1173 if (!fl_longest_match)
1174 break;
1175 }
1176 }
ff28b140
TL
1177 }
1178
1179 if (p_match_first)
1180 *p_match_first += next_start_idx;
1181
832b75ed
GG
1182 return match_last;
1183}
1184
1185/* Check NODE match the current context. */
1186
ff28b140
TL
1187static bool
1188check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
832b75ed
GG
1189{
1190 re_token_type_t type = dfa->nodes[node].type;
1191 unsigned int constraint = dfa->nodes[node].constraint;
1192 if (type != END_OF_RE)
ff28b140 1193 return false;
832b75ed 1194 if (!constraint)
ff28b140 1195 return true;
832b75ed 1196 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
ff28b140
TL
1197 return false;
1198 return true;
832b75ed
GG
1199}
1200
1201/* Check the halt state STATE match the current context.
1202 Return 0 if not match, if the node, STATE has, is a halt node and
1203 match the context, return the node. */
1204
ff28b140
TL
1205static Idx
1206check_halt_state_context (const re_match_context_t *mctx,
1207 const re_dfastate_t *state, Idx idx)
1208{
1209 Idx i;
832b75ed
GG
1210 unsigned int context;
1211#ifdef DEBUG
1212 assert (state->halt);
1213#endif
ff28b140 1214 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
832b75ed 1215 for (i = 0; i < state->nodes.nelem; ++i)
ff28b140 1216 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
832b75ed
GG
1217 return state->nodes.elems[i];
1218 return 0;
1219}
1220
1221/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1222 corresponding to the DFA).
ff28b140
TL
1223 Return the destination node, and update EPS_VIA_NODES;
1224 return -1 in case of errors. */
832b75ed 1225
ff28b140
TL
1226static Idx
1227proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1228 Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1229 struct re_fail_stack_t *fs)
1230{
1231 const re_dfa_t *const dfa = mctx->dfa;
1232 Idx i;
1233 bool ok;
832b75ed
GG
1234 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1235 {
1236 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
ff28b140
TL
1237 re_node_set *edests = &dfa->edests[node];
1238 Idx dest_node;
1239 ok = re_node_set_insert (eps_via_nodes, node);
1240 if (BE (! ok, 0))
1241 return -2;
1242 /* Pick up a valid destination, or return -1 if none
1243 is found. */
1244 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1245 {
1246 Idx candidate = edests->elems[i];
832b75ed
GG
1247 if (!re_node_set_contains (cur_nodes, candidate))
1248 continue;
ff28b140
TL
1249 if (dest_node == -1)
1250 dest_node = candidate;
1251
1252 else
1253 {
1254 /* In order to avoid infinite loop like "(a*)*", return the second
1255 epsilon-transition if the first was already considered. */
1256 if (re_node_set_contains (eps_via_nodes, dest_node))
1257 return candidate;
1258
1259 /* Otherwise, push the second epsilon-transition on the fail stack. */
1260 else if (fs != NULL
1261 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1262 eps_via_nodes))
1263 return -2;
1264
1265 /* We know we are going to exit. */
1266 break;
1267 }
1268 }
1269 return dest_node;
832b75ed
GG
1270 }
1271 else
1272 {
ff28b140 1273 Idx naccepted = 0;
832b75ed
GG
1274 re_token_type_t type = dfa->nodes[node].type;
1275
1276#ifdef RE_ENABLE_I18N
ff28b140
TL
1277 if (dfa->nodes[node].accept_mb)
1278 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
832b75ed
GG
1279 else
1280#endif /* RE_ENABLE_I18N */
1281 if (type == OP_BACK_REF)
1282 {
ff28b140 1283 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
832b75ed
GG
1284 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1285 if (fs != NULL)
1286 {
1287 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1288 return -1;
1289 else if (naccepted)
1290 {
ff28b140 1291 char *buf = (char *) re_string_get_buffer (&mctx->input);
832b75ed
GG
1292 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1293 naccepted) != 0)
1294 return -1;
1295 }
1296 }
1297
1298 if (naccepted == 0)
1299 {
ff28b140
TL
1300 Idx dest_node;
1301 ok = re_node_set_insert (eps_via_nodes, node);
1302 if (BE (! ok, 0))
832b75ed
GG
1303 return -2;
1304 dest_node = dfa->edests[node].elems[0];
1305 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1306 dest_node))
1307 return dest_node;
1308 }
1309 }
1310
1311 if (naccepted != 0
ff28b140 1312 || check_node_accept (mctx, dfa->nodes + node, *pidx))
832b75ed 1313 {
ff28b140 1314 Idx dest_node = dfa->nexts[node];
832b75ed
GG
1315 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1316 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1317 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1318 dest_node)))
1319 return -1;
1320 re_node_set_empty (eps_via_nodes);
1321 return dest_node;
1322 }
1323 }
1324 return -1;
1325}
1326
1327static reg_errcode_t
ff28b140
TL
1328__attribute_warn_unused_result__
1329push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1330 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
832b75ed
GG
1331{
1332 reg_errcode_t err;
ff28b140 1333 Idx num = fs->num++;
832b75ed
GG
1334 if (fs->num == fs->alloc)
1335 {
1336 struct re_fail_stack_ent_t *new_array;
ff28b140
TL
1337 new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1338 fs->alloc * 2);
832b75ed
GG
1339 if (new_array == NULL)
1340 return REG_ESPACE;
ff28b140 1341 fs->alloc *= 2;
832b75ed
GG
1342 fs->stack = new_array;
1343 }
1344 fs->stack[num].idx = str_idx;
ff28b140 1345 fs->stack[num].node = dest_node;
832b75ed 1346 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
ff28b140
TL
1347 if (fs->stack[num].regs == NULL)
1348 return REG_ESPACE;
832b75ed
GG
1349 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1350 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1351 return err;
1352}
1353
ff28b140
TL
1354static Idx
1355pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1356 regmatch_t *regs, re_node_set *eps_via_nodes)
832b75ed 1357{
ff28b140 1358 Idx num = --fs->num;
832b75ed 1359 assert (num >= 0);
ff28b140 1360 *pidx = fs->stack[num].idx;
832b75ed
GG
1361 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1362 re_node_set_free (eps_via_nodes);
1363 re_free (fs->stack[num].regs);
1364 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1365 return fs->stack[num].node;
1366}
1367
1368/* Set the positions where the subexpressions are starts/ends to registers
1369 PMATCH.
1370 Note: We assume that pmatch[0] is already set, and
ff28b140 1371 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
832b75ed
GG
1372
1373static reg_errcode_t
ff28b140
TL
1374__attribute_warn_unused_result__
1375set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1376 regmatch_t *pmatch, bool fl_backtrack)
1377{
1378 const re_dfa_t *dfa = preg->buffer;
1379 Idx idx, cur_node;
832b75ed
GG
1380 re_node_set eps_via_nodes;
1381 struct re_fail_stack_t *fs;
ff28b140
TL
1382 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1383 regmatch_t *prev_idx_match;
1384 bool prev_idx_match_malloced = false;
1385
832b75ed
GG
1386#ifdef DEBUG
1387 assert (nmatch > 1);
1388 assert (mctx->state_log != NULL);
1389#endif
1390 if (fl_backtrack)
1391 {
1392 fs = &fs_body;
1393 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
ff28b140
TL
1394 if (fs->stack == NULL)
1395 return REG_ESPACE;
832b75ed
GG
1396 }
1397 else
1398 fs = NULL;
ff28b140 1399
832b75ed 1400 cur_node = dfa->init_node;
832b75ed 1401 re_node_set_init_empty (&eps_via_nodes);
ff28b140
TL
1402
1403 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1404 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1405 else
1406 {
1407 prev_idx_match = re_malloc (regmatch_t, nmatch);
1408 if (prev_idx_match == NULL)
1409 {
1410 free_fail_stack_return (fs);
1411 return REG_ESPACE;
1412 }
1413 prev_idx_match_malloced = true;
1414 }
1415 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1416
832b75ed
GG
1417 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1418 {
ff28b140
TL
1419 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1420
832b75ed
GG
1421 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1422 {
ff28b140 1423 Idx reg_idx;
832b75ed
GG
1424 if (fs)
1425 {
1426 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1427 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1428 break;
1429 if (reg_idx == nmatch)
1430 {
1431 re_node_set_free (&eps_via_nodes);
ff28b140
TL
1432 if (prev_idx_match_malloced)
1433 re_free (prev_idx_match);
832b75ed
GG
1434 return free_fail_stack_return (fs);
1435 }
1436 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1437 &eps_via_nodes);
1438 }
1439 else
1440 {
1441 re_node_set_free (&eps_via_nodes);
ff28b140
TL
1442 if (prev_idx_match_malloced)
1443 re_free (prev_idx_match);
832b75ed
GG
1444 return REG_NOERROR;
1445 }
1446 }
1447
1448 /* Proceed to next node. */
ff28b140 1449 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
832b75ed
GG
1450 &eps_via_nodes, fs);
1451
1452 if (BE (cur_node < 0, 0))
1453 {
ff28b140
TL
1454 if (BE (cur_node == -2, 0))
1455 {
1456 re_node_set_free (&eps_via_nodes);
1457 if (prev_idx_match_malloced)
1458 re_free (prev_idx_match);
1459 free_fail_stack_return (fs);
1460 return REG_ESPACE;
1461 }
832b75ed
GG
1462 if (fs)
1463 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1464 &eps_via_nodes);
1465 else
1466 {
1467 re_node_set_free (&eps_via_nodes);
ff28b140
TL
1468 if (prev_idx_match_malloced)
1469 re_free (prev_idx_match);
832b75ed
GG
1470 return REG_NOMATCH;
1471 }
1472 }
1473 }
1474 re_node_set_free (&eps_via_nodes);
ff28b140
TL
1475 if (prev_idx_match_malloced)
1476 re_free (prev_idx_match);
832b75ed
GG
1477 return free_fail_stack_return (fs);
1478}
1479
1480static reg_errcode_t
ff28b140 1481free_fail_stack_return (struct re_fail_stack_t *fs)
832b75ed
GG
1482{
1483 if (fs)
1484 {
ff28b140 1485 Idx fs_idx;
832b75ed
GG
1486 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1487 {
1488 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1489 re_free (fs->stack[fs_idx].regs);
1490 }
1491 re_free (fs->stack);
1492 }
1493 return REG_NOERROR;
1494}
1495
1496static void
ff28b140
TL
1497update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1498 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
832b75ed
GG
1499{
1500 int type = dfa->nodes[cur_node].type;
832b75ed
GG
1501 if (type == OP_OPEN_SUBEXP)
1502 {
ff28b140
TL
1503 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1504
832b75ed 1505 /* We are at the first node of this sub expression. */
ff28b140
TL
1506 if (reg_num < nmatch)
1507 {
1508 pmatch[reg_num].rm_so = cur_idx;
1509 pmatch[reg_num].rm_eo = -1;
1510 }
832b75ed
GG
1511 }
1512 else if (type == OP_CLOSE_SUBEXP)
ff28b140
TL
1513 {
1514 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1515 if (reg_num < nmatch)
1516 {
1517 /* We are at the last node of this sub expression. */
1518 if (pmatch[reg_num].rm_so < cur_idx)
1519 {
1520 pmatch[reg_num].rm_eo = cur_idx;
1521 /* This is a non-empty match or we are not inside an optional
1522 subexpression. Accept this right away. */
1523 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1524 }
1525 else
1526 {
1527 if (dfa->nodes[cur_node].opt_subexp
1528 && prev_idx_match[reg_num].rm_so != -1)
1529 /* We transited through an empty match for an optional
1530 subexpression, like (a?)*, and this is not the subexp's
1531 first match. Copy back the old content of the registers
1532 so that matches of an inner subexpression are undone as
1533 well, like in ((a?))*. */
1534 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1535 else
1536 /* We completed a subexpression, but it may be part of
1537 an optional one, so do not update PREV_IDX_MATCH. */
1538 pmatch[reg_num].rm_eo = cur_idx;
1539 }
1540 }
1541 }
832b75ed
GG
1542}
1543
832b75ed
GG
1544/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1545 and sift the nodes in each states according to the following rules.
1546 Updated state_log will be wrote to STATE_LOG.
1547
ff28b140 1548 Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
832b75ed 1549 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
ff28b140
TL
1550 If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1551 the LAST_NODE, we throw away the node 'a'.
1552 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1553 string 's' and transit to 'b':
832b75ed 1554 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
ff28b140 1555 away the node 'a'.
832b75ed 1556 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
ff28b140
TL
1557 thrown away, we throw away the node 'a'.
1558 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
832b75ed 1559 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
ff28b140
TL
1560 node 'a'.
1561 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1562 we throw away the node 'a'. */
832b75ed
GG
1563
1564#define STATE_NODE_CONTAINS(state,node) \
1565 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1566
1567static reg_errcode_t
ff28b140 1568sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
832b75ed
GG
1569{
1570 reg_errcode_t err;
832b75ed 1571 int null_cnt = 0;
ff28b140 1572 Idx str_idx = sctx->last_str_idx;
832b75ed 1573 re_node_set cur_dest;
832b75ed
GG
1574
1575#ifdef DEBUG
1576 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1577#endif
832b75ed
GG
1578
1579 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1580 transit to the last_node and the last_node itself. */
1581 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1582 if (BE (err != REG_NOERROR, 0))
1583 return err;
ff28b140 1584 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
832b75ed
GG
1585 if (BE (err != REG_NOERROR, 0))
1586 goto free_return;
1587
1588 /* Then check each states in the state_log. */
1589 while (str_idx > 0)
1590 {
832b75ed
GG
1591 /* Update counters. */
1592 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1593 if (null_cnt > mctx->max_mb_elem_len)
1594 {
1595 memset (sctx->sifted_states, '\0',
1596 sizeof (re_dfastate_t *) * str_idx);
1597 re_node_set_free (&cur_dest);
1598 return REG_NOERROR;
1599 }
1600 re_node_set_empty (&cur_dest);
1601 --str_idx;
832b75ed 1602
ff28b140
TL
1603 if (mctx->state_log[str_idx])
1604 {
1605 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1606 if (BE (err != REG_NOERROR, 0))
1607 goto free_return;
832b75ed
GG
1608 }
1609
1610 /* Add all the nodes which satisfy the following conditions:
1611 - It can epsilon transit to a node in CUR_DEST.
1612 - It is in CUR_SRC.
1613 And update state_log. */
ff28b140 1614 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
832b75ed
GG
1615 if (BE (err != REG_NOERROR, 0))
1616 goto free_return;
1617 }
1618 err = REG_NOERROR;
1619 free_return:
1620 re_node_set_free (&cur_dest);
1621 return err;
1622}
1623
ff28b140
TL
1624static reg_errcode_t
1625__attribute_warn_unused_result__
1626build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1627 Idx str_idx, re_node_set *cur_dest)
1628{
1629 const re_dfa_t *const dfa = mctx->dfa;
1630 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1631 Idx i;
1632
1633 /* Then build the next sifted state.
1634 We build the next sifted state on 'cur_dest', and update
1635 'sifted_states[str_idx]' with 'cur_dest'.
1636 Note:
1637 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1638 'cur_src' points the node_set of the old 'state_log[str_idx]'
1639 (with the epsilon nodes pre-filtered out). */
1640 for (i = 0; i < cur_src->nelem; i++)
1641 {
1642 Idx prev_node = cur_src->elems[i];
1643 int naccepted = 0;
1644 bool ok;
1645
1646#ifdef DEBUG
1647 re_token_type_t type = dfa->nodes[prev_node].type;
1648 assert (!IS_EPSILON_NODE (type));
1649#endif
1650#ifdef RE_ENABLE_I18N
1651 /* If the node may accept "multi byte". */
1652 if (dfa->nodes[prev_node].accept_mb)
1653 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1654 str_idx, sctx->last_str_idx);
1655#endif /* RE_ENABLE_I18N */
1656
1657 /* We don't check backreferences here.
1658 See update_cur_sifted_state(). */
1659 if (!naccepted
1660 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1661 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1662 dfa->nexts[prev_node]))
1663 naccepted = 1;
1664
1665 if (naccepted == 0)
1666 continue;
1667
1668 if (sctx->limits.nelem)
1669 {
1670 Idx to_idx = str_idx + naccepted;
1671 if (check_dst_limits (mctx, &sctx->limits,
1672 dfa->nexts[prev_node], to_idx,
1673 prev_node, str_idx))
1674 continue;
1675 }
1676 ok = re_node_set_insert (cur_dest, prev_node);
1677 if (BE (! ok, 0))
1678 return REG_ESPACE;
1679 }
1680
1681 return REG_NOERROR;
1682}
1683
832b75ed
GG
1684/* Helper functions. */
1685
ff28b140
TL
1686static reg_errcode_t
1687clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
832b75ed 1688{
ff28b140 1689 Idx top = mctx->state_log_top;
832b75ed 1690
ff28b140
TL
1691 if ((next_state_log_idx >= mctx->input.bufs_len
1692 && mctx->input.bufs_len < mctx->input.len)
1693 || (next_state_log_idx >= mctx->input.valid_len
1694 && mctx->input.valid_len < mctx->input.len))
832b75ed
GG
1695 {
1696 reg_errcode_t err;
ff28b140 1697 err = extend_buffers (mctx, next_state_log_idx + 1);
832b75ed
GG
1698 if (BE (err != REG_NOERROR, 0))
1699 return err;
1700 }
1701
1702 if (top < next_state_log_idx)
1703 {
1704 memset (mctx->state_log + top + 1, '\0',
1705 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1706 mctx->state_log_top = next_state_log_idx;
1707 }
1708 return REG_NOERROR;
1709}
1710
1711static reg_errcode_t
ff28b140
TL
1712merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1713 re_dfastate_t **src, Idx num)
832b75ed 1714{
ff28b140 1715 Idx st_idx;
832b75ed
GG
1716 reg_errcode_t err;
1717 for (st_idx = 0; st_idx < num; ++st_idx)
1718 {
1719 if (dst[st_idx] == NULL)
1720 dst[st_idx] = src[st_idx];
1721 else if (src[st_idx] != NULL)
1722 {
1723 re_node_set merged_set;
1724 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1725 &src[st_idx]->nodes);
1726 if (BE (err != REG_NOERROR, 0))
1727 return err;
1728 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1729 re_node_set_free (&merged_set);
1730 if (BE (err != REG_NOERROR, 0))
1731 return err;
1732 }
1733 }
1734 return REG_NOERROR;
1735}
1736
1737static reg_errcode_t
ff28b140
TL
1738update_cur_sifted_state (const re_match_context_t *mctx,
1739 re_sift_context_t *sctx, Idx str_idx,
1740 re_node_set *dest_nodes)
832b75ed 1741{
ff28b140
TL
1742 const re_dfa_t *const dfa = mctx->dfa;
1743 reg_errcode_t err = REG_NOERROR;
832b75ed 1744 const re_node_set *candidates;
ff28b140 1745 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
832b75ed
GG
1746 : &mctx->state_log[str_idx]->nodes);
1747
ff28b140
TL
1748 if (dest_nodes->nelem == 0)
1749 sctx->sifted_states[str_idx] = NULL;
1750 else
832b75ed 1751 {
ff28b140
TL
1752 if (candidates)
1753 {
1754 /* At first, add the nodes which can epsilon transit to a node in
1755 DEST_NODE. */
1756 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1757 if (BE (err != REG_NOERROR, 0))
1758 return err;
832b75ed 1759
ff28b140
TL
1760 /* Then, check the limitations in the current sift_context. */
1761 if (sctx->limits.nelem)
1762 {
1763 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1764 mctx->bkref_ents, str_idx);
1765 if (BE (err != REG_NOERROR, 0))
1766 return err;
1767 }
1768 }
1769
1770 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
832b75ed
GG
1771 if (BE (err != REG_NOERROR, 0))
1772 return err;
1773 }
1774
ff28b140 1775 if (candidates && mctx->state_log[str_idx]->has_backref)
832b75ed 1776 {
ff28b140 1777 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
832b75ed
GG
1778 if (BE (err != REG_NOERROR, 0))
1779 return err;
1780 }
1781 return REG_NOERROR;
1782}
1783
1784static reg_errcode_t
ff28b140
TL
1785__attribute_warn_unused_result__
1786add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1787 const re_node_set *candidates)
832b75ed 1788{
ff28b140
TL
1789 reg_errcode_t err = REG_NOERROR;
1790 Idx i;
832b75ed 1791
ff28b140 1792 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
832b75ed
GG
1793 if (BE (err != REG_NOERROR, 0))
1794 return err;
ff28b140
TL
1795
1796 if (!state->inveclosure.alloc)
832b75ed 1797 {
ff28b140 1798 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
832b75ed 1799 if (BE (err != REG_NOERROR, 0))
ff28b140
TL
1800 return REG_ESPACE;
1801 for (i = 0; i < dest_nodes->nelem; i++)
832b75ed 1802 {
ff28b140
TL
1803 err = re_node_set_merge (&state->inveclosure,
1804 dfa->inveclosures + dest_nodes->elems[i]);
1805 if (BE (err != REG_NOERROR, 0))
1806 return REG_ESPACE;
832b75ed
GG
1807 }
1808 }
ff28b140
TL
1809 return re_node_set_add_intersect (dest_nodes, candidates,
1810 &state->inveclosure);
832b75ed
GG
1811}
1812
1813static reg_errcode_t
ff28b140
TL
1814sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1815 const re_node_set *candidates)
832b75ed 1816{
ff28b140 1817 Idx ecl_idx;
832b75ed
GG
1818 reg_errcode_t err;
1819 re_node_set *inv_eclosure = dfa->inveclosures + node;
1820 re_node_set except_nodes;
1821 re_node_set_init_empty (&except_nodes);
1822 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1823 {
ff28b140 1824 Idx cur_node = inv_eclosure->elems[ecl_idx];
832b75ed
GG
1825 if (cur_node == node)
1826 continue;
1827 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1828 {
ff28b140
TL
1829 Idx edst1 = dfa->edests[cur_node].elems[0];
1830 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
832b75ed
GG
1831 ? dfa->edests[cur_node].elems[1] : -1);
1832 if ((!re_node_set_contains (inv_eclosure, edst1)
1833 && re_node_set_contains (dest_nodes, edst1))
1834 || (edst2 > 0
1835 && !re_node_set_contains (inv_eclosure, edst2)
1836 && re_node_set_contains (dest_nodes, edst2)))
1837 {
1838 err = re_node_set_add_intersect (&except_nodes, candidates,
1839 dfa->inveclosures + cur_node);
1840 if (BE (err != REG_NOERROR, 0))
1841 {
1842 re_node_set_free (&except_nodes);
1843 return err;
1844 }
1845 }
1846 }
1847 }
1848 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1849 {
ff28b140 1850 Idx cur_node = inv_eclosure->elems[ecl_idx];
832b75ed
GG
1851 if (!re_node_set_contains (&except_nodes, cur_node))
1852 {
ff28b140 1853 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
832b75ed
GG
1854 re_node_set_remove_at (dest_nodes, idx);
1855 }
1856 }
1857 re_node_set_free (&except_nodes);
1858 return REG_NOERROR;
1859}
1860
ff28b140
TL
1861static bool
1862check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1863 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
832b75ed 1864{
ff28b140
TL
1865 const re_dfa_t *const dfa = mctx->dfa;
1866 Idx lim_idx, src_pos, dst_pos;
832b75ed 1867
ff28b140
TL
1868 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1869 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
832b75ed
GG
1870 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1871 {
ff28b140 1872 Idx subexp_idx;
832b75ed
GG
1873 struct re_backref_cache_entry *ent;
1874 ent = mctx->bkref_ents + limits->elems[lim_idx];
ff28b140 1875 subexp_idx = dfa->nodes[ent->node].opr.idx;
832b75ed 1876
ff28b140
TL
1877 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1878 subexp_idx, dst_node, dst_idx,
1879 dst_bkref_idx);
1880 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1881 subexp_idx, src_node, src_idx,
1882 src_bkref_idx);
832b75ed
GG
1883
1884 /* In case of:
1885 <src> <dst> ( <subexp> )
1886 ( <subexp> ) <src> <dst>
1887 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1888 if (src_pos == dst_pos)
1889 continue; /* This is unrelated limitation. */
1890 else
ff28b140 1891 return true;
832b75ed 1892 }
ff28b140 1893 return false;
832b75ed
GG
1894}
1895
1896static int
ff28b140
TL
1897check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1898 Idx subexp_idx, Idx from_node, Idx bkref_idx)
832b75ed 1899{
ff28b140
TL
1900 const re_dfa_t *const dfa = mctx->dfa;
1901 const re_node_set *eclosures = dfa->eclosures + from_node;
1902 Idx node_idx;
1903
1904 /* Else, we are on the boundary: examine the nodes on the epsilon
1905 closure. */
1906 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
832b75ed 1907 {
ff28b140
TL
1908 Idx node = eclosures->elems[node_idx];
1909 switch (dfa->nodes[node].type)
832b75ed 1910 {
ff28b140
TL
1911 case OP_BACK_REF:
1912 if (bkref_idx != -1)
832b75ed 1913 {
ff28b140
TL
1914 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1915 do
832b75ed 1916 {
ff28b140
TL
1917 Idx dst;
1918 int cpos;
1919
1920 if (ent->node != node)
1921 continue;
1922
1923 if (subexp_idx < BITSET_WORD_BITS
1924 && !(ent->eps_reachable_subexps_map
1925 & ((bitset_word_t) 1 << subexp_idx)))
1926 continue;
1927
1928 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1929 OP_CLOSE_SUBEXP cases below. But, if the
1930 destination node is the same node as the source
1931 node, don't recurse because it would cause an
1932 infinite loop: a regex that exhibits this behavior
1933 is ()\1*\1* */
1934 dst = dfa->edests[node].elems[0];
1935 if (dst == from_node)
832b75ed 1936 {
ff28b140
TL
1937 if (boundaries & 1)
1938 return -1;
1939 else /* if (boundaries & 2) */
1940 return 0;
832b75ed 1941 }
ff28b140
TL
1942
1943 cpos =
1944 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1945 dst, bkref_idx);
1946 if (cpos == -1 /* && (boundaries & 1) */)
1947 return -1;
1948 if (cpos == 0 && (boundaries & 2))
1949 return 0;
1950
1951 if (subexp_idx < BITSET_WORD_BITS)
1952 ent->eps_reachable_subexps_map
1953 &= ~((bitset_word_t) 1 << subexp_idx);
832b75ed 1954 }
ff28b140 1955 while (ent++->more);
832b75ed 1956 }
ff28b140
TL
1957 break;
1958
1959 case OP_OPEN_SUBEXP:
1960 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1961 return -1;
1962 break;
1963
1964 case OP_CLOSE_SUBEXP:
1965 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1966 return 0;
1967 break;
1968
1969 default:
832b75ed
GG
1970 break;
1971 }
832b75ed 1972 }
ff28b140
TL
1973
1974 return (boundaries & 2) ? 1 : 0;
1975}
1976
1977static int
1978check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1979 Idx subexp_idx, Idx from_node, Idx str_idx,
1980 Idx bkref_idx)
1981{
1982 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1983 int boundaries;
1984
1985 /* If we are outside the range of the subexpression, return -1 or 1. */
1986 if (str_idx < lim->subexp_from)
1987 return -1;
1988
1989 if (lim->subexp_to < str_idx)
1990 return 1;
1991
1992 /* If we are within the subexpression, return 0. */
1993 boundaries = (str_idx == lim->subexp_from);
1994 boundaries |= (str_idx == lim->subexp_to) << 1;
1995 if (boundaries == 0)
1996 return 0;
1997
1998 /* Else, examine epsilon closure. */
1999 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2000 from_node, bkref_idx);
832b75ed
GG
2001}
2002
2003/* Check the limitations of sub expressions LIMITS, and remove the nodes
2004 which are against limitations from DEST_NODES. */
2005
2006static reg_errcode_t
ff28b140
TL
2007check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2008 const re_node_set *candidates, re_node_set *limits,
2009 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
832b75ed
GG
2010{
2011 reg_errcode_t err;
ff28b140 2012 Idx node_idx, lim_idx;
832b75ed
GG
2013
2014 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2015 {
ff28b140 2016 Idx subexp_idx;
832b75ed
GG
2017 struct re_backref_cache_entry *ent;
2018 ent = bkref_ents + limits->elems[lim_idx];
2019
2020 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2021 continue; /* This is unrelated limitation. */
2022
ff28b140 2023 subexp_idx = dfa->nodes[ent->node].opr.idx;
832b75ed
GG
2024 if (ent->subexp_to == str_idx)
2025 {
ff28b140
TL
2026 Idx ops_node = -1;
2027 Idx cls_node = -1;
832b75ed
GG
2028 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2029 {
ff28b140
TL
2030 Idx node = dest_nodes->elems[node_idx];
2031 re_token_type_t type = dfa->nodes[node].type;
832b75ed
GG
2032 if (type == OP_OPEN_SUBEXP
2033 && subexp_idx == dfa->nodes[node].opr.idx)
2034 ops_node = node;
2035 else if (type == OP_CLOSE_SUBEXP
2036 && subexp_idx == dfa->nodes[node].opr.idx)
2037 cls_node = node;
2038 }
2039
2040 /* Check the limitation of the open subexpression. */
2041 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2042 if (ops_node >= 0)
2043 {
ff28b140
TL
2044 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2045 candidates);
832b75ed
GG
2046 if (BE (err != REG_NOERROR, 0))
2047 return err;
2048 }
ff28b140 2049
832b75ed 2050 /* Check the limitation of the close subexpression. */
ff28b140
TL
2051 if (cls_node >= 0)
2052 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2053 {
2054 Idx node = dest_nodes->elems[node_idx];
2055 if (!re_node_set_contains (dfa->inveclosures + node,
2056 cls_node)
2057 && !re_node_set_contains (dfa->eclosures + node,
2058 cls_node))
2059 {
2060 /* It is against this limitation.
2061 Remove it form the current sifted state. */
2062 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2063 candidates);
2064 if (BE (err != REG_NOERROR, 0))
2065 return err;
2066 --node_idx;
2067 }
2068 }
832b75ed
GG
2069 }
2070 else /* (ent->subexp_to != str_idx) */
2071 {
2072 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2073 {
ff28b140
TL
2074 Idx node = dest_nodes->elems[node_idx];
2075 re_token_type_t type = dfa->nodes[node].type;
832b75ed
GG
2076 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2077 {
2078 if (subexp_idx != dfa->nodes[node].opr.idx)
2079 continue;
ff28b140
TL
2080 /* It is against this limitation.
2081 Remove it form the current sifted state. */
2082 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2083 candidates);
2084 if (BE (err != REG_NOERROR, 0))
2085 return err;
832b75ed
GG
2086 }
2087 }
2088 }
2089 }
2090 return REG_NOERROR;
2091}
2092
2093static reg_errcode_t
ff28b140
TL
2094__attribute_warn_unused_result__
2095sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2096 Idx str_idx, const re_node_set *candidates)
832b75ed 2097{
ff28b140 2098 const re_dfa_t *const dfa = mctx->dfa;
832b75ed 2099 reg_errcode_t err;
ff28b140 2100 Idx node_idx, node;
832b75ed 2101 re_sift_context_t local_sctx;
ff28b140
TL
2102 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2103
2104 if (first_idx == -1)
2105 return REG_NOERROR;
2106
832b75ed
GG
2107 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2108
2109 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2110 {
ff28b140 2111 Idx enabled_idx;
832b75ed 2112 re_token_type_t type;
ff28b140 2113 struct re_backref_cache_entry *entry;
832b75ed
GG
2114 node = candidates->elems[node_idx];
2115 type = dfa->nodes[node].type;
832b75ed
GG
2116 /* Avoid infinite loop for the REs like "()\1+". */
2117 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2118 continue;
ff28b140
TL
2119 if (type != OP_BACK_REF)
2120 continue;
2121
2122 entry = mctx->bkref_ents + first_idx;
2123 enabled_idx = first_idx;
2124 do
832b75ed 2125 {
ff28b140
TL
2126 Idx subexp_len;
2127 Idx to_idx;
2128 Idx dst_node;
2129 bool ok;
2130 re_dfastate_t *cur_state;
832b75ed 2131
ff28b140
TL
2132 if (entry->node != node)
2133 continue;
2134 subexp_len = entry->subexp_to - entry->subexp_from;
2135 to_idx = str_idx + subexp_len;
2136 dst_node = (subexp_len ? dfa->nexts[node]
2137 : dfa->edests[node].elems[0]);
2138
2139 if (to_idx > sctx->last_str_idx
2140 || sctx->sifted_states[to_idx] == NULL
2141 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2142 || check_dst_limits (mctx, &sctx->limits, node,
2143 str_idx, dst_node, to_idx))
2144 continue;
2145
2146 if (local_sctx.sifted_states == NULL)
2147 {
2148 local_sctx = *sctx;
2149 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2150 if (BE (err != REG_NOERROR, 0))
2151 goto free_return;
832b75ed 2152 }
ff28b140
TL
2153 local_sctx.last_node = node;
2154 local_sctx.last_str_idx = str_idx;
2155 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2156 if (BE (! ok, 0))
832b75ed 2157 {
ff28b140
TL
2158 err = REG_ESPACE;
2159 goto free_return;
832b75ed 2160 }
ff28b140
TL
2161 cur_state = local_sctx.sifted_states[str_idx];
2162 err = sift_states_backward (mctx, &local_sctx);
2163 if (BE (err != REG_NOERROR, 0))
2164 goto free_return;
2165 if (sctx->limited_states != NULL)
2166 {
2167 err = merge_state_array (dfa, sctx->limited_states,
2168 local_sctx.sifted_states,
2169 str_idx + 1);
2170 if (BE (err != REG_NOERROR, 0))
2171 goto free_return;
2172 }
2173 local_sctx.sifted_states[str_idx] = cur_state;
2174 re_node_set_remove (&local_sctx.limits, enabled_idx);
2175
2176 /* mctx->bkref_ents may have changed, reload the pointer. */
2177 entry = mctx->bkref_ents + enabled_idx;
832b75ed 2178 }
ff28b140 2179 while (enabled_idx++, entry++->more);
832b75ed
GG
2180 }
2181 err = REG_NOERROR;
2182 free_return:
2183 if (local_sctx.sifted_states != NULL)
2184 {
2185 re_node_set_free (&local_sctx.limits);
2186 }
2187
2188 return err;
2189}
2190
2191
2192#ifdef RE_ENABLE_I18N
2193static int
ff28b140
TL
2194sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2195 Idx node_idx, Idx str_idx, Idx max_str_idx)
832b75ed 2196{
ff28b140 2197 const re_dfa_t *const dfa = mctx->dfa;
832b75ed 2198 int naccepted;
ff28b140
TL
2199 /* Check the node can accept "multi byte". */
2200 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
832b75ed
GG
2201 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2202 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2203 dfa->nexts[node_idx]))
ff28b140
TL
2204 /* The node can't accept the "multi byte", or the
2205 destination was already thrown away, then the node
2206 could't accept the current input "multi byte". */
832b75ed
GG
2207 naccepted = 0;
2208 /* Otherwise, it is sure that the node could accept
ff28b140 2209 'naccepted' bytes input. */
832b75ed
GG
2210 return naccepted;
2211}
2212#endif /* RE_ENABLE_I18N */
2213
2214\f
2215/* Functions for state transition. */
2216
2217/* Return the next state to which the current state STATE will transit by
2218 accepting the current input byte, and update STATE_LOG if necessary.
2219 If STATE can accept a multibyte char/collating element/back reference
2220 update the destination of STATE_LOG. */
2221
2222static re_dfastate_t *
ff28b140
TL
2223__attribute_warn_unused_result__
2224transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2225 re_dfastate_t *state)
2226{
2227 re_dfastate_t **trtable;
832b75ed 2228 unsigned char ch;
832b75ed 2229
ff28b140
TL
2230#ifdef RE_ENABLE_I18N
2231 /* If the current state can accept multibyte. */
2232 if (BE (state->accept_mb, 0))
832b75ed 2233 {
ff28b140 2234 *err = transit_state_mb (mctx, state);
832b75ed
GG
2235 if (BE (*err != REG_NOERROR, 0))
2236 return NULL;
2237 }
ff28b140 2238#endif /* RE_ENABLE_I18N */
832b75ed 2239
ff28b140
TL
2240 /* Then decide the next state with the single byte. */
2241#if 0
2242 if (0)
2243 /* don't use transition table */
2244 return transit_state_sb (err, mctx, state);
2245#endif
2246
2247 /* Use transition table */
2248 ch = re_string_fetch_byte (&mctx->input);
2249 for (;;)
832b75ed 2250 {
ff28b140
TL
2251 trtable = state->trtable;
2252 if (BE (trtable != NULL, 1))
2253 return trtable[ch];
832b75ed 2254
ff28b140
TL
2255 trtable = state->word_trtable;
2256 if (BE (trtable != NULL, 1))
832b75ed 2257 {
ff28b140
TL
2258 unsigned int context;
2259 context
2260 = re_string_context_at (&mctx->input,
2261 re_string_cur_idx (&mctx->input) - 1,
2262 mctx->eflags);
2263 if (IS_WORD_CONTEXT (context))
2264 return trtable[ch + SBC_MAX];
2265 else
2266 return trtable[ch];
832b75ed 2267 }
ff28b140
TL
2268
2269 if (!build_trtable (mctx->dfa, state))
832b75ed 2270 {
ff28b140
TL
2271 *err = REG_ESPACE;
2272 return NULL;
832b75ed 2273 }
ff28b140
TL
2274
2275 /* Retry, we now have a transition table. */
832b75ed 2276 }
ff28b140 2277}
832b75ed 2278
ff28b140
TL
2279/* Update the state_log if we need */
2280static re_dfastate_t *
2281merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2282 re_dfastate_t *next_state)
2283{
2284 const re_dfa_t *const dfa = mctx->dfa;
2285 Idx cur_idx = re_string_cur_idx (&mctx->input);
2286
2287 if (cur_idx > mctx->state_log_top)
832b75ed 2288 {
ff28b140
TL
2289 mctx->state_log[cur_idx] = next_state;
2290 mctx->state_log_top = cur_idx;
2291 }
2292 else if (mctx->state_log[cur_idx] == 0)
2293 {
2294 mctx->state_log[cur_idx] = next_state;
2295 }
2296 else
2297 {
2298 re_dfastate_t *pstate;
2299 unsigned int context;
2300 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2301 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2302 the destination of a multibyte char/collating element/
2303 back reference. Then the next state is the union set of
2304 these destinations and the results of the transition table. */
2305 pstate = mctx->state_log[cur_idx];
2306 log_nodes = pstate->entrance_nodes;
2307 if (next_state != NULL)
832b75ed 2308 {
ff28b140
TL
2309 table_nodes = next_state->entrance_nodes;
2310 *err = re_node_set_init_union (&next_nodes, table_nodes,
832b75ed 2311 log_nodes);
ff28b140
TL
2312 if (BE (*err != REG_NOERROR, 0))
2313 return NULL;
832b75ed 2314 }
ff28b140
TL
2315 else
2316 next_nodes = *log_nodes;
2317 /* Note: We already add the nodes of the initial state,
2318 then we don't need to add them here. */
2319
2320 context = re_string_context_at (&mctx->input,
2321 re_string_cur_idx (&mctx->input) - 1,
2322 mctx->eflags);
2323 next_state = mctx->state_log[cur_idx]
2324 = re_acquire_state_context (err, dfa, &next_nodes, context);
2325 /* We don't need to check errors here, since the return value of
2326 this function is next_state and ERR is already set. */
2327
2328 if (table_nodes != NULL)
2329 re_node_set_free (&next_nodes);
832b75ed
GG
2330 }
2331
ff28b140 2332 if (BE (dfa->nbackref, 0) && next_state != NULL)
832b75ed 2333 {
ff28b140
TL
2334 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2335 later. We must check them here, since the back references in the
2336 next state might use them. */
2337 *err = check_subexp_matching_top (mctx, &next_state->nodes,
832b75ed
GG
2338 cur_idx);
2339 if (BE (*err != REG_NOERROR, 0))
2340 return NULL;
ff28b140
TL
2341
2342 /* If the next state has back references. */
2343 if (next_state->has_backref)
2344 {
2345 *err = transit_state_bkref (mctx, &next_state->nodes);
2346 if (BE (*err != REG_NOERROR, 0))
2347 return NULL;
2348 next_state = mctx->state_log[cur_idx];
2349 }
832b75ed
GG
2350 }
2351
ff28b140
TL
2352 return next_state;
2353}
2354
2355/* Skip bytes in the input that correspond to part of a
2356 multi-byte match, then look in the log for a state
2357 from which to restart matching. */
2358static re_dfastate_t *
2359find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2360{
2361 re_dfastate_t *cur_state;
2362 do
832b75ed 2363 {
ff28b140
TL
2364 Idx max = mctx->state_log_top;
2365 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2366
2367 do
2368 {
2369 if (++cur_str_idx > max)
2370 return NULL;
2371 re_string_skip_bytes (&mctx->input, 1);
2372 }
2373 while (mctx->state_log[cur_str_idx] == NULL);
2374
2375 cur_state = merge_state_with_log (err, mctx, NULL);
832b75ed 2376 }
ff28b140
TL
2377 while (*err == REG_NOERROR && cur_state == NULL);
2378 return cur_state;
832b75ed
GG
2379}
2380
2381/* Helper functions for transit_state. */
2382
2383/* From the node set CUR_NODES, pick up the nodes whose types are
2384 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2385 expression. And register them to use them later for evaluating the
ff28b140 2386 corresponding back references. */
832b75ed
GG
2387
2388static reg_errcode_t
ff28b140
TL
2389check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2390 Idx str_idx)
832b75ed 2391{
ff28b140
TL
2392 const re_dfa_t *const dfa = mctx->dfa;
2393 Idx node_idx;
832b75ed
GG
2394 reg_errcode_t err;
2395
2396 /* TODO: This isn't efficient.
2397 Because there might be more than one nodes whose types are
2398 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2399 nodes.
2400 E.g. RE: (a){2} */
2401 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2402 {
ff28b140 2403 Idx node = cur_nodes->elems[node_idx];
832b75ed 2404 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
ff28b140
TL
2405 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2406 && (dfa->used_bkref_map
2407 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
832b75ed
GG
2408 {
2409 err = match_ctx_add_subtop (mctx, node, str_idx);
2410 if (BE (err != REG_NOERROR, 0))
2411 return err;
2412 }
2413 }
2414 return REG_NOERROR;
2415}
2416
ff28b140 2417#if 0
832b75ed
GG
2418/* Return the next state to which the current state STATE will transit by
2419 accepting the current input byte. */
2420
2421static re_dfastate_t *
ff28b140
TL
2422transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2423 re_dfastate_t *state)
2424{
2425 const re_dfa_t *const dfa = mctx->dfa;
832b75ed
GG
2426 re_node_set next_nodes;
2427 re_dfastate_t *next_state;
ff28b140 2428 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
832b75ed
GG
2429 unsigned int context;
2430
2431 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2432 if (BE (*err != REG_NOERROR, 0))
2433 return NULL;
2434 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2435 {
ff28b140
TL
2436 Idx cur_node = state->nodes.elems[node_cnt];
2437 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
832b75ed
GG
2438 {
2439 *err = re_node_set_merge (&next_nodes,
2440 dfa->eclosures + dfa->nexts[cur_node]);
2441 if (BE (*err != REG_NOERROR, 0))
2442 {
2443 re_node_set_free (&next_nodes);
2444 return NULL;
2445 }
2446 }
2447 }
ff28b140 2448 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
832b75ed
GG
2449 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2450 /* We don't need to check errors here, since the return value of
2451 this function is next_state and ERR is already set. */
2452
2453 re_node_set_free (&next_nodes);
ff28b140 2454 re_string_skip_bytes (&mctx->input, 1);
832b75ed
GG
2455 return next_state;
2456}
ff28b140 2457#endif
832b75ed
GG
2458
2459#ifdef RE_ENABLE_I18N
2460static reg_errcode_t
ff28b140 2461transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
832b75ed 2462{
ff28b140 2463 const re_dfa_t *const dfa = mctx->dfa;
832b75ed 2464 reg_errcode_t err;
ff28b140 2465 Idx i;
832b75ed
GG
2466
2467 for (i = 0; i < pstate->nodes.nelem; ++i)
2468 {
2469 re_node_set dest_nodes, *new_nodes;
ff28b140
TL
2470 Idx cur_node_idx = pstate->nodes.elems[i];
2471 int naccepted;
2472 Idx dest_idx;
832b75ed
GG
2473 unsigned int context;
2474 re_dfastate_t *dest_state;
2475
ff28b140
TL
2476 if (!dfa->nodes[cur_node_idx].accept_mb)
2477 continue;
2478
832b75ed
GG
2479 if (dfa->nodes[cur_node_idx].constraint)
2480 {
ff28b140
TL
2481 context = re_string_context_at (&mctx->input,
2482 re_string_cur_idx (&mctx->input),
2483 mctx->eflags);
832b75ed
GG
2484 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2485 context))
2486 continue;
2487 }
2488
ff28b140
TL
2489 /* How many bytes the node can accept? */
2490 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2491 re_string_cur_idx (&mctx->input));
832b75ed
GG
2492 if (naccepted == 0)
2493 continue;
2494
ff28b140
TL
2495 /* The node can accepts 'naccepted' bytes. */
2496 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
832b75ed
GG
2497 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2498 : mctx->max_mb_elem_len);
ff28b140 2499 err = clean_state_log_if_needed (mctx, dest_idx);
832b75ed
GG
2500 if (BE (err != REG_NOERROR, 0))
2501 return err;
2502#ifdef DEBUG
2503 assert (dfa->nexts[cur_node_idx] != -1);
2504#endif
ff28b140 2505 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
832b75ed
GG
2506
2507 dest_state = mctx->state_log[dest_idx];
2508 if (dest_state == NULL)
2509 dest_nodes = *new_nodes;
2510 else
2511 {
2512 err = re_node_set_init_union (&dest_nodes,
2513 dest_state->entrance_nodes, new_nodes);
2514 if (BE (err != REG_NOERROR, 0))
2515 return err;
2516 }
ff28b140
TL
2517 context = re_string_context_at (&mctx->input, dest_idx - 1,
2518 mctx->eflags);
832b75ed
GG
2519 mctx->state_log[dest_idx]
2520 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2521 if (dest_state != NULL)
2522 re_node_set_free (&dest_nodes);
2523 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2524 return err;
2525 }
2526 return REG_NOERROR;
2527}
2528#endif /* RE_ENABLE_I18N */
2529
2530static reg_errcode_t
ff28b140 2531transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
832b75ed 2532{
ff28b140 2533 const re_dfa_t *const dfa = mctx->dfa;
832b75ed 2534 reg_errcode_t err;
ff28b140
TL
2535 Idx i;
2536 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
832b75ed
GG
2537
2538 for (i = 0; i < nodes->nelem; ++i)
2539 {
ff28b140
TL
2540 Idx dest_str_idx, prev_nelem, bkc_idx;
2541 Idx node_idx = nodes->elems[i];
832b75ed 2542 unsigned int context;
ff28b140 2543 const re_token_t *node = dfa->nodes + node_idx;
832b75ed
GG
2544 re_node_set *new_dest_nodes;
2545
ff28b140 2546 /* Check whether 'node' is a backreference or not. */
832b75ed
GG
2547 if (node->type != OP_BACK_REF)
2548 continue;
2549
2550 if (node->constraint)
2551 {
ff28b140
TL
2552 context = re_string_context_at (&mctx->input, cur_str_idx,
2553 mctx->eflags);
832b75ed
GG
2554 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2555 continue;
2556 }
2557
ff28b140 2558 /* 'node' is a backreference.
832b75ed
GG
2559 Check the substring which the substring matched. */
2560 bkc_idx = mctx->nbkref_ents;
ff28b140 2561 err = get_subexp (mctx, node_idx, cur_str_idx);
832b75ed
GG
2562 if (BE (err != REG_NOERROR, 0))
2563 goto free_return;
2564
ff28b140 2565 /* And add the epsilon closures (which is 'new_dest_nodes') of
832b75ed
GG
2566 the backreference to appropriate state_log. */
2567#ifdef DEBUG
2568 assert (dfa->nexts[node_idx] != -1);
2569#endif
2570 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2571 {
ff28b140 2572 Idx subexp_len;
832b75ed
GG
2573 re_dfastate_t *dest_state;
2574 struct re_backref_cache_entry *bkref_ent;
2575 bkref_ent = mctx->bkref_ents + bkc_idx;
2576 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2577 continue;
2578 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2579 new_dest_nodes = (subexp_len == 0
2580 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2581 : dfa->eclosures + dfa->nexts[node_idx]);
2582 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2583 - bkref_ent->subexp_from);
ff28b140
TL
2584 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2585 mctx->eflags);
832b75ed
GG
2586 dest_state = mctx->state_log[dest_str_idx];
2587 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2588 : mctx->state_log[cur_str_idx]->nodes.nelem);
ff28b140 2589 /* Add 'new_dest_node' to state_log. */
832b75ed
GG
2590 if (dest_state == NULL)
2591 {
2592 mctx->state_log[dest_str_idx]
2593 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2594 context);
2595 if (BE (mctx->state_log[dest_str_idx] == NULL
2596 && err != REG_NOERROR, 0))
2597 goto free_return;
2598 }
2599 else
2600 {
2601 re_node_set dest_nodes;
2602 err = re_node_set_init_union (&dest_nodes,
2603 dest_state->entrance_nodes,
2604 new_dest_nodes);
2605 if (BE (err != REG_NOERROR, 0))
2606 {
2607 re_node_set_free (&dest_nodes);
2608 goto free_return;
2609 }
2610 mctx->state_log[dest_str_idx]
2611 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2612 re_node_set_free (&dest_nodes);
2613 if (BE (mctx->state_log[dest_str_idx] == NULL
2614 && err != REG_NOERROR, 0))
2615 goto free_return;
2616 }
2617 /* We need to check recursively if the backreference can epsilon
2618 transit. */
2619 if (subexp_len == 0
2620 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2621 {
ff28b140 2622 err = check_subexp_matching_top (mctx, new_dest_nodes,
832b75ed
GG
2623 cur_str_idx);
2624 if (BE (err != REG_NOERROR, 0))
2625 goto free_return;
ff28b140 2626 err = transit_state_bkref (mctx, new_dest_nodes);
832b75ed
GG
2627 if (BE (err != REG_NOERROR, 0))
2628 goto free_return;
2629 }
2630 }
2631 }
2632 err = REG_NOERROR;
2633 free_return:
2634 return err;
2635}
2636
2637/* Enumerate all the candidates which the backreference BKREF_NODE can match
2638 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2639 Note that we might collect inappropriate candidates here.
2640 However, the cost of checking them strictly here is too high, then we
2641 delay these checking for prune_impossible_nodes(). */
2642
2643static reg_errcode_t
ff28b140
TL
2644__attribute_warn_unused_result__
2645get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2646{
2647 const re_dfa_t *const dfa = mctx->dfa;
2648 Idx subexp_num, sub_top_idx;
2649 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
832b75ed 2650 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
ff28b140
TL
2651 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2652 if (cache_idx != -1)
832b75ed 2653 {
ff28b140
TL
2654 const struct re_backref_cache_entry *entry
2655 = mctx->bkref_ents + cache_idx;
2656 do
2657 if (entry->node == bkref_node)
2658 return REG_NOERROR; /* We already checked it. */
2659 while (entry++->more);
832b75ed 2660 }
ff28b140
TL
2661
2662 subexp_num = dfa->nodes[bkref_node].opr.idx;
832b75ed
GG
2663
2664 /* For each sub expression */
2665 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2666 {
2667 reg_errcode_t err;
2668 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2669 re_sub_match_last_t *sub_last;
ff28b140 2670 Idx sub_last_idx, sl_str, bkref_str_off;
832b75ed
GG
2671
2672 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2673 continue; /* It isn't related. */
2674
2675 sl_str = sub_top->str_idx;
ff28b140 2676 bkref_str_off = bkref_str_idx;
832b75ed
GG
2677 /* At first, check the last node of sub expressions we already
2678 evaluated. */
2679 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2680 {
ff28b140 2681 regoff_t sl_str_diff;
832b75ed
GG
2682 sub_last = sub_top->lasts[sub_last_idx];
2683 sl_str_diff = sub_last->str_idx - sl_str;
2684 /* The matched string by the sub expression match with the substring
2685 at the back reference? */
ff28b140
TL
2686 if (sl_str_diff > 0)
2687 {
2688 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2689 {
2690 /* Not enough chars for a successful match. */
2691 if (bkref_str_off + sl_str_diff > mctx->input.len)
2692 break;
2693
2694 err = clean_state_log_if_needed (mctx,
2695 bkref_str_off
2696 + sl_str_diff);
2697 if (BE (err != REG_NOERROR, 0))
2698 return err;
2699 buf = (const char *) re_string_get_buffer (&mctx->input);
2700 }
2701 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2702 /* We don't need to search this sub expression any more. */
2703 break;
2704 }
2705 bkref_str_off += sl_str_diff;
832b75ed 2706 sl_str += sl_str_diff;
ff28b140 2707 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
832b75ed 2708 bkref_str_idx);
ff28b140
TL
2709
2710 /* Reload buf, since the preceding call might have reallocated
2711 the buffer. */
2712 buf = (const char *) re_string_get_buffer (&mctx->input);
2713
832b75ed
GG
2714 if (err == REG_NOMATCH)
2715 continue;
2716 if (BE (err != REG_NOERROR, 0))
2717 return err;
2718 }
ff28b140 2719
832b75ed
GG
2720 if (sub_last_idx < sub_top->nlasts)
2721 continue;
2722 if (sub_last_idx > 0)
2723 ++sl_str;
2724 /* Then, search for the other last nodes of the sub expression. */
2725 for (; sl_str <= bkref_str_idx; ++sl_str)
2726 {
ff28b140
TL
2727 Idx cls_node;
2728 regoff_t sl_str_off;
2729 const re_node_set *nodes;
832b75ed
GG
2730 sl_str_off = sl_str - sub_top->str_idx;
2731 /* The matched string by the sub expression match with the substring
2732 at the back reference? */
ff28b140
TL
2733 if (sl_str_off > 0)
2734 {
2735 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2736 {
2737 /* If we are at the end of the input, we cannot match. */
2738 if (bkref_str_off >= mctx->input.len)
2739 break;
2740
2741 err = extend_buffers (mctx, bkref_str_off + 1);
2742 if (BE (err != REG_NOERROR, 0))
2743 return err;
2744
2745 buf = (const char *) re_string_get_buffer (&mctx->input);
2746 }
2747 if (buf [bkref_str_off++] != buf[sl_str - 1])
2748 break; /* We don't need to search this sub expression
2749 any more. */
2750 }
832b75ed
GG
2751 if (mctx->state_log[sl_str] == NULL)
2752 continue;
2753 /* Does this state have a ')' of the sub expression? */
2754 nodes = &mctx->state_log[sl_str]->nodes;
ff28b140
TL
2755 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2756 OP_CLOSE_SUBEXP);
832b75ed
GG
2757 if (cls_node == -1)
2758 continue; /* No. */
2759 if (sub_top->path == NULL)
2760 {
2761 sub_top->path = calloc (sizeof (state_array_t),
2762 sl_str - sub_top->str_idx + 1);
2763 if (sub_top->path == NULL)
2764 return REG_ESPACE;
2765 }
2766 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2767 in the current context? */
ff28b140
TL
2768 err = check_arrival (mctx, sub_top->path, sub_top->node,
2769 sub_top->str_idx, cls_node, sl_str,
2770 OP_CLOSE_SUBEXP);
832b75ed
GG
2771 if (err == REG_NOMATCH)
2772 continue;
2773 if (BE (err != REG_NOERROR, 0))
2774 return err;
2775 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2776 if (BE (sub_last == NULL, 0))
2777 return REG_ESPACE;
ff28b140 2778 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
832b75ed
GG
2779 bkref_str_idx);
2780 if (err == REG_NOMATCH)
2781 continue;
2782 }
2783 }
2784 return REG_NOERROR;
2785}
2786
2787/* Helper functions for get_subexp(). */
2788
2789/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2790 If it can arrive, register the sub expression expressed with SUB_TOP
2791 and SUB_LAST. */
2792
2793static reg_errcode_t
ff28b140
TL
2794get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2795 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
832b75ed
GG
2796{
2797 reg_errcode_t err;
ff28b140 2798 Idx to_idx;
832b75ed 2799 /* Can the subexpression arrive the back reference? */
ff28b140
TL
2800 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2801 sub_last->str_idx, bkref_node, bkref_str,
2802 OP_OPEN_SUBEXP);
832b75ed
GG
2803 if (err != REG_NOERROR)
2804 return err;
2805 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2806 sub_last->str_idx);
2807 if (BE (err != REG_NOERROR, 0))
2808 return err;
2809 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
ff28b140 2810 return clean_state_log_if_needed (mctx, to_idx);
832b75ed
GG
2811}
2812
2813/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2814 Search '(' if FL_OPEN, or search ')' otherwise.
2815 TODO: This function isn't efficient...
2816 Because there might be more than one nodes whose types are
2817 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2818 nodes.
2819 E.g. RE: (a){2} */
2820
ff28b140
TL
2821static Idx
2822find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2823 Idx subexp_idx, int type)
832b75ed 2824{
ff28b140 2825 Idx cls_idx;
832b75ed
GG
2826 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2827 {
ff28b140
TL
2828 Idx cls_node = nodes->elems[cls_idx];
2829 const re_token_t *node = dfa->nodes + cls_node;
2830 if (node->type == type
832b75ed
GG
2831 && node->opr.idx == subexp_idx)
2832 return cls_node;
2833 }
2834 return -1;
2835}
2836
2837/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2838 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2839 heavily reused.
2840 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2841
2842static reg_errcode_t
ff28b140
TL
2843__attribute_warn_unused_result__
2844check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2845 Idx top_str, Idx last_node, Idx last_str, int type)
2846{
2847 const re_dfa_t *const dfa = mctx->dfa;
2848 reg_errcode_t err = REG_NOERROR;
2849 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
832b75ed
GG
2850 re_dfastate_t *cur_state = NULL;
2851 re_node_set *cur_nodes, next_nodes;
2852 re_dfastate_t **backup_state_log;
2853 unsigned int context;
2854
2855 subexp_num = dfa->nodes[top_node].opr.idx;
2856 /* Extend the buffer if we need. */
ff28b140 2857 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
832b75ed
GG
2858 {
2859 re_dfastate_t **new_array;
ff28b140
TL
2860 Idx old_alloc = path->alloc;
2861 Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2862 Idx new_alloc;
2863 if (BE (IDX_MAX - old_alloc < incr_alloc, 0))
2864 return REG_ESPACE;
2865 new_alloc = old_alloc + incr_alloc;
2866 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2867 return REG_ESPACE;
2868 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2869 if (BE (new_array == NULL, 0))
832b75ed
GG
2870 return REG_ESPACE;
2871 path->array = new_array;
ff28b140 2872 path->alloc = new_alloc;
832b75ed
GG
2873 memset (new_array + old_alloc, '\0',
2874 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2875 }
2876
ff28b140 2877 str_idx = path->next_idx ? path->next_idx : top_str;
832b75ed
GG
2878
2879 /* Temporary modify MCTX. */
2880 backup_state_log = mctx->state_log;
ff28b140 2881 backup_cur_idx = mctx->input.cur_idx;
832b75ed 2882 mctx->state_log = path->array;
ff28b140 2883 mctx->input.cur_idx = str_idx;
832b75ed
GG
2884
2885 /* Setup initial node set. */
ff28b140 2886 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
832b75ed
GG
2887 if (str_idx == top_str)
2888 {
2889 err = re_node_set_init_1 (&next_nodes, top_node);
2890 if (BE (err != REG_NOERROR, 0))
2891 return err;
ff28b140 2892 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
832b75ed
GG
2893 if (BE (err != REG_NOERROR, 0))
2894 {
2895 re_node_set_free (&next_nodes);
2896 return err;
2897 }
2898 }
2899 else
2900 {
2901 cur_state = mctx->state_log[str_idx];
2902 if (cur_state && cur_state->has_backref)
2903 {
2904 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
ff28b140 2905 if (BE (err != REG_NOERROR, 0))
832b75ed
GG
2906 return err;
2907 }
2908 else
2909 re_node_set_init_empty (&next_nodes);
2910 }
2911 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2912 {
2913 if (next_nodes.nelem)
2914 {
ff28b140
TL
2915 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2916 subexp_num, type);
2917 if (BE (err != REG_NOERROR, 0))
832b75ed
GG
2918 {
2919 re_node_set_free (&next_nodes);
2920 return err;
2921 }
2922 }
2923 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2924 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2925 {
2926 re_node_set_free (&next_nodes);
2927 return err;
2928 }
2929 mctx->state_log[str_idx] = cur_state;
2930 }
2931
2932 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2933 {
2934 re_node_set_empty (&next_nodes);
2935 if (mctx->state_log[str_idx + 1])
2936 {
2937 err = re_node_set_merge (&next_nodes,
2938 &mctx->state_log[str_idx + 1]->nodes);
2939 if (BE (err != REG_NOERROR, 0))
2940 {
2941 re_node_set_free (&next_nodes);
2942 return err;
2943 }
2944 }
2945 if (cur_state)
2946 {
ff28b140
TL
2947 err = check_arrival_add_next_nodes (mctx, str_idx,
2948 &cur_state->non_eps_nodes,
2949 &next_nodes);
832b75ed
GG
2950 if (BE (err != REG_NOERROR, 0))
2951 {
2952 re_node_set_free (&next_nodes);
2953 return err;
2954 }
2955 }
2956 ++str_idx;
2957 if (next_nodes.nelem)
2958 {
ff28b140 2959 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
832b75ed
GG
2960 if (BE (err != REG_NOERROR, 0))
2961 {
2962 re_node_set_free (&next_nodes);
2963 return err;
2964 }
ff28b140
TL
2965 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2966 subexp_num, type);
2967 if (BE (err != REG_NOERROR, 0))
832b75ed
GG
2968 {
2969 re_node_set_free (&next_nodes);
2970 return err;
2971 }
2972 }
ff28b140 2973 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
832b75ed
GG
2974 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2975 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2976 {
2977 re_node_set_free (&next_nodes);
2978 return err;
2979 }
2980 mctx->state_log[str_idx] = cur_state;
2981 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2982 }
2983 re_node_set_free (&next_nodes);
2984 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2985 : &mctx->state_log[last_str]->nodes);
2986 path->next_idx = str_idx;
2987
2988 /* Fix MCTX. */
2989 mctx->state_log = backup_state_log;
ff28b140 2990 mctx->input.cur_idx = backup_cur_idx;
832b75ed 2991
832b75ed 2992 /* Then check the current node set has the node LAST_NODE. */
ff28b140
TL
2993 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2994 return REG_NOERROR;
2995
2996 return REG_NOMATCH;
832b75ed
GG
2997}
2998
2999/* Helper functions for check_arrival. */
3000
3001/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3002 to NEXT_NODES.
3003 TODO: This function is similar to the functions transit_state*(),
3004 however this function has many additional works.
3005 Can't we unify them? */
3006
3007static reg_errcode_t
ff28b140
TL
3008__attribute_warn_unused_result__
3009check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3010 re_node_set *cur_nodes, re_node_set *next_nodes)
3011{
3012 const re_dfa_t *const dfa = mctx->dfa;
3013 bool ok;
3014 Idx cur_idx;
3015#ifdef RE_ENABLE_I18N
3016 reg_errcode_t err = REG_NOERROR;
3017#endif
832b75ed
GG
3018 re_node_set union_set;
3019 re_node_set_init_empty (&union_set);
3020 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3021 {
3022 int naccepted = 0;
ff28b140
TL
3023 Idx cur_node = cur_nodes->elems[cur_idx];
3024#ifdef DEBUG
832b75ed 3025 re_token_type_t type = dfa->nodes[cur_node].type;
ff28b140
TL
3026 assert (!IS_EPSILON_NODE (type));
3027#endif
832b75ed 3028#ifdef RE_ENABLE_I18N
ff28b140
TL
3029 /* If the node may accept "multi byte". */
3030 if (dfa->nodes[cur_node].accept_mb)
832b75ed 3031 {
ff28b140 3032 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
832b75ed
GG
3033 str_idx);
3034 if (naccepted > 1)
3035 {
3036 re_dfastate_t *dest_state;
ff28b140
TL
3037 Idx next_node = dfa->nexts[cur_node];
3038 Idx next_idx = str_idx + naccepted;
832b75ed
GG
3039 dest_state = mctx->state_log[next_idx];
3040 re_node_set_empty (&union_set);
3041 if (dest_state)
3042 {
3043 err = re_node_set_merge (&union_set, &dest_state->nodes);
3044 if (BE (err != REG_NOERROR, 0))
3045 {
3046 re_node_set_free (&union_set);
3047 return err;
3048 }
832b75ed 3049 }
ff28b140
TL
3050 ok = re_node_set_insert (&union_set, next_node);
3051 if (BE (! ok, 0))
832b75ed 3052 {
ff28b140
TL
3053 re_node_set_free (&union_set);
3054 return REG_ESPACE;
832b75ed
GG
3055 }
3056 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3057 &union_set);
3058 if (BE (mctx->state_log[next_idx] == NULL
3059 && err != REG_NOERROR, 0))
3060 {
3061 re_node_set_free (&union_set);
3062 return err;
3063 }
3064 }
3065 }
3066#endif /* RE_ENABLE_I18N */
3067 if (naccepted
ff28b140 3068 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
832b75ed 3069 {
ff28b140
TL
3070 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3071 if (BE (! ok, 0))
832b75ed
GG
3072 {
3073 re_node_set_free (&union_set);
3074 return REG_ESPACE;
3075 }
3076 }
3077 }
3078 re_node_set_free (&union_set);
3079 return REG_NOERROR;
3080}
3081
3082/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3083 CUR_NODES, however exclude the nodes which are:
3084 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3085 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3086*/
3087
3088static reg_errcode_t
ff28b140
TL
3089check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3090 Idx ex_subexp, int type)
832b75ed
GG
3091{
3092 reg_errcode_t err;
ff28b140 3093 Idx idx, outside_node;
832b75ed
GG
3094 re_node_set new_nodes;
3095#ifdef DEBUG
3096 assert (cur_nodes->nelem);
3097#endif
3098 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3099 if (BE (err != REG_NOERROR, 0))
3100 return err;
3101 /* Create a new node set NEW_NODES with the nodes which are epsilon
3102 closures of the node in CUR_NODES. */
3103
3104 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3105 {
ff28b140
TL
3106 Idx cur_node = cur_nodes->elems[idx];
3107 const re_node_set *eclosure = dfa->eclosures + cur_node;
3108 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
832b75ed
GG
3109 if (outside_node == -1)
3110 {
3111 /* There are no problematic nodes, just merge them. */
3112 err = re_node_set_merge (&new_nodes, eclosure);
3113 if (BE (err != REG_NOERROR, 0))
3114 {
3115 re_node_set_free (&new_nodes);
3116 return err;
3117 }
3118 }
3119 else
3120 {
3121 /* There are problematic nodes, re-calculate incrementally. */
3122 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
ff28b140 3123 ex_subexp, type);
832b75ed
GG
3124 if (BE (err != REG_NOERROR, 0))
3125 {
3126 re_node_set_free (&new_nodes);
3127 return err;
3128 }
3129 }
3130 }
3131 re_node_set_free (cur_nodes);
3132 *cur_nodes = new_nodes;
3133 return REG_NOERROR;
3134}
3135
3136/* Helper function for check_arrival_expand_ecl.
3137 Check incrementally the epsilon closure of TARGET, and if it isn't
3138 problematic append it to DST_NODES. */
3139
3140static reg_errcode_t
ff28b140
TL
3141__attribute_warn_unused_result__
3142check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3143 Idx target, Idx ex_subexp, int type)
832b75ed 3144{
ff28b140 3145 Idx cur_node;
832b75ed
GG
3146 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3147 {
ff28b140 3148 bool ok;
832b75ed 3149
ff28b140 3150 if (dfa->nodes[cur_node].type == type
832b75ed
GG
3151 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3152 {
ff28b140 3153 if (type == OP_CLOSE_SUBEXP)
832b75ed 3154 {
ff28b140
TL
3155 ok = re_node_set_insert (dst_nodes, cur_node);
3156 if (BE (! ok, 0))
832b75ed
GG
3157 return REG_ESPACE;
3158 }
3159 break;
3160 }
ff28b140
TL
3161 ok = re_node_set_insert (dst_nodes, cur_node);
3162 if (BE (! ok, 0))
832b75ed
GG
3163 return REG_ESPACE;
3164 if (dfa->edests[cur_node].nelem == 0)
3165 break;
3166 if (dfa->edests[cur_node].nelem == 2)
3167 {
ff28b140 3168 reg_errcode_t err;
832b75ed
GG
3169 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3170 dfa->edests[cur_node].elems[1],
ff28b140 3171 ex_subexp, type);
832b75ed
GG
3172 if (BE (err != REG_NOERROR, 0))
3173 return err;
3174 }
3175 cur_node = dfa->edests[cur_node].elems[0];
3176 }
3177 return REG_NOERROR;
3178}
3179
3180
3181/* For all the back references in the current state, calculate the
3182 destination of the back references by the appropriate entry
3183 in MCTX->BKREF_ENTS. */
3184
3185static reg_errcode_t
ff28b140
TL
3186__attribute_warn_unused_result__
3187expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3188 Idx cur_str, Idx subexp_num, int type)
832b75ed 3189{
ff28b140 3190 const re_dfa_t *const dfa = mctx->dfa;
832b75ed 3191 reg_errcode_t err;
ff28b140
TL
3192 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3193 struct re_backref_cache_entry *ent;
3194
3195 if (cache_idx_start == -1)
3196 return REG_NOERROR;
832b75ed 3197
ff28b140
TL
3198 restart:
3199 ent = mctx->bkref_ents + cache_idx_start;
3200 do
832b75ed 3201 {
ff28b140
TL
3202 Idx to_idx, next_node;
3203
832b75ed
GG
3204 /* Is this entry ENT is appropriate? */
3205 if (!re_node_set_contains (cur_nodes, ent->node))
3206 continue; /* No. */
3207
3208 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3209 /* Calculate the destination of the back reference, and append it
3210 to MCTX->STATE_LOG. */
3211 if (to_idx == cur_str)
3212 {
3213 /* The backreference did epsilon transit, we must re-check all the
3214 node in the current state. */
3215 re_node_set new_dests;
3216 reg_errcode_t err2, err3;
3217 next_node = dfa->edests[ent->node].elems[0];
3218 if (re_node_set_contains (cur_nodes, next_node))
3219 continue;
3220 err = re_node_set_init_1 (&new_dests, next_node);
ff28b140 3221 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
832b75ed
GG
3222 err3 = re_node_set_merge (cur_nodes, &new_dests);
3223 re_node_set_free (&new_dests);
3224 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3225 || err3 != REG_NOERROR, 0))
3226 {
3227 err = (err != REG_NOERROR ? err
3228 : (err2 != REG_NOERROR ? err2 : err3));
3229 return err;
3230 }
3231 /* TODO: It is still inefficient... */
ff28b140 3232 goto restart;
832b75ed
GG
3233 }
3234 else
3235 {
3236 re_node_set union_set;
3237 next_node = dfa->nexts[ent->node];
3238 if (mctx->state_log[to_idx])
3239 {
ff28b140 3240 bool ok;
832b75ed
GG
3241 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3242 next_node))
3243 continue;
3244 err = re_node_set_init_copy (&union_set,
3245 &mctx->state_log[to_idx]->nodes);
ff28b140
TL
3246 ok = re_node_set_insert (&union_set, next_node);
3247 if (BE (err != REG_NOERROR || ! ok, 0))
832b75ed
GG
3248 {
3249 re_node_set_free (&union_set);
3250 err = err != REG_NOERROR ? err : REG_ESPACE;
3251 return err;
3252 }
3253 }
3254 else
3255 {
3256 err = re_node_set_init_1 (&union_set, next_node);
3257 if (BE (err != REG_NOERROR, 0))
3258 return err;
3259 }
3260 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3261 re_node_set_free (&union_set);
3262 if (BE (mctx->state_log[to_idx] == NULL
3263 && err != REG_NOERROR, 0))
3264 return err;
3265 }
3266 }
ff28b140 3267 while (ent++->more);
832b75ed
GG
3268 return REG_NOERROR;
3269}
3270
3271/* Build transition table for the state.
ff28b140 3272 Return true if successful. */
832b75ed 3273
ff28b140
TL
3274static bool
3275build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
832b75ed
GG
3276{
3277 reg_errcode_t err;
ff28b140
TL
3278 Idx i, j;
3279 int ch;
3280 bool need_word_trtable = false;
3281 bitset_word_t elem, mask;
3282 bool dests_node_malloced = false;
3283 bool dest_states_malloced = false;
3284 Idx ndests; /* Number of the destination states from 'state'. */
832b75ed
GG
3285 re_dfastate_t **trtable;
3286 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3287 re_node_set follows, *dests_node;
ff28b140
TL
3288 bitset_t *dests_ch;
3289 bitset_t acceptable;
3290
3291 struct dests_alloc
3292 {
3293 re_node_set dests_node[SBC_MAX];
3294 bitset_t dests_ch[SBC_MAX];
3295 } *dests_alloc;
832b75ed
GG
3296
3297 /* We build DFA states which corresponds to the destination nodes
ff28b140
TL
3298 from 'state'. 'dests_node[i]' represents the nodes which i-th
3299 destination state contains, and 'dests_ch[i]' represents the
832b75ed 3300 characters which i-th destination state accepts. */
ff28b140
TL
3301 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3302 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
832b75ed 3303 else
832b75ed 3304 {
ff28b140
TL
3305 dests_alloc = re_malloc (struct dests_alloc, 1);
3306 if (BE (dests_alloc == NULL, 0))
3307 return false;
3308 dests_node_malloced = true;
832b75ed 3309 }
ff28b140
TL
3310 dests_node = dests_alloc->dests_node;
3311 dests_ch = dests_alloc->dests_ch;
832b75ed 3312
ff28b140
TL
3313 /* Initialize transition table. */
3314 state->word_trtable = state->trtable = NULL;
832b75ed 3315
ff28b140 3316 /* At first, group all nodes belonging to 'state' into several
832b75ed 3317 destinations. */
ff28b140 3318 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
832b75ed
GG
3319 if (BE (ndests <= 0, 0))
3320 {
3321 if (dests_node_malloced)
ff28b140
TL
3322 re_free (dests_alloc);
3323 /* Return false in case of an error, true otherwise. */
832b75ed 3324 if (ndests == 0)
ff28b140
TL
3325 {
3326 state->trtable = (re_dfastate_t **)
3327 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3328 if (BE (state->trtable == NULL, 0))
3329 return false;
3330 return true;
3331 }
3332 return false;
832b75ed
GG
3333 }
3334
3335 err = re_node_set_alloc (&follows, ndests + 1);
3336 if (BE (err != REG_NOERROR, 0))
3337 goto out_free;
3338
ff28b140
TL
3339 /* Avoid arithmetic overflow in size calculation. */
3340 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3341 / (3 * sizeof (re_dfastate_t *)))
3342 < ndests),
3343 0))
3344 goto out_free;
3345
3346 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
832b75ed
GG
3347 + ndests * 3 * sizeof (re_dfastate_t *)))
3348 dest_states = (re_dfastate_t **)
ff28b140 3349 alloca (ndests * 3 * sizeof (re_dfastate_t *));
832b75ed 3350 else
832b75ed 3351 {
ff28b140 3352 dest_states = re_malloc (re_dfastate_t *, ndests * 3);
832b75ed
GG
3353 if (BE (dest_states == NULL, 0))
3354 {
3355out_free:
3356 if (dest_states_malloced)
ff28b140 3357 re_free (dest_states);
832b75ed
GG
3358 re_node_set_free (&follows);
3359 for (i = 0; i < ndests; ++i)
3360 re_node_set_free (dests_node + i);
832b75ed 3361 if (dests_node_malloced)
ff28b140
TL
3362 re_free (dests_alloc);
3363 return false;
832b75ed 3364 }
ff28b140 3365 dest_states_malloced = true;
832b75ed
GG
3366 }
3367 dest_states_word = dest_states + ndests;
3368 dest_states_nl = dest_states_word + ndests;
3369 bitset_empty (acceptable);
3370
3371 /* Then build the states for all destinations. */
3372 for (i = 0; i < ndests; ++i)
3373 {
ff28b140 3374 Idx next_node;
832b75ed
GG
3375 re_node_set_empty (&follows);
3376 /* Merge the follows of this destination states. */
3377 for (j = 0; j < dests_node[i].nelem; ++j)
3378 {
3379 next_node = dfa->nexts[dests_node[i].elems[j]];
3380 if (next_node != -1)
3381 {
3382 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3383 if (BE (err != REG_NOERROR, 0))
3384 goto out_free;
3385 }
3386 }
832b75ed
GG
3387 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3388 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3389 goto out_free;
3390 /* If the new state has context constraint,
3391 build appropriate states for these contexts. */
3392 if (dest_states[i]->has_constraint)
3393 {
3394 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3395 CONTEXT_WORD);
3396 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3397 goto out_free;
ff28b140
TL
3398
3399 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3400 need_word_trtable = true;
3401
832b75ed
GG
3402 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3403 CONTEXT_NEWLINE);
3404 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3405 goto out_free;
3406 }
3407 else
3408 {
3409 dest_states_word[i] = dest_states[i];
3410 dest_states_nl[i] = dest_states[i];
3411 }
3412 bitset_merge (acceptable, dests_ch[i]);
3413 }
3414
ff28b140
TL
3415 if (!BE (need_word_trtable, 0))
3416 {
3417 /* We don't care about whether the following character is a word
3418 character, or we are in a single-byte character set so we can
3419 discern by looking at the character code: allocate a
3420 256-entry transition table. */
3421 trtable = state->trtable =
3422 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3423 if (BE (trtable == NULL, 0))
3424 goto out_free;
3425
3426 /* For all characters ch...: */
3427 for (i = 0; i < BITSET_WORDS; ++i)
3428 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3429 elem;
3430 mask <<= 1, elem >>= 1, ++ch)
3431 if (BE (elem & 1, 0))
832b75ed 3432 {
ff28b140
TL
3433 /* There must be exactly one destination which accepts
3434 character ch. See group_nodes_into_DFAstates. */
3435 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3436 ;
3437
3438 /* j-th destination accepts the word character ch. */
3439 if (dfa->word_char[i] & mask)
3440 trtable[ch] = dest_states_word[j];
3441 else
3442 trtable[ch] = dest_states[j];
832b75ed 3443 }
ff28b140
TL
3444 }
3445 else
3446 {
3447 /* We care about whether the following character is a word
3448 character, and we are in a multi-byte character set: discern
3449 by looking at the character code: build two 256-entry
3450 transition tables, one starting at trtable[0] and one
3451 starting at trtable[SBC_MAX]. */
3452 trtable = state->word_trtable =
3453 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3454 if (BE (trtable == NULL, 0))
3455 goto out_free;
3456
3457 /* For all characters ch...: */
3458 for (i = 0; i < BITSET_WORDS; ++i)
3459 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3460 elem;
3461 mask <<= 1, elem >>= 1, ++ch)
3462 if (BE (elem & 1, 0))
832b75ed 3463 {
ff28b140
TL
3464 /* There must be exactly one destination which accepts
3465 character ch. See group_nodes_into_DFAstates. */
3466 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3467 ;
3468
3469 /* j-th destination accepts the word character ch. */
3470 trtable[ch] = dest_states[j];
3471 trtable[ch + SBC_MAX] = dest_states_word[j];
832b75ed 3472 }
ff28b140
TL
3473 }
3474
832b75ed
GG
3475 /* new line */
3476 if (bitset_contain (acceptable, NEWLINE_CHAR))
3477 {
3478 /* The current state accepts newline character. */
ff28b140
TL
3479 for (j = 0; j < ndests; ++j)
3480 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
832b75ed
GG
3481 {
3482 /* k-th destination accepts newline character. */
ff28b140
TL
3483 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3484 if (need_word_trtable)
3485 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
832b75ed
GG
3486 /* There must be only one destination which accepts
3487 newline. See group_nodes_into_DFAstates. */
3488 break;
3489 }
3490 }
3491
3492 if (dest_states_malloced)
ff28b140 3493 re_free (dest_states);
832b75ed
GG
3494
3495 re_node_set_free (&follows);
3496 for (i = 0; i < ndests; ++i)
3497 re_node_set_free (dests_node + i);
3498
3499 if (dests_node_malloced)
ff28b140 3500 re_free (dests_alloc);
832b75ed 3501
ff28b140 3502 return true;
832b75ed
GG
3503}
3504
3505/* Group all nodes belonging to STATE into several destinations.
3506 Then for all destinations, set the nodes belonging to the destination
3507 to DESTS_NODE[i] and set the characters accepted by the destination
3508 to DEST_CH[i]. This function return the number of destinations. */
3509
ff28b140
TL
3510static Idx
3511group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3512 re_node_set *dests_node, bitset_t *dests_ch)
832b75ed
GG
3513{
3514 reg_errcode_t err;
ff28b140
TL
3515 bool ok;
3516 Idx i, j, k;
3517 Idx ndests; /* Number of the destinations from 'state'. */
3518 bitset_t accepts; /* Characters a node can accept. */
832b75ed
GG
3519 const re_node_set *cur_nodes = &state->nodes;
3520 bitset_empty (accepts);
3521 ndests = 0;
3522
ff28b140 3523 /* For all the nodes belonging to 'state', */
832b75ed
GG
3524 for (i = 0; i < cur_nodes->nelem; ++i)
3525 {
3526 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3527 re_token_type_t type = node->type;
3528 unsigned int constraint = node->constraint;
3529
3530 /* Enumerate all single byte character this node can accept. */
3531 if (type == CHARACTER)
3532 bitset_set (accepts, node->opr.c);
3533 else if (type == SIMPLE_BRACKET)
3534 {
3535 bitset_merge (accepts, node->opr.sbcset);
3536 }
3537 else if (type == OP_PERIOD)
3538 {
ff28b140
TL
3539#ifdef RE_ENABLE_I18N
3540 if (dfa->mb_cur_max > 1)
3541 bitset_merge (accepts, dfa->sb_char);
3542 else
3543#endif
3544 bitset_set_all (accepts);
3545 if (!(dfa->syntax & RE_DOT_NEWLINE))
3546 bitset_clear (accepts, '\n');
3547 if (dfa->syntax & RE_DOT_NOT_NULL)
3548 bitset_clear (accepts, '\0');
3549 }
3550#ifdef RE_ENABLE_I18N
3551 else if (type == OP_UTF8_PERIOD)
3552 {
3553 if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3554 memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3555 else
3556 bitset_merge (accepts, utf8_sb_map);
3557 if (!(dfa->syntax & RE_DOT_NEWLINE))
832b75ed 3558 bitset_clear (accepts, '\n');
ff28b140 3559 if (dfa->syntax & RE_DOT_NOT_NULL)
832b75ed
GG
3560 bitset_clear (accepts, '\0');
3561 }
ff28b140 3562#endif
832b75ed
GG
3563 else
3564 continue;
3565
ff28b140 3566 /* Check the 'accepts' and sift the characters which are not
832b75ed
GG
3567 match it the context. */
3568 if (constraint)
3569 {
832b75ed
GG
3570 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3571 {
ff28b140 3572 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
832b75ed
GG
3573 bitset_empty (accepts);
3574 if (accepts_newline)
3575 bitset_set (accepts, NEWLINE_CHAR);
3576 else
3577 continue;
3578 }
ff28b140
TL
3579 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3580 {
3581 bitset_empty (accepts);
3582 continue;
3583 }
3584
3585 if (constraint & NEXT_WORD_CONSTRAINT)
3586 {
3587 bitset_word_t any_set = 0;
3588 if (type == CHARACTER && !node->word_char)
3589 {
3590 bitset_empty (accepts);
3591 continue;
3592 }
3593#ifdef RE_ENABLE_I18N
3594 if (dfa->mb_cur_max > 1)
3595 for (j = 0; j < BITSET_WORDS; ++j)
3596 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3597 else
3598#endif
3599 for (j = 0; j < BITSET_WORDS; ++j)
3600 any_set |= (accepts[j] &= dfa->word_char[j]);
3601 if (!any_set)
3602 continue;
3603 }
3604 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3605 {
3606 bitset_word_t any_set = 0;
3607 if (type == CHARACTER && node->word_char)
3608 {
3609 bitset_empty (accepts);
3610 continue;
3611 }
3612#ifdef RE_ENABLE_I18N
3613 if (dfa->mb_cur_max > 1)
3614 for (j = 0; j < BITSET_WORDS; ++j)
3615 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3616 else
3617#endif
3618 for (j = 0; j < BITSET_WORDS; ++j)
3619 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3620 if (!any_set)
3621 continue;
3622 }
832b75ed
GG
3623 }
3624
ff28b140
TL
3625 /* Then divide 'accepts' into DFA states, or create a new
3626 state. Above, we make sure that accepts is not empty. */
832b75ed
GG
3627 for (j = 0; j < ndests; ++j)
3628 {
ff28b140
TL
3629 bitset_t intersec; /* Intersection sets, see below. */
3630 bitset_t remains;
832b75ed 3631 /* Flags, see below. */
ff28b140 3632 bitset_word_t has_intersec, not_subset, not_consumed;
832b75ed
GG
3633
3634 /* Optimization, skip if this state doesn't accept the character. */
3635 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3636 continue;
3637
ff28b140 3638 /* Enumerate the intersection set of this state and 'accepts'. */
832b75ed 3639 has_intersec = 0;
ff28b140 3640 for (k = 0; k < BITSET_WORDS; ++k)
832b75ed
GG
3641 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3642 /* And skip if the intersection set is empty. */
3643 if (!has_intersec)
3644 continue;
3645
ff28b140 3646 /* Then check if this state is a subset of 'accepts'. */
832b75ed 3647 not_subset = not_consumed = 0;
ff28b140 3648 for (k = 0; k < BITSET_WORDS; ++k)
832b75ed
GG
3649 {
3650 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3651 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3652 }
3653
ff28b140
TL
3654 /* If this state isn't a subset of 'accepts', create a
3655 new group state, which has the 'remains'. */
832b75ed
GG
3656 if (not_subset)
3657 {
3658 bitset_copy (dests_ch[ndests], remains);
3659 bitset_copy (dests_ch[j], intersec);
3660 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3661 if (BE (err != REG_NOERROR, 0))
3662 goto error_return;
3663 ++ndests;
3664 }
3665
3666 /* Put the position in the current group. */
ff28b140
TL
3667 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3668 if (BE (! ok, 0))
832b75ed
GG
3669 goto error_return;
3670
3671 /* If all characters are consumed, go to next node. */
3672 if (!not_consumed)
3673 break;
3674 }
3675 /* Some characters remain, create a new group. */
3676 if (j == ndests)
3677 {
3678 bitset_copy (dests_ch[ndests], accepts);
3679 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3680 if (BE (err != REG_NOERROR, 0))
3681 goto error_return;
3682 ++ndests;
3683 bitset_empty (accepts);
3684 }
3685 }
3686 return ndests;
3687 error_return:
3688 for (j = 0; j < ndests; ++j)
3689 re_node_set_free (dests_node + j);
3690 return -1;
3691}
3692
3693#ifdef RE_ENABLE_I18N
ff28b140 3694/* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
832b75ed
GG
3695 Return the number of the bytes the node accepts.
3696 STR_IDX is the current index of the input string.
3697
3698 This function handles the nodes which can accept one character, or
3699 one collating element like '.', '[a-z]', opposite to the other nodes
3700 can only accept one byte. */
3701
ff28b140
TL
3702# ifdef _LIBC
3703# include <locale/weight.h>
3704# endif
3705
832b75ed 3706static int
ff28b140
TL
3707check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3708 const re_string_t *input, Idx str_idx)
832b75ed 3709{
832b75ed 3710 const re_token_t *node = dfa->nodes + node_idx;
ff28b140
TL
3711 int char_len, elem_len;
3712 Idx i;
3713
3714 if (BE (node->type == OP_UTF8_PERIOD, 0))
3715 {
3716 unsigned char c = re_string_byte_at (input, str_idx), d;
3717 if (BE (c < 0xc2, 1))
3718 return 0;
3719
3720 if (str_idx + 2 > input->len)
3721 return 0;
3722
3723 d = re_string_byte_at (input, str_idx + 1);
3724 if (c < 0xe0)
3725 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3726 else if (c < 0xf0)
3727 {
3728 char_len = 3;
3729 if (c == 0xe0 && d < 0xa0)
3730 return 0;
3731 }
3732 else if (c < 0xf8)
3733 {
3734 char_len = 4;
3735 if (c == 0xf0 && d < 0x90)
3736 return 0;
3737 }
3738 else if (c < 0xfc)
3739 {
3740 char_len = 5;
3741 if (c == 0xf8 && d < 0x88)
3742 return 0;
3743 }
3744 else if (c < 0xfe)
3745 {
3746 char_len = 6;
3747 if (c == 0xfc && d < 0x84)
3748 return 0;
3749 }
3750 else
3751 return 0;
3752
3753 if (str_idx + char_len > input->len)
3754 return 0;
3755
3756 for (i = 1; i < char_len; ++i)
3757 {
3758 d = re_string_byte_at (input, str_idx + i);
3759 if (d < 0x80 || d > 0xbf)
3760 return 0;
3761 }
3762 return char_len;
3763 }
3764
3765 char_len = re_string_char_size_at (input, str_idx);
832b75ed
GG
3766 if (node->type == OP_PERIOD)
3767 {
ff28b140
TL
3768 if (char_len <= 1)
3769 return 0;
3770 /* FIXME: I don't think this if is needed, as both '\n'
3771 and '\0' are char_len == 1. */
832b75ed 3772 /* '.' accepts any one character except the following two cases. */
ff28b140 3773 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
832b75ed 3774 re_string_byte_at (input, str_idx) == '\n') ||
ff28b140 3775 ((dfa->syntax & RE_DOT_NOT_NULL) &&
832b75ed
GG
3776 re_string_byte_at (input, str_idx) == '\0'))
3777 return 0;
3778 return char_len;
3779 }
ff28b140
TL
3780
3781 elem_len = re_string_elem_size_at (input, str_idx);
3782 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3783 return 0;
3784
3785 if (node->type == COMPLEX_BRACKET)
832b75ed
GG
3786 {
3787 const re_charset_t *cset = node->opr.mbcset;
3788# ifdef _LIBC
ff28b140
TL
3789 const unsigned char *pin
3790 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3791 Idx j;
3792 uint32_t nrules;
832b75ed
GG
3793# endif /* _LIBC */
3794 int match_len = 0;
3795 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3796 ? re_string_wchar_at (input, str_idx) : 0);
3797
3798 /* match with multibyte character? */
3799 for (i = 0; i < cset->nmbchars; ++i)
3800 if (wc == cset->mbchars[i])
3801 {
3802 match_len = char_len;
3803 goto check_node_accept_bytes_match;
3804 }
3805 /* match with character_class? */
3806 for (i = 0; i < cset->nchar_classes; ++i)
3807 {
3808 wctype_t wt = cset->char_classes[i];
3809 if (__iswctype (wc, wt))
3810 {
3811 match_len = char_len;
3812 goto check_node_accept_bytes_match;
3813 }
3814 }
3815
3816# ifdef _LIBC
ff28b140 3817 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
832b75ed
GG
3818 if (nrules != 0)
3819 {
3820 unsigned int in_collseq = 0;
3821 const int32_t *table, *indirect;
3822 const unsigned char *weights, *extra;
3823 const char *collseqwc;
832b75ed
GG
3824
3825 /* match with collating_symbol? */
3826 if (cset->ncoll_syms)
3827 extra = (const unsigned char *)
3828 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3829 for (i = 0; i < cset->ncoll_syms; ++i)
3830 {
3831 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3832 /* Compare the length of input collating element and
3833 the length of current collating element. */
3834 if (*coll_sym != elem_len)
3835 continue;
3836 /* Compare each bytes. */
3837 for (j = 0; j < *coll_sym; j++)
3838 if (pin[j] != coll_sym[1 + j])
3839 break;
3840 if (j == *coll_sym)
3841 {
3842 /* Match if every bytes is equal. */
3843 match_len = j;
3844 goto check_node_accept_bytes_match;
3845 }
3846 }
3847
3848 if (cset->nranges)
3849 {
3850 if (elem_len <= char_len)
3851 {
3852 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
ff28b140 3853 in_collseq = __collseq_table_lookup (collseqwc, wc);
832b75ed
GG
3854 }
3855 else
3856 in_collseq = find_collation_sequence_value (pin, elem_len);
3857 }
3858 /* match with range expression? */
ff28b140 3859 /* FIXME: Implement rational ranges here, too. */
832b75ed
GG
3860 for (i = 0; i < cset->nranges; ++i)
3861 if (cset->range_starts[i] <= in_collseq
3862 && in_collseq <= cset->range_ends[i])
3863 {
3864 match_len = elem_len;
3865 goto check_node_accept_bytes_match;
3866 }
3867
3868 /* match with equivalence_class? */
3869 if (cset->nequiv_classes)
3870 {
3871 const unsigned char *cp = pin;
3872 table = (const int32_t *)
3873 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3874 weights = (const unsigned char *)
3875 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3876 extra = (const unsigned char *)
3877 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3878 indirect = (const int32_t *)
3879 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
ff28b140
TL
3880 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3881 int32_t rule = idx >> 24;
3882 idx &= 0xffffff;
832b75ed 3883 if (idx > 0)
ff28b140
TL
3884 {
3885 size_t weight_len = weights[idx];
3886 for (i = 0; i < cset->nequiv_classes; ++i)
3887 {
3888 int32_t equiv_class_idx = cset->equiv_classes[i];
3889 int32_t equiv_class_rule = equiv_class_idx >> 24;
3890 equiv_class_idx &= 0xffffff;
3891 if (weights[equiv_class_idx] == weight_len
3892 && equiv_class_rule == rule
3893 && memcmp (weights + idx + 1,
3894 weights + equiv_class_idx + 1,
3895 weight_len) == 0)
3896 {
3897 match_len = elem_len;
3898 goto check_node_accept_bytes_match;
3899 }
3900 }
3901 }
832b75ed
GG
3902 }
3903 }
3904 else
3905# endif /* _LIBC */
3906 {
3907 /* match with range expression? */
832b75ed
GG
3908 for (i = 0; i < cset->nranges; ++i)
3909 {
ff28b140 3910 if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
832b75ed
GG
3911 {
3912 match_len = char_len;
3913 goto check_node_accept_bytes_match;
3914 }
3915 }
3916 }
3917 check_node_accept_bytes_match:
3918 if (!cset->non_match)
3919 return match_len;
3920 else
3921 {
3922 if (match_len > 0)
3923 return 0;
3924 else
3925 return (elem_len > char_len) ? elem_len : char_len;
3926 }
3927 }
3928 return 0;
3929}
3930
3931# ifdef _LIBC
3932static unsigned int
ff28b140 3933find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
832b75ed
GG
3934{
3935 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3936 if (nrules == 0)
3937 {
3938 if (mbs_len == 1)
3939 {
3940 /* No valid character. Match it as a single byte character. */
3941 const unsigned char *collseq = (const unsigned char *)
3942 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3943 return collseq[mbs[0]];
3944 }
3945 return UINT_MAX;
3946 }
3947 else
3948 {
3949 int32_t idx;
3950 const unsigned char *extra = (const unsigned char *)
3951 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
ff28b140
TL
3952 int32_t extrasize = (const unsigned char *)
3953 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
832b75ed 3954
ff28b140 3955 for (idx = 0; idx < extrasize;)
832b75ed 3956 {
ff28b140
TL
3957 int mbs_cnt;
3958 bool found = false;
832b75ed
GG
3959 int32_t elem_mbs_len;
3960 /* Skip the name of collating element name. */
3961 idx = idx + extra[idx] + 1;
3962 elem_mbs_len = extra[idx++];
3963 if (mbs_len == elem_mbs_len)
3964 {
3965 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3966 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3967 break;
3968 if (mbs_cnt == elem_mbs_len)
3969 /* Found the entry. */
ff28b140 3970 found = true;
832b75ed
GG
3971 }
3972 /* Skip the byte sequence of the collating element. */
3973 idx += elem_mbs_len;
3974 /* Adjust for the alignment. */
3975 idx = (idx + 3) & ~3;
3976 /* Skip the collation sequence value. */
3977 idx += sizeof (uint32_t);
3978 /* Skip the wide char sequence of the collating element. */
ff28b140 3979 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
832b75ed
GG
3980 /* If we found the entry, return the sequence value. */
3981 if (found)
3982 return *(uint32_t *) (extra + idx);
3983 /* Skip the collation sequence value. */
3984 idx += sizeof (uint32_t);
3985 }
ff28b140 3986 return UINT_MAX;
832b75ed
GG
3987 }
3988}
3989# endif /* _LIBC */
3990#endif /* RE_ENABLE_I18N */
3991
3992/* Check whether the node accepts the byte which is IDX-th
3993 byte of the INPUT. */
3994
ff28b140
TL
3995static bool
3996check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3997 Idx idx)
832b75ed
GG
3998{
3999 unsigned char ch;
ff28b140
TL
4000 ch = re_string_byte_at (&mctx->input, idx);
4001 switch (node->type)
4002 {
4003 case CHARACTER:
4004 if (node->opr.c != ch)
4005 return false;
4006 break;
4007
4008 case SIMPLE_BRACKET:
4009 if (!bitset_contain (node->opr.sbcset, ch))
4010 return false;
4011 break;
4012
4013#ifdef RE_ENABLE_I18N
4014 case OP_UTF8_PERIOD:
4015 if (ch >= ASCII_CHARS)
4016 return false;
4017 FALLTHROUGH;
4018#endif
4019 case OP_PERIOD:
4020 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4021 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4022 return false;
4023 break;
4024
4025 default:
4026 return false;
4027 }
4028
832b75ed
GG
4029 if (node->constraint)
4030 {
4031 /* The node has constraints. Check whether the current context
4032 satisfies the constraints. */
ff28b140
TL
4033 unsigned int context = re_string_context_at (&mctx->input, idx,
4034 mctx->eflags);
832b75ed 4035 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
ff28b140 4036 return false;
832b75ed 4037 }
ff28b140
TL
4038
4039 return true;
832b75ed
GG
4040}
4041
4042/* Extend the buffers, if the buffers have run out. */
4043
4044static reg_errcode_t
ff28b140
TL
4045__attribute_warn_unused_result__
4046extend_buffers (re_match_context_t *mctx, int min_len)
832b75ed
GG
4047{
4048 reg_errcode_t ret;
ff28b140
TL
4049 re_string_t *pstr = &mctx->input;
4050
4051 /* Avoid overflow. */
4052 if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
4053 <= pstr->bufs_len, 0))
4054 return REG_ESPACE;
832b75ed 4055
ff28b140
TL
4056 /* Double the lengths of the buffers, but allocate at least MIN_LEN. */
4057 ret = re_string_realloc_buffers (pstr,
4058 MAX (min_len,
4059 MIN (pstr->len, pstr->bufs_len * 2)));
832b75ed
GG
4060 if (BE (ret != REG_NOERROR, 0))
4061 return ret;
4062
4063 if (mctx->state_log != NULL)
4064 {
4065 /* And double the length of state_log. */
ff28b140
TL
4066 /* XXX We have no indication of the size of this buffer. If this
4067 allocation fail we have no indication that the state_log array
4068 does not have the right size. */
4069 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4070 pstr->bufs_len + 1);
832b75ed
GG
4071 if (BE (new_array == NULL, 0))
4072 return REG_ESPACE;
4073 mctx->state_log = new_array;
4074 }
4075
4076 /* Then reconstruct the buffers. */
4077 if (pstr->icase)
4078 {
4079#ifdef RE_ENABLE_I18N
ff28b140
TL
4080 if (pstr->mb_cur_max > 1)
4081 {
4082 ret = build_wcs_upper_buffer (pstr);
4083 if (BE (ret != REG_NOERROR, 0))
4084 return ret;
4085 }
832b75ed
GG
4086 else
4087#endif /* RE_ENABLE_I18N */
4088 build_upper_buffer (pstr);
4089 }
4090 else
4091 {
4092#ifdef RE_ENABLE_I18N
ff28b140 4093 if (pstr->mb_cur_max > 1)
832b75ed
GG
4094 build_wcs_buffer (pstr);
4095 else
4096#endif /* RE_ENABLE_I18N */
4097 {
4098 if (pstr->trans != NULL)
4099 re_string_translate_buffer (pstr);
832b75ed
GG
4100 }
4101 }
4102 return REG_NOERROR;
4103}
4104
4105\f
4106/* Functions for matching context. */
4107
4108/* Initialize MCTX. */
4109
4110static reg_errcode_t
ff28b140
TL
4111__attribute_warn_unused_result__
4112match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
832b75ed
GG
4113{
4114 mctx->eflags = eflags;
832b75ed
GG
4115 mctx->match_last = -1;
4116 if (n > 0)
4117 {
ff28b140
TL
4118 /* Avoid overflow. */
4119 size_t max_object_size =
4120 MAX (sizeof (struct re_backref_cache_entry),
4121 sizeof (re_sub_match_top_t *));
4122 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n, 0))
4123 return REG_ESPACE;
4124
832b75ed
GG
4125 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4126 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4127 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4128 return REG_ESPACE;
4129 }
ff28b140
TL
4130 /* Already zero-ed by the caller.
4131 else
4132 mctx->bkref_ents = NULL;
4133 mctx->nbkref_ents = 0;
4134 mctx->nsub_tops = 0; */
832b75ed
GG
4135 mctx->abkref_ents = n;
4136 mctx->max_mb_elem_len = 1;
832b75ed
GG
4137 mctx->asub_tops = n;
4138 return REG_NOERROR;
4139}
4140
4141/* Clean the entries which depend on the current input in MCTX.
4142 This function must be invoked when the matcher changes the start index
4143 of the input, or changes the input string. */
4144
4145static void
ff28b140 4146match_ctx_clean (re_match_context_t *mctx)
832b75ed 4147{
ff28b140 4148 Idx st_idx;
832b75ed
GG
4149 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4150 {
ff28b140 4151 Idx sl_idx;
832b75ed
GG
4152 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4153 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4154 {
4155 re_sub_match_last_t *last = top->lasts[sl_idx];
4156 re_free (last->path.array);
4157 re_free (last);
4158 }
4159 re_free (top->lasts);
4160 if (top->path)
4161 {
4162 re_free (top->path->array);
4163 re_free (top->path);
4164 }
ff28b140 4165 re_free (top);
832b75ed 4166 }
ff28b140
TL
4167
4168 mctx->nsub_tops = 0;
4169 mctx->nbkref_ents = 0;
4170}
4171
4172/* Free all the memory associated with MCTX. */
4173
4174static void
4175match_ctx_free (re_match_context_t *mctx)
4176{
4177 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4178 match_ctx_clean (mctx);
4179 re_free (mctx->sub_tops);
4180 re_free (mctx->bkref_ents);
832b75ed
GG
4181}
4182
4183/* Add a new backreference entry to MCTX.
4184 Note that we assume that caller never call this function with duplicate
4185 entry, and call with STR_IDX which isn't smaller than any existing entry.
4186*/
4187
4188static reg_errcode_t
ff28b140
TL
4189__attribute_warn_unused_result__
4190match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4191 Idx to)
832b75ed
GG
4192{
4193 if (mctx->nbkref_ents >= mctx->abkref_ents)
4194 {
4195 struct re_backref_cache_entry* new_entry;
4196 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4197 mctx->abkref_ents * 2);
4198 if (BE (new_entry == NULL, 0))
4199 {
4200 re_free (mctx->bkref_ents);
4201 return REG_ESPACE;
4202 }
4203 mctx->bkref_ents = new_entry;
4204 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4205 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4206 mctx->abkref_ents *= 2;
4207 }
ff28b140
TL
4208 if (mctx->nbkref_ents > 0
4209 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4210 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4211
832b75ed
GG
4212 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4213 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4214 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4215 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
ff28b140
TL
4216
4217 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4218 If bit N is clear, means that this entry won't epsilon-transition to
4219 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4220 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4221 such node.
4222
4223 A backreference does not epsilon-transition unless it is empty, so set
4224 to all zeros if FROM != TO. */
4225 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4226 = (from == to ? -1 : 0);
4227
4228 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
832b75ed
GG
4229 if (mctx->max_mb_elem_len < to - from)
4230 mctx->max_mb_elem_len = to - from;
4231 return REG_NOERROR;
4232}
4233
ff28b140
TL
4234/* Return the first entry with the same str_idx, or -1 if none is
4235 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
832b75ed 4236
ff28b140
TL
4237static Idx
4238search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
832b75ed 4239{
ff28b140
TL
4240 Idx left, right, mid, last;
4241 last = right = mctx->nbkref_ents;
832b75ed
GG
4242 for (left = 0; left < right;)
4243 {
4244 mid = (left + right) / 2;
4245 if (mctx->bkref_ents[mid].str_idx < str_idx)
4246 left = mid + 1;
4247 else
4248 right = mid;
4249 }
ff28b140
TL
4250 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4251 return left;
4252 else
4253 return -1;
832b75ed
GG
4254}
4255
4256/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4257 at STR_IDX. */
4258
4259static reg_errcode_t
ff28b140
TL
4260__attribute_warn_unused_result__
4261match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
832b75ed
GG
4262{
4263#ifdef DEBUG
4264 assert (mctx->sub_tops != NULL);
4265 assert (mctx->asub_tops > 0);
4266#endif
ff28b140 4267 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
832b75ed 4268 {
ff28b140
TL
4269 Idx new_asub_tops = mctx->asub_tops * 2;
4270 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4271 re_sub_match_top_t *,
4272 new_asub_tops);
832b75ed
GG
4273 if (BE (new_array == NULL, 0))
4274 return REG_ESPACE;
4275 mctx->sub_tops = new_array;
ff28b140 4276 mctx->asub_tops = new_asub_tops;
832b75ed
GG
4277 }
4278 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
ff28b140 4279 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
832b75ed
GG
4280 return REG_ESPACE;
4281 mctx->sub_tops[mctx->nsub_tops]->node = node;
4282 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4283 return REG_NOERROR;
4284}
4285
4286/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4287 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4288
4289static re_sub_match_last_t *
ff28b140 4290match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
832b75ed
GG
4291{
4292 re_sub_match_last_t *new_entry;
ff28b140 4293 if (BE (subtop->nlasts == subtop->alasts, 0))
832b75ed 4294 {
ff28b140
TL
4295 Idx new_alasts = 2 * subtop->alasts + 1;
4296 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4297 re_sub_match_last_t *,
4298 new_alasts);
832b75ed
GG
4299 if (BE (new_array == NULL, 0))
4300 return NULL;
4301 subtop->lasts = new_array;
ff28b140 4302 subtop->alasts = new_alasts;
832b75ed
GG
4303 }
4304 new_entry = calloc (1, sizeof (re_sub_match_last_t));
ff28b140
TL
4305 if (BE (new_entry != NULL, 1))
4306 {
4307 subtop->lasts[subtop->nlasts] = new_entry;
4308 new_entry->node = node;
4309 new_entry->str_idx = str_idx;
4310 ++subtop->nlasts;
4311 }
832b75ed
GG
4312 return new_entry;
4313}
4314
4315static void
ff28b140
TL
4316sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4317 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
832b75ed
GG
4318{
4319 sctx->sifted_states = sifted_sts;
4320 sctx->limited_states = limited_sts;
4321 sctx->last_node = last_node;
4322 sctx->last_str_idx = last_str_idx;
832b75ed
GG
4323 re_node_set_init_empty (&sctx->limits);
4324}