]> git.proxmox.com Git - ceph.git/blob - ceph/src/civetweb/src/third_party/duktape-1.3.0/src-separate/duk_regexp_executor.c
bump version to 12.2.12-pve1
[ceph.git] / ceph / src / civetweb / src / third_party / duktape-1.3.0 / src-separate / duk_regexp_executor.c
1 /*
2 * Regexp executor.
3 *
4 * Safety: the Ecmascript executor should prevent user from reading and
5 * replacing regexp bytecode. Even so, the executor must validate all
6 * memory accesses etc. When an invalid access is detected (e.g. a 'save'
7 * opcode to invalid, unallocated index) it should fail with an internal
8 * error but not cause a segmentation fault.
9 *
10 * Notes:
11 *
12 * - Backtrack counts are limited to unsigned 32 bits but should
13 * technically be duk_size_t for strings longer than 4G chars.
14 * This also requires a regexp bytecode change.
15 */
16
17 #include "duk_internal.h"
18
19 #ifdef DUK_USE_REGEXP_SUPPORT
20
21 /*
22 * Helpers for UTF-8 handling
23 *
24 * For bytecode readers the duk_uint32_t and duk_int32_t types are correct
25 * because they're used for more than just codepoints.
26 */
27
28 DUK_LOCAL duk_uint32_t duk__bc_get_u32(duk_re_matcher_ctx *re_ctx, const duk_uint8_t **pc) {
29 return (duk_uint32_t) duk_unicode_decode_xutf8_checked(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end);
30 }
31
32 DUK_LOCAL duk_int32_t duk__bc_get_i32(duk_re_matcher_ctx *re_ctx, const duk_uint8_t **pc) {
33 duk_uint32_t t;
34
35 /* signed integer encoding needed to work with UTF-8 */
36 t = (duk_uint32_t) duk_unicode_decode_xutf8_checked(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end);
37 if (t & 1) {
38 return -((duk_int32_t) (t >> 1));
39 } else {
40 return (duk_int32_t) (t >> 1);
41 }
42 }
43
44 DUK_LOCAL const duk_uint8_t *duk__utf8_backtrack(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end, duk_uint_fast32_t count) {
45 const duk_uint8_t *p;
46
47 /* Note: allow backtracking from p == ptr_end */
48 p = *ptr;
49 if (p < ptr_start || p > ptr_end) {
50 goto fail;
51 }
52
53 while (count > 0) {
54 for (;;) {
55 p--;
56 if (p < ptr_start) {
57 goto fail;
58 }
59 if ((*p & 0xc0) != 0x80) {
60 /* utf-8 continuation bytes have the form 10xx xxxx */
61 break;
62 }
63 }
64 count--;
65 }
66 *ptr = p;
67 return p;
68
69 fail:
70 DUK_ERROR(thr, DUK_ERR_INTERNAL_ERROR, DUK_STR_REGEXP_BACKTRACK_FAILED);
71 return NULL; /* never here */
72 }
73
74 DUK_LOCAL const duk_uint8_t *duk__utf8_advance(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end, duk_uint_fast32_t count) {
75 const duk_uint8_t *p;
76
77 p = *ptr;
78 if (p < ptr_start || p >= ptr_end) {
79 goto fail;
80 }
81
82 while (count > 0) {
83 for (;;) {
84 p++;
85
86 /* Note: if encoding ends by hitting end of input, we don't check that
87 * the encoding is valid, we just assume it is.
88 */
89 if (p >= ptr_end || ((*p & 0xc0) != 0x80)) {
90 /* utf-8 continuation bytes have the form 10xx xxxx */
91 break;
92 }
93 }
94 count--;
95 }
96
97 *ptr = p;
98 return p;
99
100 fail:
101 DUK_ERROR(thr, DUK_ERR_INTERNAL_ERROR, DUK_STR_REGEXP_ADVANCE_FAILED);
102 return NULL; /* never here */
103 }
104
105 /*
106 * Helpers for dealing with the input string
107 */
108
109 /* Get a (possibly canonicalized) input character from current sp. The input
110 * itself is never modified, and captures always record non-canonicalized
111 * characters even in case-insensitive matching.
112 */
113 DUK_LOCAL duk_codepoint_t duk__inp_get_cp(duk_re_matcher_ctx *re_ctx, const duk_uint8_t **sp) {
114 duk_codepoint_t res = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(re_ctx->thr, sp, re_ctx->input, re_ctx->input_end);
115 if (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) {
116 res = duk_unicode_re_canonicalize_char(re_ctx->thr, res);
117 }
118 return res;
119 }
120
121 DUK_LOCAL const duk_uint8_t *duk__inp_backtrack(duk_re_matcher_ctx *re_ctx, const duk_uint8_t **sp, duk_uint_fast32_t count) {
122 return duk__utf8_backtrack(re_ctx->thr, sp, re_ctx->input, re_ctx->input_end, count);
123 }
124
125 /* Backtrack utf-8 input and return a (possibly canonicalized) input character. */
126 DUK_LOCAL duk_codepoint_t duk__inp_get_prev_cp(duk_re_matcher_ctx *re_ctx, const duk_uint8_t *sp) {
127 /* note: caller 'sp' is intentionally not updated here */
128 (void) duk__inp_backtrack(re_ctx, &sp, (duk_uint_fast32_t) 1);
129 return duk__inp_get_cp(re_ctx, &sp);
130 }
131
132 /*
133 * Regexp recursive matching function.
134 *
135 * Returns 'sp' on successful match (points to character after last matched one),
136 * NULL otherwise.
137 *
138 * The C recursion depth limit check is only performed in this function, this
139 * suffices because the function is present in all true recursion required by
140 * regexp execution.
141 */
142
143 DUK_LOCAL const duk_uint8_t *duk__match_regexp(duk_re_matcher_ctx *re_ctx, const duk_uint8_t *pc, const duk_uint8_t *sp) {
144 if (re_ctx->recursion_depth >= re_ctx->recursion_limit) {
145 DUK_ERROR(re_ctx->thr, DUK_ERR_RANGE_ERROR, DUK_STR_REGEXP_EXECUTOR_RECURSION_LIMIT);
146 }
147 re_ctx->recursion_depth++;
148
149 for (;;) {
150 duk_small_int_t op;
151
152 if (re_ctx->steps_count >= re_ctx->steps_limit) {
153 DUK_ERROR(re_ctx->thr, DUK_ERR_RANGE_ERROR, DUK_STR_REGEXP_EXECUTOR_STEP_LIMIT);
154 }
155 re_ctx->steps_count++;
156
157 op = (duk_small_int_t) duk__bc_get_u32(re_ctx, &pc);
158
159 DUK_DDD(DUK_DDDPRINT("match: rec=%ld, steps=%ld, pc (after op)=%ld, sp=%ld, op=%ld",
160 (long) re_ctx->recursion_depth,
161 (long) re_ctx->steps_count,
162 (long) (pc - re_ctx->bytecode),
163 (long) (sp - re_ctx->input),
164 (long) op));
165
166 switch (op) {
167 case DUK_REOP_MATCH: {
168 goto match;
169 }
170 case DUK_REOP_CHAR: {
171 /*
172 * Byte-based matching would be possible for case-sensitive
173 * matching but not for case-insensitive matching. So, we
174 * match by decoding the input and bytecode character normally.
175 *
176 * Bytecode characters are assumed to be already canonicalized.
177 * Input characters are canonicalized automatically by
178 * duk__inp_get_cp() if necessary.
179 *
180 * There is no opcode for matching multiple characters. The
181 * regexp compiler has trouble joining strings efficiently
182 * during compilation. See doc/regexp.rst for more discussion.
183 */
184 duk_codepoint_t c1, c2;
185
186 c1 = (duk_codepoint_t) duk__bc_get_u32(re_ctx, &pc);
187 DUK_ASSERT(!(re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) ||
188 c1 == duk_unicode_re_canonicalize_char(re_ctx->thr, c1)); /* canonicalized by compiler */
189 if (sp >= re_ctx->input_end) {
190 goto fail;
191 }
192 c2 = duk__inp_get_cp(re_ctx, &sp);
193 DUK_DDD(DUK_DDDPRINT("char match, c1=%ld, c2=%ld", (long) c1, (long) c2));
194 if (c1 != c2) {
195 goto fail;
196 }
197 break;
198 }
199 case DUK_REOP_PERIOD: {
200 duk_codepoint_t c;
201
202 if (sp >= re_ctx->input_end) {
203 goto fail;
204 }
205 c = duk__inp_get_cp(re_ctx, &sp);
206 if (duk_unicode_is_line_terminator(c)) {
207 /* E5 Sections 15.10.2.8, 7.3 */
208 goto fail;
209 }
210 break;
211 }
212 case DUK_REOP_RANGES:
213 case DUK_REOP_INVRANGES: {
214 duk_uint32_t n;
215 duk_codepoint_t c;
216 duk_small_int_t match;
217
218 n = duk__bc_get_u32(re_ctx, &pc);
219 if (sp >= re_ctx->input_end) {
220 goto fail;
221 }
222 c = duk__inp_get_cp(re_ctx, &sp);
223
224 match = 0;
225 while (n) {
226 duk_codepoint_t r1, r2;
227 r1 = (duk_codepoint_t) duk__bc_get_u32(re_ctx, &pc);
228 r2 = (duk_codepoint_t) duk__bc_get_u32(re_ctx, &pc);
229 DUK_DDD(DUK_DDDPRINT("matching ranges/invranges, n=%ld, r1=%ld, r2=%ld, c=%ld",
230 (long) n, (long) r1, (long) r2, (long) c));
231 if (c >= r1 && c <= r2) {
232 /* Note: don't bail out early, we must read all the ranges from
233 * bytecode. Another option is to skip them efficiently after
234 * breaking out of here. Prefer smallest code.
235 */
236 match = 1;
237 }
238 n--;
239 }
240
241 if (op == DUK_REOP_RANGES) {
242 if (!match) {
243 goto fail;
244 }
245 } else {
246 DUK_ASSERT(op == DUK_REOP_INVRANGES);
247 if (match) {
248 goto fail;
249 }
250 }
251 break;
252 }
253 case DUK_REOP_ASSERT_START: {
254 duk_codepoint_t c;
255
256 if (sp <= re_ctx->input) {
257 break;
258 }
259 if (!(re_ctx->re_flags & DUK_RE_FLAG_MULTILINE)) {
260 goto fail;
261 }
262 c = duk__inp_get_prev_cp(re_ctx, sp);
263 if (duk_unicode_is_line_terminator(c)) {
264 /* E5 Sections 15.10.2.8, 7.3 */
265 break;
266 }
267 goto fail;
268 }
269 case DUK_REOP_ASSERT_END: {
270 duk_codepoint_t c;
271 const duk_uint8_t *tmp_sp;
272
273 if (sp >= re_ctx->input_end) {
274 break;
275 }
276 if (!(re_ctx->re_flags & DUK_RE_FLAG_MULTILINE)) {
277 goto fail;
278 }
279 tmp_sp = sp;
280 c = duk__inp_get_cp(re_ctx, &tmp_sp);
281 if (duk_unicode_is_line_terminator(c)) {
282 /* E5 Sections 15.10.2.8, 7.3 */
283 break;
284 }
285 goto fail;
286 }
287 case DUK_REOP_ASSERT_WORD_BOUNDARY:
288 case DUK_REOP_ASSERT_NOT_WORD_BOUNDARY: {
289 /*
290 * E5 Section 15.10.2.6. The previous and current character
291 * should -not- be canonicalized as they are now. However,
292 * canonicalization does not affect the result of IsWordChar()
293 * (which depends on Unicode characters never canonicalizing
294 * into ASCII characters) so this does not matter.
295 */
296 duk_small_int_t w1, w2;
297
298 if (sp <= re_ctx->input) {
299 w1 = 0; /* not a wordchar */
300 } else {
301 duk_codepoint_t c;
302 c = duk__inp_get_prev_cp(re_ctx, sp);
303 w1 = duk_unicode_re_is_wordchar(c);
304 }
305 if (sp >= re_ctx->input_end) {
306 w2 = 0; /* not a wordchar */
307 } else {
308 const duk_uint8_t *tmp_sp = sp; /* dummy so sp won't get updated */
309 duk_codepoint_t c;
310 c = duk__inp_get_cp(re_ctx, &tmp_sp);
311 w2 = duk_unicode_re_is_wordchar(c);
312 }
313
314 if (op == DUK_REOP_ASSERT_WORD_BOUNDARY) {
315 if (w1 == w2) {
316 goto fail;
317 }
318 } else {
319 DUK_ASSERT(op == DUK_REOP_ASSERT_NOT_WORD_BOUNDARY);
320 if (w1 != w2) {
321 goto fail;
322 }
323 }
324 break;
325 }
326 case DUK_REOP_JUMP: {
327 duk_int32_t skip;
328
329 skip = duk__bc_get_i32(re_ctx, &pc);
330 pc += skip;
331 break;
332 }
333 case DUK_REOP_SPLIT1: {
334 /* split1: prefer direct execution (no jump) */
335 const duk_uint8_t *sub_sp;
336 duk_int32_t skip;
337
338 skip = duk__bc_get_i32(re_ctx, &pc);
339 sub_sp = duk__match_regexp(re_ctx, pc, sp);
340 if (sub_sp) {
341 sp = sub_sp;
342 goto match;
343 }
344 pc += skip;
345 break;
346 }
347 case DUK_REOP_SPLIT2: {
348 /* split2: prefer jump execution (not direct) */
349 const duk_uint8_t *sub_sp;
350 duk_int32_t skip;
351
352 skip = duk__bc_get_i32(re_ctx, &pc);
353 sub_sp = duk__match_regexp(re_ctx, pc + skip, sp);
354 if (sub_sp) {
355 sp = sub_sp;
356 goto match;
357 }
358 break;
359 }
360 case DUK_REOP_SQMINIMAL: {
361 duk_uint32_t q, qmin, qmax;
362 duk_int32_t skip;
363 const duk_uint8_t *sub_sp;
364
365 qmin = duk__bc_get_u32(re_ctx, &pc);
366 qmax = duk__bc_get_u32(re_ctx, &pc);
367 skip = duk__bc_get_i32(re_ctx, &pc);
368 DUK_DDD(DUK_DDDPRINT("minimal quantifier, qmin=%lu, qmax=%lu, skip=%ld",
369 (unsigned long) qmin, (unsigned long) qmax, (long) skip));
370
371 q = 0;
372 while (q <= qmax) {
373 if (q >= qmin) {
374 sub_sp = duk__match_regexp(re_ctx, pc + skip, sp);
375 if (sub_sp) {
376 sp = sub_sp;
377 goto match;
378 }
379 }
380 sub_sp = duk__match_regexp(re_ctx, pc, sp);
381 if (!sub_sp) {
382 break;
383 }
384 sp = sub_sp;
385 q++;
386 }
387 goto fail;
388 }
389 case DUK_REOP_SQGREEDY: {
390 duk_uint32_t q, qmin, qmax, atomlen;
391 duk_int32_t skip;
392 const duk_uint8_t *sub_sp;
393
394 qmin = duk__bc_get_u32(re_ctx, &pc);
395 qmax = duk__bc_get_u32(re_ctx, &pc);
396 atomlen = duk__bc_get_u32(re_ctx, &pc);
397 skip = duk__bc_get_i32(re_ctx, &pc);
398 DUK_DDD(DUK_DDDPRINT("greedy quantifier, qmin=%lu, qmax=%lu, atomlen=%lu, skip=%ld",
399 (unsigned long) qmin, (unsigned long) qmax, (unsigned long) atomlen, (long) skip));
400
401 q = 0;
402 while (q < qmax) {
403 sub_sp = duk__match_regexp(re_ctx, pc, sp);
404 if (!sub_sp) {
405 break;
406 }
407 sp = sub_sp;
408 q++;
409 }
410 while (q >= qmin) {
411 sub_sp = duk__match_regexp(re_ctx, pc + skip, sp);
412 if (sub_sp) {
413 sp = sub_sp;
414 goto match;
415 }
416 if (q == qmin) {
417 break;
418 }
419
420 /* Note: if atom were to contain e.g. captures, we would need to
421 * re-match the atom to get correct captures. Simply quantifiers
422 * do not allow captures in their atom now, so this is not an issue.
423 */
424
425 DUK_DDD(DUK_DDDPRINT("greedy quantifier, backtrack %ld characters (atomlen)",
426 (long) atomlen));
427 sp = duk__inp_backtrack(re_ctx, &sp, (duk_uint_fast32_t) atomlen);
428 q--;
429 }
430 goto fail;
431 }
432 case DUK_REOP_SAVE: {
433 duk_uint32_t idx;
434 const duk_uint8_t *old;
435 const duk_uint8_t *sub_sp;
436
437 idx = duk__bc_get_u32(re_ctx, &pc);
438 if (idx >= re_ctx->nsaved) {
439 /* idx is unsigned, < 0 check is not necessary */
440 DUK_D(DUK_DPRINT("internal error, regexp save index insane: idx=%ld", (long) idx));
441 goto internal_error;
442 }
443 old = re_ctx->saved[idx];
444 re_ctx->saved[idx] = sp;
445 sub_sp = duk__match_regexp(re_ctx, pc, sp);
446 if (sub_sp) {
447 sp = sub_sp;
448 goto match;
449 }
450 re_ctx->saved[idx] = old;
451 goto fail;
452 }
453 case DUK_REOP_WIPERANGE: {
454 /* Wipe capture range and save old values for backtracking.
455 *
456 * XXX: this typically happens with a relatively small idx_count.
457 * It might be useful to handle cases where the count is small
458 * (say <= 8) by saving the values in stack instead. This would
459 * reduce memory churn and improve performance, at the cost of a
460 * slightly higher code footprint.
461 */
462 duk_uint32_t idx_start, idx_count;
463 #ifdef DUK_USE_EXPLICIT_NULL_INIT
464 duk_uint32_t idx_end, idx;
465 #endif
466 duk_uint8_t **range_save;
467 const duk_uint8_t *sub_sp;
468
469 idx_start = duk__bc_get_u32(re_ctx, &pc);
470 idx_count = duk__bc_get_u32(re_ctx, &pc);
471 DUK_DDD(DUK_DDDPRINT("wipe saved range: start=%ld, count=%ld -> [%ld,%ld] (captures [%ld,%ld])",
472 (long) idx_start, (long) idx_count,
473 (long) idx_start, (long) (idx_start + idx_count - 1),
474 (long) (idx_start / 2), (long) ((idx_start + idx_count - 1) / 2)));
475 if (idx_start + idx_count > re_ctx->nsaved || idx_count == 0) {
476 /* idx is unsigned, < 0 check is not necessary */
477 DUK_D(DUK_DPRINT("internal error, regexp wipe indices insane: idx_start=%ld, idx_count=%ld",
478 (long) idx_start, (long) idx_count));
479 goto internal_error;
480 }
481 DUK_ASSERT(idx_count > 0);
482
483 duk_require_stack((duk_context *) re_ctx->thr, 1);
484 range_save = (duk_uint8_t **) duk_push_fixed_buffer((duk_context *) re_ctx->thr,
485 sizeof(duk_uint8_t *) * idx_count);
486 DUK_ASSERT(range_save != NULL);
487 DUK_MEMCPY(range_save, re_ctx->saved + idx_start, sizeof(duk_uint8_t *) * idx_count);
488 #ifdef DUK_USE_EXPLICIT_NULL_INIT
489 idx_end = idx_start + idx_count;
490 for (idx = idx_start; idx < idx_end; idx++) {
491 re_ctx->saved[idx] = NULL;
492 }
493 #else
494 DUK_MEMZERO((void *) (re_ctx->saved + idx_start), sizeof(duk_uint8_t *) * idx_count);
495 #endif
496
497 sub_sp = duk__match_regexp(re_ctx, pc, sp);
498 if (sub_sp) {
499 /* match: keep wiped/resaved values */
500 DUK_DDD(DUK_DDDPRINT("match: keep wiped/resaved values [%ld,%ld] (captures [%ld,%ld])",
501 (long) idx_start, (long) (idx_start + idx_count - 1),
502 (long) (idx_start / 2), (long) ((idx_start + idx_count - 1) / 2)));
503 duk_pop((duk_context *) re_ctx->thr);
504 sp = sub_sp;
505 goto match;
506 }
507
508 /* fail: restore saves */
509 DUK_DDD(DUK_DDDPRINT("fail: restore wiped/resaved values [%ld,%ld] (captures [%ld,%ld])",
510 (long) idx_start, (long) (idx_start + idx_count - 1),
511 (long) (idx_start / 2), (long) ((idx_start + idx_count - 1) / 2)));
512 DUK_MEMCPY((void *) (re_ctx->saved + idx_start),
513 (const void *) range_save,
514 sizeof(duk_uint8_t *) * idx_count);
515 duk_pop((duk_context *) re_ctx->thr);
516 goto fail;
517 }
518 case DUK_REOP_LOOKPOS:
519 case DUK_REOP_LOOKNEG: {
520 /*
521 * Needs a save of multiple saved[] entries depending on what range
522 * may be overwritten. Because the regexp parser does no such analysis,
523 * we currently save the entire saved array here. Lookaheads are thus
524 * a bit expensive. Note that the saved array is not needed for just
525 * the lookahead sub-match, but for the matching of the entire sequel.
526 *
527 * The temporary save buffer is pushed on to the valstack to handle
528 * errors correctly. Each lookahead causes a C recursion and pushes
529 * more stuff on the value stack. If the C recursion limit is less
530 * than the value stack spare, there is no need to check the stack.
531 * We do so regardless, just in case.
532 */
533
534 duk_int32_t skip;
535 duk_uint8_t **full_save;
536 const duk_uint8_t *sub_sp;
537
538 DUK_ASSERT(re_ctx->nsaved > 0);
539
540 duk_require_stack((duk_context *) re_ctx->thr, 1);
541 full_save = (duk_uint8_t **) duk_push_fixed_buffer((duk_context *) re_ctx->thr,
542 sizeof(duk_uint8_t *) * re_ctx->nsaved);
543 DUK_ASSERT(full_save != NULL);
544 DUK_MEMCPY(full_save, re_ctx->saved, sizeof(duk_uint8_t *) * re_ctx->nsaved);
545
546 skip = duk__bc_get_i32(re_ctx, &pc);
547 sub_sp = duk__match_regexp(re_ctx, pc, sp);
548 if (op == DUK_REOP_LOOKPOS) {
549 if (!sub_sp) {
550 goto lookahead_fail;
551 }
552 } else {
553 if (sub_sp) {
554 goto lookahead_fail;
555 }
556 }
557 sub_sp = duk__match_regexp(re_ctx, pc + skip, sp);
558 if (sub_sp) {
559 /* match: keep saves */
560 duk_pop((duk_context *) re_ctx->thr);
561 sp = sub_sp;
562 goto match;
563 }
564
565 /* fall through */
566
567 lookahead_fail:
568 /* fail: restore saves */
569 DUK_MEMCPY((void *) re_ctx->saved,
570 (const void *) full_save,
571 sizeof(duk_uint8_t *) * re_ctx->nsaved);
572 duk_pop((duk_context *) re_ctx->thr);
573 goto fail;
574 }
575 case DUK_REOP_BACKREFERENCE: {
576 /*
577 * Byte matching for back-references would be OK in case-
578 * sensitive matching. In case-insensitive matching we need
579 * to canonicalize characters, so back-reference matching needs
580 * to be done with codepoints instead. So, we just decode
581 * everything normally here, too.
582 *
583 * Note: back-reference index which is 0 or higher than
584 * NCapturingParens (= number of capturing parens in the
585 * -entire- regexp) is a compile time error. However, a
586 * backreference referring to a valid capture which has
587 * not matched anything always succeeds! See E5 Section
588 * 15.10.2.9, step 5, sub-step 3.
589 */
590 duk_uint32_t idx;
591 const duk_uint8_t *p;
592
593 idx = duk__bc_get_u32(re_ctx, &pc);
594 idx = idx << 1; /* backref n -> saved indices [n*2, n*2+1] */
595 if (idx < 2 || idx + 1 >= re_ctx->nsaved) {
596 /* regexp compiler should catch these */
597 DUK_D(DUK_DPRINT("internal error, backreference index insane"));
598 goto internal_error;
599 }
600 if (!re_ctx->saved[idx] || !re_ctx->saved[idx+1]) {
601 /* capture is 'undefined', always matches! */
602 DUK_DDD(DUK_DDDPRINT("backreference: saved[%ld,%ld] not complete, always match",
603 (long) idx, (long) (idx + 1)));
604 break;
605 }
606 DUK_DDD(DUK_DDDPRINT("backreference: match saved[%ld,%ld]", (long) idx, (long) (idx + 1)));
607
608 p = re_ctx->saved[idx];
609 while (p < re_ctx->saved[idx+1]) {
610 duk_codepoint_t c1, c2;
611
612 /* Note: not necessary to check p against re_ctx->input_end:
613 * the memory access is checked by duk__inp_get_cp(), while
614 * valid compiled regexps cannot write a saved[] entry
615 * which points to outside the string.
616 */
617 if (sp >= re_ctx->input_end) {
618 goto fail;
619 }
620 c1 = duk__inp_get_cp(re_ctx, &p);
621 c2 = duk__inp_get_cp(re_ctx, &sp);
622 if (c1 != c2) {
623 goto fail;
624 }
625 }
626 break;
627 }
628 default: {
629 DUK_D(DUK_DPRINT("internal error, regexp opcode error: %ld", (long) op));
630 goto internal_error;
631 }
632 }
633 }
634
635 match:
636 re_ctx->recursion_depth--;
637 return sp;
638
639 fail:
640 re_ctx->recursion_depth--;
641 return NULL;
642
643 internal_error:
644 DUK_ERROR(re_ctx->thr, DUK_ERR_INTERNAL_ERROR, DUK_STR_REGEXP_INTERNAL_ERROR);
645 return NULL; /* never here */
646 }
647
648 /*
649 * Exposed matcher function which provides the semantics of RegExp.prototype.exec().
650 *
651 * RegExp.prototype.test() has the same semantics as exec() but does not return the
652 * result object (which contains the matching string and capture groups). Currently
653 * there is no separate test() helper, so a temporary result object is created and
654 * discarded if test() is needed. This is intentional, to save code space.
655 *
656 * Input stack: [ ... re_obj input ]
657 * Output stack: [ ... result ]
658 */
659
660 DUK_LOCAL void duk__regexp_match_helper(duk_hthread *thr, duk_small_int_t force_global) {
661 duk_context *ctx = (duk_context *) thr;
662 duk_re_matcher_ctx re_ctx;
663 duk_hobject *h_regexp;
664 duk_hstring *h_bytecode;
665 duk_hstring *h_input;
666 const duk_uint8_t *pc;
667 const duk_uint8_t *sp;
668 duk_small_int_t match = 0;
669 duk_small_int_t global;
670 duk_uint_fast32_t i;
671 double d;
672 duk_uint32_t char_offset;
673
674 DUK_ASSERT(thr != NULL);
675 DUK_ASSERT(ctx != NULL);
676
677 DUK_DD(DUK_DDPRINT("regexp match: regexp=%!T, input=%!T",
678 (duk_tval *) duk_get_tval(ctx, -2),
679 (duk_tval *) duk_get_tval(ctx, -1)));
680
681 /*
682 * Regexp instance check, bytecode check, input coercion.
683 *
684 * See E5 Section 15.10.6.
685 */
686
687 /* TypeError if wrong; class check, see E5 Section 15.10.6 */
688 h_regexp = duk_require_hobject_with_class(ctx, -2, DUK_HOBJECT_CLASS_REGEXP);
689 DUK_ASSERT(h_regexp != NULL);
690 DUK_ASSERT(DUK_HOBJECT_GET_CLASS_NUMBER(h_regexp) == DUK_HOBJECT_CLASS_REGEXP);
691 DUK_UNREF(h_regexp);
692
693 duk_to_string(ctx, -1);
694 h_input = duk_get_hstring(ctx, -1);
695 DUK_ASSERT(h_input != NULL);
696
697 duk_get_prop_stridx(ctx, -2, DUK_STRIDX_INT_BYTECODE); /* [ ... re_obj input ] -> [ ... re_obj input bc ] */
698 h_bytecode = duk_require_hstring(ctx, -1); /* no regexp instance should exist without a non-configurable bytecode property */
699 DUK_ASSERT(h_bytecode != NULL);
700
701 /*
702 * Basic context initialization.
703 *
704 * Some init values are read from the bytecode header
705 * whose format is (UTF-8 codepoints):
706 *
707 * uint flags
708 * uint nsaved (even, 2n+2 where n = num captures)
709 */
710
711 /* [ ... re_obj input bc ] */
712
713 DUK_MEMZERO(&re_ctx, sizeof(re_ctx));
714
715 re_ctx.thr = thr;
716 re_ctx.input = (duk_uint8_t *) DUK_HSTRING_GET_DATA(h_input);
717 re_ctx.input_end = re_ctx.input + DUK_HSTRING_GET_BYTELEN(h_input);
718 re_ctx.bytecode = (duk_uint8_t *) DUK_HSTRING_GET_DATA(h_bytecode);
719 re_ctx.bytecode_end = re_ctx.bytecode + DUK_HSTRING_GET_BYTELEN(h_bytecode);
720 re_ctx.saved = NULL;
721 re_ctx.recursion_limit = DUK_USE_REGEXP_EXECUTOR_RECLIMIT;
722 re_ctx.steps_limit = DUK_RE_EXECUTE_STEPS_LIMIT;
723
724 /* read header */
725 pc = re_ctx.bytecode;
726 re_ctx.re_flags = duk__bc_get_u32(&re_ctx, &pc);
727 re_ctx.nsaved = duk__bc_get_u32(&re_ctx, &pc);
728 re_ctx.bytecode = pc;
729
730 DUK_ASSERT(DUK_RE_FLAG_GLOBAL < 0x10000UL); /* must fit into duk_small_int_t */
731 global = (duk_small_int_t) (force_global | (re_ctx.re_flags & DUK_RE_FLAG_GLOBAL));
732
733 DUK_ASSERT(re_ctx.nsaved >= 2);
734 DUK_ASSERT((re_ctx.nsaved % 2) == 0);
735
736 duk_push_fixed_buffer(ctx, sizeof(duk_uint8_t *) * re_ctx.nsaved);
737 re_ctx.saved = (const duk_uint8_t **) duk_get_buffer(ctx, -1, NULL);
738 DUK_ASSERT(re_ctx.saved != NULL);
739
740 /* [ ... re_obj input bc saved_buf ] */
741
742 /* buffer is automatically zeroed */
743 #ifdef DUK_USE_EXPLICIT_NULL_INIT
744 for (i = 0; i < re_ctx.nsaved; i++) {
745 re_ctx.saved[i] = (duk_uint8_t *) NULL;
746 }
747 #endif
748
749 DUK_DDD(DUK_DDDPRINT("regexp ctx initialized, flags=0x%08lx, nsaved=%ld, recursion_limit=%ld, steps_limit=%ld",
750 (unsigned long) re_ctx.re_flags, (long) re_ctx.nsaved, (long) re_ctx.recursion_limit,
751 (long) re_ctx.steps_limit));
752
753 /*
754 * Get starting character offset for match, and initialize 'sp' based on it.
755 *
756 * Note: lastIndex is non-configurable so it must be present (we check the
757 * internal class of the object above, so we know it is). User code can set
758 * its value to an arbitrary (garbage) value though; E5 requires that lastIndex
759 * be coerced to a number before using. The code below works even if the
760 * property is missing: the value will then be coerced to zero.
761 *
762 * Note: lastIndex may be outside Uint32 range even after ToInteger() coercion.
763 * For instance, ToInteger(+Infinity) = +Infinity. We track the match offset
764 * as an integer, but pre-check it to be inside the 32-bit range before the loop.
765 * If not, the check in E5 Section 15.10.6.2, step 9.a applies.
766 */
767
768 /* XXX: lastIndex handling produces a lot of asm */
769
770 /* [ ... re_obj input bc saved_buf ] */
771
772 duk_get_prop_stridx(ctx, -4, DUK_STRIDX_LAST_INDEX); /* -> [ ... re_obj input bc saved_buf lastIndex ] */
773 (void) duk_to_int(ctx, -1); /* ToInteger(lastIndex) */
774 d = duk_get_number(ctx, -1); /* integer, but may be +/- Infinite, +/- zero (not NaN, though) */
775 duk_pop(ctx);
776
777 if (global) {
778 if (d < 0.0 || d > (double) DUK_HSTRING_GET_CHARLEN(h_input)) {
779 /* match fail */
780 char_offset = 0; /* not really necessary */
781 DUK_ASSERT(match == 0);
782 goto match_over;
783 }
784 char_offset = (duk_uint32_t) d;
785 } else {
786 /* lastIndex must be ignored for non-global regexps, but get the
787 * value for (theoretical) side effects. No side effects can
788 * really occur, because lastIndex is a normal property and is
789 * always non-configurable for RegExp instances.
790 */
791 char_offset = (duk_uint32_t) 0;
792 }
793
794 sp = re_ctx.input + duk_heap_strcache_offset_char2byte(thr, h_input, char_offset);
795
796 /*
797 * Match loop.
798 *
799 * Try matching at different offsets until match found or input exhausted.
800 */
801
802 /* [ ... re_obj input bc saved_buf ] */
803
804 DUK_ASSERT(match == 0);
805
806 for (;;) {
807 /* char offset in [0, h_input->clen] (both ends inclusive), checked before entry */
808 DUK_ASSERT_DISABLE(char_offset >= 0);
809 DUK_ASSERT(char_offset <= DUK_HSTRING_GET_CHARLEN(h_input));
810
811 /* Note: ctx.steps is intentionally not reset, it applies to the entire unanchored match */
812 DUK_ASSERT(re_ctx.recursion_depth == 0);
813
814 DUK_DDD(DUK_DDDPRINT("attempt match at char offset %ld; %p [%p,%p]",
815 (long) char_offset, (void *) sp, (void *) re_ctx.input,
816 (void *) re_ctx.input_end));
817
818 /*
819 * Note:
820 *
821 * - duk__match_regexp() is required not to longjmp() in ordinary "non-match"
822 * conditions; a longjmp() will terminate the entire matching process.
823 *
824 * - Clearing saved[] is not necessary because backtracking does it
825 *
826 * - Backtracking also rewinds ctx.recursion back to zero, unless an
827 * internal/limit error occurs (which causes a longjmp())
828 *
829 * - If we supported anchored matches, we would break out here
830 * unconditionally; however, Ecmascript regexps don't have anchored
831 * matches. It might make sense to implement a fast bail-out if
832 * the regexp begins with '^' and sp is not 0: currently we'll just
833 * run through the entire input string, trivially failing the match
834 * at every non-zero offset.
835 */
836
837 if (duk__match_regexp(&re_ctx, re_ctx.bytecode, sp) != NULL) {
838 DUK_DDD(DUK_DDDPRINT("match at offset %ld", (long) char_offset));
839 match = 1;
840 break;
841 }
842
843 /* advance by one character (code point) and one char_offset */
844 char_offset++;
845 if (char_offset > DUK_HSTRING_GET_CHARLEN(h_input)) {
846 /*
847 * Note:
848 *
849 * - Intentionally attempt (empty) match at char_offset == k_input->clen
850 *
851 * - Negative char_offsets have been eliminated and char_offset is duk_uint32_t
852 * -> no need or use for a negative check
853 */
854
855 DUK_DDD(DUK_DDDPRINT("no match after trying all sp offsets"));
856 break;
857 }
858
859 /* avoid calling at end of input, will DUK_ERROR (above check suffices to avoid this) */
860 (void) duk__utf8_advance(thr, &sp, re_ctx.input, re_ctx.input_end, (duk_uint_fast32_t) 1);
861 }
862
863 match_over:
864
865 /*
866 * Matching complete, create result array or return a 'null'. Update lastIndex
867 * if necessary. See E5 Section 15.10.6.2.
868 *
869 * Because lastIndex is a character (not byte) offset, we need the character
870 * length of the match which we conveniently get as a side effect of interning
871 * the matching substring (0th index of result array).
872 *
873 * saved[0] start pointer (~ byte offset) of current match
874 * saved[1] end pointer (~ byte offset) of current match (exclusive)
875 * char_offset start character offset of current match (-> .index of result)
876 * char_end_offset end character offset (computed below)
877 */
878
879 /* [ ... re_obj input bc saved_buf ] */
880
881 if (match) {
882 #ifdef DUK_USE_ASSERTIONS
883 duk_hobject *h_res;
884 #endif
885 duk_uint32_t char_end_offset = 0;
886
887 DUK_DDD(DUK_DDDPRINT("regexp matches at char_offset %ld", (long) char_offset));
888
889 DUK_ASSERT(re_ctx.nsaved >= 2); /* must have start and end */
890 DUK_ASSERT((re_ctx.nsaved % 2) == 0); /* and even number */
891
892 /* XXX: Array size is known before and (2 * re_ctx.nsaved) but not taken
893 * advantage of now. The array is not compacted either, as regexp match
894 * objects are usually short lived.
895 */
896
897 duk_push_array(ctx);
898
899 #ifdef DUK_USE_ASSERTIONS
900 h_res = duk_require_hobject(ctx, -1);
901 DUK_ASSERT(DUK_HOBJECT_HAS_EXTENSIBLE(h_res));
902 DUK_ASSERT(DUK_HOBJECT_HAS_EXOTIC_ARRAY(h_res));
903 DUK_ASSERT(DUK_HOBJECT_GET_CLASS_NUMBER(h_res) == DUK_HOBJECT_CLASS_ARRAY);
904 #endif
905
906 /* [ ... re_obj input bc saved_buf res_obj ] */
907
908 duk_push_u32(ctx, char_offset);
909 duk_xdef_prop_stridx_wec(ctx, -2, DUK_STRIDX_INDEX);
910
911 duk_dup(ctx, -4);
912 duk_xdef_prop_stridx_wec(ctx, -2, DUK_STRIDX_INPUT);
913
914 for (i = 0; i < re_ctx.nsaved; i += 2) {
915 /* Captures which are undefined have NULL pointers and are returned
916 * as 'undefined'. The same is done when saved[] pointers are insane
917 * (this should, of course, never happen in practice).
918 */
919 if (re_ctx.saved[i] && re_ctx.saved[i+1] && re_ctx.saved[i+1] >= re_ctx.saved[i]) {
920 duk_hstring *h_saved;
921
922 duk_push_lstring(ctx,
923 (char *) re_ctx.saved[i],
924 (duk_size_t) (re_ctx.saved[i+1] - re_ctx.saved[i]));
925 h_saved = duk_get_hstring(ctx, -1);
926 DUK_ASSERT(h_saved != NULL);
927
928 if (i == 0) {
929 /* Assumes that saved[0] and saved[1] are always
930 * set by regexp bytecode (if not, char_end_offset
931 * will be zero). Also assumes clen reflects the
932 * correct char length.
933 */
934 char_end_offset = char_offset + DUK_HSTRING_GET_CHARLEN(h_saved);
935 }
936 } else {
937 duk_push_undefined(ctx);
938 }
939
940 /* [ ... re_obj input bc saved_buf res_obj val ] */
941 duk_put_prop_index(ctx, -2, i / 2);
942 }
943
944 /* [ ... re_obj input bc saved_buf res_obj ] */
945
946 /* NB: 'length' property is automatically updated by the array setup loop */
947
948 if (global) {
949 /* global regexp: lastIndex updated on match */
950 duk_push_u32(ctx, char_end_offset);
951 duk_put_prop_stridx(ctx, -6, DUK_STRIDX_LAST_INDEX);
952 } else {
953 /* non-global regexp: lastIndex never updated on match */
954 ;
955 }
956 } else {
957 /*
958 * No match, E5 Section 15.10.6.2, step 9.a.i - 9.a.ii apply, regardless
959 * of 'global' flag of the RegExp. In particular, if lastIndex is invalid
960 * initially, it is reset to zero.
961 */
962
963 DUK_DDD(DUK_DDDPRINT("regexp does not match"));
964
965 duk_push_null(ctx);
966
967 /* [ ... re_obj input bc saved_buf res_obj ] */
968
969 duk_push_int(ctx, 0);
970 duk_put_prop_stridx(ctx, -6, DUK_STRIDX_LAST_INDEX);
971 }
972
973 /* [ ... re_obj input bc saved_buf res_obj ] */
974
975 duk_insert(ctx, -5);
976
977 /* [ ... res_obj re_obj input bc saved_buf ] */
978
979 duk_pop_n(ctx, 4);
980
981 /* [ ... res_obj ] */
982
983 /* XXX: these last tricks are unnecessary if the function is made
984 * a genuine native function.
985 */
986 }
987
988 DUK_INTERNAL void duk_regexp_match(duk_hthread *thr) {
989 duk__regexp_match_helper(thr, 0 /*force_global*/);
990 }
991
992 /* This variant is needed by String.prototype.split(); it needs to perform
993 * global-style matching on a cloned RegExp which is potentially non-global.
994 */
995 DUK_INTERNAL void duk_regexp_match_force_global(duk_hthread *thr) {
996 duk__regexp_match_helper(thr, 1 /*force_global*/);
997 }
998
999 #else /* DUK_USE_REGEXP_SUPPORT */
1000
1001 /* regexp support disabled */
1002
1003 #endif /* DUK_USE_REGEXP_SUPPORT */