4 * Safety: the Ecmascript executor should prevent user from reading and
5 * replacing regexp bytecode. Even so, the executor must validate all
6 * memory accesses etc. When an invalid access is detected (e.g. a 'save'
7 * opcode to invalid, unallocated index) it should fail with an internal
8 * error but not cause a segmentation fault.
12 * - Backtrack counts are limited to unsigned 32 bits but should
13 * technically be duk_size_t for strings longer than 4G chars.
14 * This also requires a regexp bytecode change.
17 #include "duk_internal.h"
19 #ifdef DUK_USE_REGEXP_SUPPORT
22 * Helpers for UTF-8 handling
24 * For bytecode readers the duk_uint32_t and duk_int32_t types are correct
25 * because they're used for more than just codepoints.
28 DUK_LOCAL duk_uint32_t
duk__bc_get_u32(duk_re_matcher_ctx
*re_ctx
, const duk_uint8_t
**pc
) {
29 return (duk_uint32_t
) duk_unicode_decode_xutf8_checked(re_ctx
->thr
, pc
, re_ctx
->bytecode
, re_ctx
->bytecode_end
);
32 DUK_LOCAL duk_int32_t
duk__bc_get_i32(duk_re_matcher_ctx
*re_ctx
, const duk_uint8_t
**pc
) {
35 /* signed integer encoding needed to work with UTF-8 */
36 t
= (duk_uint32_t
) duk_unicode_decode_xutf8_checked(re_ctx
->thr
, pc
, re_ctx
->bytecode
, re_ctx
->bytecode_end
);
38 return -((duk_int32_t
) (t
>> 1));
40 return (duk_int32_t
) (t
>> 1);
44 DUK_LOCAL
const duk_uint8_t
*duk__utf8_backtrack(duk_hthread
*thr
, const duk_uint8_t
**ptr
, const duk_uint8_t
*ptr_start
, const duk_uint8_t
*ptr_end
, duk_uint_fast32_t count
) {
47 /* Note: allow backtracking from p == ptr_end */
49 if (p
< ptr_start
|| p
> ptr_end
) {
59 if ((*p
& 0xc0) != 0x80) {
60 /* utf-8 continuation bytes have the form 10xx xxxx */
70 DUK_ERROR(thr
, DUK_ERR_INTERNAL_ERROR
, DUK_STR_REGEXP_BACKTRACK_FAILED
);
71 return NULL
; /* never here */
74 DUK_LOCAL
const duk_uint8_t
*duk__utf8_advance(duk_hthread
*thr
, const duk_uint8_t
**ptr
, const duk_uint8_t
*ptr_start
, const duk_uint8_t
*ptr_end
, duk_uint_fast32_t count
) {
78 if (p
< ptr_start
|| p
>= ptr_end
) {
86 /* Note: if encoding ends by hitting end of input, we don't check that
87 * the encoding is valid, we just assume it is.
89 if (p
>= ptr_end
|| ((*p
& 0xc0) != 0x80)) {
90 /* utf-8 continuation bytes have the form 10xx xxxx */
101 DUK_ERROR(thr
, DUK_ERR_INTERNAL_ERROR
, DUK_STR_REGEXP_ADVANCE_FAILED
);
102 return NULL
; /* never here */
106 * Helpers for dealing with the input string
109 /* Get a (possibly canonicalized) input character from current sp. The input
110 * itself is never modified, and captures always record non-canonicalized
111 * characters even in case-insensitive matching.
113 DUK_LOCAL duk_codepoint_t
duk__inp_get_cp(duk_re_matcher_ctx
*re_ctx
, const duk_uint8_t
**sp
) {
114 duk_codepoint_t res
= (duk_codepoint_t
) duk_unicode_decode_xutf8_checked(re_ctx
->thr
, sp
, re_ctx
->input
, re_ctx
->input_end
);
115 if (re_ctx
->re_flags
& DUK_RE_FLAG_IGNORE_CASE
) {
116 res
= duk_unicode_re_canonicalize_char(re_ctx
->thr
, res
);
121 DUK_LOCAL
const duk_uint8_t
*duk__inp_backtrack(duk_re_matcher_ctx
*re_ctx
, const duk_uint8_t
**sp
, duk_uint_fast32_t count
) {
122 return duk__utf8_backtrack(re_ctx
->thr
, sp
, re_ctx
->input
, re_ctx
->input_end
, count
);
125 /* Backtrack utf-8 input and return a (possibly canonicalized) input character. */
126 DUK_LOCAL duk_codepoint_t
duk__inp_get_prev_cp(duk_re_matcher_ctx
*re_ctx
, const duk_uint8_t
*sp
) {
127 /* note: caller 'sp' is intentionally not updated here */
128 (void) duk__inp_backtrack(re_ctx
, &sp
, (duk_uint_fast32_t
) 1);
129 return duk__inp_get_cp(re_ctx
, &sp
);
133 * Regexp recursive matching function.
135 * Returns 'sp' on successful match (points to character after last matched one),
138 * The C recursion depth limit check is only performed in this function, this
139 * suffices because the function is present in all true recursion required by
143 DUK_LOCAL
const duk_uint8_t
*duk__match_regexp(duk_re_matcher_ctx
*re_ctx
, const duk_uint8_t
*pc
, const duk_uint8_t
*sp
) {
144 if (re_ctx
->recursion_depth
>= re_ctx
->recursion_limit
) {
145 DUK_ERROR(re_ctx
->thr
, DUK_ERR_RANGE_ERROR
, DUK_STR_REGEXP_EXECUTOR_RECURSION_LIMIT
);
147 re_ctx
->recursion_depth
++;
152 if (re_ctx
->steps_count
>= re_ctx
->steps_limit
) {
153 DUK_ERROR(re_ctx
->thr
, DUK_ERR_RANGE_ERROR
, DUK_STR_REGEXP_EXECUTOR_STEP_LIMIT
);
155 re_ctx
->steps_count
++;
157 op
= (duk_small_int_t
) duk__bc_get_u32(re_ctx
, &pc
);
159 DUK_DDD(DUK_DDDPRINT("match: rec=%ld, steps=%ld, pc (after op)=%ld, sp=%ld, op=%ld",
160 (long) re_ctx
->recursion_depth
,
161 (long) re_ctx
->steps_count
,
162 (long) (pc
- re_ctx
->bytecode
),
163 (long) (sp
- re_ctx
->input
),
167 case DUK_REOP_MATCH
: {
170 case DUK_REOP_CHAR
: {
172 * Byte-based matching would be possible for case-sensitive
173 * matching but not for case-insensitive matching. So, we
174 * match by decoding the input and bytecode character normally.
176 * Bytecode characters are assumed to be already canonicalized.
177 * Input characters are canonicalized automatically by
178 * duk__inp_get_cp() if necessary.
180 * There is no opcode for matching multiple characters. The
181 * regexp compiler has trouble joining strings efficiently
182 * during compilation. See doc/regexp.rst for more discussion.
184 duk_codepoint_t c1
, c2
;
186 c1
= (duk_codepoint_t
) duk__bc_get_u32(re_ctx
, &pc
);
187 DUK_ASSERT(!(re_ctx
->re_flags
& DUK_RE_FLAG_IGNORE_CASE
) ||
188 c1
== duk_unicode_re_canonicalize_char(re_ctx
->thr
, c1
)); /* canonicalized by compiler */
189 if (sp
>= re_ctx
->input_end
) {
192 c2
= duk__inp_get_cp(re_ctx
, &sp
);
193 DUK_DDD(DUK_DDDPRINT("char match, c1=%ld, c2=%ld", (long) c1
, (long) c2
));
199 case DUK_REOP_PERIOD
: {
202 if (sp
>= re_ctx
->input_end
) {
205 c
= duk__inp_get_cp(re_ctx
, &sp
);
206 if (duk_unicode_is_line_terminator(c
)) {
207 /* E5 Sections 15.10.2.8, 7.3 */
212 case DUK_REOP_RANGES
:
213 case DUK_REOP_INVRANGES
: {
216 duk_small_int_t match
;
218 n
= duk__bc_get_u32(re_ctx
, &pc
);
219 if (sp
>= re_ctx
->input_end
) {
222 c
= duk__inp_get_cp(re_ctx
, &sp
);
226 duk_codepoint_t r1
, r2
;
227 r1
= (duk_codepoint_t
) duk__bc_get_u32(re_ctx
, &pc
);
228 r2
= (duk_codepoint_t
) duk__bc_get_u32(re_ctx
, &pc
);
229 DUK_DDD(DUK_DDDPRINT("matching ranges/invranges, n=%ld, r1=%ld, r2=%ld, c=%ld",
230 (long) n
, (long) r1
, (long) r2
, (long) c
));
231 if (c
>= r1
&& c
<= r2
) {
232 /* Note: don't bail out early, we must read all the ranges from
233 * bytecode. Another option is to skip them efficiently after
234 * breaking out of here. Prefer smallest code.
241 if (op
== DUK_REOP_RANGES
) {
246 DUK_ASSERT(op
== DUK_REOP_INVRANGES
);
253 case DUK_REOP_ASSERT_START
: {
256 if (sp
<= re_ctx
->input
) {
259 if (!(re_ctx
->re_flags
& DUK_RE_FLAG_MULTILINE
)) {
262 c
= duk__inp_get_prev_cp(re_ctx
, sp
);
263 if (duk_unicode_is_line_terminator(c
)) {
264 /* E5 Sections 15.10.2.8, 7.3 */
269 case DUK_REOP_ASSERT_END
: {
271 const duk_uint8_t
*tmp_sp
;
273 if (sp
>= re_ctx
->input_end
) {
276 if (!(re_ctx
->re_flags
& DUK_RE_FLAG_MULTILINE
)) {
280 c
= duk__inp_get_cp(re_ctx
, &tmp_sp
);
281 if (duk_unicode_is_line_terminator(c
)) {
282 /* E5 Sections 15.10.2.8, 7.3 */
287 case DUK_REOP_ASSERT_WORD_BOUNDARY
:
288 case DUK_REOP_ASSERT_NOT_WORD_BOUNDARY
: {
290 * E5 Section 15.10.2.6. The previous and current character
291 * should -not- be canonicalized as they are now. However,
292 * canonicalization does not affect the result of IsWordChar()
293 * (which depends on Unicode characters never canonicalizing
294 * into ASCII characters) so this does not matter.
296 duk_small_int_t w1
, w2
;
298 if (sp
<= re_ctx
->input
) {
299 w1
= 0; /* not a wordchar */
302 c
= duk__inp_get_prev_cp(re_ctx
, sp
);
303 w1
= duk_unicode_re_is_wordchar(c
);
305 if (sp
>= re_ctx
->input_end
) {
306 w2
= 0; /* not a wordchar */
308 const duk_uint8_t
*tmp_sp
= sp
; /* dummy so sp won't get updated */
310 c
= duk__inp_get_cp(re_ctx
, &tmp_sp
);
311 w2
= duk_unicode_re_is_wordchar(c
);
314 if (op
== DUK_REOP_ASSERT_WORD_BOUNDARY
) {
319 DUK_ASSERT(op
== DUK_REOP_ASSERT_NOT_WORD_BOUNDARY
);
326 case DUK_REOP_JUMP
: {
329 skip
= duk__bc_get_i32(re_ctx
, &pc
);
333 case DUK_REOP_SPLIT1
: {
334 /* split1: prefer direct execution (no jump) */
335 const duk_uint8_t
*sub_sp
;
338 skip
= duk__bc_get_i32(re_ctx
, &pc
);
339 sub_sp
= duk__match_regexp(re_ctx
, pc
, sp
);
347 case DUK_REOP_SPLIT2
: {
348 /* split2: prefer jump execution (not direct) */
349 const duk_uint8_t
*sub_sp
;
352 skip
= duk__bc_get_i32(re_ctx
, &pc
);
353 sub_sp
= duk__match_regexp(re_ctx
, pc
+ skip
, sp
);
360 case DUK_REOP_SQMINIMAL
: {
361 duk_uint32_t q
, qmin
, qmax
;
363 const duk_uint8_t
*sub_sp
;
365 qmin
= duk__bc_get_u32(re_ctx
, &pc
);
366 qmax
= duk__bc_get_u32(re_ctx
, &pc
);
367 skip
= duk__bc_get_i32(re_ctx
, &pc
);
368 DUK_DDD(DUK_DDDPRINT("minimal quantifier, qmin=%lu, qmax=%lu, skip=%ld",
369 (unsigned long) qmin
, (unsigned long) qmax
, (long) skip
));
374 sub_sp
= duk__match_regexp(re_ctx
, pc
+ skip
, sp
);
380 sub_sp
= duk__match_regexp(re_ctx
, pc
, sp
);
389 case DUK_REOP_SQGREEDY
: {
390 duk_uint32_t q
, qmin
, qmax
, atomlen
;
392 const duk_uint8_t
*sub_sp
;
394 qmin
= duk__bc_get_u32(re_ctx
, &pc
);
395 qmax
= duk__bc_get_u32(re_ctx
, &pc
);
396 atomlen
= duk__bc_get_u32(re_ctx
, &pc
);
397 skip
= duk__bc_get_i32(re_ctx
, &pc
);
398 DUK_DDD(DUK_DDDPRINT("greedy quantifier, qmin=%lu, qmax=%lu, atomlen=%lu, skip=%ld",
399 (unsigned long) qmin
, (unsigned long) qmax
, (unsigned long) atomlen
, (long) skip
));
403 sub_sp
= duk__match_regexp(re_ctx
, pc
, sp
);
411 sub_sp
= duk__match_regexp(re_ctx
, pc
+ skip
, sp
);
420 /* Note: if atom were to contain e.g. captures, we would need to
421 * re-match the atom to get correct captures. Simply quantifiers
422 * do not allow captures in their atom now, so this is not an issue.
425 DUK_DDD(DUK_DDDPRINT("greedy quantifier, backtrack %ld characters (atomlen)",
427 sp
= duk__inp_backtrack(re_ctx
, &sp
, (duk_uint_fast32_t
) atomlen
);
432 case DUK_REOP_SAVE
: {
434 const duk_uint8_t
*old
;
435 const duk_uint8_t
*sub_sp
;
437 idx
= duk__bc_get_u32(re_ctx
, &pc
);
438 if (idx
>= re_ctx
->nsaved
) {
439 /* idx is unsigned, < 0 check is not necessary */
440 DUK_D(DUK_DPRINT("internal error, regexp save index insane: idx=%ld", (long) idx
));
443 old
= re_ctx
->saved
[idx
];
444 re_ctx
->saved
[idx
] = sp
;
445 sub_sp
= duk__match_regexp(re_ctx
, pc
, sp
);
450 re_ctx
->saved
[idx
] = old
;
453 case DUK_REOP_WIPERANGE
: {
454 /* Wipe capture range and save old values for backtracking.
456 * XXX: this typically happens with a relatively small idx_count.
457 * It might be useful to handle cases where the count is small
458 * (say <= 8) by saving the values in stack instead. This would
459 * reduce memory churn and improve performance, at the cost of a
460 * slightly higher code footprint.
462 duk_uint32_t idx_start
, idx_count
;
463 #ifdef DUK_USE_EXPLICIT_NULL_INIT
464 duk_uint32_t idx_end
, idx
;
466 duk_uint8_t
**range_save
;
467 const duk_uint8_t
*sub_sp
;
469 idx_start
= duk__bc_get_u32(re_ctx
, &pc
);
470 idx_count
= duk__bc_get_u32(re_ctx
, &pc
);
471 DUK_DDD(DUK_DDDPRINT("wipe saved range: start=%ld, count=%ld -> [%ld,%ld] (captures [%ld,%ld])",
472 (long) idx_start
, (long) idx_count
,
473 (long) idx_start
, (long) (idx_start
+ idx_count
- 1),
474 (long) (idx_start
/ 2), (long) ((idx_start
+ idx_count
- 1) / 2)));
475 if (idx_start
+ idx_count
> re_ctx
->nsaved
|| idx_count
== 0) {
476 /* idx is unsigned, < 0 check is not necessary */
477 DUK_D(DUK_DPRINT("internal error, regexp wipe indices insane: idx_start=%ld, idx_count=%ld",
478 (long) idx_start
, (long) idx_count
));
481 DUK_ASSERT(idx_count
> 0);
483 duk_require_stack((duk_context
*) re_ctx
->thr
, 1);
484 range_save
= (duk_uint8_t
**) duk_push_fixed_buffer((duk_context
*) re_ctx
->thr
,
485 sizeof(duk_uint8_t
*) * idx_count
);
486 DUK_ASSERT(range_save
!= NULL
);
487 DUK_MEMCPY(range_save
, re_ctx
->saved
+ idx_start
, sizeof(duk_uint8_t
*) * idx_count
);
488 #ifdef DUK_USE_EXPLICIT_NULL_INIT
489 idx_end
= idx_start
+ idx_count
;
490 for (idx
= idx_start
; idx
< idx_end
; idx
++) {
491 re_ctx
->saved
[idx
] = NULL
;
494 DUK_MEMZERO((void *) (re_ctx
->saved
+ idx_start
), sizeof(duk_uint8_t
*) * idx_count
);
497 sub_sp
= duk__match_regexp(re_ctx
, pc
, sp
);
499 /* match: keep wiped/resaved values */
500 DUK_DDD(DUK_DDDPRINT("match: keep wiped/resaved values [%ld,%ld] (captures [%ld,%ld])",
501 (long) idx_start
, (long) (idx_start
+ idx_count
- 1),
502 (long) (idx_start
/ 2), (long) ((idx_start
+ idx_count
- 1) / 2)));
503 duk_pop((duk_context
*) re_ctx
->thr
);
508 /* fail: restore saves */
509 DUK_DDD(DUK_DDDPRINT("fail: restore wiped/resaved values [%ld,%ld] (captures [%ld,%ld])",
510 (long) idx_start
, (long) (idx_start
+ idx_count
- 1),
511 (long) (idx_start
/ 2), (long) ((idx_start
+ idx_count
- 1) / 2)));
512 DUK_MEMCPY((void *) (re_ctx
->saved
+ idx_start
),
513 (const void *) range_save
,
514 sizeof(duk_uint8_t
*) * idx_count
);
515 duk_pop((duk_context
*) re_ctx
->thr
);
518 case DUK_REOP_LOOKPOS
:
519 case DUK_REOP_LOOKNEG
: {
521 * Needs a save of multiple saved[] entries depending on what range
522 * may be overwritten. Because the regexp parser does no such analysis,
523 * we currently save the entire saved array here. Lookaheads are thus
524 * a bit expensive. Note that the saved array is not needed for just
525 * the lookahead sub-match, but for the matching of the entire sequel.
527 * The temporary save buffer is pushed on to the valstack to handle
528 * errors correctly. Each lookahead causes a C recursion and pushes
529 * more stuff on the value stack. If the C recursion limit is less
530 * than the value stack spare, there is no need to check the stack.
531 * We do so regardless, just in case.
535 duk_uint8_t
**full_save
;
536 const duk_uint8_t
*sub_sp
;
538 DUK_ASSERT(re_ctx
->nsaved
> 0);
540 duk_require_stack((duk_context
*) re_ctx
->thr
, 1);
541 full_save
= (duk_uint8_t
**) duk_push_fixed_buffer((duk_context
*) re_ctx
->thr
,
542 sizeof(duk_uint8_t
*) * re_ctx
->nsaved
);
543 DUK_ASSERT(full_save
!= NULL
);
544 DUK_MEMCPY(full_save
, re_ctx
->saved
, sizeof(duk_uint8_t
*) * re_ctx
->nsaved
);
546 skip
= duk__bc_get_i32(re_ctx
, &pc
);
547 sub_sp
= duk__match_regexp(re_ctx
, pc
, sp
);
548 if (op
== DUK_REOP_LOOKPOS
) {
557 sub_sp
= duk__match_regexp(re_ctx
, pc
+ skip
, sp
);
559 /* match: keep saves */
560 duk_pop((duk_context
*) re_ctx
->thr
);
568 /* fail: restore saves */
569 DUK_MEMCPY((void *) re_ctx
->saved
,
570 (const void *) full_save
,
571 sizeof(duk_uint8_t
*) * re_ctx
->nsaved
);
572 duk_pop((duk_context
*) re_ctx
->thr
);
575 case DUK_REOP_BACKREFERENCE
: {
577 * Byte matching for back-references would be OK in case-
578 * sensitive matching. In case-insensitive matching we need
579 * to canonicalize characters, so back-reference matching needs
580 * to be done with codepoints instead. So, we just decode
581 * everything normally here, too.
583 * Note: back-reference index which is 0 or higher than
584 * NCapturingParens (= number of capturing parens in the
585 * -entire- regexp) is a compile time error. However, a
586 * backreference referring to a valid capture which has
587 * not matched anything always succeeds! See E5 Section
588 * 15.10.2.9, step 5, sub-step 3.
591 const duk_uint8_t
*p
;
593 idx
= duk__bc_get_u32(re_ctx
, &pc
);
594 idx
= idx
<< 1; /* backref n -> saved indices [n*2, n*2+1] */
595 if (idx
< 2 || idx
+ 1 >= re_ctx
->nsaved
) {
596 /* regexp compiler should catch these */
597 DUK_D(DUK_DPRINT("internal error, backreference index insane"));
600 if (!re_ctx
->saved
[idx
] || !re_ctx
->saved
[idx
+1]) {
601 /* capture is 'undefined', always matches! */
602 DUK_DDD(DUK_DDDPRINT("backreference: saved[%ld,%ld] not complete, always match",
603 (long) idx
, (long) (idx
+ 1)));
606 DUK_DDD(DUK_DDDPRINT("backreference: match saved[%ld,%ld]", (long) idx
, (long) (idx
+ 1)));
608 p
= re_ctx
->saved
[idx
];
609 while (p
< re_ctx
->saved
[idx
+1]) {
610 duk_codepoint_t c1
, c2
;
612 /* Note: not necessary to check p against re_ctx->input_end:
613 * the memory access is checked by duk__inp_get_cp(), while
614 * valid compiled regexps cannot write a saved[] entry
615 * which points to outside the string.
617 if (sp
>= re_ctx
->input_end
) {
620 c1
= duk__inp_get_cp(re_ctx
, &p
);
621 c2
= duk__inp_get_cp(re_ctx
, &sp
);
629 DUK_D(DUK_DPRINT("internal error, regexp opcode error: %ld", (long) op
));
636 re_ctx
->recursion_depth
--;
640 re_ctx
->recursion_depth
--;
644 DUK_ERROR(re_ctx
->thr
, DUK_ERR_INTERNAL_ERROR
, DUK_STR_REGEXP_INTERNAL_ERROR
);
645 return NULL
; /* never here */
649 * Exposed matcher function which provides the semantics of RegExp.prototype.exec().
651 * RegExp.prototype.test() has the same semantics as exec() but does not return the
652 * result object (which contains the matching string and capture groups). Currently
653 * there is no separate test() helper, so a temporary result object is created and
654 * discarded if test() is needed. This is intentional, to save code space.
656 * Input stack: [ ... re_obj input ]
657 * Output stack: [ ... result ]
660 DUK_LOCAL
void duk__regexp_match_helper(duk_hthread
*thr
, duk_small_int_t force_global
) {
661 duk_context
*ctx
= (duk_context
*) thr
;
662 duk_re_matcher_ctx re_ctx
;
663 duk_hobject
*h_regexp
;
664 duk_hstring
*h_bytecode
;
665 duk_hstring
*h_input
;
666 const duk_uint8_t
*pc
;
667 const duk_uint8_t
*sp
;
668 duk_small_int_t match
= 0;
669 duk_small_int_t global
;
672 duk_uint32_t char_offset
;
674 DUK_ASSERT(thr
!= NULL
);
675 DUK_ASSERT(ctx
!= NULL
);
677 DUK_DD(DUK_DDPRINT("regexp match: regexp=%!T, input=%!T",
678 (duk_tval
*) duk_get_tval(ctx
, -2),
679 (duk_tval
*) duk_get_tval(ctx
, -1)));
682 * Regexp instance check, bytecode check, input coercion.
684 * See E5 Section 15.10.6.
687 /* TypeError if wrong; class check, see E5 Section 15.10.6 */
688 h_regexp
= duk_require_hobject_with_class(ctx
, -2, DUK_HOBJECT_CLASS_REGEXP
);
689 DUK_ASSERT(h_regexp
!= NULL
);
690 DUK_ASSERT(DUK_HOBJECT_GET_CLASS_NUMBER(h_regexp
) == DUK_HOBJECT_CLASS_REGEXP
);
693 duk_to_string(ctx
, -1);
694 h_input
= duk_get_hstring(ctx
, -1);
695 DUK_ASSERT(h_input
!= NULL
);
697 duk_get_prop_stridx(ctx
, -2, DUK_STRIDX_INT_BYTECODE
); /* [ ... re_obj input ] -> [ ... re_obj input bc ] */
698 h_bytecode
= duk_require_hstring(ctx
, -1); /* no regexp instance should exist without a non-configurable bytecode property */
699 DUK_ASSERT(h_bytecode
!= NULL
);
702 * Basic context initialization.
704 * Some init values are read from the bytecode header
705 * whose format is (UTF-8 codepoints):
708 * uint nsaved (even, 2n+2 where n = num captures)
711 /* [ ... re_obj input bc ] */
713 DUK_MEMZERO(&re_ctx
, sizeof(re_ctx
));
716 re_ctx
.input
= (duk_uint8_t
*) DUK_HSTRING_GET_DATA(h_input
);
717 re_ctx
.input_end
= re_ctx
.input
+ DUK_HSTRING_GET_BYTELEN(h_input
);
718 re_ctx
.bytecode
= (duk_uint8_t
*) DUK_HSTRING_GET_DATA(h_bytecode
);
719 re_ctx
.bytecode_end
= re_ctx
.bytecode
+ DUK_HSTRING_GET_BYTELEN(h_bytecode
);
721 re_ctx
.recursion_limit
= DUK_USE_REGEXP_EXECUTOR_RECLIMIT
;
722 re_ctx
.steps_limit
= DUK_RE_EXECUTE_STEPS_LIMIT
;
725 pc
= re_ctx
.bytecode
;
726 re_ctx
.re_flags
= duk__bc_get_u32(&re_ctx
, &pc
);
727 re_ctx
.nsaved
= duk__bc_get_u32(&re_ctx
, &pc
);
728 re_ctx
.bytecode
= pc
;
730 DUK_ASSERT(DUK_RE_FLAG_GLOBAL
< 0x10000UL
); /* must fit into duk_small_int_t */
731 global
= (duk_small_int_t
) (force_global
| (re_ctx
.re_flags
& DUK_RE_FLAG_GLOBAL
));
733 DUK_ASSERT(re_ctx
.nsaved
>= 2);
734 DUK_ASSERT((re_ctx
.nsaved
% 2) == 0);
736 duk_push_fixed_buffer(ctx
, sizeof(duk_uint8_t
*) * re_ctx
.nsaved
);
737 re_ctx
.saved
= (const duk_uint8_t
**) duk_get_buffer(ctx
, -1, NULL
);
738 DUK_ASSERT(re_ctx
.saved
!= NULL
);
740 /* [ ... re_obj input bc saved_buf ] */
742 /* buffer is automatically zeroed */
743 #ifdef DUK_USE_EXPLICIT_NULL_INIT
744 for (i
= 0; i
< re_ctx
.nsaved
; i
++) {
745 re_ctx
.saved
[i
] = (duk_uint8_t
*) NULL
;
749 DUK_DDD(DUK_DDDPRINT("regexp ctx initialized, flags=0x%08lx, nsaved=%ld, recursion_limit=%ld, steps_limit=%ld",
750 (unsigned long) re_ctx
.re_flags
, (long) re_ctx
.nsaved
, (long) re_ctx
.recursion_limit
,
751 (long) re_ctx
.steps_limit
));
754 * Get starting character offset for match, and initialize 'sp' based on it.
756 * Note: lastIndex is non-configurable so it must be present (we check the
757 * internal class of the object above, so we know it is). User code can set
758 * its value to an arbitrary (garbage) value though; E5 requires that lastIndex
759 * be coerced to a number before using. The code below works even if the
760 * property is missing: the value will then be coerced to zero.
762 * Note: lastIndex may be outside Uint32 range even after ToInteger() coercion.
763 * For instance, ToInteger(+Infinity) = +Infinity. We track the match offset
764 * as an integer, but pre-check it to be inside the 32-bit range before the loop.
765 * If not, the check in E5 Section 15.10.6.2, step 9.a applies.
768 /* XXX: lastIndex handling produces a lot of asm */
770 /* [ ... re_obj input bc saved_buf ] */
772 duk_get_prop_stridx(ctx
, -4, DUK_STRIDX_LAST_INDEX
); /* -> [ ... re_obj input bc saved_buf lastIndex ] */
773 (void) duk_to_int(ctx
, -1); /* ToInteger(lastIndex) */
774 d
= duk_get_number(ctx
, -1); /* integer, but may be +/- Infinite, +/- zero (not NaN, though) */
778 if (d
< 0.0 || d
> (double) DUK_HSTRING_GET_CHARLEN(h_input
)) {
780 char_offset
= 0; /* not really necessary */
781 DUK_ASSERT(match
== 0);
784 char_offset
= (duk_uint32_t
) d
;
786 /* lastIndex must be ignored for non-global regexps, but get the
787 * value for (theoretical) side effects. No side effects can
788 * really occur, because lastIndex is a normal property and is
789 * always non-configurable for RegExp instances.
791 char_offset
= (duk_uint32_t
) 0;
794 sp
= re_ctx
.input
+ duk_heap_strcache_offset_char2byte(thr
, h_input
, char_offset
);
799 * Try matching at different offsets until match found or input exhausted.
802 /* [ ... re_obj input bc saved_buf ] */
804 DUK_ASSERT(match
== 0);
807 /* char offset in [0, h_input->clen] (both ends inclusive), checked before entry */
808 DUK_ASSERT_DISABLE(char_offset
>= 0);
809 DUK_ASSERT(char_offset
<= DUK_HSTRING_GET_CHARLEN(h_input
));
811 /* Note: ctx.steps is intentionally not reset, it applies to the entire unanchored match */
812 DUK_ASSERT(re_ctx
.recursion_depth
== 0);
814 DUK_DDD(DUK_DDDPRINT("attempt match at char offset %ld; %p [%p,%p]",
815 (long) char_offset
, (void *) sp
, (void *) re_ctx
.input
,
816 (void *) re_ctx
.input_end
));
821 * - duk__match_regexp() is required not to longjmp() in ordinary "non-match"
822 * conditions; a longjmp() will terminate the entire matching process.
824 * - Clearing saved[] is not necessary because backtracking does it
826 * - Backtracking also rewinds ctx.recursion back to zero, unless an
827 * internal/limit error occurs (which causes a longjmp())
829 * - If we supported anchored matches, we would break out here
830 * unconditionally; however, Ecmascript regexps don't have anchored
831 * matches. It might make sense to implement a fast bail-out if
832 * the regexp begins with '^' and sp is not 0: currently we'll just
833 * run through the entire input string, trivially failing the match
834 * at every non-zero offset.
837 if (duk__match_regexp(&re_ctx
, re_ctx
.bytecode
, sp
) != NULL
) {
838 DUK_DDD(DUK_DDDPRINT("match at offset %ld", (long) char_offset
));
843 /* advance by one character (code point) and one char_offset */
845 if (char_offset
> DUK_HSTRING_GET_CHARLEN(h_input
)) {
849 * - Intentionally attempt (empty) match at char_offset == k_input->clen
851 * - Negative char_offsets have been eliminated and char_offset is duk_uint32_t
852 * -> no need or use for a negative check
855 DUK_DDD(DUK_DDDPRINT("no match after trying all sp offsets"));
859 /* avoid calling at end of input, will DUK_ERROR (above check suffices to avoid this) */
860 (void) duk__utf8_advance(thr
, &sp
, re_ctx
.input
, re_ctx
.input_end
, (duk_uint_fast32_t
) 1);
866 * Matching complete, create result array or return a 'null'. Update lastIndex
867 * if necessary. See E5 Section 15.10.6.2.
869 * Because lastIndex is a character (not byte) offset, we need the character
870 * length of the match which we conveniently get as a side effect of interning
871 * the matching substring (0th index of result array).
873 * saved[0] start pointer (~ byte offset) of current match
874 * saved[1] end pointer (~ byte offset) of current match (exclusive)
875 * char_offset start character offset of current match (-> .index of result)
876 * char_end_offset end character offset (computed below)
879 /* [ ... re_obj input bc saved_buf ] */
882 #ifdef DUK_USE_ASSERTIONS
885 duk_uint32_t char_end_offset
= 0;
887 DUK_DDD(DUK_DDDPRINT("regexp matches at char_offset %ld", (long) char_offset
));
889 DUK_ASSERT(re_ctx
.nsaved
>= 2); /* must have start and end */
890 DUK_ASSERT((re_ctx
.nsaved
% 2) == 0); /* and even number */
892 /* XXX: Array size is known before and (2 * re_ctx.nsaved) but not taken
893 * advantage of now. The array is not compacted either, as regexp match
894 * objects are usually short lived.
899 #ifdef DUK_USE_ASSERTIONS
900 h_res
= duk_require_hobject(ctx
, -1);
901 DUK_ASSERT(DUK_HOBJECT_HAS_EXTENSIBLE(h_res
));
902 DUK_ASSERT(DUK_HOBJECT_HAS_EXOTIC_ARRAY(h_res
));
903 DUK_ASSERT(DUK_HOBJECT_GET_CLASS_NUMBER(h_res
) == DUK_HOBJECT_CLASS_ARRAY
);
906 /* [ ... re_obj input bc saved_buf res_obj ] */
908 duk_push_u32(ctx
, char_offset
);
909 duk_xdef_prop_stridx_wec(ctx
, -2, DUK_STRIDX_INDEX
);
912 duk_xdef_prop_stridx_wec(ctx
, -2, DUK_STRIDX_INPUT
);
914 for (i
= 0; i
< re_ctx
.nsaved
; i
+= 2) {
915 /* Captures which are undefined have NULL pointers and are returned
916 * as 'undefined'. The same is done when saved[] pointers are insane
917 * (this should, of course, never happen in practice).
919 if (re_ctx
.saved
[i
] && re_ctx
.saved
[i
+1] && re_ctx
.saved
[i
+1] >= re_ctx
.saved
[i
]) {
920 duk_hstring
*h_saved
;
922 duk_push_lstring(ctx
,
923 (char *) re_ctx
.saved
[i
],
924 (duk_size_t
) (re_ctx
.saved
[i
+1] - re_ctx
.saved
[i
]));
925 h_saved
= duk_get_hstring(ctx
, -1);
926 DUK_ASSERT(h_saved
!= NULL
);
929 /* Assumes that saved[0] and saved[1] are always
930 * set by regexp bytecode (if not, char_end_offset
931 * will be zero). Also assumes clen reflects the
932 * correct char length.
934 char_end_offset
= char_offset
+ DUK_HSTRING_GET_CHARLEN(h_saved
);
937 duk_push_undefined(ctx
);
940 /* [ ... re_obj input bc saved_buf res_obj val ] */
941 duk_put_prop_index(ctx
, -2, i
/ 2);
944 /* [ ... re_obj input bc saved_buf res_obj ] */
946 /* NB: 'length' property is automatically updated by the array setup loop */
949 /* global regexp: lastIndex updated on match */
950 duk_push_u32(ctx
, char_end_offset
);
951 duk_put_prop_stridx(ctx
, -6, DUK_STRIDX_LAST_INDEX
);
953 /* non-global regexp: lastIndex never updated on match */
958 * No match, E5 Section 15.10.6.2, step 9.a.i - 9.a.ii apply, regardless
959 * of 'global' flag of the RegExp. In particular, if lastIndex is invalid
960 * initially, it is reset to zero.
963 DUK_DDD(DUK_DDDPRINT("regexp does not match"));
967 /* [ ... re_obj input bc saved_buf res_obj ] */
969 duk_push_int(ctx
, 0);
970 duk_put_prop_stridx(ctx
, -6, DUK_STRIDX_LAST_INDEX
);
973 /* [ ... re_obj input bc saved_buf res_obj ] */
977 /* [ ... res_obj re_obj input bc saved_buf ] */
981 /* [ ... res_obj ] */
983 /* XXX: these last tricks are unnecessary if the function is made
984 * a genuine native function.
988 DUK_INTERNAL
void duk_regexp_match(duk_hthread
*thr
) {
989 duk__regexp_match_helper(thr
, 0 /*force_global*/);
992 /* This variant is needed by String.prototype.split(); it needs to perform
993 * global-style matching on a cloned RegExp which is potentially non-global.
995 DUK_INTERNAL
void duk_regexp_match_force_global(duk_hthread
*thr
) {
996 duk__regexp_match_helper(thr
, 1 /*force_global*/);
999 #else /* DUK_USE_REGEXP_SUPPORT */
1001 /* regexp support disabled */
1003 #endif /* DUK_USE_REGEXP_SUPPORT */