]> git.proxmox.com Git - ceph.git/blame - ceph/src/civetweb/src/third_party/duktape-1.8.0/src-separate/duk_regexp_executor.c
buildsys: switch source download to quincy
[ceph.git] / ceph / src / civetweb / src / third_party / duktape-1.8.0 / src-separate / duk_regexp_executor.c
CommitLineData
7c673cae
FG
1/*
2 * Regexp executor.
3 *
4 * Safety: the Ecmascript executor should prevent user from reading and
5 * replacing regexp bytecode. Even so, the executor must validate all
6 * memory accesses etc. When an invalid access is detected (e.g. a 'save'
7 * opcode to invalid, unallocated index) it should fail with an internal
8 * error but not cause a segmentation fault.
9 *
10 * Notes:
11 *
12 * - Backtrack counts are limited to unsigned 32 bits but should
13 * technically be duk_size_t for strings longer than 4G chars.
14 * This also requires a regexp bytecode change.
15 */
16
17#include "duk_internal.h"
18
19#ifdef DUK_USE_REGEXP_SUPPORT
20
21/*
22 * Helpers for UTF-8 handling
23 *
24 * For bytecode readers the duk_uint32_t and duk_int32_t types are correct
25 * because they're used for more than just codepoints.
26 */
27
28DUK_LOCAL duk_uint32_t duk__bc_get_u32(duk_re_matcher_ctx *re_ctx, const duk_uint8_t **pc) {
29 return (duk_uint32_t) duk_unicode_decode_xutf8_checked(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end);
30}
31
32DUK_LOCAL duk_int32_t duk__bc_get_i32(duk_re_matcher_ctx *re_ctx, const duk_uint8_t **pc) {
33 duk_uint32_t t;
34
35 /* signed integer encoding needed to work with UTF-8 */
36 t = (duk_uint32_t) duk_unicode_decode_xutf8_checked(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end);
37 if (t & 1) {
38 return -((duk_int32_t) (t >> 1));
39 } else {
40 return (duk_int32_t) (t >> 1);
41 }
42}
43
44DUK_LOCAL const duk_uint8_t *duk__utf8_backtrack(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end, duk_uint_fast32_t count) {
45 const duk_uint8_t *p;
46
47 /* Note: allow backtracking from p == ptr_end */
48 p = *ptr;
49 if (p < ptr_start || p > ptr_end) {
50 goto fail;
51 }
52
53 while (count > 0) {
54 for (;;) {
55 p--;
56 if (p < ptr_start) {
57 goto fail;
58 }
59 if ((*p & 0xc0) != 0x80) {
60 /* utf-8 continuation bytes have the form 10xx xxxx */
61 break;
62 }
63 }
64 count--;
65 }
66 *ptr = p;
67 return p;
68
69 fail:
11fdf7f2 70 DUK_ERROR_INTERNAL_DEFMSG(thr);
7c673cae
FG
71 return NULL; /* never here */
72}
73
74DUK_LOCAL const duk_uint8_t *duk__utf8_advance(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end, duk_uint_fast32_t count) {
75 const duk_uint8_t *p;
76
77 p = *ptr;
78 if (p < ptr_start || p >= ptr_end) {
79 goto fail;
80 }
81
82 while (count > 0) {
83 for (;;) {
84 p++;
85
86 /* Note: if encoding ends by hitting end of input, we don't check that
87 * the encoding is valid, we just assume it is.
88 */
89 if (p >= ptr_end || ((*p & 0xc0) != 0x80)) {
90 /* utf-8 continuation bytes have the form 10xx xxxx */
91 break;
92 }
93 }
94 count--;
95 }
96
97 *ptr = p;
98 return p;
99
100 fail:
11fdf7f2 101 DUK_ERROR_INTERNAL_DEFMSG(thr);
7c673cae
FG
102 return NULL; /* never here */
103}
104
105/*
106 * Helpers for dealing with the input string
107 */
108
109/* Get a (possibly canonicalized) input character from current sp. The input
110 * itself is never modified, and captures always record non-canonicalized
111 * characters even in case-insensitive matching.
112 */
113DUK_LOCAL duk_codepoint_t duk__inp_get_cp(duk_re_matcher_ctx *re_ctx, const duk_uint8_t **sp) {
114 duk_codepoint_t res = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(re_ctx->thr, sp, re_ctx->input, re_ctx->input_end);
115 if (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) {
116 res = duk_unicode_re_canonicalize_char(re_ctx->thr, res);
117 }
118 return res;
119}
120
121DUK_LOCAL const duk_uint8_t *duk__inp_backtrack(duk_re_matcher_ctx *re_ctx, const duk_uint8_t **sp, duk_uint_fast32_t count) {
122 return duk__utf8_backtrack(re_ctx->thr, sp, re_ctx->input, re_ctx->input_end, count);
123}
124
125/* Backtrack utf-8 input and return a (possibly canonicalized) input character. */
126DUK_LOCAL duk_codepoint_t duk__inp_get_prev_cp(duk_re_matcher_ctx *re_ctx, const duk_uint8_t *sp) {
127 /* note: caller 'sp' is intentionally not updated here */
128 (void) duk__inp_backtrack(re_ctx, &sp, (duk_uint_fast32_t) 1);
129 return duk__inp_get_cp(re_ctx, &sp);
130}
131
132/*
133 * Regexp recursive matching function.
134 *
135 * Returns 'sp' on successful match (points to character after last matched one),
136 * NULL otherwise.
137 *
138 * The C recursion depth limit check is only performed in this function, this
139 * suffices because the function is present in all true recursion required by
140 * regexp execution.
141 */
142
143DUK_LOCAL const duk_uint8_t *duk__match_regexp(duk_re_matcher_ctx *re_ctx, const duk_uint8_t *pc, const duk_uint8_t *sp) {
144 if (re_ctx->recursion_depth >= re_ctx->recursion_limit) {
11fdf7f2 145 DUK_ERROR_RANGE(re_ctx->thr, DUK_STR_REGEXP_EXECUTOR_RECURSION_LIMIT);
7c673cae
FG
146 }
147 re_ctx->recursion_depth++;
148
149 for (;;) {
150 duk_small_int_t op;
151
152 if (re_ctx->steps_count >= re_ctx->steps_limit) {
11fdf7f2 153 DUK_ERROR_RANGE(re_ctx->thr, DUK_STR_REGEXP_EXECUTOR_STEP_LIMIT);
7c673cae
FG
154 }
155 re_ctx->steps_count++;
156
157 op = (duk_small_int_t) duk__bc_get_u32(re_ctx, &pc);
158
159 DUK_DDD(DUK_DDDPRINT("match: rec=%ld, steps=%ld, pc (after op)=%ld, sp=%ld, op=%ld",
160 (long) re_ctx->recursion_depth,
161 (long) re_ctx->steps_count,
162 (long) (pc - re_ctx->bytecode),
163 (long) (sp - re_ctx->input),
164 (long) op));
165
166 switch (op) {
167 case DUK_REOP_MATCH: {
168 goto match;
169 }
170 case DUK_REOP_CHAR: {
171 /*
172 * Byte-based matching would be possible for case-sensitive
173 * matching but not for case-insensitive matching. So, we
174 * match by decoding the input and bytecode character normally.
175 *
176 * Bytecode characters are assumed to be already canonicalized.
177 * Input characters are canonicalized automatically by
178 * duk__inp_get_cp() if necessary.
179 *
180 * There is no opcode for matching multiple characters. The
181 * regexp compiler has trouble joining strings efficiently
182 * during compilation. See doc/regexp.rst for more discussion.
183 */
184 duk_codepoint_t c1, c2;
185
186 c1 = (duk_codepoint_t) duk__bc_get_u32(re_ctx, &pc);
187 DUK_ASSERT(!(re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) ||
188 c1 == duk_unicode_re_canonicalize_char(re_ctx->thr, c1)); /* canonicalized by compiler */
189 if (sp >= re_ctx->input_end) {
190 goto fail;
191 }
192 c2 = duk__inp_get_cp(re_ctx, &sp);
193 DUK_DDD(DUK_DDDPRINT("char match, c1=%ld, c2=%ld", (long) c1, (long) c2));
194 if (c1 != c2) {
195 goto fail;
196 }
197 break;
198 }
199 case DUK_REOP_PERIOD: {
200 duk_codepoint_t c;
201
202 if (sp >= re_ctx->input_end) {
203 goto fail;
204 }
205 c = duk__inp_get_cp(re_ctx, &sp);
206 if (duk_unicode_is_line_terminator(c)) {
207 /* E5 Sections 15.10.2.8, 7.3 */
208 goto fail;
209 }
210 break;
211 }
212 case DUK_REOP_RANGES:
213 case DUK_REOP_INVRANGES: {
214 duk_uint32_t n;
215 duk_codepoint_t c;
216 duk_small_int_t match;
217
218 n = duk__bc_get_u32(re_ctx, &pc);
219 if (sp >= re_ctx->input_end) {
220 goto fail;
221 }
222 c = duk__inp_get_cp(re_ctx, &sp);
223
224 match = 0;
225 while (n) {
226 duk_codepoint_t r1, r2;
227 r1 = (duk_codepoint_t) duk__bc_get_u32(re_ctx, &pc);
228 r2 = (duk_codepoint_t) duk__bc_get_u32(re_ctx, &pc);
229 DUK_DDD(DUK_DDDPRINT("matching ranges/invranges, n=%ld, r1=%ld, r2=%ld, c=%ld",
230 (long) n, (long) r1, (long) r2, (long) c));
231 if (c >= r1 && c <= r2) {
232 /* Note: don't bail out early, we must read all the ranges from
233 * bytecode. Another option is to skip them efficiently after
234 * breaking out of here. Prefer smallest code.
235 */
236 match = 1;
237 }
238 n--;
239 }
240
241 if (op == DUK_REOP_RANGES) {
242 if (!match) {
243 goto fail;
244 }
245 } else {
246 DUK_ASSERT(op == DUK_REOP_INVRANGES);
247 if (match) {
248 goto fail;
249 }
250 }
251 break;
252 }
253 case DUK_REOP_ASSERT_START: {
254 duk_codepoint_t c;
255
256 if (sp <= re_ctx->input) {
257 break;
258 }
259 if (!(re_ctx->re_flags & DUK_RE_FLAG_MULTILINE)) {
260 goto fail;
261 }
262 c = duk__inp_get_prev_cp(re_ctx, sp);
263 if (duk_unicode_is_line_terminator(c)) {
264 /* E5 Sections 15.10.2.8, 7.3 */
265 break;
266 }
267 goto fail;
268 }
269 case DUK_REOP_ASSERT_END: {
270 duk_codepoint_t c;
271 const duk_uint8_t *tmp_sp;
272
273 if (sp >= re_ctx->input_end) {
274 break;
275 }
276 if (!(re_ctx->re_flags & DUK_RE_FLAG_MULTILINE)) {
277 goto fail;
278 }
279 tmp_sp = sp;
280 c = duk__inp_get_cp(re_ctx, &tmp_sp);
281 if (duk_unicode_is_line_terminator(c)) {
282 /* E5 Sections 15.10.2.8, 7.3 */
283 break;
284 }
285 goto fail;
286 }
287 case DUK_REOP_ASSERT_WORD_BOUNDARY:
288 case DUK_REOP_ASSERT_NOT_WORD_BOUNDARY: {
289 /*
290 * E5 Section 15.10.2.6. The previous and current character
291 * should -not- be canonicalized as they are now. However,
292 * canonicalization does not affect the result of IsWordChar()
293 * (which depends on Unicode characters never canonicalizing
294 * into ASCII characters) so this does not matter.
295 */
296 duk_small_int_t w1, w2;
297
298 if (sp <= re_ctx->input) {
299 w1 = 0; /* not a wordchar */
300 } else {
301 duk_codepoint_t c;
302 c = duk__inp_get_prev_cp(re_ctx, sp);
303 w1 = duk_unicode_re_is_wordchar(c);
304 }
305 if (sp >= re_ctx->input_end) {
306 w2 = 0; /* not a wordchar */
307 } else {
308 const duk_uint8_t *tmp_sp = sp; /* dummy so sp won't get updated */
309 duk_codepoint_t c;
310 c = duk__inp_get_cp(re_ctx, &tmp_sp);
311 w2 = duk_unicode_re_is_wordchar(c);
312 }
313
314 if (op == DUK_REOP_ASSERT_WORD_BOUNDARY) {
315 if (w1 == w2) {
316 goto fail;
317 }
318 } else {
319 DUK_ASSERT(op == DUK_REOP_ASSERT_NOT_WORD_BOUNDARY);
320 if (w1 != w2) {
321 goto fail;
322 }
323 }
324 break;
325 }
326 case DUK_REOP_JUMP: {
327 duk_int32_t skip;
328
329 skip = duk__bc_get_i32(re_ctx, &pc);
330 pc += skip;
331 break;
332 }
333 case DUK_REOP_SPLIT1: {
334 /* split1: prefer direct execution (no jump) */
335 const duk_uint8_t *sub_sp;
336 duk_int32_t skip;
337
338 skip = duk__bc_get_i32(re_ctx, &pc);
339 sub_sp = duk__match_regexp(re_ctx, pc, sp);
340 if (sub_sp) {
341 sp = sub_sp;
342 goto match;
343 }
344 pc += skip;
345 break;
346 }
347 case DUK_REOP_SPLIT2: {
348 /* split2: prefer jump execution (not direct) */
349 const duk_uint8_t *sub_sp;
350 duk_int32_t skip;
351
352 skip = duk__bc_get_i32(re_ctx, &pc);
353 sub_sp = duk__match_regexp(re_ctx, pc + skip, sp);
354 if (sub_sp) {
355 sp = sub_sp;
356 goto match;
357 }
358 break;
359 }
360 case DUK_REOP_SQMINIMAL: {
361 duk_uint32_t q, qmin, qmax;
362 duk_int32_t skip;
363 const duk_uint8_t *sub_sp;
364
365 qmin = duk__bc_get_u32(re_ctx, &pc);
366 qmax = duk__bc_get_u32(re_ctx, &pc);
367 skip = duk__bc_get_i32(re_ctx, &pc);
368 DUK_DDD(DUK_DDDPRINT("minimal quantifier, qmin=%lu, qmax=%lu, skip=%ld",
369 (unsigned long) qmin, (unsigned long) qmax, (long) skip));
370
371 q = 0;
372 while (q <= qmax) {
373 if (q >= qmin) {
374 sub_sp = duk__match_regexp(re_ctx, pc + skip, sp);
375 if (sub_sp) {
376 sp = sub_sp;
377 goto match;
378 }
379 }
380 sub_sp = duk__match_regexp(re_ctx, pc, sp);
381 if (!sub_sp) {
382 break;
383 }
384 sp = sub_sp;
385 q++;
386 }
387 goto fail;
388 }
389 case DUK_REOP_SQGREEDY: {
390 duk_uint32_t q, qmin, qmax, atomlen;
391 duk_int32_t skip;
392 const duk_uint8_t *sub_sp;
393
394 qmin = duk__bc_get_u32(re_ctx, &pc);
395 qmax = duk__bc_get_u32(re_ctx, &pc);
396 atomlen = duk__bc_get_u32(re_ctx, &pc);
397 skip = duk__bc_get_i32(re_ctx, &pc);
398 DUK_DDD(DUK_DDDPRINT("greedy quantifier, qmin=%lu, qmax=%lu, atomlen=%lu, skip=%ld",
399 (unsigned long) qmin, (unsigned long) qmax, (unsigned long) atomlen, (long) skip));
400
401 q = 0;
402 while (q < qmax) {
403 sub_sp = duk__match_regexp(re_ctx, pc, sp);
404 if (!sub_sp) {
405 break;
406 }
407 sp = sub_sp;
408 q++;
409 }
410 while (q >= qmin) {
411 sub_sp = duk__match_regexp(re_ctx, pc + skip, sp);
412 if (sub_sp) {
413 sp = sub_sp;
414 goto match;
415 }
416 if (q == qmin) {
417 break;
418 }
419
420 /* Note: if atom were to contain e.g. captures, we would need to
421 * re-match the atom to get correct captures. Simply quantifiers
422 * do not allow captures in their atom now, so this is not an issue.
423 */
424
425 DUK_DDD(DUK_DDDPRINT("greedy quantifier, backtrack %ld characters (atomlen)",
426 (long) atomlen));
427 sp = duk__inp_backtrack(re_ctx, &sp, (duk_uint_fast32_t) atomlen);
428 q--;
429 }
430 goto fail;
431 }
432 case DUK_REOP_SAVE: {
433 duk_uint32_t idx;
434 const duk_uint8_t *old;
435 const duk_uint8_t *sub_sp;
436
437 idx = duk__bc_get_u32(re_ctx, &pc);
438 if (idx >= re_ctx->nsaved) {
439 /* idx is unsigned, < 0 check is not necessary */
440 DUK_D(DUK_DPRINT("internal error, regexp save index insane: idx=%ld", (long) idx));
441 goto internal_error;
442 }
443 old = re_ctx->saved[idx];
444 re_ctx->saved[idx] = sp;
445 sub_sp = duk__match_regexp(re_ctx, pc, sp);
446 if (sub_sp) {
447 sp = sub_sp;
448 goto match;
449 }
450 re_ctx->saved[idx] = old;
451 goto fail;
452 }
453 case DUK_REOP_WIPERANGE: {
454 /* Wipe capture range and save old values for backtracking.
455 *
456 * XXX: this typically happens with a relatively small idx_count.
457 * It might be useful to handle cases where the count is small
458 * (say <= 8) by saving the values in stack instead. This would
459 * reduce memory churn and improve performance, at the cost of a
460 * slightly higher code footprint.
461 */
462 duk_uint32_t idx_start, idx_count;
463#ifdef DUK_USE_EXPLICIT_NULL_INIT
464 duk_uint32_t idx_end, idx;
465#endif
466 duk_uint8_t **range_save;
467 const duk_uint8_t *sub_sp;
468
469 idx_start = duk__bc_get_u32(re_ctx, &pc);
470 idx_count = duk__bc_get_u32(re_ctx, &pc);
471 DUK_DDD(DUK_DDDPRINT("wipe saved range: start=%ld, count=%ld -> [%ld,%ld] (captures [%ld,%ld])",
472 (long) idx_start, (long) idx_count,
473 (long) idx_start, (long) (idx_start + idx_count - 1),
474 (long) (idx_start / 2), (long) ((idx_start + idx_count - 1) / 2)));
475 if (idx_start + idx_count > re_ctx->nsaved || idx_count == 0) {
476 /* idx is unsigned, < 0 check is not necessary */
477 DUK_D(DUK_DPRINT("internal error, regexp wipe indices insane: idx_start=%ld, idx_count=%ld",
478 (long) idx_start, (long) idx_count));
479 goto internal_error;
480 }
481 DUK_ASSERT(idx_count > 0);
482
483 duk_require_stack((duk_context *) re_ctx->thr, 1);
484 range_save = (duk_uint8_t **) duk_push_fixed_buffer((duk_context *) re_ctx->thr,
485 sizeof(duk_uint8_t *) * idx_count);
486 DUK_ASSERT(range_save != NULL);
487 DUK_MEMCPY(range_save, re_ctx->saved + idx_start, sizeof(duk_uint8_t *) * idx_count);
488#ifdef DUK_USE_EXPLICIT_NULL_INIT
489 idx_end = idx_start + idx_count;
490 for (idx = idx_start; idx < idx_end; idx++) {
491 re_ctx->saved[idx] = NULL;
492 }
493#else
494 DUK_MEMZERO((void *) (re_ctx->saved + idx_start), sizeof(duk_uint8_t *) * idx_count);
495#endif
496
497 sub_sp = duk__match_regexp(re_ctx, pc, sp);
498 if (sub_sp) {
499 /* match: keep wiped/resaved values */
500 DUK_DDD(DUK_DDDPRINT("match: keep wiped/resaved values [%ld,%ld] (captures [%ld,%ld])",
501 (long) idx_start, (long) (idx_start + idx_count - 1),
502 (long) (idx_start / 2), (long) ((idx_start + idx_count - 1) / 2)));
503 duk_pop((duk_context *) re_ctx->thr);
504 sp = sub_sp;
505 goto match;
506 }
507
508 /* fail: restore saves */
509 DUK_DDD(DUK_DDDPRINT("fail: restore wiped/resaved values [%ld,%ld] (captures [%ld,%ld])",
510 (long) idx_start, (long) (idx_start + idx_count - 1),
511 (long) (idx_start / 2), (long) ((idx_start + idx_count - 1) / 2)));
512 DUK_MEMCPY((void *) (re_ctx->saved + idx_start),
513 (const void *) range_save,
514 sizeof(duk_uint8_t *) * idx_count);
515 duk_pop((duk_context *) re_ctx->thr);
516 goto fail;
517 }
518 case DUK_REOP_LOOKPOS:
519 case DUK_REOP_LOOKNEG: {
520 /*
521 * Needs a save of multiple saved[] entries depending on what range
522 * may be overwritten. Because the regexp parser does no such analysis,
523 * we currently save the entire saved array here. Lookaheads are thus
524 * a bit expensive. Note that the saved array is not needed for just
525 * the lookahead sub-match, but for the matching of the entire sequel.
526 *
527 * The temporary save buffer is pushed on to the valstack to handle
528 * errors correctly. Each lookahead causes a C recursion and pushes
529 * more stuff on the value stack. If the C recursion limit is less
530 * than the value stack spare, there is no need to check the stack.
531 * We do so regardless, just in case.
532 */
533
534 duk_int32_t skip;
535 duk_uint8_t **full_save;
536 const duk_uint8_t *sub_sp;
537
538 DUK_ASSERT(re_ctx->nsaved > 0);
539
540 duk_require_stack((duk_context *) re_ctx->thr, 1);
541 full_save = (duk_uint8_t **) duk_push_fixed_buffer((duk_context *) re_ctx->thr,
542 sizeof(duk_uint8_t *) * re_ctx->nsaved);
543 DUK_ASSERT(full_save != NULL);
544 DUK_MEMCPY(full_save, re_ctx->saved, sizeof(duk_uint8_t *) * re_ctx->nsaved);
545
546 skip = duk__bc_get_i32(re_ctx, &pc);
547 sub_sp = duk__match_regexp(re_ctx, pc, sp);
548 if (op == DUK_REOP_LOOKPOS) {
549 if (!sub_sp) {
550 goto lookahead_fail;
551 }
552 } else {
553 if (sub_sp) {
554 goto lookahead_fail;
555 }
556 }
557 sub_sp = duk__match_regexp(re_ctx, pc + skip, sp);
558 if (sub_sp) {
559 /* match: keep saves */
560 duk_pop((duk_context *) re_ctx->thr);
561 sp = sub_sp;
562 goto match;
563 }
564
565 /* fall through */
566
567 lookahead_fail:
568 /* fail: restore saves */
569 DUK_MEMCPY((void *) re_ctx->saved,
570 (const void *) full_save,
571 sizeof(duk_uint8_t *) * re_ctx->nsaved);
572 duk_pop((duk_context *) re_ctx->thr);
573 goto fail;
574 }
575 case DUK_REOP_BACKREFERENCE: {
576 /*
577 * Byte matching for back-references would be OK in case-
578 * sensitive matching. In case-insensitive matching we need
579 * to canonicalize characters, so back-reference matching needs
580 * to be done with codepoints instead. So, we just decode
581 * everything normally here, too.
582 *
583 * Note: back-reference index which is 0 or higher than
584 * NCapturingParens (= number of capturing parens in the
585 * -entire- regexp) is a compile time error. However, a
586 * backreference referring to a valid capture which has
587 * not matched anything always succeeds! See E5 Section
588 * 15.10.2.9, step 5, sub-step 3.
589 */
590 duk_uint32_t idx;
591 const duk_uint8_t *p;
592
593 idx = duk__bc_get_u32(re_ctx, &pc);
594 idx = idx << 1; /* backref n -> saved indices [n*2, n*2+1] */
595 if (idx < 2 || idx + 1 >= re_ctx->nsaved) {
596 /* regexp compiler should catch these */
597 DUK_D(DUK_DPRINT("internal error, backreference index insane"));
598 goto internal_error;
599 }
600 if (!re_ctx->saved[idx] || !re_ctx->saved[idx+1]) {
601 /* capture is 'undefined', always matches! */
602 DUK_DDD(DUK_DDDPRINT("backreference: saved[%ld,%ld] not complete, always match",
603 (long) idx, (long) (idx + 1)));
604 break;
605 }
606 DUK_DDD(DUK_DDDPRINT("backreference: match saved[%ld,%ld]", (long) idx, (long) (idx + 1)));
607
608 p = re_ctx->saved[idx];
609 while (p < re_ctx->saved[idx+1]) {
610 duk_codepoint_t c1, c2;
611
612 /* Note: not necessary to check p against re_ctx->input_end:
613 * the memory access is checked by duk__inp_get_cp(), while
614 * valid compiled regexps cannot write a saved[] entry
615 * which points to outside the string.
616 */
617 if (sp >= re_ctx->input_end) {
618 goto fail;
619 }
620 c1 = duk__inp_get_cp(re_ctx, &p);
621 c2 = duk__inp_get_cp(re_ctx, &sp);
622 if (c1 != c2) {
623 goto fail;
624 }
625 }
626 break;
627 }
628 default: {
629 DUK_D(DUK_DPRINT("internal error, regexp opcode error: %ld", (long) op));
630 goto internal_error;
631 }
632 }
633 }
634
635 match:
636 re_ctx->recursion_depth--;
637 return sp;
638
639 fail:
640 re_ctx->recursion_depth--;
641 return NULL;
642
643 internal_error:
11fdf7f2 644 DUK_ERROR_INTERNAL_DEFMSG(re_ctx->thr);
7c673cae
FG
645 return NULL; /* never here */
646}
647
648/*
649 * Exposed matcher function which provides the semantics of RegExp.prototype.exec().
650 *
651 * RegExp.prototype.test() has the same semantics as exec() but does not return the
652 * result object (which contains the matching string and capture groups). Currently
653 * there is no separate test() helper, so a temporary result object is created and
654 * discarded if test() is needed. This is intentional, to save code space.
655 *
656 * Input stack: [ ... re_obj input ]
657 * Output stack: [ ... result ]
658 */
659
660DUK_LOCAL void duk__regexp_match_helper(duk_hthread *thr, duk_small_int_t force_global) {
661 duk_context *ctx = (duk_context *) thr;
662 duk_re_matcher_ctx re_ctx;
663 duk_hobject *h_regexp;
664 duk_hstring *h_bytecode;
665 duk_hstring *h_input;
11fdf7f2 666 duk_uint8_t *p_buf;
7c673cae
FG
667 const duk_uint8_t *pc;
668 const duk_uint8_t *sp;
669 duk_small_int_t match = 0;
670 duk_small_int_t global;
671 duk_uint_fast32_t i;
672 double d;
673 duk_uint32_t char_offset;
674
675 DUK_ASSERT(thr != NULL);
676 DUK_ASSERT(ctx != NULL);
677
678 DUK_DD(DUK_DDPRINT("regexp match: regexp=%!T, input=%!T",
679 (duk_tval *) duk_get_tval(ctx, -2),
680 (duk_tval *) duk_get_tval(ctx, -1)));
681
682 /*
683 * Regexp instance check, bytecode check, input coercion.
684 *
685 * See E5 Section 15.10.6.
686 */
687
688 /* TypeError if wrong; class check, see E5 Section 15.10.6 */
689 h_regexp = duk_require_hobject_with_class(ctx, -2, DUK_HOBJECT_CLASS_REGEXP);
690 DUK_ASSERT(h_regexp != NULL);
691 DUK_ASSERT(DUK_HOBJECT_GET_CLASS_NUMBER(h_regexp) == DUK_HOBJECT_CLASS_REGEXP);
692 DUK_UNREF(h_regexp);
693
694 duk_to_string(ctx, -1);
695 h_input = duk_get_hstring(ctx, -1);
696 DUK_ASSERT(h_input != NULL);
697
698 duk_get_prop_stridx(ctx, -2, DUK_STRIDX_INT_BYTECODE); /* [ ... re_obj input ] -> [ ... re_obj input bc ] */
699 h_bytecode = duk_require_hstring(ctx, -1); /* no regexp instance should exist without a non-configurable bytecode property */
700 DUK_ASSERT(h_bytecode != NULL);
701
702 /*
703 * Basic context initialization.
704 *
705 * Some init values are read from the bytecode header
706 * whose format is (UTF-8 codepoints):
707 *
708 * uint flags
709 * uint nsaved (even, 2n+2 where n = num captures)
710 */
711
712 /* [ ... re_obj input bc ] */
713
714 DUK_MEMZERO(&re_ctx, sizeof(re_ctx));
715
716 re_ctx.thr = thr;
11fdf7f2 717 re_ctx.input = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h_input);
7c673cae 718 re_ctx.input_end = re_ctx.input + DUK_HSTRING_GET_BYTELEN(h_input);
11fdf7f2 719 re_ctx.bytecode = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h_bytecode);
7c673cae
FG
720 re_ctx.bytecode_end = re_ctx.bytecode + DUK_HSTRING_GET_BYTELEN(h_bytecode);
721 re_ctx.saved = NULL;
722 re_ctx.recursion_limit = DUK_USE_REGEXP_EXECUTOR_RECLIMIT;
723 re_ctx.steps_limit = DUK_RE_EXECUTE_STEPS_LIMIT;
724
725 /* read header */
726 pc = re_ctx.bytecode;
727 re_ctx.re_flags = duk__bc_get_u32(&re_ctx, &pc);
728 re_ctx.nsaved = duk__bc_get_u32(&re_ctx, &pc);
729 re_ctx.bytecode = pc;
730
731 DUK_ASSERT(DUK_RE_FLAG_GLOBAL < 0x10000UL); /* must fit into duk_small_int_t */
732 global = (duk_small_int_t) (force_global | (re_ctx.re_flags & DUK_RE_FLAG_GLOBAL));
733
734 DUK_ASSERT(re_ctx.nsaved >= 2);
735 DUK_ASSERT((re_ctx.nsaved % 2) == 0);
736
11fdf7f2
TL
737 p_buf = (duk_uint8_t *) duk_push_fixed_buffer(ctx, sizeof(duk_uint8_t *) * re_ctx.nsaved);
738 DUK_UNREF(p_buf);
7c673cae
FG
739 re_ctx.saved = (const duk_uint8_t **) duk_get_buffer(ctx, -1, NULL);
740 DUK_ASSERT(re_ctx.saved != NULL);
741
742 /* [ ... re_obj input bc saved_buf ] */
743
11fdf7f2 744#if defined(DUK_USE_EXPLICIT_NULL_INIT)
7c673cae
FG
745 for (i = 0; i < re_ctx.nsaved; i++) {
746 re_ctx.saved[i] = (duk_uint8_t *) NULL;
747 }
11fdf7f2
TL
748#elif defined(DUK_USE_ZERO_BUFFER_DATA)
749 /* buffer is automatically zeroed */
750#else
751 DUK_MEMZERO((void *) p_buf, sizeof(duk_uint8_t *) * re_ctx.nsaved);
7c673cae
FG
752#endif
753
754 DUK_DDD(DUK_DDDPRINT("regexp ctx initialized, flags=0x%08lx, nsaved=%ld, recursion_limit=%ld, steps_limit=%ld",
755 (unsigned long) re_ctx.re_flags, (long) re_ctx.nsaved, (long) re_ctx.recursion_limit,
756 (long) re_ctx.steps_limit));
757
758 /*
759 * Get starting character offset for match, and initialize 'sp' based on it.
760 *
761 * Note: lastIndex is non-configurable so it must be present (we check the
762 * internal class of the object above, so we know it is). User code can set
763 * its value to an arbitrary (garbage) value though; E5 requires that lastIndex
764 * be coerced to a number before using. The code below works even if the
765 * property is missing: the value will then be coerced to zero.
766 *
767 * Note: lastIndex may be outside Uint32 range even after ToInteger() coercion.
768 * For instance, ToInteger(+Infinity) = +Infinity. We track the match offset
769 * as an integer, but pre-check it to be inside the 32-bit range before the loop.
770 * If not, the check in E5 Section 15.10.6.2, step 9.a applies.
771 */
772
773 /* XXX: lastIndex handling produces a lot of asm */
774
775 /* [ ... re_obj input bc saved_buf ] */
776
777 duk_get_prop_stridx(ctx, -4, DUK_STRIDX_LAST_INDEX); /* -> [ ... re_obj input bc saved_buf lastIndex ] */
778 (void) duk_to_int(ctx, -1); /* ToInteger(lastIndex) */
779 d = duk_get_number(ctx, -1); /* integer, but may be +/- Infinite, +/- zero (not NaN, though) */
780 duk_pop(ctx);
781
782 if (global) {
783 if (d < 0.0 || d > (double) DUK_HSTRING_GET_CHARLEN(h_input)) {
784 /* match fail */
785 char_offset = 0; /* not really necessary */
786 DUK_ASSERT(match == 0);
787 goto match_over;
788 }
789 char_offset = (duk_uint32_t) d;
790 } else {
791 /* lastIndex must be ignored for non-global regexps, but get the
792 * value for (theoretical) side effects. No side effects can
793 * really occur, because lastIndex is a normal property and is
794 * always non-configurable for RegExp instances.
795 */
796 char_offset = (duk_uint32_t) 0;
797 }
798
799 sp = re_ctx.input + duk_heap_strcache_offset_char2byte(thr, h_input, char_offset);
800
801 /*
802 * Match loop.
803 *
804 * Try matching at different offsets until match found or input exhausted.
805 */
806
807 /* [ ... re_obj input bc saved_buf ] */
808
809 DUK_ASSERT(match == 0);
810
811 for (;;) {
812 /* char offset in [0, h_input->clen] (both ends inclusive), checked before entry */
813 DUK_ASSERT_DISABLE(char_offset >= 0);
814 DUK_ASSERT(char_offset <= DUK_HSTRING_GET_CHARLEN(h_input));
815
816 /* Note: ctx.steps is intentionally not reset, it applies to the entire unanchored match */
817 DUK_ASSERT(re_ctx.recursion_depth == 0);
818
819 DUK_DDD(DUK_DDDPRINT("attempt match at char offset %ld; %p [%p,%p]",
11fdf7f2
TL
820 (long) char_offset, (const void *) sp,
821 (const void *) re_ctx.input, (const void *) re_ctx.input_end));
7c673cae
FG
822
823 /*
824 * Note:
825 *
826 * - duk__match_regexp() is required not to longjmp() in ordinary "non-match"
827 * conditions; a longjmp() will terminate the entire matching process.
828 *
829 * - Clearing saved[] is not necessary because backtracking does it
830 *
831 * - Backtracking also rewinds ctx.recursion back to zero, unless an
832 * internal/limit error occurs (which causes a longjmp())
833 *
834 * - If we supported anchored matches, we would break out here
835 * unconditionally; however, Ecmascript regexps don't have anchored
836 * matches. It might make sense to implement a fast bail-out if
837 * the regexp begins with '^' and sp is not 0: currently we'll just
838 * run through the entire input string, trivially failing the match
839 * at every non-zero offset.
840 */
841
842 if (duk__match_regexp(&re_ctx, re_ctx.bytecode, sp) != NULL) {
843 DUK_DDD(DUK_DDDPRINT("match at offset %ld", (long) char_offset));
844 match = 1;
845 break;
846 }
847
848 /* advance by one character (code point) and one char_offset */
849 char_offset++;
850 if (char_offset > DUK_HSTRING_GET_CHARLEN(h_input)) {
851 /*
852 * Note:
853 *
854 * - Intentionally attempt (empty) match at char_offset == k_input->clen
855 *
856 * - Negative char_offsets have been eliminated and char_offset is duk_uint32_t
857 * -> no need or use for a negative check
858 */
859
860 DUK_DDD(DUK_DDDPRINT("no match after trying all sp offsets"));
861 break;
862 }
863
864 /* avoid calling at end of input, will DUK_ERROR (above check suffices to avoid this) */
865 (void) duk__utf8_advance(thr, &sp, re_ctx.input, re_ctx.input_end, (duk_uint_fast32_t) 1);
866 }
867
868 match_over:
869
870 /*
871 * Matching complete, create result array or return a 'null'. Update lastIndex
872 * if necessary. See E5 Section 15.10.6.2.
873 *
874 * Because lastIndex is a character (not byte) offset, we need the character
875 * length of the match which we conveniently get as a side effect of interning
876 * the matching substring (0th index of result array).
877 *
878 * saved[0] start pointer (~ byte offset) of current match
879 * saved[1] end pointer (~ byte offset) of current match (exclusive)
880 * char_offset start character offset of current match (-> .index of result)
881 * char_end_offset end character offset (computed below)
882 */
883
884 /* [ ... re_obj input bc saved_buf ] */
885
886 if (match) {
887#ifdef DUK_USE_ASSERTIONS
888 duk_hobject *h_res;
889#endif
890 duk_uint32_t char_end_offset = 0;
891
892 DUK_DDD(DUK_DDDPRINT("regexp matches at char_offset %ld", (long) char_offset));
893
894 DUK_ASSERT(re_ctx.nsaved >= 2); /* must have start and end */
895 DUK_ASSERT((re_ctx.nsaved % 2) == 0); /* and even number */
896
897 /* XXX: Array size is known before and (2 * re_ctx.nsaved) but not taken
898 * advantage of now. The array is not compacted either, as regexp match
899 * objects are usually short lived.
900 */
901
902 duk_push_array(ctx);
903
904#ifdef DUK_USE_ASSERTIONS
905 h_res = duk_require_hobject(ctx, -1);
906 DUK_ASSERT(DUK_HOBJECT_HAS_EXTENSIBLE(h_res));
907 DUK_ASSERT(DUK_HOBJECT_HAS_EXOTIC_ARRAY(h_res));
908 DUK_ASSERT(DUK_HOBJECT_GET_CLASS_NUMBER(h_res) == DUK_HOBJECT_CLASS_ARRAY);
909#endif
910
911 /* [ ... re_obj input bc saved_buf res_obj ] */
912
913 duk_push_u32(ctx, char_offset);
914 duk_xdef_prop_stridx_wec(ctx, -2, DUK_STRIDX_INDEX);
915
916 duk_dup(ctx, -4);
917 duk_xdef_prop_stridx_wec(ctx, -2, DUK_STRIDX_INPUT);
918
919 for (i = 0; i < re_ctx.nsaved; i += 2) {
920 /* Captures which are undefined have NULL pointers and are returned
921 * as 'undefined'. The same is done when saved[] pointers are insane
922 * (this should, of course, never happen in practice).
923 */
924 if (re_ctx.saved[i] && re_ctx.saved[i+1] && re_ctx.saved[i+1] >= re_ctx.saved[i]) {
925 duk_hstring *h_saved;
926
927 duk_push_lstring(ctx,
11fdf7f2 928 (const char *) re_ctx.saved[i],
7c673cae
FG
929 (duk_size_t) (re_ctx.saved[i+1] - re_ctx.saved[i]));
930 h_saved = duk_get_hstring(ctx, -1);
931 DUK_ASSERT(h_saved != NULL);
932
933 if (i == 0) {
934 /* Assumes that saved[0] and saved[1] are always
935 * set by regexp bytecode (if not, char_end_offset
936 * will be zero). Also assumes clen reflects the
937 * correct char length.
938 */
939 char_end_offset = char_offset + DUK_HSTRING_GET_CHARLEN(h_saved);
940 }
941 } else {
942 duk_push_undefined(ctx);
943 }
944
945 /* [ ... re_obj input bc saved_buf res_obj val ] */
946 duk_put_prop_index(ctx, -2, i / 2);
947 }
948
949 /* [ ... re_obj input bc saved_buf res_obj ] */
950
951 /* NB: 'length' property is automatically updated by the array setup loop */
952
953 if (global) {
954 /* global regexp: lastIndex updated on match */
955 duk_push_u32(ctx, char_end_offset);
956 duk_put_prop_stridx(ctx, -6, DUK_STRIDX_LAST_INDEX);
957 } else {
958 /* non-global regexp: lastIndex never updated on match */
959 ;
960 }
961 } else {
962 /*
963 * No match, E5 Section 15.10.6.2, step 9.a.i - 9.a.ii apply, regardless
964 * of 'global' flag of the RegExp. In particular, if lastIndex is invalid
965 * initially, it is reset to zero.
966 */
967
968 DUK_DDD(DUK_DDDPRINT("regexp does not match"));
969
970 duk_push_null(ctx);
971
972 /* [ ... re_obj input bc saved_buf res_obj ] */
973
974 duk_push_int(ctx, 0);
975 duk_put_prop_stridx(ctx, -6, DUK_STRIDX_LAST_INDEX);
976 }
977
978 /* [ ... re_obj input bc saved_buf res_obj ] */
979
980 duk_insert(ctx, -5);
981
982 /* [ ... res_obj re_obj input bc saved_buf ] */
983
984 duk_pop_n(ctx, 4);
985
986 /* [ ... res_obj ] */
987
988 /* XXX: these last tricks are unnecessary if the function is made
989 * a genuine native function.
990 */
991}
992
993DUK_INTERNAL void duk_regexp_match(duk_hthread *thr) {
994 duk__regexp_match_helper(thr, 0 /*force_global*/);
995}
996
997/* This variant is needed by String.prototype.split(); it needs to perform
998 * global-style matching on a cloned RegExp which is potentially non-global.
999 */
1000DUK_INTERNAL void duk_regexp_match_force_global(duk_hthread *thr) {
1001 duk__regexp_match_helper(thr, 1 /*force_global*/);
1002}
1003
1004#else /* DUK_USE_REGEXP_SUPPORT */
1005
1006/* regexp support disabled */
1007
1008#endif /* DUK_USE_REGEXP_SUPPORT */