4 * See doc/regexp.rst for a discussion of the compilation approach and
7 * Regexp bytecode assumes jumps can be expressed with signed 32-bit
8 * integers. Consequently the bytecode size must not exceed 0x7fffffffL.
9 * The implementation casts duk_size_t (buffer size) to duk_(u)int32_t
10 * in many places. Although this could be changed, the bytecode format
11 * limit would still prevent regexps exceeding the signed 32-bit limit
14 * XXX: The implementation does not prevent bytecode from exceeding the
15 * maximum supported size. This could be done by limiting the maximum
16 * input string size (assuming an upper bound can be computed for number
17 * of bytecode bytes emitted per input byte) or checking buffer maximum
18 * size when emitting bytecode (slower).
21 #include "duk_internal.h"
23 #ifdef DUK_USE_REGEXP_SUPPORT
29 #define DUK__RE_INITIAL_BUFSIZE 64
32 #define DUK__RE_BUFLEN(re_ctx) \
33 DUK_BW_GET_SIZE(re_ctx->thr, &re_ctx->bw)
36 * Disjunction struct: result of parsing a disjunction
40 /* Number of characters that the atom matches (e.g. 3 for 'abc'),
41 * -1 if atom is complex and number of matched characters either
42 * varies or is not known.
47 /* These are not needed to implement quantifier capture handling,
48 * but might be needed at some point.
51 /* re_ctx->captures at start and end of atom parsing.
52 * Since 'captures' indicates highest capture number emitted
53 * so far in a DUK_REOP_SAVE, the captures numbers saved by
54 * the atom are: ]start_captures,end_captures].
56 duk_uint32_t start_captures
;
57 duk_uint32_t end_captures
;
59 } duk__re_disjunction_info
;
64 * Some of the typing is bytecode based, e.g. slice sizes are unsigned 32-bit
65 * even though the buffer operations will use duk_size_t.
68 /* XXX: the insert helpers should ensure that the bytecode result is not
69 * larger than expected (or at least assert for it). Many things in the
70 * bytecode, like skip offsets, won't work correctly if the bytecode is
74 DUK_LOCAL duk_uint32_t
duk__encode_i32(duk_int32_t x
) {
76 return ((duk_uint32_t
) (-x
)) * 2 + 1;
78 return ((duk_uint32_t
) x
) * 2;
82 /* XXX: return type should probably be duk_size_t, or explicit checks are needed for
85 DUK_LOCAL duk_uint32_t
duk__insert_u32(duk_re_compiler_ctx
*re_ctx
, duk_uint32_t offset
, duk_uint32_t x
) {
86 duk_uint8_t buf
[DUK_UNICODE_MAX_XUTF8_LENGTH
];
89 len
= duk_unicode_encode_xutf8((duk_ucodepoint_t
) x
, buf
);
90 DUK_BW_INSERT_ENSURE_BYTES(re_ctx
->thr
, &re_ctx
->bw
, offset
, buf
, len
);
91 return (duk_uint32_t
) len
;
94 DUK_LOCAL duk_uint32_t
duk__append_u32(duk_re_compiler_ctx
*re_ctx
, duk_uint32_t x
) {
95 duk_uint8_t buf
[DUK_UNICODE_MAX_XUTF8_LENGTH
];
98 len
= duk_unicode_encode_xutf8((duk_ucodepoint_t
) x
, buf
);
99 DUK_BW_WRITE_ENSURE_BYTES(re_ctx
->thr
, &re_ctx
->bw
, buf
, len
);
100 return (duk_uint32_t
) len
;
103 DUK_LOCAL duk_uint32_t
duk__insert_i32(duk_re_compiler_ctx
*re_ctx
, duk_uint32_t offset
, duk_int32_t x
) {
104 return duk__insert_u32(re_ctx
, offset
, duk__encode_i32(x
));
108 DUK_LOCAL duk_uint32_t
duk__append_i32(duk_re_compiler_ctx
*re_ctx
, duk_int32_t x
) {
109 return duk__append_u32(re_ctx
, duk__encode_i32(x
));
113 /* special helper for emitting u16 lists (used for character ranges for built-in char classes) */
114 DUK_LOCAL
void duk__append_u16_list(duk_re_compiler_ctx
*re_ctx
, const duk_uint16_t
*values
, duk_uint32_t count
) {
115 /* Call sites don't need the result length so it's not accumulated. */
117 (void) duk__append_u32(re_ctx
, (duk_uint32_t
) (*values
++));
122 DUK_LOCAL
void duk__insert_slice(duk_re_compiler_ctx
*re_ctx
, duk_uint32_t offset
, duk_uint32_t data_offset
, duk_uint32_t data_length
) {
123 DUK_BW_INSERT_ENSURE_SLICE(re_ctx
->thr
, &re_ctx
->bw
, offset
, data_offset
, data_length
);
126 DUK_LOCAL
void duk__append_slice(duk_re_compiler_ctx
*re_ctx
, duk_uint32_t data_offset
, duk_uint32_t data_length
) {
127 DUK_BW_WRITE_ENSURE_SLICE(re_ctx
->thr
, &re_ctx
->bw
, data_offset
, data_length
);
130 DUK_LOCAL
void duk__remove_slice(duk_re_compiler_ctx
*re_ctx
, duk_uint32_t data_offset
, duk_uint32_t data_length
) {
131 DUK_BW_REMOVE_ENSURE_SLICE(re_ctx
->thr
, &re_ctx
->bw
, data_offset
, data_length
);
135 * Insert a jump offset at 'offset' to complete an instruction
136 * (the jump offset is always the last component of an instruction).
137 * The 'skip' argument must be computed relative to 'offset',
138 * -without- taking into account the skip field being inserted.
140 * ... A B C ins X Y Z ... (ins may be a JUMP, SPLIT1/SPLIT2, etc)
141 * => ... A B C ins SKIP X Y Z
143 * Computing the final (adjusted) skip value, which is relative to the
144 * first byte of the next instruction, is a bit tricky because of the
145 * variable length UTF-8 encoding. See doc/regexp.rst for discussion.
147 DUK_LOCAL duk_uint32_t
duk__insert_jump_offset(duk_re_compiler_ctx
*re_ctx
, duk_uint32_t offset
, duk_int32_t skip
) {
150 /* XXX: solve into closed form (smaller code) */
153 /* two encoding attempts suffices */
154 len
= duk_unicode_get_xutf8_length((duk_codepoint_t
) duk__encode_i32(skip
));
155 len
= duk_unicode_get_xutf8_length((duk_codepoint_t
) duk__encode_i32(skip
- (duk_int32_t
) len
));
156 DUK_ASSERT(duk_unicode_get_xutf8_length(duk__encode_i32(skip
- (duk_int32_t
) len
)) == len
); /* no change */
157 skip
-= (duk_int32_t
) len
;
159 return duk__insert_i32(re_ctx
, offset
, skip
);
162 DUK_LOCAL duk_uint32_t
duk__append_jump_offset(duk_re_compiler_ctx
*re_ctx
, duk_int32_t skip
) {
163 return (duk_uint32_t
) duk__insert_jump_offset(re_ctx
, (duk_uint32_t
) DUK__RE_BUFLEN(re_ctx
), skip
);
167 * duk_re_range_callback for generating character class ranges.
169 * When ignoreCase is false, the range is simply emitted as is.
170 * We don't, for instance, eliminate duplicates or overlapping
171 * ranges in a character class.
173 * When ignoreCase is true, the range needs to be normalized through
174 * canonicalization. Unfortunately a canonicalized version of a
175 * continuous range is not necessarily continuous (e.g. [x-{] is
176 * continuous but [X-{] is not). The current algorithm creates the
177 * canonicalized range(s) space efficiently at the cost of compile
178 * time execution time (see doc/regexp.rst for discussion).
180 * Note that the ctx->nranges is a context-wide temporary value
181 * (this is OK because there cannot be multiple character classes
182 * being parsed simultaneously).
185 DUK_LOCAL
void duk__generate_ranges(void *userdata
, duk_codepoint_t r1
, duk_codepoint_t r2
, duk_bool_t direct
) {
186 duk_re_compiler_ctx
*re_ctx
= (duk_re_compiler_ctx
*) userdata
;
188 DUK_DD(DUK_DDPRINT("duk__generate_ranges(): re_ctx=%p, range=[%ld,%ld] direct=%ld",
189 (void *) re_ctx
, (long) r1
, (long) r2
, (long) direct
));
191 if (!direct
&& (re_ctx
->re_flags
& DUK_RE_FLAG_IGNORE_CASE
)) {
193 * Canonicalize a range, generating result ranges as necessary.
194 * Needs to exhaustively scan the entire range (at most 65536
195 * code points). If 'direct' is set, caller (lexer) has ensured
196 * that the range is already canonicalization compatible (this
197 * is used to avoid unnecessary canonicalization of built-in
198 * ranges like \W, which are not affected by canonicalization).
200 * NOTE: here is one place where we don't want to support chars
201 * outside the BMP, because the exhaustive search would be
207 duk_codepoint_t r_start
, r_end
;
209 r_start
= duk_unicode_re_canonicalize_char(re_ctx
->thr
, r1
);
211 for (i
= r1
+ 1; i
<= r2
; i
++) {
212 t
= duk_unicode_re_canonicalize_char(re_ctx
->thr
, i
);
213 if (t
== r_end
+ 1) {
216 DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start
, (long) r_end
));
217 duk__append_u32(re_ctx
, (duk_uint32_t
) r_start
);
218 duk__append_u32(re_ctx
, (duk_uint32_t
) r_end
);
224 DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start
, (long) r_end
));
225 duk__append_u32(re_ctx
, (duk_uint32_t
) r_start
);
226 duk__append_u32(re_ctx
, (duk_uint32_t
) r_end
);
229 DUK_DD(DUK_DDPRINT("direct, emit range: [%ld,%ld]", (long) r1
, (long) r2
));
230 duk__append_u32(re_ctx
, (duk_uint32_t
) r1
);
231 duk__append_u32(re_ctx
, (duk_uint32_t
) r2
);
237 * Parse regexp Disjunction. Most of regexp compilation happens here.
239 * Handles Disjunction, Alternative, and Term productions directly without
240 * recursion. The only constructs requiring recursion are positive/negative
241 * lookaheads, capturing parentheses, and non-capturing parentheses.
243 * The function determines whether the entire disjunction is a 'simple atom'
244 * (see doc/regexp.rst discussion on 'simple quantifiers') and if so,
245 * returns the atom character length which is needed by the caller to keep
246 * track of its own atom character length. A disjunction with more than one
247 * alternative is never considered a simple atom (although in some cases
248 * that might be the case).
250 * Return value: simple atom character length or < 0 if not a simple atom.
251 * Appends the bytecode for the disjunction matcher to the end of the temp
254 * Regexp top level structure is:
256 * Disjunction = Term*
257 * | Term* | Disjunction
263 * An empty Term sequence is a valid disjunction alternative (e.g. /|||c||/).
267 * * Tracking of the 'simple-ness' of the current atom vs. the entire
268 * disjunction are separate matters. For instance, the disjunction
269 * may be complex, but individual atoms may be simple. Furthermore,
270 * simple quantifiers are used whenever possible, even if the
271 * disjunction as a whole is complex.
273 * * The estimate of whether an atom is simple is conservative now,
274 * and it would be possible to expand it. For instance, captures
275 * cause the disjunction to be marked complex, even though captures
276 * -can- be handled by simple quantifiers with some minor modifications.
278 * * Disjunction 'tainting' as 'complex' is handled at the end of the
279 * main for loop collectively for atoms. Assertions, quantifiers,
280 * and '|' tokens need to taint the result manually if necessary.
281 * Assertions cannot add to result char length, only atoms (and
282 * quantifiers) can; currently quantifiers will taint the result
286 DUK_LOCAL
void duk__parse_disjunction(duk_re_compiler_ctx
*re_ctx
, duk_bool_t expect_eof
, duk__re_disjunction_info
*out_atom_info
) {
287 duk_int32_t atom_start_offset
= -1; /* negative -> no atom matched on previous round */
288 duk_int32_t atom_char_length
= 0; /* negative -> complex atom */
289 duk_uint32_t atom_start_captures
= re_ctx
->captures
; /* value of re_ctx->captures at start of atom */
290 duk_int32_t unpatched_disjunction_split
= -1;
291 duk_int32_t unpatched_disjunction_jump
= -1;
292 duk_uint32_t entry_offset
= (duk_uint32_t
) DUK__RE_BUFLEN(re_ctx
);
293 duk_int32_t res_charlen
= 0; /* -1 if disjunction is complex, char length if simple */
294 duk__re_disjunction_info tmp_disj
;
296 DUK_ASSERT(out_atom_info
!= NULL
);
298 if (re_ctx
->recursion_depth
>= re_ctx
->recursion_limit
) {
299 DUK_ERROR_RANGE(re_ctx
->thr
, DUK_STR_REGEXP_COMPILER_RECURSION_LIMIT
);
301 re_ctx
->recursion_depth
++;
304 out_atom_info
->start_captures
= re_ctx
->captures
;
308 /* atom_char_length, atom_start_offset, atom_start_offset reflect the
309 * atom matched on the previous loop. If a quantifier is encountered
310 * on this loop, these are needed to handle the quantifier correctly.
311 * new_atom_char_length etc are for the atom parsed on this round;
312 * they're written to atom_char_length etc at the end of the round.
314 duk_int32_t new_atom_char_length
; /* char length of the atom parsed in this loop */
315 duk_int32_t new_atom_start_offset
; /* bytecode start offset of the atom parsed in this loop
316 * (allows quantifiers to copy the atom bytecode)
318 duk_uint32_t new_atom_start_captures
; /* re_ctx->captures at the start of the atom parsed in this loop */
320 duk_lexer_parse_re_token(&re_ctx
->lex
, &re_ctx
->curr_token
);
322 DUK_DD(DUK_DDPRINT("re token: %ld (num=%ld, char=%c)",
323 (long) re_ctx
->curr_token
.t
,
324 (long) re_ctx
->curr_token
.num
,
325 (re_ctx
->curr_token
.num
>= 0x20 && re_ctx
->curr_token
.num
<= 0x7e) ?
326 (int) re_ctx
->curr_token
.num
: (int) '?'));
328 /* set by atom case clauses */
329 new_atom_start_offset
= -1;
330 new_atom_char_length
= -1;
331 new_atom_start_captures
= re_ctx
->captures
;
333 switch (re_ctx
->curr_token
.t
) {
334 case DUK_RETOK_DISJUNCTION
: {
336 * The handling here is a bit tricky. If a previous '|' has been processed,
337 * we have a pending split1 and a pending jump (for a previous match). These
338 * need to be back-patched carefully. See docs for a detailed example.
341 /* patch pending jump and split */
342 if (unpatched_disjunction_jump
>= 0) {
345 DUK_ASSERT(unpatched_disjunction_split
>= 0);
346 offset
= unpatched_disjunction_jump
;
347 offset
+= duk__insert_jump_offset(re_ctx
,
349 (duk_int32_t
) (DUK__RE_BUFLEN(re_ctx
) - offset
));
350 /* offset is now target of the pending split (right after jump) */
351 duk__insert_jump_offset(re_ctx
,
352 unpatched_disjunction_split
,
353 offset
- unpatched_disjunction_split
);
356 /* add a new pending split to the beginning of the entire disjunction */
357 (void) duk__insert_u32(re_ctx
,
359 DUK_REOP_SPLIT1
); /* prefer direct execution */
360 unpatched_disjunction_split
= entry_offset
+ 1; /* +1 for opcode */
362 /* add a new pending match jump for latest finished alternative */
363 duk__append_u32(re_ctx
, DUK_REOP_JUMP
);
364 unpatched_disjunction_jump
= (duk_int32_t
) DUK__RE_BUFLEN(re_ctx
);
366 /* 'taint' result as complex */
370 case DUK_RETOK_QUANTIFIER
: {
371 if (atom_start_offset
< 0) {
372 DUK_ERROR_SYNTAX(re_ctx
->thr
, DUK_STR_INVALID_QUANTIFIER_NO_ATOM
);
374 if (re_ctx
->curr_token
.qmin
> re_ctx
->curr_token
.qmax
) {
375 DUK_ERROR_SYNTAX(re_ctx
->thr
, DUK_STR_INVALID_QUANTIFIER_VALUES
);
377 if (atom_char_length
>= 0) {
381 * If atom_char_length is zero, we'll have unbounded execution time for e.g.
382 * /()*x/.exec('x'). We can't just skip the match because it might have some
383 * side effects (for instance, if we allowed captures in simple atoms, the
384 * capture needs to happen). The simple solution below is to force the
385 * quantifier to match at most once, since the additional matches have no effect.
387 * With a simple atom there can be no capture groups, so no captures need
390 duk_int32_t atom_code_length
;
392 duk_uint32_t qmin
, qmax
;
394 qmin
= re_ctx
->curr_token
.qmin
;
395 qmax
= re_ctx
->curr_token
.qmax
;
396 if (atom_char_length
== 0) {
397 /* qmin and qmax will be 0 or 1 */
406 duk__append_u32(re_ctx
, DUK_REOP_MATCH
); /* complete 'sub atom' */
407 atom_code_length
= (duk_int32_t
) (DUK__RE_BUFLEN(re_ctx
) - atom_start_offset
);
409 offset
= atom_start_offset
;
410 if (re_ctx
->curr_token
.greedy
) {
411 offset
+= duk__insert_u32(re_ctx
, offset
, DUK_REOP_SQGREEDY
);
412 offset
+= duk__insert_u32(re_ctx
, offset
, qmin
);
413 offset
+= duk__insert_u32(re_ctx
, offset
, qmax
);
414 offset
+= duk__insert_u32(re_ctx
, offset
, atom_char_length
);
415 offset
+= duk__insert_jump_offset(re_ctx
, offset
, atom_code_length
);
417 offset
+= duk__insert_u32(re_ctx
, offset
, DUK_REOP_SQMINIMAL
);
418 offset
+= duk__insert_u32(re_ctx
, offset
, qmin
);
419 offset
+= duk__insert_u32(re_ctx
, offset
, qmax
);
420 offset
+= duk__insert_jump_offset(re_ctx
, offset
, atom_code_length
);
422 DUK_UNREF(offset
); /* silence scan-build warning */
427 * The original code is used as a template, and removed at the end
428 * (this differs from the handling of simple quantifiers).
430 * NOTE: there is no current solution for empty atoms in complex
431 * quantifiers. This would need some sort of a 'progress' instruction.
433 * XXX: impose limit on maximum result size, i.e. atom_code_len * atom_copies?
435 duk_int32_t atom_code_length
;
436 duk_uint32_t atom_copies
;
437 duk_uint32_t tmp_qmin
, tmp_qmax
;
439 /* pre-check how many atom copies we're willing to make (atom_copies not needed below) */
440 atom_copies
= (re_ctx
->curr_token
.qmax
== DUK_RE_QUANTIFIER_INFINITE
) ?
441 re_ctx
->curr_token
.qmin
: re_ctx
->curr_token
.qmax
;
442 if (atom_copies
> DUK_RE_MAX_ATOM_COPIES
) {
443 DUK_ERROR_RANGE(re_ctx
->thr
, DUK_STR_QUANTIFIER_TOO_MANY_COPIES
);
446 /* wipe the capture range made by the atom (if any) */
447 DUK_ASSERT(atom_start_captures
<= re_ctx
->captures
);
448 if (atom_start_captures
!= re_ctx
->captures
) {
449 DUK_ASSERT(atom_start_captures
< re_ctx
->captures
);
450 DUK_DDD(DUK_DDDPRINT("must wipe ]atom_start_captures,re_ctx->captures]: ]%ld,%ld]",
451 (long) atom_start_captures
, (long) re_ctx
->captures
));
453 /* insert (DUK_REOP_WIPERANGE, start, count) in reverse order so the order ends up right */
454 duk__insert_u32(re_ctx
, atom_start_offset
, (re_ctx
->captures
- atom_start_captures
) * 2);
455 duk__insert_u32(re_ctx
, atom_start_offset
, (atom_start_captures
+ 1) * 2);
456 duk__insert_u32(re_ctx
, atom_start_offset
, DUK_REOP_WIPERANGE
);
458 DUK_DDD(DUK_DDDPRINT("no need to wipe captures: atom_start_captures == re_ctx->captures == %ld",
459 (long) atom_start_captures
));
462 atom_code_length
= (duk_int32_t
) DUK__RE_BUFLEN(re_ctx
) - atom_start_offset
;
464 /* insert the required matches (qmin) by copying the atom */
465 tmp_qmin
= re_ctx
->curr_token
.qmin
;
466 tmp_qmax
= re_ctx
->curr_token
.qmax
;
467 while (tmp_qmin
> 0) {
468 duk__append_slice(re_ctx
, atom_start_offset
, atom_code_length
);
470 if (tmp_qmax
!= DUK_RE_QUANTIFIER_INFINITE
) {
474 DUK_ASSERT(tmp_qmin
== 0);
476 /* insert code for matching the remainder - infinite or finite */
477 if (tmp_qmax
== DUK_RE_QUANTIFIER_INFINITE
) {
478 /* reuse last emitted atom for remaining 'infinite' quantifier */
480 if (re_ctx
->curr_token
.qmin
== 0) {
481 /* Special case: original qmin was zero so there is nothing
482 * to repeat. Emit an atom copy but jump over it here.
484 duk__append_u32(re_ctx
, DUK_REOP_JUMP
);
485 duk__append_jump_offset(re_ctx
, atom_code_length
);
486 duk__append_slice(re_ctx
, atom_start_offset
, atom_code_length
);
488 if (re_ctx
->curr_token
.greedy
) {
489 duk__append_u32(re_ctx
, DUK_REOP_SPLIT2
); /* prefer jump */
491 duk__append_u32(re_ctx
, DUK_REOP_SPLIT1
); /* prefer direct */
493 duk__append_jump_offset(re_ctx
, -atom_code_length
- 1); /* -1 for opcode */
496 * The remaining matches are emitted as sequence of SPLITs and atom
497 * copies; the SPLITs skip the remaining copies and match the sequel.
498 * This sequence needs to be emitted starting from the last copy
499 * because the SPLITs are variable length due to the variable length
500 * skip offset. This causes a lot of memory copying now.
502 * Example structure (greedy, match maximum # atoms):
506 * SPLIT1 LSEQ ; <- the byte length of this instruction is needed
507 * (atom) ; to encode the above SPLIT1 correctly
511 duk_uint32_t offset
= (duk_uint32_t
) DUK__RE_BUFLEN(re_ctx
);
512 while (tmp_qmax
> 0) {
513 duk__insert_slice(re_ctx
, offset
, atom_start_offset
, atom_code_length
);
514 if (re_ctx
->curr_token
.greedy
) {
515 duk__insert_u32(re_ctx
, offset
, DUK_REOP_SPLIT1
); /* prefer direct */
517 duk__insert_u32(re_ctx
, offset
, DUK_REOP_SPLIT2
); /* prefer jump */
519 duk__insert_jump_offset(re_ctx
,
520 offset
+ 1, /* +1 for opcode */
521 (duk_int32_t
) (DUK__RE_BUFLEN(re_ctx
) - (offset
+ 1)));
526 /* remove the original 'template' atom */
527 duk__remove_slice(re_ctx
, atom_start_offset
, atom_code_length
);
530 /* 'taint' result as complex */
534 case DUK_RETOK_ASSERT_START
: {
535 duk__append_u32(re_ctx
, DUK_REOP_ASSERT_START
);
538 case DUK_RETOK_ASSERT_END
: {
539 duk__append_u32(re_ctx
, DUK_REOP_ASSERT_END
);
542 case DUK_RETOK_ASSERT_WORD_BOUNDARY
: {
543 duk__append_u32(re_ctx
, DUK_REOP_ASSERT_WORD_BOUNDARY
);
546 case DUK_RETOK_ASSERT_NOT_WORD_BOUNDARY
: {
547 duk__append_u32(re_ctx
, DUK_REOP_ASSERT_NOT_WORD_BOUNDARY
);
550 case DUK_RETOK_ASSERT_START_POS_LOOKAHEAD
:
551 case DUK_RETOK_ASSERT_START_NEG_LOOKAHEAD
: {
553 duk_uint32_t opcode
= (re_ctx
->curr_token
.t
== DUK_RETOK_ASSERT_START_POS_LOOKAHEAD
) ?
554 DUK_REOP_LOOKPOS
: DUK_REOP_LOOKNEG
;
556 offset
= (duk_uint32_t
) DUK__RE_BUFLEN(re_ctx
);
557 duk__parse_disjunction(re_ctx
, 0, &tmp_disj
);
558 duk__append_u32(re_ctx
, DUK_REOP_MATCH
);
560 (void) duk__insert_u32(re_ctx
, offset
, opcode
);
561 (void) duk__insert_jump_offset(re_ctx
,
562 offset
+ 1, /* +1 for opcode */
563 (duk_int32_t
) (DUK__RE_BUFLEN(re_ctx
) - (offset
+ 1)));
565 /* 'taint' result as complex -- this is conservative,
566 * as lookaheads do not backtrack.
571 case DUK_RETOK_ATOM_PERIOD
: {
572 new_atom_char_length
= 1;
573 new_atom_start_offset
= (duk_int32_t
) DUK__RE_BUFLEN(re_ctx
);
574 duk__append_u32(re_ctx
, DUK_REOP_PERIOD
);
577 case DUK_RETOK_ATOM_CHAR
: {
578 /* Note: successive characters could be joined into string matches
579 * but this is not trivial (consider e.g. '/xyz+/); see docs for
584 new_atom_char_length
= 1;
585 new_atom_start_offset
= (duk_int32_t
) DUK__RE_BUFLEN(re_ctx
);
586 duk__append_u32(re_ctx
, DUK_REOP_CHAR
);
587 ch
= re_ctx
->curr_token
.num
;
588 if (re_ctx
->re_flags
& DUK_RE_FLAG_IGNORE_CASE
) {
589 ch
= duk_unicode_re_canonicalize_char(re_ctx
->thr
, ch
);
591 duk__append_u32(re_ctx
, ch
);
594 case DUK_RETOK_ATOM_DIGIT
:
595 case DUK_RETOK_ATOM_NOT_DIGIT
: {
596 new_atom_char_length
= 1;
597 new_atom_start_offset
= (duk_int32_t
) DUK__RE_BUFLEN(re_ctx
);
598 duk__append_u32(re_ctx
,
599 (re_ctx
->curr_token
.t
== DUK_RETOK_ATOM_DIGIT
) ?
600 DUK_REOP_RANGES
: DUK_REOP_INVRANGES
);
601 duk__append_u32(re_ctx
, sizeof(duk_unicode_re_ranges_digit
) / (2 * sizeof(duk_uint16_t
)));
602 duk__append_u16_list(re_ctx
, duk_unicode_re_ranges_digit
, sizeof(duk_unicode_re_ranges_digit
) / sizeof(duk_uint16_t
));
605 case DUK_RETOK_ATOM_WHITE
:
606 case DUK_RETOK_ATOM_NOT_WHITE
: {
607 new_atom_char_length
= 1;
608 new_atom_start_offset
= (duk_int32_t
) DUK__RE_BUFLEN(re_ctx
);
609 duk__append_u32(re_ctx
,
610 (re_ctx
->curr_token
.t
== DUK_RETOK_ATOM_WHITE
) ?
611 DUK_REOP_RANGES
: DUK_REOP_INVRANGES
);
612 duk__append_u32(re_ctx
, sizeof(duk_unicode_re_ranges_white
) / (2 * sizeof(duk_uint16_t
)));
613 duk__append_u16_list(re_ctx
, duk_unicode_re_ranges_white
, sizeof(duk_unicode_re_ranges_white
) / sizeof(duk_uint16_t
));
616 case DUK_RETOK_ATOM_WORD_CHAR
:
617 case DUK_RETOK_ATOM_NOT_WORD_CHAR
: {
618 new_atom_char_length
= 1;
619 new_atom_start_offset
= (duk_int32_t
) DUK__RE_BUFLEN(re_ctx
);
620 duk__append_u32(re_ctx
,
621 (re_ctx
->curr_token
.t
== DUK_RETOK_ATOM_WORD_CHAR
) ?
622 DUK_REOP_RANGES
: DUK_REOP_INVRANGES
);
623 duk__append_u32(re_ctx
, sizeof(duk_unicode_re_ranges_wordchar
) / (2 * sizeof(duk_uint16_t
)));
624 duk__append_u16_list(re_ctx
, duk_unicode_re_ranges_wordchar
, sizeof(duk_unicode_re_ranges_wordchar
) / sizeof(duk_uint16_t
));
627 case DUK_RETOK_ATOM_BACKREFERENCE
: {
628 duk_uint32_t backref
= (duk_uint32_t
) re_ctx
->curr_token
.num
;
629 if (backref
> re_ctx
->highest_backref
) {
630 re_ctx
->highest_backref
= backref
;
632 new_atom_char_length
= -1; /* mark as complex */
633 new_atom_start_offset
= (duk_int32_t
) DUK__RE_BUFLEN(re_ctx
);
634 duk__append_u32(re_ctx
, DUK_REOP_BACKREFERENCE
);
635 duk__append_u32(re_ctx
, backref
);
638 case DUK_RETOK_ATOM_START_CAPTURE_GROUP
: {
641 new_atom_char_length
= -1; /* mark as complex (capture handling) */
642 new_atom_start_offset
= (duk_int32_t
) DUK__RE_BUFLEN(re_ctx
);
643 cap
= ++re_ctx
->captures
;
644 duk__append_u32(re_ctx
, DUK_REOP_SAVE
);
645 duk__append_u32(re_ctx
, cap
* 2);
646 duk__parse_disjunction(re_ctx
, 0, &tmp_disj
); /* retval (sub-atom char length) unused, tainted as complex above */
647 duk__append_u32(re_ctx
, DUK_REOP_SAVE
);
648 duk__append_u32(re_ctx
, cap
* 2 + 1);
651 case DUK_RETOK_ATOM_START_NONCAPTURE_GROUP
: {
652 new_atom_start_offset
= (duk_int32_t
) DUK__RE_BUFLEN(re_ctx
);
653 duk__parse_disjunction(re_ctx
, 0, &tmp_disj
);
654 new_atom_char_length
= tmp_disj
.charlen
;
657 case DUK_RETOK_ATOM_START_CHARCLASS
:
658 case DUK_RETOK_ATOM_START_CHARCLASS_INVERTED
: {
660 * Range parsing is done with a special lexer function which calls
661 * us for every range parsed. This is different from how rest of
662 * the parsing works, but avoids a heavy, arbitrary size intermediate
663 * value type to hold the ranges.
665 * Another complication is the handling of character ranges when
666 * case insensitive matching is used (see docs for discussion).
667 * The range handler callback given to the lexer takes care of this
670 * Note that duplicate ranges are not eliminated when parsing character
671 * classes, so that canonicalization of
675 * creates the result (note the duplicate ranges):
679 * where [x-{] is split as a result of canonicalization. The duplicate
680 * ranges are not a semantics issue: they work correctly.
685 DUK_DD(DUK_DDPRINT("character class"));
687 /* insert ranges instruction, range count patched in later */
688 new_atom_char_length
= 1;
689 new_atom_start_offset
= (duk_int32_t
) DUK__RE_BUFLEN(re_ctx
);
690 duk__append_u32(re_ctx
,
691 (re_ctx
->curr_token
.t
== DUK_RETOK_ATOM_START_CHARCLASS
) ?
692 DUK_REOP_RANGES
: DUK_REOP_INVRANGES
);
693 offset
= (duk_uint32_t
) DUK__RE_BUFLEN(re_ctx
); /* patch in range count later */
695 /* parse ranges until character class ends */
696 re_ctx
->nranges
= 0; /* note: ctx-wide temporary */
697 duk_lexer_parse_re_ranges(&re_ctx
->lex
, duk__generate_ranges
, (void *) re_ctx
);
699 /* insert range count */
700 duk__insert_u32(re_ctx
, offset
, re_ctx
->nranges
);
703 case DUK_RETOK_ATOM_END_GROUP
: {
705 DUK_ERROR_SYNTAX(re_ctx
->thr
, DUK_STR_UNEXPECTED_CLOSING_PAREN
);
709 case DUK_RETOK_EOF
: {
711 DUK_ERROR_SYNTAX(re_ctx
->thr
, DUK_STR_UNEXPECTED_END_OF_PATTERN
);
716 DUK_ERROR_SYNTAX(re_ctx
->thr
, DUK_STR_UNEXPECTED_REGEXP_TOKEN
);
720 /* a complex (new) atom taints the result */
721 if (new_atom_start_offset
>= 0) {
722 if (new_atom_char_length
< 0) {
724 } else if (res_charlen
>= 0) {
725 /* only advance if not tainted */
726 res_charlen
+= new_atom_char_length
;
730 /* record previous atom info in case next token is a quantifier */
731 atom_start_offset
= new_atom_start_offset
;
732 atom_char_length
= new_atom_char_length
;
733 atom_start_captures
= new_atom_start_captures
;
738 /* finish up pending jump and split for last alternative */
739 if (unpatched_disjunction_jump
>= 0) {
742 DUK_ASSERT(unpatched_disjunction_split
>= 0);
743 offset
= unpatched_disjunction_jump
;
744 offset
+= duk__insert_jump_offset(re_ctx
,
746 (duk_int32_t
) (DUK__RE_BUFLEN(re_ctx
) - offset
));
747 /* offset is now target of the pending split (right after jump) */
748 duk__insert_jump_offset(re_ctx
,
749 unpatched_disjunction_split
,
750 offset
- unpatched_disjunction_split
);
754 out_atom_info
->end_captures
= re_ctx
->captures
;
756 out_atom_info
->charlen
= res_charlen
;
757 DUK_DDD(DUK_DDDPRINT("parse disjunction finished: charlen=%ld",
758 (long) out_atom_info
->charlen
));
760 re_ctx
->recursion_depth
--;
764 * Flags parsing (see E5 Section 15.10.4.1).
767 DUK_LOCAL duk_uint32_t
duk__parse_regexp_flags(duk_hthread
*thr
, duk_hstring
*h
) {
768 const duk_uint8_t
*p
;
769 const duk_uint8_t
*p_end
;
770 duk_uint32_t flags
= 0;
772 p
= DUK_HSTRING_GET_DATA(h
);
773 p_end
= p
+ DUK_HSTRING_GET_BYTELEN(h
);
775 /* Note: can be safely scanned as bytes (undecoded) */
778 duk_uint8_t c
= *p
++;
781 if (flags
& DUK_RE_FLAG_GLOBAL
) {
784 flags
|= DUK_RE_FLAG_GLOBAL
;
788 if (flags
& DUK_RE_FLAG_IGNORE_CASE
) {
791 flags
|= DUK_RE_FLAG_IGNORE_CASE
;
795 if (flags
& DUK_RE_FLAG_MULTILINE
) {
798 flags
|= DUK_RE_FLAG_MULTILINE
;
810 DUK_ERROR_SYNTAX(thr
, DUK_STR_INVALID_REGEXP_FLAGS
);
811 return 0; /* never here */
815 * Create escaped RegExp source (E5 Section 15.10.3).
817 * The current approach is to special case the empty RegExp
818 * ('' -> '(?:)') and otherwise replace unescaped '/' characters
819 * with '\/' regardless of where they occur in the regexp.
821 * Note that normalization does not seem to be necessary for
822 * RegExp literals (e.g. '/foo/') because to be acceptable as
823 * a RegExp literal, the text between forward slashes must
824 * already match the escaping requirements (e.g. must not contain
825 * unescaped forward slashes or be empty). Escaping IS needed
826 * for expressions like 'new Regexp("...", "")' however.
827 * Currently, we re-escape in either case.
829 * Also note that we process the source here in UTF-8 encoded
830 * form. This is correct, because any non-ASCII characters are
831 * passed through without change.
834 DUK_LOCAL
void duk__create_escaped_source(duk_hthread
*thr
, int idx_pattern
) {
835 duk_context
*ctx
= (duk_context
*) thr
;
837 const duk_uint8_t
*p
;
838 duk_bufwriter_ctx bw_alloc
;
839 duk_bufwriter_ctx
*bw
;
842 duk_uint_fast8_t c_prev
, c
;
844 h
= duk_get_hstring(ctx
, idx_pattern
);
845 DUK_ASSERT(h
!= NULL
);
846 p
= (const duk_uint8_t
*) DUK_HSTRING_GET_DATA(h
);
847 n
= (duk_size_t
) DUK_HSTRING_GET_BYTELEN(h
);
851 duk_push_hstring_stridx(ctx
, DUK_STRIDX_ESCAPED_EMPTY_REGEXP
);
856 DUK_BW_INIT_PUSHBUF(thr
, bw
, n
);
857 q
= DUK_BW_GET_PTR(thr
, bw
);
859 c_prev
= (duk_uint_fast8_t
) 0;
861 for (i
= 0; i
< n
; i
++) {
864 q
= DUK_BW_ENSURE_RAW(thr
, bw
, 2, q
);
866 if (c
== (duk_uint_fast8_t
) '/' && c_prev
!= (duk_uint_fast8_t
) '\\') {
867 /* Unescaped '/' ANYWHERE in the regexp (in disjunction,
868 * inside a character class, ...) => same escape works.
870 *q
++ = DUK_ASC_BACKSLASH
;
872 *q
++ = (duk_uint8_t
) c
;
877 DUK_BW_SETPTR_AND_COMPACT(thr
, bw
, q
);
878 duk_to_string(ctx
, -1); /* -> [ ... escaped_source ] */
882 * Exposed regexp compilation primitive.
884 * Sets up a regexp compilation context, and calls duk__parse_disjunction() to do the
885 * actual parsing. Handles generation of the compiled regexp header and the
886 * "boilerplate" capture of the matching substring (save 0 and 1). Also does some
887 * global level regexp checks after recursive compilation has finished.
889 * An escaped version of the regexp source, suitable for use as a RegExp instance
890 * 'source' property (see E5 Section 15.10.3), is also left on the stack.
892 * Input stack: [ pattern flags ]
893 * Output stack: [ bytecode escaped_source ] (both as strings)
896 DUK_INTERNAL
void duk_regexp_compile(duk_hthread
*thr
) {
897 duk_context
*ctx
= (duk_context
*) thr
;
898 duk_re_compiler_ctx re_ctx
;
899 duk_lexer_point lex_point
;
900 duk_hstring
*h_pattern
;
901 duk_hstring
*h_flags
;
902 duk__re_disjunction_info ign_disj
;
904 DUK_ASSERT(thr
!= NULL
);
905 DUK_ASSERT(ctx
!= NULL
);
911 /* TypeError if fails */
912 h_pattern
= duk_require_hstring(ctx
, -2);
913 h_flags
= duk_require_hstring(ctx
, -1);
916 * Create normalized 'source' property (E5 Section 15.10.3).
919 /* [ ... pattern flags ] */
921 duk__create_escaped_source(thr
, -2);
923 /* [ ... pattern flags escaped_source ] */
926 * Init compilation context
929 /* [ ... pattern flags escaped_source buffer ] */
931 DUK_MEMZERO(&re_ctx
, sizeof(re_ctx
));
932 DUK_LEXER_INITCTX(&re_ctx
.lex
); /* duplicate zeroing, expect for (possible) NULL inits */
934 re_ctx
.lex
.thr
= thr
;
935 re_ctx
.lex
.input
= DUK_HSTRING_GET_DATA(h_pattern
);
936 re_ctx
.lex
.input_length
= DUK_HSTRING_GET_BYTELEN(h_pattern
);
937 re_ctx
.lex
.token_limit
= DUK_RE_COMPILE_TOKEN_LIMIT
;
938 re_ctx
.recursion_limit
= DUK_USE_REGEXP_COMPILER_RECLIMIT
;
939 re_ctx
.re_flags
= duk__parse_regexp_flags(thr
, h_flags
);
941 DUK_BW_INIT_PUSHBUF(thr
, &re_ctx
.bw
, DUK__RE_INITIAL_BUFSIZE
);
943 DUK_DD(DUK_DDPRINT("regexp compiler ctx initialized, flags=0x%08lx, recursion_limit=%ld",
944 (unsigned long) re_ctx
.re_flags
, (long) re_ctx
.recursion_limit
));
950 lex_point
.offset
= 0; /* expensive init, just want to fill window */
952 DUK_LEXER_SETPOINT(&re_ctx
.lex
, &lex_point
);
958 DUK_DD(DUK_DDPRINT("starting regexp compilation"));
960 duk__append_u32(&re_ctx
, DUK_REOP_SAVE
);
961 duk__append_u32(&re_ctx
, 0);
962 duk__parse_disjunction(&re_ctx
, 1 /*expect_eof*/, &ign_disj
);
963 duk__append_u32(&re_ctx
, DUK_REOP_SAVE
);
964 duk__append_u32(&re_ctx
, 1);
965 duk__append_u32(&re_ctx
, DUK_REOP_MATCH
);
968 * Check for invalid backreferences; note that it is NOT an error
969 * to back-reference a capture group which has not yet been introduced
970 * in the pattern (as in /\1(foo)/); in fact, the backreference will
971 * always match! It IS an error to back-reference a capture group
972 * which will never be introduced in the pattern. Thus, we can check
973 * for such references only after parsing is complete.
976 if (re_ctx
.highest_backref
> re_ctx
.captures
) {
977 DUK_ERROR_SYNTAX(thr
, DUK_STR_INVALID_BACKREFS
);
981 * Emit compiled regexp header: flags, ncaptures
982 * (insertion order inverted on purpose)
985 duk__insert_u32(&re_ctx
, 0, (re_ctx
.captures
+ 1) * 2);
986 duk__insert_u32(&re_ctx
, 0, re_ctx
.re_flags
);
988 /* [ ... pattern flags escaped_source buffer ] */
990 DUK_BW_COMPACT(thr
, &re_ctx
.bw
);
991 duk_to_string(ctx
, -1); /* coerce to string */
993 /* [ ... pattern flags escaped_source bytecode ] */
999 duk_remove(ctx
, -4); /* -> [ ... flags escaped_source bytecode ] */
1000 duk_remove(ctx
, -3); /* -> [ ... escaped_source bytecode ] */
1002 DUK_DD(DUK_DDPRINT("regexp compilation successful, bytecode: %!T, escaped source: %!T",
1003 (duk_tval
*) duk_get_tval(ctx
, -1), (duk_tval
*) duk_get_tval(ctx
, -2)));
1007 * Create a RegExp instance (E5 Section 15.10.7).
1009 * Note: the output stack left by duk_regexp_compile() is directly compatible
1010 * with the input here.
1012 * Input stack: [ escaped_source bytecode ] (both as strings)
1013 * Output stack: [ RegExp ]
1016 DUK_INTERNAL
void duk_regexp_create_instance(duk_hthread
*thr
) {
1017 duk_context
*ctx
= (duk_context
*) thr
;
1020 duk_small_int_t re_flags
;
1022 /* [ ... escape_source bytecode ] */
1024 h_bc
= duk_get_hstring(ctx
, -1);
1025 DUK_ASSERT(h_bc
!= NULL
);
1026 DUK_ASSERT(DUK_HSTRING_GET_BYTELEN(h_bc
) >= 1); /* always at least the header */
1027 DUK_ASSERT(DUK_HSTRING_GET_CHARLEN(h_bc
) >= 1);
1028 DUK_ASSERT((duk_small_int_t
) DUK_HSTRING_GET_DATA(h_bc
)[0] < 0x80); /* flags always encodes to 1 byte */
1029 re_flags
= (duk_small_int_t
) DUK_HSTRING_GET_DATA(h_bc
)[0];
1031 /* [ ... escaped_source bytecode ] */
1033 duk_push_object(ctx
);
1034 h
= duk_get_hobject(ctx
, -1);
1035 DUK_ASSERT(h
!= NULL
);
1036 duk_insert(ctx
, -3);
1038 /* [ ... regexp_object escaped_source bytecode ] */
1040 DUK_HOBJECT_SET_CLASS_NUMBER(h
, DUK_HOBJECT_CLASS_REGEXP
);
1041 DUK_HOBJECT_SET_PROTOTYPE_UPDREF(thr
, h
, thr
->builtins
[DUK_BIDX_REGEXP_PROTOTYPE
]);
1043 duk_xdef_prop_stridx(ctx
, -3, DUK_STRIDX_INT_BYTECODE
, DUK_PROPDESC_FLAGS_NONE
);
1045 /* [ ... regexp_object escaped_source ] */
1047 duk_xdef_prop_stridx(ctx
, -2, DUK_STRIDX_SOURCE
, DUK_PROPDESC_FLAGS_NONE
);
1049 /* [ ... regexp_object ] */
1051 duk_push_boolean(ctx
, (re_flags
& DUK_RE_FLAG_GLOBAL
));
1052 duk_xdef_prop_stridx(ctx
, -2, DUK_STRIDX_GLOBAL
, DUK_PROPDESC_FLAGS_NONE
);
1054 duk_push_boolean(ctx
, (re_flags
& DUK_RE_FLAG_IGNORE_CASE
));
1055 duk_xdef_prop_stridx(ctx
, -2, DUK_STRIDX_IGNORE_CASE
, DUK_PROPDESC_FLAGS_NONE
);
1057 duk_push_boolean(ctx
, (re_flags
& DUK_RE_FLAG_MULTILINE
));
1058 duk_xdef_prop_stridx(ctx
, -2, DUK_STRIDX_MULTILINE
, DUK_PROPDESC_FLAGS_NONE
);
1060 duk_push_int(ctx
, 0);
1061 duk_xdef_prop_stridx(ctx
, -2, DUK_STRIDX_LAST_INDEX
, DUK_PROPDESC_FLAGS_W
);
1063 /* [ ... regexp_object ] */
1066 #undef DUK__RE_BUFLEN
1068 #else /* DUK_USE_REGEXP_SUPPORT */
1070 /* regexp support disabled */
1072 #endif /* DUK_USE_REGEXP_SUPPORT */