]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /* |
2 | * Regexp compilation. | |
3 | * | |
4 | * See doc/regexp.rst for a discussion of the compilation approach and | |
5 | * current limitations. | |
6 | * | |
7 | * Regexp bytecode assumes jumps can be expressed with signed 32-bit | |
8 | * integers. Consequently the bytecode size must not exceed 0x7fffffffL. | |
9 | * The implementation casts duk_size_t (buffer size) to duk_(u)int32_t | |
10 | * in many places. Although this could be changed, the bytecode format | |
11 | * limit would still prevent regexps exceeding the signed 32-bit limit | |
12 | * from working. | |
13 | * | |
14 | * XXX: The implementation does not prevent bytecode from exceeding the | |
15 | * maximum supported size. This could be done by limiting the maximum | |
16 | * input string size (assuming an upper bound can be computed for number | |
17 | * of bytecode bytes emitted per input byte) or checking buffer maximum | |
18 | * size when emitting bytecode (slower). | |
19 | */ | |
20 | ||
21 | #include "duk_internal.h" | |
22 | ||
23 | #ifdef DUK_USE_REGEXP_SUPPORT | |
24 | ||
25 | /* | |
26 | * Helper macros | |
27 | */ | |
28 | ||
29 | #define DUK__RE_INITIAL_BUFSIZE 64 | |
30 | ||
31 | #undef DUK__RE_BUFLEN | |
32 | #define DUK__RE_BUFLEN(re_ctx) \ | |
33 | DUK_BW_GET_SIZE(re_ctx->thr, &re_ctx->bw) | |
34 | ||
35 | /* | |
36 | * Disjunction struct: result of parsing a disjunction | |
37 | */ | |
38 | ||
39 | typedef struct { | |
40 | /* Number of characters that the atom matches (e.g. 3 for 'abc'), | |
41 | * -1 if atom is complex and number of matched characters either | |
42 | * varies or is not known. | |
43 | */ | |
44 | duk_int32_t charlen; | |
45 | ||
46 | #if 0 | |
47 | /* These are not needed to implement quantifier capture handling, | |
48 | * but might be needed at some point. | |
49 | */ | |
50 | ||
51 | /* re_ctx->captures at start and end of atom parsing. | |
52 | * Since 'captures' indicates highest capture number emitted | |
53 | * so far in a DUK_REOP_SAVE, the captures numbers saved by | |
54 | * the atom are: ]start_captures,end_captures]. | |
55 | */ | |
56 | duk_uint32_t start_captures; | |
57 | duk_uint32_t end_captures; | |
58 | #endif | |
59 | } duk__re_disjunction_info; | |
60 | ||
61 | /* | |
62 | * Encoding helpers | |
63 | * | |
64 | * Some of the typing is bytecode based, e.g. slice sizes are unsigned 32-bit | |
65 | * even though the buffer operations will use duk_size_t. | |
66 | */ | |
67 | ||
68 | /* XXX: the insert helpers should ensure that the bytecode result is not | |
69 | * larger than expected (or at least assert for it). Many things in the | |
70 | * bytecode, like skip offsets, won't work correctly if the bytecode is | |
71 | * larger than say 2G. | |
72 | */ | |
73 | ||
74 | DUK_LOCAL duk_uint32_t duk__encode_i32(duk_int32_t x) { | |
75 | if (x < 0) { | |
76 | return ((duk_uint32_t) (-x)) * 2 + 1; | |
77 | } else { | |
78 | return ((duk_uint32_t) x) * 2; | |
79 | } | |
80 | } | |
81 | ||
82 | /* XXX: return type should probably be duk_size_t, or explicit checks are needed for | |
83 | * maximum size. | |
84 | */ | |
85 | DUK_LOCAL duk_uint32_t duk__insert_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_uint32_t x) { | |
86 | duk_uint8_t buf[DUK_UNICODE_MAX_XUTF8_LENGTH]; | |
87 | duk_small_int_t len; | |
88 | ||
89 | len = duk_unicode_encode_xutf8((duk_ucodepoint_t) x, buf); | |
90 | DUK_BW_INSERT_ENSURE_BYTES(re_ctx->thr, &re_ctx->bw, offset, buf, len); | |
91 | return (duk_uint32_t) len; | |
92 | } | |
93 | ||
94 | DUK_LOCAL duk_uint32_t duk__append_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t x) { | |
95 | duk_uint8_t buf[DUK_UNICODE_MAX_XUTF8_LENGTH]; | |
96 | duk_small_int_t len; | |
97 | ||
98 | len = duk_unicode_encode_xutf8((duk_ucodepoint_t) x, buf); | |
99 | DUK_BW_WRITE_ENSURE_BYTES(re_ctx->thr, &re_ctx->bw, buf, len); | |
100 | return (duk_uint32_t) len; | |
101 | } | |
102 | ||
103 | DUK_LOCAL duk_uint32_t duk__insert_i32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_int32_t x) { | |
104 | return duk__insert_u32(re_ctx, offset, duk__encode_i32(x)); | |
105 | } | |
106 | ||
107 | #if 0 /* unused */ | |
108 | DUK_LOCAL duk_uint32_t duk__append_i32(duk_re_compiler_ctx *re_ctx, duk_int32_t x) { | |
109 | return duk__append_u32(re_ctx, duk__encode_i32(x)); | |
110 | } | |
111 | #endif | |
112 | ||
113 | /* special helper for emitting u16 lists (used for character ranges for built-in char classes) */ | |
114 | DUK_LOCAL void duk__append_u16_list(duk_re_compiler_ctx *re_ctx, duk_uint16_t *values, duk_uint32_t count) { | |
115 | /* Call sites don't need the result length so it's not accumulated. */ | |
116 | while (count > 0) { | |
117 | (void) duk__append_u32(re_ctx, (duk_uint32_t) (*values++)); | |
118 | count--; | |
119 | } | |
120 | } | |
121 | ||
122 | DUK_LOCAL void duk__insert_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_uint32_t data_offset, duk_uint32_t data_length) { | |
123 | DUK_BW_INSERT_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, offset, data_offset, data_length); | |
124 | } | |
125 | ||
126 | DUK_LOCAL void duk__append_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t data_offset, duk_uint32_t data_length) { | |
127 | DUK_BW_WRITE_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, data_offset, data_length); | |
128 | } | |
129 | ||
130 | DUK_LOCAL void duk__remove_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t data_offset, duk_uint32_t data_length) { | |
131 | DUK_BW_REMOVE_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, data_offset, data_length); | |
132 | } | |
133 | ||
134 | /* | |
135 | * Insert a jump offset at 'offset' to complete an instruction | |
136 | * (the jump offset is always the last component of an instruction). | |
137 | * The 'skip' argument must be computed relative to 'offset', | |
138 | * -without- taking into account the skip field being inserted. | |
139 | * | |
140 | * ... A B C ins X Y Z ... (ins may be a JUMP, SPLIT1/SPLIT2, etc) | |
141 | * => ... A B C ins SKIP X Y Z | |
142 | * | |
143 | * Computing the final (adjusted) skip value, which is relative to the | |
144 | * first byte of the next instruction, is a bit tricky because of the | |
145 | * variable length UTF-8 encoding. See doc/regexp.rst for discussion. | |
146 | */ | |
147 | DUK_LOCAL duk_uint32_t duk__insert_jump_offset(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_int32_t skip) { | |
148 | duk_small_int_t len; | |
149 | ||
150 | /* XXX: solve into closed form (smaller code) */ | |
151 | ||
152 | if (skip < 0) { | |
153 | /* two encoding attempts suffices */ | |
154 | len = duk_unicode_get_xutf8_length((duk_codepoint_t) duk__encode_i32(skip)); | |
155 | len = duk_unicode_get_xutf8_length((duk_codepoint_t) duk__encode_i32(skip - (duk_int32_t) len)); | |
156 | DUK_ASSERT(duk_unicode_get_xutf8_length(duk__encode_i32(skip - (duk_int32_t) len)) == len); /* no change */ | |
157 | skip -= (duk_int32_t) len; | |
158 | } | |
159 | return duk__insert_i32(re_ctx, offset, skip); | |
160 | } | |
161 | ||
162 | DUK_LOCAL duk_uint32_t duk__append_jump_offset(duk_re_compiler_ctx *re_ctx, duk_int32_t skip) { | |
163 | return (duk_uint32_t) duk__insert_jump_offset(re_ctx, (duk_uint32_t) DUK__RE_BUFLEN(re_ctx), skip); | |
164 | } | |
165 | ||
166 | /* | |
167 | * duk_re_range_callback for generating character class ranges. | |
168 | * | |
169 | * When ignoreCase is false, the range is simply emitted as is. | |
170 | * We don't, for instance, eliminate duplicates or overlapping | |
171 | * ranges in a character class. | |
172 | * | |
173 | * When ignoreCase is true, the range needs to be normalized through | |
174 | * canonicalization. Unfortunately a canonicalized version of a | |
175 | * continuous range is not necessarily continuous (e.g. [x-{] is | |
176 | * continuous but [X-{] is not). The current algorithm creates the | |
177 | * canonicalized range(s) space efficiently at the cost of compile | |
178 | * time execution time (see doc/regexp.rst for discussion). | |
179 | * | |
180 | * Note that the ctx->nranges is a context-wide temporary value | |
181 | * (this is OK because there cannot be multiple character classes | |
182 | * being parsed simultaneously). | |
183 | */ | |
184 | ||
185 | DUK_LOCAL void duk__generate_ranges(void *userdata, duk_codepoint_t r1, duk_codepoint_t r2, duk_bool_t direct) { | |
186 | duk_re_compiler_ctx *re_ctx = (duk_re_compiler_ctx *) userdata; | |
187 | ||
188 | DUK_DD(DUK_DDPRINT("duk__generate_ranges(): re_ctx=%p, range=[%ld,%ld] direct=%ld", | |
189 | (void *) re_ctx, (long) r1, (long) r2, (long) direct)); | |
190 | ||
191 | if (!direct && (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE)) { | |
192 | /* | |
193 | * Canonicalize a range, generating result ranges as necessary. | |
194 | * Needs to exhaustively scan the entire range (at most 65536 | |
195 | * code points). If 'direct' is set, caller (lexer) has ensured | |
196 | * that the range is already canonicalization compatible (this | |
197 | * is used to avoid unnecessary canonicalization of built-in | |
198 | * ranges like \W, which are not affected by canonicalization). | |
199 | * | |
200 | * NOTE: here is one place where we don't want to support chars | |
201 | * outside the BMP, because the exhaustive search would be | |
202 | * massively larger. | |
203 | */ | |
204 | ||
205 | duk_codepoint_t i; | |
206 | duk_codepoint_t t; | |
207 | duk_codepoint_t r_start, r_end; | |
208 | ||
209 | r_start = duk_unicode_re_canonicalize_char(re_ctx->thr, r1); | |
210 | r_end = r_start; | |
211 | for (i = r1 + 1; i <= r2; i++) { | |
212 | t = duk_unicode_re_canonicalize_char(re_ctx->thr, i); | |
213 | if (t == r_end + 1) { | |
214 | r_end = t; | |
215 | } else { | |
216 | DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start, (long) r_end)); | |
217 | duk__append_u32(re_ctx, (duk_uint32_t) r_start); | |
218 | duk__append_u32(re_ctx, (duk_uint32_t) r_end); | |
219 | re_ctx->nranges++; | |
220 | r_start = t; | |
221 | r_end = t; | |
222 | } | |
223 | } | |
224 | DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start, (long) r_end)); | |
225 | duk__append_u32(re_ctx, (duk_uint32_t) r_start); | |
226 | duk__append_u32(re_ctx, (duk_uint32_t) r_end); | |
227 | re_ctx->nranges++; | |
228 | } else { | |
229 | DUK_DD(DUK_DDPRINT("direct, emit range: [%ld,%ld]", (long) r1, (long) r2)); | |
230 | duk__append_u32(re_ctx, (duk_uint32_t) r1); | |
231 | duk__append_u32(re_ctx, (duk_uint32_t) r2); | |
232 | re_ctx->nranges++; | |
233 | } | |
234 | } | |
235 | ||
236 | /* | |
237 | * Parse regexp Disjunction. Most of regexp compilation happens here. | |
238 | * | |
239 | * Handles Disjunction, Alternative, and Term productions directly without | |
240 | * recursion. The only constructs requiring recursion are positive/negative | |
241 | * lookaheads, capturing parentheses, and non-capturing parentheses. | |
242 | * | |
243 | * The function determines whether the entire disjunction is a 'simple atom' | |
244 | * (see doc/regexp.rst discussion on 'simple quantifiers') and if so, | |
245 | * returns the atom character length which is needed by the caller to keep | |
246 | * track of its own atom character length. A disjunction with more than one | |
247 | * alternative is never considered a simple atom (although in some cases | |
248 | * that might be the case). | |
249 | * | |
250 | * Return value: simple atom character length or < 0 if not a simple atom. | |
251 | * Appends the bytecode for the disjunction matcher to the end of the temp | |
252 | * buffer. | |
253 | * | |
254 | * Regexp top level structure is: | |
255 | * | |
256 | * Disjunction = Term* | |
257 | * | Term* | Disjunction | |
258 | * | |
259 | * Term = Assertion | |
260 | * | Atom | |
261 | * | Atom Quantifier | |
262 | * | |
263 | * An empty Term sequence is a valid disjunction alternative (e.g. /|||c||/). | |
264 | * | |
265 | * Notes: | |
266 | * | |
267 | * * Tracking of the 'simple-ness' of the current atom vs. the entire | |
268 | * disjunction are separate matters. For instance, the disjunction | |
269 | * may be complex, but individual atoms may be simple. Furthermore, | |
270 | * simple quantifiers are used whenever possible, even if the | |
271 | * disjunction as a whole is complex. | |
272 | * | |
273 | * * The estimate of whether an atom is simple is conservative now, | |
274 | * and it would be possible to expand it. For instance, captures | |
275 | * cause the disjunction to be marked complex, even though captures | |
276 | * -can- be handled by simple quantifiers with some minor modifications. | |
277 | * | |
278 | * * Disjunction 'tainting' as 'complex' is handled at the end of the | |
279 | * main for loop collectively for atoms. Assertions, quantifiers, | |
280 | * and '|' tokens need to taint the result manually if necessary. | |
281 | * Assertions cannot add to result char length, only atoms (and | |
282 | * quantifiers) can; currently quantifiers will taint the result | |
283 | * as complex though. | |
284 | */ | |
285 | ||
286 | DUK_LOCAL void duk__parse_disjunction(duk_re_compiler_ctx *re_ctx, duk_bool_t expect_eof, duk__re_disjunction_info *out_atom_info) { | |
287 | duk_int32_t atom_start_offset = -1; /* negative -> no atom matched on previous round */ | |
288 | duk_int32_t atom_char_length = 0; /* negative -> complex atom */ | |
289 | duk_uint32_t atom_start_captures = re_ctx->captures; /* value of re_ctx->captures at start of atom */ | |
290 | duk_int32_t unpatched_disjunction_split = -1; | |
291 | duk_int32_t unpatched_disjunction_jump = -1; | |
292 | duk_uint32_t entry_offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx); | |
293 | duk_int32_t res_charlen = 0; /* -1 if disjunction is complex, char length if simple */ | |
294 | duk__re_disjunction_info tmp_disj; | |
295 | ||
296 | DUK_ASSERT(out_atom_info != NULL); | |
297 | ||
298 | if (re_ctx->recursion_depth >= re_ctx->recursion_limit) { | |
299 | DUK_ERROR(re_ctx->thr, DUK_ERR_RANGE_ERROR, | |
300 | DUK_STR_REGEXP_COMPILER_RECURSION_LIMIT); | |
301 | } | |
302 | re_ctx->recursion_depth++; | |
303 | ||
304 | #if 0 | |
305 | out_atom_info->start_captures = re_ctx->captures; | |
306 | #endif | |
307 | ||
308 | for (;;) { | |
309 | /* atom_char_length, atom_start_offset, atom_start_offset reflect the | |
310 | * atom matched on the previous loop. If a quantifier is encountered | |
311 | * on this loop, these are needed to handle the quantifier correctly. | |
312 | * new_atom_char_length etc are for the atom parsed on this round; | |
313 | * they're written to atom_char_length etc at the end of the round. | |
314 | */ | |
315 | duk_int32_t new_atom_char_length; /* char length of the atom parsed in this loop */ | |
316 | duk_int32_t new_atom_start_offset; /* bytecode start offset of the atom parsed in this loop | |
317 | * (allows quantifiers to copy the atom bytecode) | |
318 | */ | |
319 | duk_uint32_t new_atom_start_captures; /* re_ctx->captures at the start of the atom parsed in this loop */ | |
320 | ||
321 | duk_lexer_parse_re_token(&re_ctx->lex, &re_ctx->curr_token); | |
322 | ||
323 | DUK_DD(DUK_DDPRINT("re token: %ld (num=%ld, char=%c)", | |
324 | (long) re_ctx->curr_token.t, | |
325 | (long) re_ctx->curr_token.num, | |
326 | (re_ctx->curr_token.num >= 0x20 && re_ctx->curr_token.num <= 0x7e) ? | |
327 | (int) re_ctx->curr_token.num : (int) '?')); | |
328 | ||
329 | /* set by atom case clauses */ | |
330 | new_atom_start_offset = -1; | |
331 | new_atom_char_length = -1; | |
332 | new_atom_start_captures = re_ctx->captures; | |
333 | ||
334 | switch (re_ctx->curr_token.t) { | |
335 | case DUK_RETOK_DISJUNCTION: { | |
336 | /* | |
337 | * The handling here is a bit tricky. If a previous '|' has been processed, | |
338 | * we have a pending split1 and a pending jump (for a previous match). These | |
339 | * need to be back-patched carefully. See docs for a detailed example. | |
340 | */ | |
341 | ||
342 | /* patch pending jump and split */ | |
343 | if (unpatched_disjunction_jump >= 0) { | |
344 | duk_uint32_t offset; | |
345 | ||
346 | DUK_ASSERT(unpatched_disjunction_split >= 0); | |
347 | offset = unpatched_disjunction_jump; | |
348 | offset += duk__insert_jump_offset(re_ctx, | |
349 | offset, | |
350 | (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - offset)); | |
351 | /* offset is now target of the pending split (right after jump) */ | |
352 | duk__insert_jump_offset(re_ctx, | |
353 | unpatched_disjunction_split, | |
354 | offset - unpatched_disjunction_split); | |
355 | } | |
356 | ||
357 | /* add a new pending split to the beginning of the entire disjunction */ | |
358 | (void) duk__insert_u32(re_ctx, | |
359 | entry_offset, | |
360 | DUK_REOP_SPLIT1); /* prefer direct execution */ | |
361 | unpatched_disjunction_split = entry_offset + 1; /* +1 for opcode */ | |
362 | ||
363 | /* add a new pending match jump for latest finished alternative */ | |
364 | duk__append_u32(re_ctx, DUK_REOP_JUMP); | |
365 | unpatched_disjunction_jump = (duk_int32_t) DUK__RE_BUFLEN(re_ctx); | |
366 | ||
367 | /* 'taint' result as complex */ | |
368 | res_charlen = -1; | |
369 | break; | |
370 | } | |
371 | case DUK_RETOK_QUANTIFIER: { | |
372 | if (atom_start_offset < 0) { | |
373 | DUK_ERROR(re_ctx->thr, DUK_ERR_SYNTAX_ERROR, | |
374 | DUK_STR_INVALID_QUANTIFIER_NO_ATOM); | |
375 | } | |
376 | if (re_ctx->curr_token.qmin > re_ctx->curr_token.qmax) { | |
377 | DUK_ERROR(re_ctx->thr, DUK_ERR_SYNTAX_ERROR, | |
378 | DUK_STR_INVALID_QUANTIFIER_VALUES); | |
379 | } | |
380 | if (atom_char_length >= 0) { | |
381 | /* | |
382 | * Simple atom | |
383 | * | |
384 | * If atom_char_length is zero, we'll have unbounded execution time for e.g. | |
385 | * /()*x/.exec('x'). We can't just skip the match because it might have some | |
386 | * side effects (for instance, if we allowed captures in simple atoms, the | |
387 | * capture needs to happen). The simple solution below is to force the | |
388 | * quantifier to match at most once, since the additional matches have no effect. | |
389 | * | |
390 | * With a simple atom there can be no capture groups, so no captures need | |
391 | * to be reset. | |
392 | */ | |
393 | duk_int32_t atom_code_length; | |
394 | duk_uint32_t offset; | |
395 | duk_uint32_t qmin, qmax; | |
396 | ||
397 | qmin = re_ctx->curr_token.qmin; | |
398 | qmax = re_ctx->curr_token.qmax; | |
399 | if (atom_char_length == 0) { | |
400 | /* qmin and qmax will be 0 or 1 */ | |
401 | if (qmin > 1) { | |
402 | qmin = 1; | |
403 | } | |
404 | if (qmax > 1) { | |
405 | qmax = 1; | |
406 | } | |
407 | } | |
408 | ||
409 | duk__append_u32(re_ctx, DUK_REOP_MATCH); /* complete 'sub atom' */ | |
410 | atom_code_length = (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - atom_start_offset); | |
411 | ||
412 | offset = atom_start_offset; | |
413 | if (re_ctx->curr_token.greedy) { | |
414 | offset += duk__insert_u32(re_ctx, offset, DUK_REOP_SQGREEDY); | |
415 | offset += duk__insert_u32(re_ctx, offset, qmin); | |
416 | offset += duk__insert_u32(re_ctx, offset, qmax); | |
417 | offset += duk__insert_u32(re_ctx, offset, atom_char_length); | |
418 | offset += duk__insert_jump_offset(re_ctx, offset, atom_code_length); | |
419 | } else { | |
420 | offset += duk__insert_u32(re_ctx, offset, DUK_REOP_SQMINIMAL); | |
421 | offset += duk__insert_u32(re_ctx, offset, qmin); | |
422 | offset += duk__insert_u32(re_ctx, offset, qmax); | |
423 | offset += duk__insert_jump_offset(re_ctx, offset, atom_code_length); | |
424 | } | |
425 | DUK_UNREF(offset); /* silence scan-build warning */ | |
426 | } else { | |
427 | /* | |
428 | * Complex atom | |
429 | * | |
430 | * The original code is used as a template, and removed at the end | |
431 | * (this differs from the handling of simple quantifiers). | |
432 | * | |
433 | * NOTE: there is no current solution for empty atoms in complex | |
434 | * quantifiers. This would need some sort of a 'progress' instruction. | |
435 | * | |
436 | * XXX: impose limit on maximum result size, i.e. atom_code_len * atom_copies? | |
437 | */ | |
438 | duk_int32_t atom_code_length; | |
439 | duk_uint32_t atom_copies; | |
440 | duk_uint32_t tmp_qmin, tmp_qmax; | |
441 | ||
442 | /* pre-check how many atom copies we're willing to make (atom_copies not needed below) */ | |
443 | atom_copies = (re_ctx->curr_token.qmax == DUK_RE_QUANTIFIER_INFINITE) ? | |
444 | re_ctx->curr_token.qmin : re_ctx->curr_token.qmax; | |
445 | if (atom_copies > DUK_RE_MAX_ATOM_COPIES) { | |
446 | DUK_ERROR(re_ctx->thr, DUK_ERR_RANGE_ERROR, | |
447 | DUK_STR_QUANTIFIER_TOO_MANY_COPIES); | |
448 | } | |
449 | ||
450 | /* wipe the capture range made by the atom (if any) */ | |
451 | DUK_ASSERT(atom_start_captures <= re_ctx->captures); | |
452 | if (atom_start_captures != re_ctx->captures) { | |
453 | DUK_ASSERT(atom_start_captures < re_ctx->captures); | |
454 | DUK_DDD(DUK_DDDPRINT("must wipe ]atom_start_captures,re_ctx->captures]: ]%ld,%ld]", | |
455 | (long) atom_start_captures, (long) re_ctx->captures)); | |
456 | ||
457 | /* insert (DUK_REOP_WIPERANGE, start, count) in reverse order so the order ends up right */ | |
458 | duk__insert_u32(re_ctx, atom_start_offset, (re_ctx->captures - atom_start_captures) * 2); | |
459 | duk__insert_u32(re_ctx, atom_start_offset, (atom_start_captures + 1) * 2); | |
460 | duk__insert_u32(re_ctx, atom_start_offset, DUK_REOP_WIPERANGE); | |
461 | } else { | |
462 | DUK_DDD(DUK_DDDPRINT("no need to wipe captures: atom_start_captures == re_ctx->captures == %ld", | |
463 | (long) atom_start_captures)); | |
464 | } | |
465 | ||
466 | atom_code_length = (duk_int32_t) DUK__RE_BUFLEN(re_ctx) - atom_start_offset; | |
467 | ||
468 | /* insert the required matches (qmin) by copying the atom */ | |
469 | tmp_qmin = re_ctx->curr_token.qmin; | |
470 | tmp_qmax = re_ctx->curr_token.qmax; | |
471 | while (tmp_qmin > 0) { | |
472 | duk__append_slice(re_ctx, atom_start_offset, atom_code_length); | |
473 | tmp_qmin--; | |
474 | if (tmp_qmax != DUK_RE_QUANTIFIER_INFINITE) { | |
475 | tmp_qmax--; | |
476 | } | |
477 | } | |
478 | DUK_ASSERT(tmp_qmin == 0); | |
479 | ||
480 | /* insert code for matching the remainder - infinite or finite */ | |
481 | if (tmp_qmax == DUK_RE_QUANTIFIER_INFINITE) { | |
482 | /* reuse last emitted atom for remaining 'infinite' quantifier */ | |
483 | ||
484 | if (re_ctx->curr_token.qmin == 0) { | |
485 | /* Special case: original qmin was zero so there is nothing | |
486 | * to repeat. Emit an atom copy but jump over it here. | |
487 | */ | |
488 | duk__append_u32(re_ctx, DUK_REOP_JUMP); | |
489 | duk__append_jump_offset(re_ctx, atom_code_length); | |
490 | duk__append_slice(re_ctx, atom_start_offset, atom_code_length); | |
491 | } | |
492 | if (re_ctx->curr_token.greedy) { | |
493 | duk__append_u32(re_ctx, DUK_REOP_SPLIT2); /* prefer jump */ | |
494 | } else { | |
495 | duk__append_u32(re_ctx, DUK_REOP_SPLIT1); /* prefer direct */ | |
496 | } | |
497 | duk__append_jump_offset(re_ctx, -atom_code_length - 1); /* -1 for opcode */ | |
498 | } else { | |
499 | /* | |
500 | * The remaining matches are emitted as sequence of SPLITs and atom | |
501 | * copies; the SPLITs skip the remaining copies and match the sequel. | |
502 | * This sequence needs to be emitted starting from the last copy | |
503 | * because the SPLITs are variable length due to the variable length | |
504 | * skip offset. This causes a lot of memory copying now. | |
505 | * | |
506 | * Example structure (greedy, match maximum # atoms): | |
507 | * | |
508 | * SPLIT1 LSEQ | |
509 | * (atom) | |
510 | * SPLIT1 LSEQ ; <- the byte length of this instruction is needed | |
511 | * (atom) ; to encode the above SPLIT1 correctly | |
512 | * ... | |
513 | * LSEQ: | |
514 | */ | |
515 | duk_uint32_t offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx); | |
516 | while (tmp_qmax > 0) { | |
517 | duk__insert_slice(re_ctx, offset, atom_start_offset, atom_code_length); | |
518 | if (re_ctx->curr_token.greedy) { | |
519 | duk__insert_u32(re_ctx, offset, DUK_REOP_SPLIT1); /* prefer direct */ | |
520 | } else { | |
521 | duk__insert_u32(re_ctx, offset, DUK_REOP_SPLIT2); /* prefer jump */ | |
522 | } | |
523 | duk__insert_jump_offset(re_ctx, | |
524 | offset + 1, /* +1 for opcode */ | |
525 | (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (offset + 1))); | |
526 | tmp_qmax--; | |
527 | } | |
528 | } | |
529 | ||
530 | /* remove the original 'template' atom */ | |
531 | duk__remove_slice(re_ctx, atom_start_offset, atom_code_length); | |
532 | } | |
533 | ||
534 | /* 'taint' result as complex */ | |
535 | res_charlen = -1; | |
536 | break; | |
537 | } | |
538 | case DUK_RETOK_ASSERT_START: { | |
539 | duk__append_u32(re_ctx, DUK_REOP_ASSERT_START); | |
540 | break; | |
541 | } | |
542 | case DUK_RETOK_ASSERT_END: { | |
543 | duk__append_u32(re_ctx, DUK_REOP_ASSERT_END); | |
544 | break; | |
545 | } | |
546 | case DUK_RETOK_ASSERT_WORD_BOUNDARY: { | |
547 | duk__append_u32(re_ctx, DUK_REOP_ASSERT_WORD_BOUNDARY); | |
548 | break; | |
549 | } | |
550 | case DUK_RETOK_ASSERT_NOT_WORD_BOUNDARY: { | |
551 | duk__append_u32(re_ctx, DUK_REOP_ASSERT_NOT_WORD_BOUNDARY); | |
552 | break; | |
553 | } | |
554 | case DUK_RETOK_ASSERT_START_POS_LOOKAHEAD: | |
555 | case DUK_RETOK_ASSERT_START_NEG_LOOKAHEAD: { | |
556 | duk_uint32_t offset; | |
557 | duk_uint32_t opcode = (re_ctx->curr_token.t == DUK_RETOK_ASSERT_START_POS_LOOKAHEAD) ? | |
558 | DUK_REOP_LOOKPOS : DUK_REOP_LOOKNEG; | |
559 | ||
560 | offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx); | |
561 | duk__parse_disjunction(re_ctx, 0, &tmp_disj); | |
562 | duk__append_u32(re_ctx, DUK_REOP_MATCH); | |
563 | ||
564 | (void) duk__insert_u32(re_ctx, offset, opcode); | |
565 | (void) duk__insert_jump_offset(re_ctx, | |
566 | offset + 1, /* +1 for opcode */ | |
567 | (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (offset + 1))); | |
568 | ||
569 | /* 'taint' result as complex -- this is conservative, | |
570 | * as lookaheads do not backtrack. | |
571 | */ | |
572 | res_charlen = -1; | |
573 | break; | |
574 | } | |
575 | case DUK_RETOK_ATOM_PERIOD: { | |
576 | new_atom_char_length = 1; | |
577 | new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx); | |
578 | duk__append_u32(re_ctx, DUK_REOP_PERIOD); | |
579 | break; | |
580 | } | |
581 | case DUK_RETOK_ATOM_CHAR: { | |
582 | /* Note: successive characters could be joined into string matches | |
583 | * but this is not trivial (consider e.g. '/xyz+/); see docs for | |
584 | * more discussion. | |
585 | */ | |
586 | duk_uint32_t ch; | |
587 | ||
588 | new_atom_char_length = 1; | |
589 | new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx); | |
590 | duk__append_u32(re_ctx, DUK_REOP_CHAR); | |
591 | ch = re_ctx->curr_token.num; | |
592 | if (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) { | |
593 | ch = duk_unicode_re_canonicalize_char(re_ctx->thr, ch); | |
594 | } | |
595 | duk__append_u32(re_ctx, ch); | |
596 | break; | |
597 | } | |
598 | case DUK_RETOK_ATOM_DIGIT: | |
599 | case DUK_RETOK_ATOM_NOT_DIGIT: { | |
600 | new_atom_char_length = 1; | |
601 | new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx); | |
602 | duk__append_u32(re_ctx, | |
603 | (re_ctx->curr_token.t == DUK_RETOK_ATOM_DIGIT) ? | |
604 | DUK_REOP_RANGES : DUK_REOP_INVRANGES); | |
605 | duk__append_u32(re_ctx, sizeof(duk_unicode_re_ranges_digit) / (2 * sizeof(duk_uint16_t))); | |
606 | duk__append_u16_list(re_ctx, duk_unicode_re_ranges_digit, sizeof(duk_unicode_re_ranges_digit) / sizeof(duk_uint16_t)); | |
607 | break; | |
608 | } | |
609 | case DUK_RETOK_ATOM_WHITE: | |
610 | case DUK_RETOK_ATOM_NOT_WHITE: { | |
611 | new_atom_char_length = 1; | |
612 | new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx); | |
613 | duk__append_u32(re_ctx, | |
614 | (re_ctx->curr_token.t == DUK_RETOK_ATOM_WHITE) ? | |
615 | DUK_REOP_RANGES : DUK_REOP_INVRANGES); | |
616 | duk__append_u32(re_ctx, sizeof(duk_unicode_re_ranges_white) / (2 * sizeof(duk_uint16_t))); | |
617 | duk__append_u16_list(re_ctx, duk_unicode_re_ranges_white, sizeof(duk_unicode_re_ranges_white) / sizeof(duk_uint16_t)); | |
618 | break; | |
619 | } | |
620 | case DUK_RETOK_ATOM_WORD_CHAR: | |
621 | case DUK_RETOK_ATOM_NOT_WORD_CHAR: { | |
622 | new_atom_char_length = 1; | |
623 | new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx); | |
624 | duk__append_u32(re_ctx, | |
625 | (re_ctx->curr_token.t == DUK_RETOK_ATOM_WORD_CHAR) ? | |
626 | DUK_REOP_RANGES : DUK_REOP_INVRANGES); | |
627 | duk__append_u32(re_ctx, sizeof(duk_unicode_re_ranges_wordchar) / (2 * sizeof(duk_uint16_t))); | |
628 | duk__append_u16_list(re_ctx, duk_unicode_re_ranges_wordchar, sizeof(duk_unicode_re_ranges_wordchar) / sizeof(duk_uint16_t)); | |
629 | break; | |
630 | } | |
631 | case DUK_RETOK_ATOM_BACKREFERENCE: { | |
632 | duk_uint32_t backref = (duk_uint32_t) re_ctx->curr_token.num; | |
633 | if (backref > re_ctx->highest_backref) { | |
634 | re_ctx->highest_backref = backref; | |
635 | } | |
636 | new_atom_char_length = -1; /* mark as complex */ | |
637 | new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx); | |
638 | duk__append_u32(re_ctx, DUK_REOP_BACKREFERENCE); | |
639 | duk__append_u32(re_ctx, backref); | |
640 | break; | |
641 | } | |
642 | case DUK_RETOK_ATOM_START_CAPTURE_GROUP: { | |
643 | duk_uint32_t cap; | |
644 | ||
645 | new_atom_char_length = -1; /* mark as complex (capture handling) */ | |
646 | new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx); | |
647 | cap = ++re_ctx->captures; | |
648 | duk__append_u32(re_ctx, DUK_REOP_SAVE); | |
649 | duk__append_u32(re_ctx, cap * 2); | |
650 | duk__parse_disjunction(re_ctx, 0, &tmp_disj); /* retval (sub-atom char length) unused, tainted as complex above */ | |
651 | duk__append_u32(re_ctx, DUK_REOP_SAVE); | |
652 | duk__append_u32(re_ctx, cap * 2 + 1); | |
653 | break; | |
654 | } | |
655 | case DUK_RETOK_ATOM_START_NONCAPTURE_GROUP: { | |
656 | new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx); | |
657 | duk__parse_disjunction(re_ctx, 0, &tmp_disj); | |
658 | new_atom_char_length = tmp_disj.charlen; | |
659 | break; | |
660 | } | |
661 | case DUK_RETOK_ATOM_START_CHARCLASS: | |
662 | case DUK_RETOK_ATOM_START_CHARCLASS_INVERTED: { | |
663 | /* | |
664 | * Range parsing is done with a special lexer function which calls | |
665 | * us for every range parsed. This is different from how rest of | |
666 | * the parsing works, but avoids a heavy, arbitrary size intermediate | |
667 | * value type to hold the ranges. | |
668 | * | |
669 | * Another complication is the handling of character ranges when | |
670 | * case insensitive matching is used (see docs for discussion). | |
671 | * The range handler callback given to the lexer takes care of this | |
672 | * as well. | |
673 | * | |
674 | * Note that duplicate ranges are not eliminated when parsing character | |
675 | * classes, so that canonicalization of | |
676 | * | |
677 | * [0-9a-fA-Fx-{] | |
678 | * | |
679 | * creates the result (note the duplicate ranges): | |
680 | * | |
681 | * [0-9A-FA-FX-Z{-{] | |
682 | * | |
683 | * where [x-{] is split as a result of canonicalization. The duplicate | |
684 | * ranges are not a semantics issue: they work correctly. | |
685 | */ | |
686 | ||
687 | duk_uint32_t offset; | |
688 | ||
689 | DUK_DD(DUK_DDPRINT("character class")); | |
690 | ||
691 | /* insert ranges instruction, range count patched in later */ | |
692 | new_atom_char_length = 1; | |
693 | new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx); | |
694 | duk__append_u32(re_ctx, | |
695 | (re_ctx->curr_token.t == DUK_RETOK_ATOM_START_CHARCLASS) ? | |
696 | DUK_REOP_RANGES : DUK_REOP_INVRANGES); | |
697 | offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx); /* patch in range count later */ | |
698 | ||
699 | /* parse ranges until character class ends */ | |
700 | re_ctx->nranges = 0; /* note: ctx-wide temporary */ | |
701 | duk_lexer_parse_re_ranges(&re_ctx->lex, duk__generate_ranges, (void *) re_ctx); | |
702 | ||
703 | /* insert range count */ | |
704 | duk__insert_u32(re_ctx, offset, re_ctx->nranges); | |
705 | break; | |
706 | } | |
707 | case DUK_RETOK_ATOM_END_GROUP: { | |
708 | if (expect_eof) { | |
709 | DUK_ERROR(re_ctx->thr, DUK_ERR_SYNTAX_ERROR, | |
710 | DUK_STR_UNEXPECTED_CLOSING_PAREN); | |
711 | } | |
712 | goto done; | |
713 | } | |
714 | case DUK_RETOK_EOF: { | |
715 | if (!expect_eof) { | |
716 | DUK_ERROR(re_ctx->thr, DUK_ERR_SYNTAX_ERROR, | |
717 | DUK_STR_UNEXPECTED_END_OF_PATTERN); | |
718 | } | |
719 | goto done; | |
720 | } | |
721 | default: { | |
722 | DUK_ERROR(re_ctx->thr, DUK_ERR_SYNTAX_ERROR, | |
723 | DUK_STR_UNEXPECTED_REGEXP_TOKEN); | |
724 | } | |
725 | } | |
726 | ||
727 | /* a complex (new) atom taints the result */ | |
728 | if (new_atom_start_offset >= 0) { | |
729 | if (new_atom_char_length < 0) { | |
730 | res_charlen = -1; | |
731 | } else if (res_charlen >= 0) { | |
732 | /* only advance if not tainted */ | |
733 | res_charlen += new_atom_char_length; | |
734 | } | |
735 | } | |
736 | ||
737 | /* record previous atom info in case next token is a quantifier */ | |
738 | atom_start_offset = new_atom_start_offset; | |
739 | atom_char_length = new_atom_char_length; | |
740 | atom_start_captures = new_atom_start_captures; | |
741 | } | |
742 | ||
743 | done: | |
744 | ||
745 | /* finish up pending jump and split for last alternative */ | |
746 | if (unpatched_disjunction_jump >= 0) { | |
747 | duk_uint32_t offset; | |
748 | ||
749 | DUK_ASSERT(unpatched_disjunction_split >= 0); | |
750 | offset = unpatched_disjunction_jump; | |
751 | offset += duk__insert_jump_offset(re_ctx, | |
752 | offset, | |
753 | (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - offset)); | |
754 | /* offset is now target of the pending split (right after jump) */ | |
755 | duk__insert_jump_offset(re_ctx, | |
756 | unpatched_disjunction_split, | |
757 | offset - unpatched_disjunction_split); | |
758 | } | |
759 | ||
760 | #if 0 | |
761 | out_atom_info->end_captures = re_ctx->captures; | |
762 | #endif | |
763 | out_atom_info->charlen = res_charlen; | |
764 | DUK_DDD(DUK_DDDPRINT("parse disjunction finished: charlen=%ld", | |
765 | (long) out_atom_info->charlen)); | |
766 | ||
767 | re_ctx->recursion_depth--; | |
768 | } | |
769 | ||
770 | /* | |
771 | * Flags parsing (see E5 Section 15.10.4.1). | |
772 | */ | |
773 | ||
774 | DUK_LOCAL duk_uint32_t duk__parse_regexp_flags(duk_hthread *thr, duk_hstring *h) { | |
775 | const duk_uint8_t *p; | |
776 | const duk_uint8_t *p_end; | |
777 | duk_uint32_t flags = 0; | |
778 | ||
779 | p = DUK_HSTRING_GET_DATA(h); | |
780 | p_end = p + DUK_HSTRING_GET_BYTELEN(h); | |
781 | ||
782 | /* Note: can be safely scanned as bytes (undecoded) */ | |
783 | ||
784 | while (p < p_end) { | |
785 | duk_uint8_t c = *p++; | |
786 | switch ((int) c) { | |
787 | case (int) 'g': { | |
788 | if (flags & DUK_RE_FLAG_GLOBAL) { | |
789 | goto error; | |
790 | } | |
791 | flags |= DUK_RE_FLAG_GLOBAL; | |
792 | break; | |
793 | } | |
794 | case (int) 'i': { | |
795 | if (flags & DUK_RE_FLAG_IGNORE_CASE) { | |
796 | goto error; | |
797 | } | |
798 | flags |= DUK_RE_FLAG_IGNORE_CASE; | |
799 | break; | |
800 | } | |
801 | case (int) 'm': { | |
802 | if (flags & DUK_RE_FLAG_MULTILINE) { | |
803 | goto error; | |
804 | } | |
805 | flags |= DUK_RE_FLAG_MULTILINE; | |
806 | break; | |
807 | } | |
808 | default: { | |
809 | goto error; | |
810 | } | |
811 | } | |
812 | } | |
813 | ||
814 | return flags; | |
815 | ||
816 | error: | |
817 | DUK_ERROR(thr, DUK_ERR_SYNTAX_ERROR, DUK_STR_INVALID_REGEXP_FLAGS); | |
818 | return 0; /* never here */ | |
819 | } | |
820 | ||
821 | /* | |
822 | * Create escaped RegExp source (E5 Section 15.10.3). | |
823 | * | |
824 | * The current approach is to special case the empty RegExp | |
825 | * ('' -> '(?:)') and otherwise replace unescaped '/' characters | |
826 | * with '\/' regardless of where they occur in the regexp. | |
827 | * | |
828 | * Note that normalization does not seem to be necessary for | |
829 | * RegExp literals (e.g. '/foo/') because to be acceptable as | |
830 | * a RegExp literal, the text between forward slashes must | |
831 | * already match the escaping requirements (e.g. must not contain | |
832 | * unescaped forward slashes or be empty). Escaping IS needed | |
833 | * for expressions like 'new Regexp("...", "")' however. | |
834 | * Currently, we re-escape in either case. | |
835 | * | |
836 | * Also note that we process the source here in UTF-8 encoded | |
837 | * form. This is correct, because any non-ASCII characters are | |
838 | * passed through without change. | |
839 | */ | |
840 | ||
841 | DUK_LOCAL void duk__create_escaped_source(duk_hthread *thr, int idx_pattern) { | |
842 | duk_context *ctx = (duk_context *) thr; | |
843 | duk_hstring *h; | |
844 | const duk_uint8_t *p; | |
845 | duk_bufwriter_ctx bw_alloc; | |
846 | duk_bufwriter_ctx *bw; | |
847 | duk_uint8_t *q; | |
848 | duk_size_t i, n; | |
849 | duk_uint_fast8_t c_prev, c; | |
850 | ||
851 | h = duk_get_hstring(ctx, idx_pattern); | |
852 | DUK_ASSERT(h != NULL); | |
853 | p = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h); | |
854 | n = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h); | |
855 | ||
856 | if (n == 0) { | |
857 | /* return '(?:)' */ | |
858 | duk_push_hstring_stridx(ctx, DUK_STRIDX_ESCAPED_EMPTY_REGEXP); | |
859 | return; | |
860 | } | |
861 | ||
862 | bw = &bw_alloc; | |
863 | DUK_BW_INIT_PUSHBUF(thr, bw, n); | |
864 | q = DUK_BW_GET_PTR(thr, bw); | |
865 | ||
866 | c_prev = (duk_uint_fast8_t) 0; | |
867 | ||
868 | for (i = 0; i < n; i++) { | |
869 | c = p[i]; | |
870 | ||
871 | q = DUK_BW_ENSURE_RAW(thr, bw, 2, q); | |
872 | ||
873 | if (c == (duk_uint_fast8_t) '/' && c_prev != (duk_uint_fast8_t) '\\') { | |
874 | /* Unescaped '/' ANYWHERE in the regexp (in disjunction, | |
875 | * inside a character class, ...) => same escape works. | |
876 | */ | |
877 | *q++ = DUK_ASC_BACKSLASH; | |
878 | } | |
879 | *q++ = (duk_uint8_t) c; | |
880 | ||
881 | c_prev = c; | |
882 | } | |
883 | ||
884 | DUK_BW_SETPTR_AND_COMPACT(thr, bw, q); | |
885 | duk_to_string(ctx, -1); /* -> [ ... escaped_source ] */ | |
886 | } | |
887 | ||
888 | /* | |
889 | * Exposed regexp compilation primitive. | |
890 | * | |
891 | * Sets up a regexp compilation context, and calls duk__parse_disjunction() to do the | |
892 | * actual parsing. Handles generation of the compiled regexp header and the | |
893 | * "boilerplate" capture of the matching substring (save 0 and 1). Also does some | |
894 | * global level regexp checks after recursive compilation has finished. | |
895 | * | |
896 | * An escaped version of the regexp source, suitable for use as a RegExp instance | |
897 | * 'source' property (see E5 Section 15.10.3), is also left on the stack. | |
898 | * | |
899 | * Input stack: [ pattern flags ] | |
900 | * Output stack: [ bytecode escaped_source ] (both as strings) | |
901 | */ | |
902 | ||
903 | DUK_INTERNAL void duk_regexp_compile(duk_hthread *thr) { | |
904 | duk_context *ctx = (duk_context *) thr; | |
905 | duk_re_compiler_ctx re_ctx; | |
906 | duk_lexer_point lex_point; | |
907 | duk_hstring *h_pattern; | |
908 | duk_hstring *h_flags; | |
909 | duk__re_disjunction_info ign_disj; | |
910 | ||
911 | DUK_ASSERT(thr != NULL); | |
912 | DUK_ASSERT(ctx != NULL); | |
913 | ||
914 | /* | |
915 | * Args validation | |
916 | */ | |
917 | ||
918 | /* TypeError if fails */ | |
919 | h_pattern = duk_require_hstring(ctx, -2); | |
920 | h_flags = duk_require_hstring(ctx, -1); | |
921 | ||
922 | /* | |
923 | * Create normalized 'source' property (E5 Section 15.10.3). | |
924 | */ | |
925 | ||
926 | /* [ ... pattern flags ] */ | |
927 | ||
928 | duk__create_escaped_source(thr, -2); | |
929 | ||
930 | /* [ ... pattern flags escaped_source ] */ | |
931 | ||
932 | /* | |
933 | * Init compilation context | |
934 | */ | |
935 | ||
936 | /* [ ... pattern flags escaped_source buffer ] */ | |
937 | ||
938 | DUK_MEMZERO(&re_ctx, sizeof(re_ctx)); | |
939 | DUK_LEXER_INITCTX(&re_ctx.lex); /* duplicate zeroing, expect for (possible) NULL inits */ | |
940 | re_ctx.thr = thr; | |
941 | re_ctx.lex.thr = thr; | |
942 | re_ctx.lex.input = DUK_HSTRING_GET_DATA(h_pattern); | |
943 | re_ctx.lex.input_length = DUK_HSTRING_GET_BYTELEN(h_pattern); | |
944 | re_ctx.lex.token_limit = DUK_RE_COMPILE_TOKEN_LIMIT; | |
945 | re_ctx.recursion_limit = DUK_USE_REGEXP_COMPILER_RECLIMIT; | |
946 | re_ctx.re_flags = duk__parse_regexp_flags(thr, h_flags); | |
947 | ||
948 | DUK_BW_INIT_PUSHBUF(thr, &re_ctx.bw, DUK__RE_INITIAL_BUFSIZE); | |
949 | ||
950 | DUK_DD(DUK_DDPRINT("regexp compiler ctx initialized, flags=0x%08lx, recursion_limit=%ld", | |
951 | (unsigned long) re_ctx.re_flags, (long) re_ctx.recursion_limit)); | |
952 | ||
953 | /* | |
954 | * Init lexer | |
955 | */ | |
956 | ||
957 | lex_point.offset = 0; /* expensive init, just want to fill window */ | |
958 | lex_point.line = 1; | |
959 | DUK_LEXER_SETPOINT(&re_ctx.lex, &lex_point); | |
960 | ||
961 | /* | |
962 | * Compilation | |
963 | */ | |
964 | ||
965 | DUK_D(DUK_DPRINT("starting regexp compilation")); | |
966 | ||
967 | duk__append_u32(&re_ctx, DUK_REOP_SAVE); | |
968 | duk__append_u32(&re_ctx, 0); | |
969 | duk__parse_disjunction(&re_ctx, 1 /*expect_eof*/, &ign_disj); | |
970 | duk__append_u32(&re_ctx, DUK_REOP_SAVE); | |
971 | duk__append_u32(&re_ctx, 1); | |
972 | duk__append_u32(&re_ctx, DUK_REOP_MATCH); | |
973 | ||
974 | /* | |
975 | * Check for invalid backreferences; note that it is NOT an error | |
976 | * to back-reference a capture group which has not yet been introduced | |
977 | * in the pattern (as in /\1(foo)/); in fact, the backreference will | |
978 | * always match! It IS an error to back-reference a capture group | |
979 | * which will never be introduced in the pattern. Thus, we can check | |
980 | * for such references only after parsing is complete. | |
981 | */ | |
982 | ||
983 | if (re_ctx.highest_backref > re_ctx.captures) { | |
984 | DUK_ERROR(thr, DUK_ERR_SYNTAX_ERROR, DUK_STR_INVALID_BACKREFS); | |
985 | } | |
986 | ||
987 | /* | |
988 | * Emit compiled regexp header: flags, ncaptures | |
989 | * (insertion order inverted on purpose) | |
990 | */ | |
991 | ||
992 | duk__insert_u32(&re_ctx, 0, (re_ctx.captures + 1) * 2); | |
993 | duk__insert_u32(&re_ctx, 0, re_ctx.re_flags); | |
994 | ||
995 | /* [ ... pattern flags escaped_source buffer ] */ | |
996 | ||
997 | DUK_BW_COMPACT(thr, &re_ctx.bw); | |
998 | duk_to_string(ctx, -1); /* coerce to string */ | |
999 | ||
1000 | /* [ ... pattern flags escaped_source bytecode ] */ | |
1001 | ||
1002 | /* | |
1003 | * Finalize stack | |
1004 | */ | |
1005 | ||
1006 | duk_remove(ctx, -4); /* -> [ ... flags escaped_source bytecode ] */ | |
1007 | duk_remove(ctx, -3); /* -> [ ... escaped_source bytecode ] */ | |
1008 | ||
1009 | DUK_D(DUK_DPRINT("regexp compilation successful, bytecode: %!T, escaped source: %!T", | |
1010 | (duk_tval *) duk_get_tval(ctx, -1), (duk_tval *) duk_get_tval(ctx, -2))); | |
1011 | } | |
1012 | ||
1013 | /* | |
1014 | * Create a RegExp instance (E5 Section 15.10.7). | |
1015 | * | |
1016 | * Note: the output stack left by duk_regexp_compile() is directly compatible | |
1017 | * with the input here. | |
1018 | * | |
1019 | * Input stack: [ escaped_source bytecode ] (both as strings) | |
1020 | * Output stack: [ RegExp ] | |
1021 | */ | |
1022 | ||
1023 | DUK_INTERNAL void duk_regexp_create_instance(duk_hthread *thr) { | |
1024 | duk_context *ctx = (duk_context *) thr; | |
1025 | duk_hobject *h; | |
1026 | duk_hstring *h_bc; | |
1027 | duk_small_int_t re_flags; | |
1028 | ||
1029 | /* [ ... escape_source bytecode ] */ | |
1030 | ||
1031 | h_bc = duk_get_hstring(ctx, -1); | |
1032 | DUK_ASSERT(h_bc != NULL); | |
1033 | DUK_ASSERT(DUK_HSTRING_GET_BYTELEN(h_bc) >= 1); /* always at least the header */ | |
1034 | DUK_ASSERT(DUK_HSTRING_GET_CHARLEN(h_bc) >= 1); | |
1035 | DUK_ASSERT((duk_small_int_t) DUK_HSTRING_GET_DATA(h_bc)[0] < 0x80); /* flags always encodes to 1 byte */ | |
1036 | re_flags = (duk_small_int_t) DUK_HSTRING_GET_DATA(h_bc)[0]; | |
1037 | ||
1038 | /* [ ... escaped_source bytecode ] */ | |
1039 | ||
1040 | duk_push_object(ctx); | |
1041 | h = duk_get_hobject(ctx, -1); | |
1042 | DUK_ASSERT(h != NULL); | |
1043 | duk_insert(ctx, -3); | |
1044 | ||
1045 | /* [ ... regexp_object escaped_source bytecode ] */ | |
1046 | ||
1047 | DUK_HOBJECT_SET_CLASS_NUMBER(h, DUK_HOBJECT_CLASS_REGEXP); | |
1048 | DUK_HOBJECT_SET_PROTOTYPE_UPDREF(thr, h, thr->builtins[DUK_BIDX_REGEXP_PROTOTYPE]); | |
1049 | ||
1050 | duk_xdef_prop_stridx(ctx, -3, DUK_STRIDX_INT_BYTECODE, DUK_PROPDESC_FLAGS_NONE); | |
1051 | ||
1052 | /* [ ... regexp_object escaped_source ] */ | |
1053 | ||
1054 | duk_xdef_prop_stridx(ctx, -2, DUK_STRIDX_SOURCE, DUK_PROPDESC_FLAGS_NONE); | |
1055 | ||
1056 | /* [ ... regexp_object ] */ | |
1057 | ||
1058 | duk_push_boolean(ctx, (re_flags & DUK_RE_FLAG_GLOBAL)); | |
1059 | duk_xdef_prop_stridx(ctx, -2, DUK_STRIDX_GLOBAL, DUK_PROPDESC_FLAGS_NONE); | |
1060 | ||
1061 | duk_push_boolean(ctx, (re_flags & DUK_RE_FLAG_IGNORE_CASE)); | |
1062 | duk_xdef_prop_stridx(ctx, -2, DUK_STRIDX_IGNORE_CASE, DUK_PROPDESC_FLAGS_NONE); | |
1063 | ||
1064 | duk_push_boolean(ctx, (re_flags & DUK_RE_FLAG_MULTILINE)); | |
1065 | duk_xdef_prop_stridx(ctx, -2, DUK_STRIDX_MULTILINE, DUK_PROPDESC_FLAGS_NONE); | |
1066 | ||
1067 | duk_push_int(ctx, 0); | |
1068 | duk_xdef_prop_stridx(ctx, -2, DUK_STRIDX_LAST_INDEX, DUK_PROPDESC_FLAGS_W); | |
1069 | ||
1070 | /* [ ... regexp_object ] */ | |
1071 | } | |
1072 | ||
1073 | #undef DUK__RE_BUFLEN | |
1074 | ||
1075 | #else /* DUK_USE_REGEXP_SUPPORT */ | |
1076 | ||
1077 | /* regexp support disabled */ | |
1078 | ||
1079 | #endif /* DUK_USE_REGEXP_SUPPORT */ |