]> git.proxmox.com Git - ceph.git/blame - ceph/src/civetweb/src/third_party/duktape-1.8.0/src-separate/duk_lexer.c
buildsys: switch source download to quincy
[ceph.git] / ceph / src / civetweb / src / third_party / duktape-1.8.0 / src-separate / duk_lexer.c
CommitLineData
7c673cae
FG
1/*
2 * Lexer for source files, ToNumber() string conversions, RegExp expressions,
3 * and JSON.
4 *
5 * Provides a stream of Ecmascript tokens from an UTF-8/CESU-8 buffer. The
6 * caller can also rewind the token stream into a certain position which is
7 * needed by the compiler part for multi-pass scanning. Tokens are
8 * represented as duk_token structures, and contain line number information.
9 * Token types are identified with DUK_TOK_* defines.
10 *
11 * Characters are decoded into a fixed size lookup window consisting of
12 * decoded Unicode code points, with window positions past the end of the
13 * input filled with an invalid codepoint (-1). The tokenizer can thus
14 * perform multiple character lookups efficiently and with few sanity
15 * checks (such as access outside the end of the input), which keeps the
16 * tokenization code small at the cost of performance.
17 *
18 * Character data in tokens, such as identifier names and string literals,
19 * is encoded into CESU-8 format on-the-fly while parsing the token in
20 * question. The string data is made reachable to garbage collection by
21 * placing the token-related values in value stack entries allocated for
22 * this purpose by the caller. The characters exist in Unicode code point
23 * form only in the fixed size lookup window, which keeps character data
24 * expansion (of especially ASCII data) low.
25 *
26 * Token parsing supports the full range of Unicode characters as described
27 * in the E5 specification. Parsing has been optimized for ASCII characters
28 * because ordinary Ecmascript code consists almost entirely of ASCII
29 * characters. Matching of complex Unicode codepoint sets (such as in the
30 * IdentifierStart and IdentifierPart productions) is optimized for size,
31 * and is done using a linear scan of a bit-packed list of ranges. This is
32 * very slow, but should never be entered unless the source code actually
33 * contains Unicode characters.
34 *
35 * Ecmascript tokenization is partially context sensitive. First,
36 * additional future reserved words are recognized in strict mode (see E5
37 * Section 7.6.1.2). Second, a forward slash character ('/') can be
38 * recognized either as starting a RegExp literal or as a division operator,
39 * depending on context. The caller must provide necessary context flags
40 * when requesting a new token.
41 *
42 * Future work:
43 *
44 * * Make line number tracking optional, as it consumes space.
45 *
46 * * Add a feature flag for disabling UTF-8 decoding of input, as most
47 * source code is ASCII. Because of Unicode escapes written in ASCII,
48 * this does not allow Unicode support to be removed from e.g.
49 * duk_unicode_is_identifier_start() nor does it allow removal of CESU-8
50 * encoding of e.g. string literals.
51 *
52 * * Add a feature flag for disabling Unicode compliance of e.g. identifier
53 * names. This allows for a build more than a kilobyte smaller, because
54 * Unicode ranges needed by duk_unicode_is_identifier_start() and
55 * duk_unicode_is_identifier_part() can be dropped. String literals
56 * should still be allowed to contain escaped Unicode, so this still does
57 * not allow removal of CESU-8 encoding of e.g. string literals.
58 *
59 * * Character lookup tables for codepoints above BMP could be stripped.
60 *
61 * * Strictly speaking, E5 specification requires that source code consists
62 * of 16-bit code units, and if not, must be conceptually converted to
63 * that format first. The current lexer processes Unicode code points
64 * and allows characters outside the BMP. These should be converted to
65 * surrogate pairs while reading the source characters into the window,
66 * not after tokens have been formed (as is done now). However, the fix
67 * is not trivial because two characters are decoded from one codepoint.
68 *
69 * * Optimize for speed as well as size. Large if-else ladders are (at
70 * least potentially) slow.
71 */
72
73#include "duk_internal.h"
74
75/*
76 * Various defines and file specific helper macros
77 */
78
79#define DUK__MAX_RE_DECESC_DIGITS 9
80#define DUK__MAX_RE_QUANT_DIGITS 9 /* Does not allow e.g. 2**31-1, but one more would allow overflows of u32. */
81
82/* whether to use macros or helper function depends on call count */
83#define DUK__ISDIGIT(x) ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_9)
84#define DUK__ISHEXDIGIT(x) duk__is_hex_digit((x))
85#define DUK__ISOCTDIGIT(x) ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_7)
86#define DUK__ISDIGIT03(x) ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_3)
87#define DUK__ISDIGIT47(x) ((x) >= DUK_ASC_4 && (x) <= DUK_ASC_7)
88
89/* lexer character window helpers */
90#define DUK__LOOKUP(lex_ctx,index) ((lex_ctx)->window[(index)].codepoint)
91#define DUK__ADVANCECHARS(lex_ctx,count) duk__advance_bytes((lex_ctx), (count) * sizeof(duk_lexer_codepoint))
92#define DUK__ADVANCEBYTES(lex_ctx,count) duk__advance_bytes((lex_ctx), (count))
93#define DUK__INITBUFFER(lex_ctx) duk__initbuffer((lex_ctx))
94#define DUK__APPENDBUFFER(lex_ctx,x) duk__appendbuffer((lex_ctx), (duk_codepoint_t) (x))
95
96/* lookup shorthands (note: assume context variable is named 'lex_ctx') */
97#define DUK__L0() DUK__LOOKUP(lex_ctx, 0)
98#define DUK__L1() DUK__LOOKUP(lex_ctx, 1)
99#define DUK__L2() DUK__LOOKUP(lex_ctx, 2)
100#define DUK__L3() DUK__LOOKUP(lex_ctx, 3)
101#define DUK__L4() DUK__LOOKUP(lex_ctx, 4)
102#define DUK__L5() DUK__LOOKUP(lex_ctx, 5)
103
104/* packed advance/token number macro used by multiple functions */
105#define DUK__ADVTOK(advbytes,tok) ((((advbytes) * sizeof(duk_lexer_codepoint)) << 8) + (tok))
106
107/*
108 * Advance lookup window by N characters, filling in new characters as
109 * necessary. After returning caller is guaranteed a character window of
110 * at least DUK_LEXER_WINDOW_SIZE characters.
111 *
112 * The main function duk__advance_bytes() is called at least once per every
113 * token so it has a major lexer/compiler performance impact. There are two
114 * variants for the main duk__advance_bytes() algorithm: a sliding window
115 * approach which is slightly faster at the cost of larger code footprint,
116 * and a simple copying one.
117 *
118 * Decoding directly from the source string would be another lexing option.
119 * But the lookup window based approach has the advantage of hiding the
120 * source string and its encoding effectively which gives more flexibility
121 * going forward to e.g. support chunked streaming of source from flash.
122 *
123 * Decodes UTF-8/CESU-8 leniently with support for code points from U+0000 to
124 * U+10FFFF, causing an error if the input is unparseable. Leniency means:
125 *
126 * * Unicode code point validation is intentionally not performed,
127 * except to check that the codepoint does not exceed 0x10ffff.
128 *
129 * * In particular, surrogate pairs are allowed and not combined, which
130 * allows source files to represent all SourceCharacters with CESU-8.
131 * Broken surrogate pairs are allowed, as Ecmascript does not mandate
132 * their validation.
133 *
134 * * Allow non-shortest UTF-8 encodings.
135 *
136 * Leniency here causes few security concerns because all character data is
137 * decoded into Unicode codepoints before lexer processing, and is then
138 * re-encoded into CESU-8. The source can be parsed as strict UTF-8 with
139 * a compiler option. However, Ecmascript source characters include -all-
140 * 16-bit unsigned integer codepoints, so leniency seems to be appropriate.
141 *
142 * Note that codepoints above the BMP are not strictly SourceCharacters,
143 * but the lexer still accepts them as such. Before ending up in a string
144 * or an identifier name, codepoints above BMP are converted into surrogate
145 * pairs and then CESU-8 encoded, resulting in 16-bit Unicode data as
146 * expected by Ecmascript.
147 *
148 * An alternative approach to dealing with invalid or partial sequences
149 * would be to skip them and replace them with e.g. the Unicode replacement
150 * character U+FFFD. This has limited utility because a replacement character
151 * will most likely cause a parse error, unless it occurs inside a string.
152 * Further, Ecmascript source is typically pure ASCII.
153 *
154 * See:
155 *
156 * http://en.wikipedia.org/wiki/UTF-8
157 * http://en.wikipedia.org/wiki/CESU-8
158 * http://tools.ietf.org/html/rfc3629
159 * http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
160 *
161 * Future work:
162 *
163 * * Reject other invalid Unicode sequences (see Wikipedia entry for examples)
164 * in strict UTF-8 mode.
165 *
166 * * Size optimize. An attempt to use a 16-byte lookup table for the first
167 * byte resulted in a code increase though.
168 *
169 * * Is checking against maximum 0x10ffff really useful? 4-byte encoding
170 * imposes a certain limit anyway.
171 *
172 * * Support chunked streaming of source code. Can be implemented either
173 * by streaming chunks of bytes or chunks of codepoints.
174 */
175
176#if defined(DUK_USE_LEXER_SLIDING_WINDOW)
177DUK_LOCAL void duk__fill_lexer_buffer(duk_lexer_ctx *lex_ctx, duk_small_uint_t start_offset_bytes) {
178 duk_lexer_codepoint *cp, *cp_end;
179 duk_ucodepoint_t x;
180 duk_small_uint_t contlen;
181 const duk_uint8_t *p, *p_end;
11fdf7f2 182#if defined(DUK_USE_STRICT_UTF8_SOURCE)
7c673cae
FG
183 duk_ucodepoint_t mincp;
184#endif
185 duk_int_t input_line;
186
187 /* Use temporaries and update lex_ctx only when finished. */
188 input_line = lex_ctx->input_line;
189 p = lex_ctx->input + lex_ctx->input_offset;
190 p_end = lex_ctx->input + lex_ctx->input_length;
191
192 cp = (duk_lexer_codepoint *) (void *) ((duk_uint8_t *) lex_ctx->buffer + start_offset_bytes);
193 cp_end = lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE;
194
195 for (; cp != cp_end; cp++) {
196 cp->offset = (duk_size_t) (p - lex_ctx->input);
197 cp->line = input_line;
198
199 /* XXX: potential issue with signed pointers, p_end < p. */
200 if (DUK_UNLIKELY(p >= p_end)) {
201 /* If input_offset were assigned a negative value, it would
202 * result in a large positive value. Most likely it would be
203 * larger than input_length and be caught here. In any case
204 * no memory unsafe behavior would happen.
205 */
206 cp->codepoint = -1;
207 continue;
208 }
209
210 x = (duk_ucodepoint_t) (*p++);
211
212 /* Fast path. */
213
214 if (DUK_LIKELY(x < 0x80UL)) {
215 DUK_ASSERT(x != 0x2028UL && x != 0x2029UL); /* not LS/PS */
216 if (DUK_UNLIKELY(x <= 0x000dUL)) {
217 if ((x == 0x000aUL) ||
218 ((x == 0x000dUL) && (p >= p_end || *p != 0x000aUL))) {
219 /* lookup for 0x000a above assumes shortest encoding now */
220
221 /* E5 Section 7.3, treat the following as newlines:
222 * LF
223 * CR [not followed by LF]
224 * LS
225 * PS
226 *
227 * For CR LF, CR is ignored if it is followed by LF, and the LF will bump
228 * the line number.
229 */
230 input_line++;
231 }
232 }
233
234 cp->codepoint = (duk_codepoint_t) x;
235 continue;
236 }
237
238 /* Slow path. */
239
240 if (x < 0xc0UL) {
241 /* 10xx xxxx -> invalid */
242 goto error_encoding;
243 } else if (x < 0xe0UL) {
244 /* 110x xxxx 10xx xxxx */
245 contlen = 1;
11fdf7f2 246#if defined(DUK_USE_STRICT_UTF8_SOURCE)
7c673cae
FG
247 mincp = 0x80UL;
248#endif
249 x = x & 0x1fUL;
250 } else if (x < 0xf0UL) {
251 /* 1110 xxxx 10xx xxxx 10xx xxxx */
252 contlen = 2;
11fdf7f2 253#if defined(DUK_USE_STRICT_UTF8_SOURCE)
7c673cae
FG
254 mincp = 0x800UL;
255#endif
256 x = x & 0x0fUL;
257 } else if (x < 0xf8UL) {
258 /* 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx */
259 contlen = 3;
11fdf7f2 260#if defined(DUK_USE_STRICT_UTF8_SOURCE)
7c673cae
FG
261 mincp = 0x10000UL;
262#endif
263 x = x & 0x07UL;
264 } else {
265 /* no point in supporting encodings of 5 or more bytes */
266 goto error_encoding;
267 }
268
269 DUK_ASSERT(p_end >= p);
270 if ((duk_size_t) contlen > (duk_size_t) (p_end - p)) {
271 goto error_clipped;
272 }
273
274 while (contlen > 0) {
275 duk_small_uint_t y;
276 y = *p++;
277 if ((y & 0xc0U) != 0x80U) {
278 /* check that byte has the form 10xx xxxx */
279 goto error_encoding;
280 }
281 x = x << 6;
282 x += y & 0x3fUL;
283 contlen--;
284 }
285
286 /* check final character validity */
287
288 if (x > 0x10ffffUL) {
289 goto error_encoding;
290 }
11fdf7f2 291#if defined(DUK_USE_STRICT_UTF8_SOURCE)
7c673cae
FG
292 if (x < mincp || (x >= 0xd800UL && x <= 0xdfffUL) || x == 0xfffeUL) {
293 goto error_encoding;
294 }
295#endif
296
297 DUK_ASSERT(x != 0x000aUL && x != 0x000dUL);
298 if ((x == 0x2028UL) || (x == 0x2029UL)) {
299 input_line++;
300 }
301
302 cp->codepoint = (duk_codepoint_t) x;
303 }
304
305 lex_ctx->input_offset = (duk_size_t) (p - lex_ctx->input);
306 lex_ctx->input_line = input_line;
307 return;
308
309 error_clipped: /* clipped codepoint */
310 error_encoding: /* invalid codepoint encoding or codepoint */
311 lex_ctx->input_offset = (duk_size_t) (p - lex_ctx->input);
312 lex_ctx->input_line = input_line;
313
11fdf7f2 314 DUK_ERROR_SYNTAX(lex_ctx->thr, "utf-8 decode failed");
7c673cae
FG
315}
316
317DUK_LOCAL void duk__advance_bytes(duk_lexer_ctx *lex_ctx, duk_small_uint_t count_bytes) {
318 duk_small_uint_t used_bytes, avail_bytes;
319
320 DUK_ASSERT_DISABLE(count_bytes >= 0); /* unsigned */
321 DUK_ASSERT(count_bytes <= (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint)));
322 DUK_ASSERT(lex_ctx->window >= lex_ctx->buffer);
323 DUK_ASSERT(lex_ctx->window < lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE);
324 DUK_ASSERT((duk_uint8_t *) lex_ctx->window + count_bytes <= (duk_uint8_t *) lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE * sizeof(duk_lexer_codepoint));
325
326 /* Zero 'count' is also allowed to make call sites easier.
327 * Arithmetic in bytes generates better code in GCC.
328 */
329
330 lex_ctx->window = (duk_lexer_codepoint *) (void *) ((duk_uint8_t *) lex_ctx->window + count_bytes); /* avoid multiply */
331 used_bytes = (duk_small_uint_t) ((duk_uint8_t *) lex_ctx->window - (duk_uint8_t *) lex_ctx->buffer);
332 avail_bytes = DUK_LEXER_BUFFER_SIZE * sizeof(duk_lexer_codepoint) - used_bytes;
333 if (avail_bytes < (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint))) {
334 /* Not enough data to provide a full window, so "scroll" window to
335 * start of buffer and fill up the rest.
336 */
337 DUK_MEMMOVE((void *) lex_ctx->buffer,
338 (const void *) lex_ctx->window,
339 (size_t) avail_bytes);
340 lex_ctx->window = lex_ctx->buffer;
341 duk__fill_lexer_buffer(lex_ctx, avail_bytes);
342 }
343}
344
345DUK_LOCAL void duk__init_lexer_window(duk_lexer_ctx *lex_ctx) {
346 lex_ctx->window = lex_ctx->buffer;
347 duk__fill_lexer_buffer(lex_ctx, 0);
348}
349#else /* DUK_USE_LEXER_SLIDING_WINDOW */
350DUK_LOCAL duk_codepoint_t duk__read_char(duk_lexer_ctx *lex_ctx) {
351 duk_ucodepoint_t x;
352 duk_small_uint_t len;
353 duk_small_uint_t i;
354 const duk_uint8_t *p;
11fdf7f2 355#if defined(DUK_USE_STRICT_UTF8_SOURCE)
7c673cae
FG
356 duk_ucodepoint_t mincp;
357#endif
358 duk_size_t input_offset;
359
360 input_offset = lex_ctx->input_offset;
361 if (DUK_UNLIKELY(input_offset >= lex_ctx->input_length)) {
362 /* If input_offset were assigned a negative value, it would
363 * result in a large positive value. Most likely it would be
364 * larger than input_length and be caught here. In any case
365 * no memory unsafe behavior would happen.
366 */
367 return -1;
368 }
369
370 p = lex_ctx->input + input_offset;
371 x = (duk_ucodepoint_t) (*p);
372
373 if (DUK_LIKELY(x < 0x80UL)) {
374 /* 0xxx xxxx -> fast path */
375
376 /* input offset tracking */
377 lex_ctx->input_offset++;
378
379 DUK_ASSERT(x != 0x2028UL && x != 0x2029UL); /* not LS/PS */
380 if (DUK_UNLIKELY(x <= 0x000dUL)) {
381 if ((x == 0x000aUL) ||
382 ((x == 0x000dUL) && (lex_ctx->input_offset >= lex_ctx->input_length ||
383 lex_ctx->input[lex_ctx->input_offset] != 0x000aUL))) {
384 /* lookup for 0x000a above assumes shortest encoding now */
385
386 /* E5 Section 7.3, treat the following as newlines:
387 * LF
388 * CR [not followed by LF]
389 * LS
390 * PS
391 *
392 * For CR LF, CR is ignored if it is followed by LF, and the LF will bump
393 * the line number.
394 */
395 lex_ctx->input_line++;
396 }
397 }
398
399 return (duk_codepoint_t) x;
400 }
401
402 /* Slow path. */
403
404 if (x < 0xc0UL) {
405 /* 10xx xxxx -> invalid */
406 goto error_encoding;
407 } else if (x < 0xe0UL) {
408 /* 110x xxxx 10xx xxxx */
409 len = 2;
11fdf7f2 410#if defined(DUK_USE_STRICT_UTF8_SOURCE)
7c673cae
FG
411 mincp = 0x80UL;
412#endif
413 x = x & 0x1fUL;
414 } else if (x < 0xf0UL) {
415 /* 1110 xxxx 10xx xxxx 10xx xxxx */
416 len = 3;
11fdf7f2 417#if defined(DUK_USE_STRICT_UTF8_SOURCE)
7c673cae
FG
418 mincp = 0x800UL;
419#endif
420 x = x & 0x0fUL;
421 } else if (x < 0xf8UL) {
422 /* 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx */
423 len = 4;
11fdf7f2 424#if defined(DUK_USE_STRICT_UTF8_SOURCE)
7c673cae
FG
425 mincp = 0x10000UL;
426#endif
427 x = x & 0x07UL;
428 } else {
429 /* no point in supporting encodings of 5 or more bytes */
430 goto error_encoding;
431 }
432
433 DUK_ASSERT(lex_ctx->input_length >= lex_ctx->input_offset);
434 if ((duk_size_t) len > (duk_size_t) (lex_ctx->input_length - lex_ctx->input_offset)) {
435 goto error_clipped;
436 }
437
438 p++;
439 for (i = 1; i < len; i++) {
440 duk_small_uint_t y;
441 y = *p++;
442 if ((y & 0xc0U) != 0x80U) {
443 /* check that byte has the form 10xx xxxx */
444 goto error_encoding;
445 }
446 x = x << 6;
447 x += y & 0x3fUL;
448 }
449
450 /* check final character validity */
451
452 if (x > 0x10ffffUL) {
453 goto error_encoding;
454 }
11fdf7f2 455#if defined(DUK_USE_STRICT_UTF8_SOURCE)
7c673cae
FG
456 if (x < mincp || (x >= 0xd800UL && x <= 0xdfffUL) || x == 0xfffeUL) {
457 goto error_encoding;
458 }
459#endif
460
461 /* input offset tracking */
462 lex_ctx->input_offset += len;
463
464 /* line tracking */
465 DUK_ASSERT(x != 0x000aUL && x != 0x000dUL);
466 if ((x == 0x2028UL) || (x == 0x2029UL)) {
467 lex_ctx->input_line++;
468 }
469
470 return (duk_codepoint_t) x;
471
472 error_clipped: /* clipped codepoint */
473 error_encoding: /* invalid codepoint encoding or codepoint */
11fdf7f2 474 DUK_ERROR_SYNTAX(lex_ctx->thr, "utf-8 decode failed");
7c673cae
FG
475 return 0;
476}
477
478DUK_LOCAL void duk__advance_bytes(duk_lexer_ctx *lex_ctx, duk_small_uint_t count_bytes) {
479 duk_small_uint_t keep_bytes;
480 duk_lexer_codepoint *cp, *cp_end;
481
482 DUK_ASSERT_DISABLE(count_bytes >= 0); /* unsigned */
483 DUK_ASSERT(count_bytes <= (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint)));
484
485 /* Zero 'count' is also allowed to make call sites easier. */
486
487 keep_bytes = DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint) - count_bytes;
488 DUK_MEMMOVE((void *) lex_ctx->window,
489 (const void *) ((duk_uint8_t *) lex_ctx->window + count_bytes),
11fdf7f2 490 (size_t) keep_bytes);
7c673cae
FG
491
492 cp = (duk_lexer_codepoint *) ((duk_uint8_t *) lex_ctx->window + keep_bytes);
493 cp_end = lex_ctx->window + DUK_LEXER_WINDOW_SIZE;
494 for (; cp != cp_end; cp++) {
495 cp->offset = lex_ctx->input_offset;
496 cp->line = lex_ctx->input_line;
497 cp->codepoint = duk__read_char(lex_ctx);
498 }
499}
500
501DUK_LOCAL void duk__init_lexer_window(duk_lexer_ctx *lex_ctx) {
502 /* Call with count == DUK_LEXER_WINDOW_SIZE to fill buffer initially. */
503 duk__advance_bytes(lex_ctx, DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint)); /* fill window */
504}
505#endif /* DUK_USE_LEXER_SLIDING_WINDOW */
506
507/*
508 * (Re)initialize the temporary byte buffer. May be called extra times
509 * with little impact.
510 */
511
512DUK_LOCAL void duk__initbuffer(duk_lexer_ctx *lex_ctx) {
513 /* Reuse buffer as is unless buffer has grown large. */
514 if (DUK_HBUFFER_DYNAMIC_GET_SIZE(lex_ctx->buf) < DUK_LEXER_TEMP_BUF_LIMIT) {
515 /* Keep current size */
516 } else {
517 duk_hbuffer_resize(lex_ctx->thr, lex_ctx->buf, DUK_LEXER_TEMP_BUF_LIMIT);
518 }
519
520 DUK_BW_INIT_WITHBUF(lex_ctx->thr, &lex_ctx->bw, lex_ctx->buf);
521}
522
523/*
524 * Append a Unicode codepoint to the temporary byte buffer. Performs
525 * CESU-8 surrogate pair encoding for codepoints above the BMP.
526 * Existing surrogate pairs are allowed and also encoded into CESU-8.
527 */
528
529DUK_LOCAL void duk__appendbuffer(duk_lexer_ctx *lex_ctx, duk_codepoint_t x) {
530 /*
531 * Since character data is only generated by decoding the source or by
532 * the compiler itself, we rely on the input codepoints being correct
533 * and avoid a check here.
534 *
535 * Character data can also come here through decoding of Unicode
536 * escapes ("\udead\ubeef") so all 16-but unsigned values can be
537 * present, even when the source file itself is strict UTF-8.
538 */
539
540 DUK_ASSERT(x >= 0 && x <= 0x10ffff);
541
542 DUK_BW_WRITE_ENSURE_CESU8(lex_ctx->thr, &lex_ctx->bw, (duk_ucodepoint_t) x);
543}
544
545/*
546 * Intern the temporary byte buffer into a valstack slot
547 * (in practice, slot1 or slot2).
548 */
549
550DUK_LOCAL void duk__internbuffer(duk_lexer_ctx *lex_ctx, duk_idx_t valstack_idx) {
551 duk_context *ctx = (duk_context *) lex_ctx->thr;
552
553 DUK_ASSERT(valstack_idx == lex_ctx->slot1_idx || valstack_idx == lex_ctx->slot2_idx);
554
555 DUK_BW_PUSH_AS_STRING(lex_ctx->thr, &lex_ctx->bw);
556 duk_replace(ctx, valstack_idx);
557}
558
559/*
560 * Init lexer context
561 */
562
563DUK_INTERNAL void duk_lexer_initctx(duk_lexer_ctx *lex_ctx) {
564 DUK_ASSERT(lex_ctx != NULL);
565
566 DUK_MEMZERO(lex_ctx, sizeof(*lex_ctx));
11fdf7f2 567#if defined(DUK_USE_EXPLICIT_NULL_INIT)
7c673cae
FG
568#if defined(DUK_USE_LEXER_SLIDING_WINDOW)
569 lex_ctx->window = NULL;
570#endif
571 lex_ctx->thr = NULL;
572 lex_ctx->input = NULL;
573 lex_ctx->buf = NULL;
574#endif
575}
576
577/*
578 * Set lexer input position and reinitialize lookup window.
579 */
580
581/* NB: duk_lexer_getpoint() is a macro only */
582
583DUK_INTERNAL void duk_lexer_setpoint(duk_lexer_ctx *lex_ctx, duk_lexer_point *pt) {
584 DUK_ASSERT_DISABLE(pt->offset >= 0); /* unsigned */
585 DUK_ASSERT(pt->line >= 1);
586 lex_ctx->input_offset = pt->offset;
587 lex_ctx->input_line = pt->line;
588 duk__init_lexer_window(lex_ctx);
589}
590
591/*
592 * Lexing helpers
593 */
594
595/* numeric value of a hex digit (also covers octal and decimal digits) */
596DUK_LOCAL duk_codepoint_t duk__hexval(duk_lexer_ctx *lex_ctx, duk_codepoint_t x) {
597 duk_small_int_t t;
598
599 /* Here 'x' is a Unicode codepoint */
600 if (DUK_LIKELY(x >= 0 && x <= 0xff)) {
601 t = duk_hex_dectab[x];
602 if (DUK_LIKELY(t >= 0)) {
603 return t;
604 }
605 }
606
607 /* Throwing an error this deep makes the error rather vague, but
608 * saves hundreds of bytes of code.
609 */
11fdf7f2 610 DUK_ERROR_SYNTAX(lex_ctx->thr, "decode error");
7c673cae
FG
611 return 0;
612}
613
614/* having this as a separate function provided a size benefit */
615DUK_LOCAL duk_bool_t duk__is_hex_digit(duk_codepoint_t x) {
616 if (DUK_LIKELY(x >= 0 && x <= 0xff)) {
617 return (duk_hex_dectab[x] >= 0);
618 }
619 return 0;
620}
621
622DUK_LOCAL duk_codepoint_t duk__decode_hexesc_from_window(duk_lexer_ctx *lex_ctx, duk_small_int_t lookup_offset) {
623 /* validation performed by duk__hexval */
624 return (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset].codepoint) << 4) |
625 (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset + 1].codepoint));
626}
627
628DUK_LOCAL duk_codepoint_t duk__decode_uniesc_from_window(duk_lexer_ctx *lex_ctx, duk_small_int_t lookup_offset) {
629 /* validation performed by duk__hexval */
630 return (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset].codepoint) << 12) |
631 (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset + 1].codepoint) << 8) |
632 (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset + 2].codepoint) << 4) |
633 (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset + 3].codepoint));
634}
635
636/*
637 * Parse Ecmascript source InputElementDiv or InputElementRegExp
638 * (E5 Section 7), skipping whitespace, comments, and line terminators.
639 *
640 * Possible results are:
641 * (1) a token
642 * (2) a line terminator (skipped)
643 * (3) a comment (skipped)
644 * (4) EOF
645 *
646 * White space is automatically skipped from the current position (but
647 * not after the input element). If input has already ended, returns
648 * DUK_TOK_EOF indefinitely. If a parse error occurs, uses an DUK_ERROR()
649 * macro call (and hence a longjmp through current heap longjmp context).
650 * Comments and line terminator tokens are automatically skipped.
651 *
652 * The input element being matched is determined by regexp_mode; if set,
653 * parses a InputElementRegExp, otherwise a InputElementDiv. The
654 * difference between these are handling of productions starting with a
655 * forward slash.
656 *
657 * If strict_mode is set, recognizes additional future reserved words
658 * specific to strict mode, and refuses to parse octal literals.
659 *
660 * The matching strategy below is to (currently) use a six character
661 * lookup window to quickly determine which production is the -longest-
662 * matching one, and then parse that. The top-level if-else clauses
663 * match the first character, and the code blocks for each clause
664 * handle -all- alternatives for that first character. Ecmascript
665 * specification uses the "longest match wins" semantics, so the order
666 * of the if-clauses matters.
667 *
668 * Misc notes:
669 *
670 * * Ecmascript numeric literals do not accept a sign character.
671 * Consequently e.g. "-1.0" is parsed as two tokens: a negative
672 * sign and a positive numeric literal. The compiler performs
673 * the negation during compilation, so this has no adverse impact.
674 *
675 * * There is no token for "undefined": it is just a value available
676 * from the global object (or simply established by doing a reference
677 * to an undefined value).
678 *
679 * * Some contexts want Identifier tokens, which are IdentifierNames
680 * excluding reserved words, while some contexts want IdentifierNames
681 * directly. In the latter case e.g. "while" is interpreted as an
682 * identifier name, not a DUK_TOK_WHILE token. The solution here is
683 * to provide both token types: DUK_TOK_WHILE goes to 't' while
684 * DUK_TOK_IDENTIFIER goes to 't_nores', and 'slot1' always contains
685 * the identifier / keyword name.
686 *
687 * * Directive prologue needs to identify string literals such as
688 * "use strict" and 'use strict', which are sensitive to line
689 * continuations and escape sequences. For instance, "use\u0020strict"
690 * is a valid directive but is distinct from "use strict". The solution
691 * here is to decode escapes while tokenizing, but to keep track of the
692 * number of escapes. Directive detection can then check that the
693 * number of escapes is zero.
694 *
695 * * Multi-line comments with one or more internal LineTerminator are
696 * treated like a line terminator to comply with automatic semicolon
697 * insertion.
698 */
699
700DUK_INTERNAL
701void duk_lexer_parse_js_input_element(duk_lexer_ctx *lex_ctx,
702 duk_token *out_token,
703 duk_bool_t strict_mode,
704 duk_bool_t regexp_mode) {
705 duk_codepoint_t x; /* temporary, must be signed and 32-bit to hold Unicode code points */
706 duk_small_uint_t advtok = 0; /* (advance << 8) + token_type, updated at function end,
707 * init is unnecessary but suppresses "may be used uninitialized" warnings.
708 */
709 duk_bool_t got_lineterm = 0; /* got lineterm preceding non-whitespace, non-lineterm token */
710
711 if (++lex_ctx->token_count >= lex_ctx->token_limit) {
11fdf7f2 712 DUK_ERROR_RANGE(lex_ctx->thr, "token limit");
7c673cae
FG
713 return; /* unreachable */
714 }
715
716 out_token->t = DUK_TOK_EOF;
717 out_token->t_nores = -1; /* marker: copy t if not changed */
718#if 0 /* not necessary to init, disabled for faster parsing */
719 out_token->num = DUK_DOUBLE_NAN;
720 out_token->str1 = NULL;
721 out_token->str2 = NULL;
722#endif
723 out_token->num_escapes = 0;
724 /* out_token->lineterm set by caller */
725
726 /* This would be nice, but parsing is faster without resetting the
727 * value slots. The only side effect is that references to temporary
728 * string values may linger until lexing is finished; they're then
729 * freed normally.
730 */
731#if 0
732 duk_to_undefined((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx);
733 duk_to_undefined((duk_context *) lex_ctx->thr, lex_ctx->slot2_idx);
734#endif
735
736 /* 'advtok' indicates how much to advance and which token id to assign
737 * at the end. This shared functionality minimizes code size. All
738 * code paths are required to set 'advtok' to some value, so no default
739 * init value is used. Code paths calling DUK_ERROR() never return so
740 * they don't need to set advtok.
741 */
742
743 /*
744 * Matching order:
745 *
746 * Punctuator first chars, also covers comments, regexps
747 * LineTerminator
748 * Identifier or reserved word, also covers null/true/false literals
749 * NumericLiteral
750 * StringLiteral
751 * EOF
752 *
753 * The order does not matter as long as the longest match is
754 * always correctly identified. There are order dependencies
755 * in the clauses, so it's not trivial to convert to a switch.
756 */
757
758 restart_lineupdate:
759 out_token->start_line = lex_ctx->window[0].line;
760
761 restart:
762 out_token->start_offset = lex_ctx->window[0].offset;
763
764 x = DUK__L0();
765
766 switch (x) {
767 case DUK_ASC_SPACE:
768 case DUK_ASC_HT: /* fast paths for space and tab */
769 DUK__ADVANCECHARS(lex_ctx, 1);
770 goto restart;
771 case DUK_ASC_LF: /* LF line terminator; CR LF and Unicode lineterms are handled in slow path */
772 DUK__ADVANCECHARS(lex_ctx, 1);
773 got_lineterm = 1;
774 goto restart_lineupdate;
775 case DUK_ASC_SLASH: /* '/' */
776 if (DUK__L1() == '/') {
777 /*
778 * E5 Section 7.4, allow SourceCharacter (which is any 16-bit
779 * code point).
780 */
781
782 /* DUK__ADVANCECHARS(lex_ctx, 2) would be correct here, but it unnecessary */
783 for (;;) {
784 x = DUK__L0();
785 if (x < 0 || duk_unicode_is_line_terminator(x)) {
786 break;
787 }
788 DUK__ADVANCECHARS(lex_ctx, 1);
789 }
790 goto restart; /* line terminator will be handled on next round */
791 } else if (DUK__L1() == '*') {
792 /*
793 * E5 Section 7.4. If the multi-line comment contains a newline,
794 * it is treated like a single line terminator for automatic
795 * semicolon insertion.
796 */
797
798 duk_bool_t last_asterisk = 0;
799 DUK__ADVANCECHARS(lex_ctx, 2);
800 for (;;) {
801 x = DUK__L0();
802 if (x < 0) {
11fdf7f2 803 DUK_ERROR_SYNTAX(lex_ctx->thr, "eof in multiline comment");
7c673cae
FG
804 }
805 DUK__ADVANCECHARS(lex_ctx, 1);
806 if (last_asterisk && x == '/') {
807 break;
808 }
809 if (duk_unicode_is_line_terminator(x)) {
810 got_lineterm = 1;
811 }
812 last_asterisk = (x == '*');
813 }
814 goto restart_lineupdate;
815 } else if (regexp_mode) {
11fdf7f2 816#if defined(DUK_USE_REGEXP_SUPPORT)
7c673cae
FG
817 /*
818 * "/" followed by something in regexp mode. See E5 Section 7.8.5.
819 *
820 * RegExp parsing is a bit complex. First, the regexp body is delimited
821 * by forward slashes, but the body may also contain forward slashes as
822 * part of an escape sequence or inside a character class (delimited by
823 * square brackets). A mini state machine is used to implement these.
824 *
825 * Further, an early (parse time) error must be thrown if the regexp
826 * would cause a run-time error when used in the expression new RegExp(...).
827 * Parsing here simply extracts the (candidate) regexp, and also accepts
828 * invalid regular expressions (which are delimited properly). The caller
829 * (compiler) must perform final validation and regexp compilation.
830 *
831 * RegExp first char may not be '/' (single line comment) or '*' (multi-
832 * line comment). These have already been checked above, so there is no
833 * need below for special handling of the first regexp character as in
834 * the E5 productions.
835 *
836 * About unicode escapes within regexp literals:
837 *
838 * E5 Section 7.8.5 grammar does NOT accept \uHHHH escapes.
839 * However, Section 6 states that regexps accept the escapes,
840 * see paragraph starting with "In string literals...".
841 * The regexp grammar, which sees the decoded regexp literal
842 * (after lexical parsing) DOES have a \uHHHH unicode escape.
843 * So, for instance:
844 *
845 * /\u1234/
846 *
847 * should first be parsed by the lexical grammar as:
848 *
849 * '\' 'u' RegularExpressionBackslashSequence
850 * '1' RegularExpressionNonTerminator
851 * '2' RegularExpressionNonTerminator
852 * '3' RegularExpressionNonTerminator
853 * '4' RegularExpressionNonTerminator
854 *
855 * and the escape itself is then parsed by the regexp engine.
856 * This is the current implementation.
857 *
858 * Minor spec inconsistency:
859 *
860 * E5 Section 7.8.5 RegularExpressionBackslashSequence is:
861 *
862 * \ RegularExpressionNonTerminator
863 *
864 * while Section A.1 RegularExpressionBackslashSequence is:
865 *
866 * \ NonTerminator
867 *
868 * The latter is not normative and a typo.
869 *
870 */
871
872 /* first, parse regexp body roughly */
873
874 duk_small_int_t state = 0; /* 0=base, 1=esc, 2=class, 3=class+esc */
875
876 DUK__INITBUFFER(lex_ctx);
877 for (;;) {
878 DUK__ADVANCECHARS(lex_ctx, 1); /* skip opening slash on first loop */
879 x = DUK__L0();
880 if (x < 0 || duk_unicode_is_line_terminator(x)) {
11fdf7f2 881 DUK_ERROR_SYNTAX(lex_ctx->thr, "eof or line terminator in regexp");
7c673cae
FG
882 }
883 x = DUK__L0(); /* re-read to avoid spill / fetch */
884 if (state == 0) {
885 if (x == '/') {
886 DUK__ADVANCECHARS(lex_ctx, 1); /* eat closing slash */
887 break;
888 } else if (x == '\\') {
889 state = 1;
890 } else if (x == '[') {
891 state = 2;
892 }
893 } else if (state == 1) {
894 state = 0;
895 } else if (state == 2) {
896 if (x == ']') {
897 state = 0;
898 } else if (x == '\\') {
899 state = 3;
900 }
901 } else { /* state == 3 */
902 state = 2;
903 }
904 DUK__APPENDBUFFER(lex_ctx, x);
905 }
906 duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
907 out_token->str1 = duk_get_hstring((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx);
908
909 /* second, parse flags */
910
911 DUK__INITBUFFER(lex_ctx);
912 for (;;) {
913 x = DUK__L0();
914 if (!duk_unicode_is_identifier_part(x)) {
915 break;
916 }
917 x = DUK__L0(); /* re-read to avoid spill / fetch */
918 DUK__APPENDBUFFER(lex_ctx, x);
919 DUK__ADVANCECHARS(lex_ctx, 1);
920 }
921 duk__internbuffer(lex_ctx, lex_ctx->slot2_idx);
922 out_token->str2 = duk_get_hstring((duk_context *) lex_ctx->thr, lex_ctx->slot2_idx);
923
924 DUK__INITBUFFER(lex_ctx); /* free some memory */
925
926 /* validation of the regexp is caller's responsibility */
927
928 advtok = DUK__ADVTOK(0, DUK_TOK_REGEXP);
929#else
11fdf7f2 930 DUK_ERROR_SYNTAX(lex_ctx->thr, "regexp support disabled");
7c673cae
FG
931#endif
932 } else if (DUK__L1() == '=') {
933 /* "/=" and not in regexp mode */
934 advtok = DUK__ADVTOK(2, DUK_TOK_DIV_EQ);
935 } else {
936 /* "/" and not in regexp mode */
937 advtok = DUK__ADVTOK(1, DUK_TOK_DIV);
938 }
939 break;
940 case DUK_ASC_LCURLY: /* '{' */
941 advtok = DUK__ADVTOK(1, DUK_TOK_LCURLY);
942 break;
943 case DUK_ASC_RCURLY: /* '}' */
944 advtok = DUK__ADVTOK(1, DUK_TOK_RCURLY);
945 break;
946 case DUK_ASC_LPAREN: /* '(' */
947 advtok = DUK__ADVTOK(1, DUK_TOK_LPAREN);
948 break;
949 case DUK_ASC_RPAREN: /* ')' */
950 advtok = DUK__ADVTOK(1, DUK_TOK_RPAREN);
951 break;
952 case DUK_ASC_LBRACKET: /* '[' */
953 advtok = DUK__ADVTOK(1, DUK_TOK_LBRACKET);
954 break;
955 case DUK_ASC_RBRACKET: /* ']' */
956 advtok = DUK__ADVTOK(1, DUK_TOK_RBRACKET);
957 break;
958 case DUK_ASC_PERIOD: /* '.' */
959 if (DUK__ISDIGIT(DUK__L1())) {
960 /* Period followed by a digit can only start DecimalLiteral
961 * (handled in slow path). We could jump straight into the
962 * DecimalLiteral handling but should avoid goto to inside
963 * a block.
964 */
965 goto slow_path;
966 }
967 advtok = DUK__ADVTOK(1, DUK_TOK_PERIOD);
968 break;
969 case DUK_ASC_SEMICOLON: /* ';' */
970 advtok = DUK__ADVTOK(1, DUK_TOK_SEMICOLON);
971 break;
972 case DUK_ASC_COMMA: /* ',' */
973 advtok = DUK__ADVTOK(1, DUK_TOK_COMMA);
974 break;
975 case DUK_ASC_LANGLE: /* '<' */
976 if (DUK__L1() == '<' && DUK__L2() == '=') {
977 advtok = DUK__ADVTOK(3, DUK_TOK_ALSHIFT_EQ);
978 } else if (DUK__L1() == '=') {
979 advtok = DUK__ADVTOK(2, DUK_TOK_LE);
980 } else if (DUK__L1() == '<') {
981 advtok = DUK__ADVTOK(2, DUK_TOK_ALSHIFT);
982 } else {
983 advtok = DUK__ADVTOK(1, DUK_TOK_LT);
984 }
985 break;
986 case DUK_ASC_RANGLE: /* '>' */
987 if (DUK__L1() == '>' && DUK__L2() == '>' && DUK__L3() == '=') {
988 advtok = DUK__ADVTOK(4, DUK_TOK_RSHIFT_EQ);
989 } else if (DUK__L1() == '>' && DUK__L2() == '>') {
990 advtok = DUK__ADVTOK(3, DUK_TOK_RSHIFT);
991 } else if (DUK__L1() == '>' && DUK__L2() == '=') {
992 advtok = DUK__ADVTOK(3, DUK_TOK_ARSHIFT_EQ);
993 } else if (DUK__L1() == '=') {
994 advtok = DUK__ADVTOK(2, DUK_TOK_GE);
995 } else if (DUK__L1() == '>') {
996 advtok = DUK__ADVTOK(2, DUK_TOK_ARSHIFT);
997 } else {
998 advtok = DUK__ADVTOK(1, DUK_TOK_GT);
999 }
1000 break;
1001 case DUK_ASC_EQUALS: /* '=' */
1002 if (DUK__L1() == '=' && DUK__L2() == '=') {
1003 advtok = DUK__ADVTOK(3, DUK_TOK_SEQ);
1004 } else if (DUK__L1() == '=') {
1005 advtok = DUK__ADVTOK(2, DUK_TOK_EQ);
1006 } else {
1007 advtok = DUK__ADVTOK(1, DUK_TOK_EQUALSIGN);
1008 }
1009 break;
1010 case DUK_ASC_EXCLAMATION: /* '!' */
1011 if (DUK__L1() == '=' && DUK__L2() == '=') {
1012 advtok = DUK__ADVTOK(3, DUK_TOK_SNEQ);
1013 } else if (DUK__L1() == '=') {
1014 advtok = DUK__ADVTOK(2, DUK_TOK_NEQ);
1015 } else {
1016 advtok = DUK__ADVTOK(1, DUK_TOK_LNOT);
1017 }
1018 break;
1019 case DUK_ASC_PLUS: /* '+' */
1020 if (DUK__L1() == '+') {
1021 advtok = DUK__ADVTOK(2, DUK_TOK_INCREMENT);
1022 } else if (DUK__L1() == '=') {
1023 advtok = DUK__ADVTOK(2, DUK_TOK_ADD_EQ);
1024 } else {
1025 advtok = DUK__ADVTOK(1, DUK_TOK_ADD);
1026 }
1027 break;
1028 case DUK_ASC_MINUS: /* '-' */
1029 if (DUK__L1() == '-') {
1030 advtok = DUK__ADVTOK(2, DUK_TOK_DECREMENT);
1031 } else if (DUK__L1() == '=') {
1032 advtok = DUK__ADVTOK(2, DUK_TOK_SUB_EQ);
1033 } else {
1034 advtok = DUK__ADVTOK(1, DUK_TOK_SUB);
1035 }
1036 break;
1037 case DUK_ASC_STAR: /* '*' */
1038 if (DUK__L1() == '=') {
1039 advtok = DUK__ADVTOK(2, DUK_TOK_MUL_EQ);
1040 } else {
1041 advtok = DUK__ADVTOK(1, DUK_TOK_MUL);
1042 }
1043 break;
1044 case DUK_ASC_PERCENT: /* '%' */
1045 if (DUK__L1() == '=') {
1046 advtok = DUK__ADVTOK(2, DUK_TOK_MOD_EQ);
1047 } else {
1048 advtok = DUK__ADVTOK(1, DUK_TOK_MOD);
1049 }
1050 break;
1051 case DUK_ASC_AMP: /* '&' */
1052 if (DUK__L1() == '&') {
1053 advtok = DUK__ADVTOK(2, DUK_TOK_LAND);
1054 } else if (DUK__L1() == '=') {
1055 advtok = DUK__ADVTOK(2, DUK_TOK_BAND_EQ);
1056 } else {
1057 advtok = DUK__ADVTOK(1, DUK_TOK_BAND);
1058 }
1059 break;
1060 case DUK_ASC_PIPE: /* '|' */
1061 if (DUK__L1() == '|') {
1062 advtok = DUK__ADVTOK(2, DUK_TOK_LOR);
1063 } else if (DUK__L1() == '=') {
1064 advtok = DUK__ADVTOK(2, DUK_TOK_BOR_EQ);
1065 } else {
1066 advtok = DUK__ADVTOK(1, DUK_TOK_BOR);
1067 }
1068 break;
1069 case DUK_ASC_CARET: /* '^' */
1070 if (DUK__L1() == '=') {
1071 advtok = DUK__ADVTOK(2, DUK_TOK_BXOR_EQ);
1072 } else {
1073 advtok = DUK__ADVTOK(1, DUK_TOK_BXOR);
1074 }
1075 break;
1076 case DUK_ASC_TILDE: /* '~' */
1077 advtok = DUK__ADVTOK(1, DUK_TOK_BNOT);
1078 break;
1079 case DUK_ASC_QUESTION: /* '?' */
1080 advtok = DUK__ADVTOK(1, DUK_TOK_QUESTION);
1081 break;
1082 case DUK_ASC_COLON: /* ':' */
1083 advtok = DUK__ADVTOK(1, DUK_TOK_COLON);
1084 break;
1085 case DUK_ASC_DOUBLEQUOTE: /* '"' */
1086 case DUK_ASC_SINGLEQUOTE: { /* '\'' */
1087 duk_small_int_t quote = x; /* Note: duk_uint8_t type yields larger code */
1088 duk_small_int_t adv;
1089
1090 DUK__INITBUFFER(lex_ctx);
1091 for (;;) {
1092 DUK__ADVANCECHARS(lex_ctx, 1); /* eat opening quote on first loop */
1093 x = DUK__L0();
1094 if (x < 0 || duk_unicode_is_line_terminator(x)) {
11fdf7f2 1095 DUK_ERROR_SYNTAX(lex_ctx->thr, "eof or line terminator in string literal");
7c673cae
FG
1096 }
1097 if (x == quote) {
1098 DUK__ADVANCECHARS(lex_ctx, 1); /* eat closing quote */
1099 break;
1100 }
1101 if (x == '\\') {
1102 /* DUK__L0 -> '\' char
1103 * DUK__L1 ... DUK__L5 -> more lookup
1104 */
1105
1106 x = DUK__L1();
1107
1108 /* How much to advance before next loop; note that next loop
1109 * will advance by 1 anyway, so -1 from the total escape
1110 * length (e.g. len('\uXXXX') - 1 = 6 - 1). As a default,
1111 * 1 is good.
1112 */
1113 adv = 2 - 1; /* note: long live range */
1114
1115 if (x < 0) {
11fdf7f2 1116 DUK_ERROR_SYNTAX(lex_ctx->thr, "eof or line terminator in string literal");
7c673cae
FG
1117 }
1118 if (duk_unicode_is_line_terminator(x)) {
1119 /* line continuation */
1120 if (x == 0x000d && DUK__L2() == 0x000a) {
1121 /* CR LF again a special case */
1122 adv = 3 - 1;
1123 }
1124 } else if (x == '\'') {
1125 DUK__APPENDBUFFER(lex_ctx, 0x0027);
1126 } else if (x == '"') {
1127 DUK__APPENDBUFFER(lex_ctx, 0x0022);
1128 } else if (x == '\\') {
1129 DUK__APPENDBUFFER(lex_ctx, 0x005c);
1130 } else if (x == 'b') {
1131 DUK__APPENDBUFFER(lex_ctx, 0x0008);
1132 } else if (x == 'f') {
1133 DUK__APPENDBUFFER(lex_ctx, 0x000c);
1134 } else if (x == 'n') {
1135 DUK__APPENDBUFFER(lex_ctx, 0x000a);
1136 } else if (x == 'r') {
1137 DUK__APPENDBUFFER(lex_ctx, 0x000d);
1138 } else if (x == 't') {
1139 DUK__APPENDBUFFER(lex_ctx, 0x0009);
1140 } else if (x == 'v') {
1141 DUK__APPENDBUFFER(lex_ctx, 0x000b);
1142 } else if (x == 'x') {
1143 adv = 4 - 1;
1144 DUK__APPENDBUFFER(lex_ctx, duk__decode_hexesc_from_window(lex_ctx, 2));
1145 } else if (x == 'u') {
1146 adv = 6 - 1;
1147 DUK__APPENDBUFFER(lex_ctx, duk__decode_uniesc_from_window(lex_ctx, 2));
1148 } else if (DUK__ISDIGIT(x)) {
1149 duk_codepoint_t ch = 0; /* initialized to avoid warnings of unused var */
1150
1151 /*
1152 * Octal escape or zero escape:
1153 * \0 (lookahead not DecimalDigit)
1154 * \1 ... \7 (lookahead not DecimalDigit)
1155 * \ZeroToThree OctalDigit (lookahead not DecimalDigit)
1156 * \FourToSeven OctalDigit (no lookahead restrictions)
1157 * \ZeroToThree OctalDigit OctalDigit (no lookahead restrictions)
1158 *
1159 * Zero escape is part of the standard syntax. Octal escapes are
1160 * defined in E5 Section B.1.2, and are only allowed in non-strict mode.
1161 * Any other productions starting with a decimal digit are invalid.
1162 */
1163
1164 if (x == '0' && !DUK__ISDIGIT(DUK__L2())) {
1165 /* Zero escape (also allowed in non-strict mode) */
1166 ch = 0;
1167 /* adv = 2 - 1 default OK */
11fdf7f2 1168#if defined(DUK_USE_OCTAL_SUPPORT)
7c673cae
FG
1169 } else if (strict_mode) {
1170 /* No other escape beginning with a digit in strict mode */
11fdf7f2 1171 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid escape in string literal");
7c673cae
FG
1172 } else if (DUK__ISDIGIT03(x) && DUK__ISOCTDIGIT(DUK__L2()) && DUK__ISOCTDIGIT(DUK__L3())) {
1173 /* Three digit octal escape, digits validated. */
1174 adv = 4 - 1;
1175 ch = (duk__hexval(lex_ctx, x) << 6) +
1176 (duk__hexval(lex_ctx, DUK__L2()) << 3) +
1177 duk__hexval(lex_ctx, DUK__L3());
1178 } else if (((DUK__ISDIGIT03(x) && !DUK__ISDIGIT(DUK__L3())) || DUK__ISDIGIT47(x)) &&
1179 DUK__ISOCTDIGIT(DUK__L2())) {
1180 /* Two digit octal escape, digits validated.
1181 *
1182 * The if-condition is a bit tricky. We could catch e.g.
1183 * '\039' in the three-digit escape and fail it there (by
1184 * validating the digits), but we want to avoid extra
1185 * additional validation code.
1186 */
1187 adv = 3 - 1;
1188 ch = (duk__hexval(lex_ctx, x) << 3) +
1189 duk__hexval(lex_ctx, DUK__L2());
1190 } else if (DUK__ISDIGIT(x) && !DUK__ISDIGIT(DUK__L2())) {
1191 /* One digit octal escape, digit validated. */
1192 /* adv = 2 default OK */
1193 ch = duk__hexval(lex_ctx, x);
1194#else
1195 /* fall through to error */
1196#endif
1197 } else {
11fdf7f2 1198 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid escape in string literal");
7c673cae
FG
1199 }
1200
1201 DUK__APPENDBUFFER(lex_ctx, ch);
1202 } else {
1203 /* escaped NonEscapeCharacter */
1204 DUK__APPENDBUFFER(lex_ctx, x);
1205 }
1206 DUK__ADVANCECHARS(lex_ctx, adv);
1207
1208 /* Track number of escapes; count not really needed but directive
1209 * prologues need to detect whether there were any escapes or line
1210 * continuations or not.
1211 */
1212 out_token->num_escapes++;
1213 } else {
1214 /* part of string */
1215 DUK__APPENDBUFFER(lex_ctx, x);
1216 }
1217 }
1218
1219 duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
1220 out_token->str1 = duk_get_hstring((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx);
1221
1222 DUK__INITBUFFER(lex_ctx); /* free some memory */
1223
1224 advtok = DUK__ADVTOK(0, DUK_TOK_STRING);
1225 break;
1226 }
1227 default:
1228 goto slow_path;
1229 } /* switch */
1230
1231 goto skip_slow_path;
1232
1233 slow_path:
1234 if (duk_unicode_is_line_terminator(x)) {
1235 if (x == 0x000d && DUK__L1() == 0x000a) {
1236 /*
1237 * E5 Section 7.3: CR LF is detected as a single line terminator for
1238 * line numbers. Here we also detect it as a single line terminator
1239 * token.
1240 */
1241 DUK__ADVANCECHARS(lex_ctx, 2);
1242 } else {
1243 DUK__ADVANCECHARS(lex_ctx, 1);
1244 }
1245 got_lineterm = 1;
1246 goto restart_lineupdate;
1247 } else if (duk_unicode_is_identifier_start(x) || x == '\\') {
1248 /*
1249 * Parse an identifier and then check whether it is:
1250 * - reserved word (keyword or other reserved word)
1251 * - "null" (NullLiteral)
1252 * - "true" (BooleanLiteral)
1253 * - "false" (BooleanLiteral)
1254 * - anything else => identifier
1255 *
1256 * This does not follow the E5 productions cleanly, but is
1257 * useful and compact.
1258 *
1259 * Note that identifiers may contain Unicode escapes,
1260 * see E5 Sections 6 and 7.6. They must be decoded first,
1261 * and the result checked against allowed characters.
1262 * The above if-clause accepts an identifier start and an
1263 * '\' character -- no other token can begin with a '\'.
1264 *
1265 * Note that "get" and "set" are not reserved words in E5
1266 * specification so they are recognized as plain identifiers
1267 * (the tokens DUK_TOK_GET and DUK_TOK_SET are actually not
1268 * used now). The compiler needs to work around this.
1269 *
1270 * Strictly speaking, following Ecmascript longest match
1271 * specification, an invalid escape for the first character
1272 * should cause a syntax error. However, an invalid escape
1273 * for IdentifierParts should just terminate the identifier
1274 * early (longest match), and let the next tokenization
1275 * fail. For instance Rhino croaks with 'foo\z' when
1276 * parsing the identifier. This has little practical impact.
1277 */
1278
1279 duk_small_int_t i, i_end;
1280 duk_bool_t first = 1;
1281 duk_hstring *str;
1282
1283 DUK__INITBUFFER(lex_ctx);
1284 for (;;) {
1285 /* re-lookup first char on first loop */
1286 if (DUK__L0() == '\\') {
1287 duk_codepoint_t ch;
1288 if (DUK__L1() != 'u') {
11fdf7f2 1289 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid unicode escape in identifier");
7c673cae
FG
1290 }
1291
1292 ch = duk__decode_uniesc_from_window(lex_ctx, 2);
1293
1294 /* IdentifierStart is stricter than IdentifierPart, so if the first
1295 * character is escaped, must have a stricter check here.
1296 */
1297 if (!(first ? duk_unicode_is_identifier_start(ch) : duk_unicode_is_identifier_part(ch))) {
11fdf7f2 1298 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid unicode escape in identifier");
7c673cae
FG
1299 }
1300 DUK__APPENDBUFFER(lex_ctx, ch);
1301 DUK__ADVANCECHARS(lex_ctx, 6);
1302
1303 /* Track number of escapes: necessary for proper keyword
1304 * detection.
1305 */
1306 out_token->num_escapes++;
1307 } else {
1308 /* Note: first character is checked against this. But because
1309 * IdentifierPart includes all IdentifierStart characters, and
1310 * the first character (if unescaped) has already been checked
1311 * in the if condition, this is OK.
1312 */
1313 if (!duk_unicode_is_identifier_part(DUK__L0())) {
1314 break;
1315 }
1316 DUK__APPENDBUFFER(lex_ctx, DUK__L0());
1317 DUK__ADVANCECHARS(lex_ctx, 1);
1318 }
1319 first = 0;
1320 }
1321
1322 duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
1323 out_token->str1 = duk_get_hstring((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx);
1324 str = out_token->str1;
1325 DUK_ASSERT(str != NULL);
1326 out_token->t_nores = DUK_TOK_IDENTIFIER;
1327
1328 DUK__INITBUFFER(lex_ctx); /* free some memory */
1329
1330 /*
1331 * Interned identifier is compared against reserved words, which are
11fdf7f2 1332 * currently interned into the heap context. See genbuiltins.py.
7c673cae
FG
1333 *
1334 * Note that an escape in the identifier disables recognition of
1335 * keywords; e.g. "\u0069f = 1;" is a valid statement (assigns to
1336 * identifier named "if"). This is not necessarily compliant,
1337 * see test-dec-escaped-char-in-keyword.js.
1338 *
1339 * Note: "get" and "set" are awkward. They are not officially
1340 * ReservedWords (and indeed e.g. "var set = 1;" is valid), and
1341 * must come out as DUK_TOK_IDENTIFIER. The compiler needs to
1342 * work around this a bit.
1343 */
1344
1345 /* XXX: optimize by adding the token numbers directly into the
1346 * always interned duk_hstring objects (there should be enough
1347 * flag bits free for that)?
1348 */
1349
1350 i_end = (strict_mode ? DUK_STRIDX_END_RESERVED : DUK_STRIDX_START_STRICT_RESERVED);
1351
1352 advtok = DUK__ADVTOK(0, DUK_TOK_IDENTIFIER);
1353 if (out_token->num_escapes == 0) {
1354 for (i = DUK_STRIDX_START_RESERVED; i < i_end; i++) {
1355 DUK_ASSERT(i >= 0 && i < DUK_HEAP_NUM_STRINGS);
1356 if (DUK_HTHREAD_GET_STRING(lex_ctx->thr, i) == str) {
1357 advtok = DUK__ADVTOK(0, DUK_STRIDX_TO_TOK(i));
1358 break;
1359 }
1360 }
1361 }
1362 } else if (DUK__ISDIGIT(x) || (x == '.')) {
1363 /* Note: decimal number may start with a period, but must be followed by a digit */
1364
1365 /*
1366 * DecimalLiteral, HexIntegerLiteral, OctalIntegerLiteral
1367 * "pre-parsing", followed by an actual, accurate parser step.
1368 *
1369 * Note: the leading sign character ('+' or '-') is -not- part of
1370 * the production in E5 grammar, and that the a DecimalLiteral
1371 * starting with a '0' must be followed by a non-digit. Leading
1372 * zeroes are syntax errors and must be checked for.
1373 *
1374 * XXX: the two step parsing process is quite awkward, it would
1375 * be more straightforward to allow numconv to parse the longest
1376 * valid prefix (it already does that, it only needs to indicate
1377 * where the input ended). However, the lexer decodes characters
1378 * using a lookup window, so this is not a trivial change.
1379 */
1380
1381 /* XXX: because of the final check below (that the literal is not
1382 * followed by a digit), this could maybe be simplified, if we bail
1383 * out early from a leading zero (and if there are no periods etc).
1384 * Maybe too complex.
1385 */
1386
1387 duk_double_t val;
1388 duk_bool_t int_only = 0;
1389 duk_bool_t allow_hex = 0;
1390 duk_small_int_t state; /* 0=before period/exp,
1391 * 1=after period, before exp
1392 * 2=after exp, allow '+' or '-'
1393 * 3=after exp and exp sign
1394 */
1395 duk_small_uint_t s2n_flags;
1396 duk_codepoint_t y;
1397
1398 DUK__INITBUFFER(lex_ctx);
1399 y = DUK__L1();
1400 if (x == '0' && (y == 'x' || y == 'X')) {
1401 DUK__APPENDBUFFER(lex_ctx, x);
1402 DUK__APPENDBUFFER(lex_ctx, y);
1403 DUK__ADVANCECHARS(lex_ctx, 2);
1404 int_only = 1;
1405 allow_hex = 1;
11fdf7f2 1406#if defined(DUK_USE_OCTAL_SUPPORT)
7c673cae
FG
1407 } else if (!strict_mode && x == '0' && DUK__ISDIGIT(y)) {
1408 /* Note: if DecimalLiteral starts with a '0', it can only be
1409 * followed by a period or an exponent indicator which starts
1410 * with 'e' or 'E'. Hence the if-check above ensures that
1411 * OctalIntegerLiteral is the only valid NumericLiteral
1412 * alternative at this point (even if y is, say, '9').
1413 */
1414
1415 DUK__APPENDBUFFER(lex_ctx, x);
1416 DUK__ADVANCECHARS(lex_ctx, 1);
1417 int_only = 1;
1418#endif
1419 }
1420
1421 state = 0;
1422 for (;;) {
1423 x = DUK__L0(); /* re-lookup curr char on first round */
1424 if (DUK__ISDIGIT(x)) {
1425 /* Note: intentionally allow leading zeroes here, as the
1426 * actual parser will check for them.
1427 */
1428 if (state == 2) {
1429 state = 3;
1430 }
1431 } else if (allow_hex && DUK__ISHEXDIGIT(x)) {
1432 /* Note: 'e' and 'E' are also accepted here. */
1433 ;
1434 } else if (x == '.') {
1435 if (state >= 1 || int_only) {
1436 break;
1437 } else {
1438 state = 1;
1439 }
1440 } else if (x == 'e' || x == 'E') {
1441 if (state >= 2 || int_only) {
1442 break;
1443 } else {
1444 state = 2;
1445 }
1446 } else if (x == '-' || x == '+') {
1447 if (state != 2) {
1448 break;
1449 } else {
1450 state = 3;
1451 }
1452 } else {
1453 break;
1454 }
1455 DUK__APPENDBUFFER(lex_ctx, x);
1456 DUK__ADVANCECHARS(lex_ctx, 1);
1457 }
1458
1459 /* XXX: better coercion */
1460 duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
1461
1462 s2n_flags = DUK_S2N_FLAG_ALLOW_EXP |
1463 DUK_S2N_FLAG_ALLOW_FRAC |
1464 DUK_S2N_FLAG_ALLOW_NAKED_FRAC |
1465 DUK_S2N_FLAG_ALLOW_EMPTY_FRAC |
11fdf7f2 1466#if defined(DUK_USE_OCTAL_SUPPORT)
7c673cae
FG
1467 (strict_mode ? 0 : DUK_S2N_FLAG_ALLOW_AUTO_OCT_INT) |
1468#endif
1469 DUK_S2N_FLAG_ALLOW_AUTO_HEX_INT;
1470
1471 duk_dup((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx);
1472 duk_numconv_parse((duk_context *) lex_ctx->thr, 10 /*radix*/, s2n_flags);
1473 val = duk_to_number((duk_context *) lex_ctx->thr, -1);
1474 if (DUK_ISNAN(val)) {
11fdf7f2 1475 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid numeric literal");
7c673cae
FG
1476 }
1477 duk_replace((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx); /* could also just pop? */
1478
1479 DUK__INITBUFFER(lex_ctx); /* free some memory */
1480
1481 /* Section 7.8.3 (note): NumericLiteral must be followed by something other than
1482 * IdentifierStart or DecimalDigit.
1483 */
1484
1485 if (DUK__ISDIGIT(DUK__L0()) || duk_unicode_is_identifier_start(DUK__L0())) {
11fdf7f2 1486 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid numeric literal");
7c673cae
FG
1487 }
1488
1489 out_token->num = val;
1490 advtok = DUK__ADVTOK(0, DUK_TOK_NUMBER);
1491 } else if (duk_unicode_is_whitespace(DUK__LOOKUP(lex_ctx, 0))) {
1492 DUK__ADVANCECHARS(lex_ctx, 1);
1493 goto restart;
1494 } else if (x < 0) {
1495 advtok = DUK__ADVTOK(0, DUK_TOK_EOF);
1496 } else {
11fdf7f2 1497 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid token");
7c673cae
FG
1498 }
1499 skip_slow_path:
1500
1501 /*
1502 * Shared exit path
1503 */
1504
1505 DUK__ADVANCEBYTES(lex_ctx, advtok >> 8);
1506 out_token->t = advtok & 0xff;
1507 if (out_token->t_nores < 0) {
1508 out_token->t_nores = out_token->t;
1509 }
1510 out_token->lineterm = got_lineterm;
1511
1512 /* Automatic semicolon insertion is allowed if a token is preceded
1513 * by line terminator(s), or terminates a statement list (right curly
1514 * or EOF).
1515 */
1516 if (got_lineterm || out_token->t == DUK_TOK_RCURLY || out_token->t == DUK_TOK_EOF) {
1517 out_token->allow_auto_semi = 1;
1518 } else {
1519 out_token->allow_auto_semi = 0;
1520 }
1521}
1522
11fdf7f2 1523#if defined(DUK_USE_REGEXP_SUPPORT)
7c673cae
FG
1524
1525/*
1526 * Parse a RegExp token. The grammar is described in E5 Section 15.10.
1527 * Terminal constructions (such as quantifiers) are parsed directly here.
1528 *
1529 * 0xffffffffU is used as a marker for "infinity" in quantifiers. Further,
1530 * DUK__MAX_RE_QUANT_DIGITS limits the maximum number of digits that
1531 * will be accepted for a quantifier.
1532 */
1533
1534DUK_INTERNAL void duk_lexer_parse_re_token(duk_lexer_ctx *lex_ctx, duk_re_token *out_token) {
1535 duk_small_int_t advtok = 0; /* init is unnecessary but suppresses "may be used uninitialized" warnings */
1536 duk_codepoint_t x, y;
1537
1538 if (++lex_ctx->token_count >= lex_ctx->token_limit) {
11fdf7f2 1539 DUK_ERROR_RANGE(lex_ctx->thr, "token limit");
7c673cae
FG
1540 return; /* unreachable */
1541 }
1542
1543 DUK_MEMZERO(out_token, sizeof(*out_token));
1544
1545 x = DUK__L0();
1546 y = DUK__L1();
1547
1548 DUK_DDD(DUK_DDDPRINT("parsing regexp token, L0=%ld, L1=%ld", (long) x, (long) y));
1549
1550 switch (x) {
1551 case '|': {
1552 advtok = DUK__ADVTOK(1, DUK_RETOK_DISJUNCTION);
1553 break;
1554 }
1555 case '^': {
1556 advtok = DUK__ADVTOK(1, DUK_RETOK_ASSERT_START);
1557 break;
1558 }
1559 case '$': {
1560 advtok = DUK__ADVTOK(1, DUK_RETOK_ASSERT_END);
1561 break;
1562 }
1563 case '?': {
1564 out_token->qmin = 0;
1565 out_token->qmax = 1;
1566 if (y == '?') {
1567 advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER);
1568 out_token->greedy = 0;
1569 } else {
1570 advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER);
1571 out_token->greedy = 1;
1572 }
1573 break;
1574 }
1575 case '*': {
1576 out_token->qmin = 0;
1577 out_token->qmax = DUK_RE_QUANTIFIER_INFINITE;
1578 if (y == '?') {
1579 advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER);
1580 out_token->greedy = 0;
1581 } else {
1582 advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER);
1583 out_token->greedy = 1;
1584 }
1585 break;
1586 }
1587 case '+': {
1588 out_token->qmin = 1;
1589 out_token->qmax = DUK_RE_QUANTIFIER_INFINITE;
1590 if (y == '?') {
1591 advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER);
1592 out_token->greedy = 0;
1593 } else {
1594 advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER);
1595 out_token->greedy = 1;
1596 }
1597 break;
1598 }
1599 case '{': {
1600 /* Production allows 'DecimalDigits', including leading zeroes */
1601 duk_uint_fast32_t val1 = 0;
1602 duk_uint_fast32_t val2 = DUK_RE_QUANTIFIER_INFINITE;
1603 duk_small_int_t digits = 0;
11fdf7f2
TL
1604#if defined(DUK_USE_ES6_REGEXP_BRACES)
1605 duk_lexer_point lex_pt;
1606#endif
1607
1608#if defined(DUK_USE_ES6_REGEXP_BRACES)
1609 /* Store lexer position, restoring if quantifier is invalid. */
1610 DUK_LEXER_GETPOINT(lex_ctx, &lex_pt);
1611#endif
1612
7c673cae
FG
1613 for (;;) {
1614 DUK__ADVANCECHARS(lex_ctx, 1); /* eat '{' on entry */
1615 x = DUK__L0();
1616 if (DUK__ISDIGIT(x)) {
7c673cae
FG
1617 digits++;
1618 val1 = val1 * 10 + (duk_uint_fast32_t) duk__hexval(lex_ctx, x);
1619 } else if (x == ',') {
11fdf7f2
TL
1620 if (digits > DUK__MAX_RE_QUANT_DIGITS) {
1621 goto invalid_quantifier;
1622 }
7c673cae 1623 if (val2 != DUK_RE_QUANTIFIER_INFINITE) {
11fdf7f2 1624 goto invalid_quantifier;
7c673cae
FG
1625 }
1626 if (DUK__L1() == '}') {
1627 /* form: { DecimalDigits , }, val1 = min count */
1628 if (digits == 0) {
11fdf7f2 1629 goto invalid_quantifier;
7c673cae
FG
1630 }
1631 out_token->qmin = val1;
1632 out_token->qmax = DUK_RE_QUANTIFIER_INFINITE;
1633 DUK__ADVANCECHARS(lex_ctx, 2);
1634 break;
1635 }
1636 val2 = val1;
1637 val1 = 0;
1638 digits = 0; /* not strictly necessary because of lookahead '}' above */
1639 } else if (x == '}') {
11fdf7f2
TL
1640 if (digits > DUK__MAX_RE_QUANT_DIGITS) {
1641 goto invalid_quantifier;
1642 }
7c673cae 1643 if (digits == 0) {
11fdf7f2 1644 goto invalid_quantifier;
7c673cae
FG
1645 }
1646 if (val2 != DUK_RE_QUANTIFIER_INFINITE) {
1647 /* val2 = min count, val1 = max count */
1648 out_token->qmin = val2;
1649 out_token->qmax = val1;
1650 } else {
1651 /* val1 = count */
1652 out_token->qmin = val1;
1653 out_token->qmax = val1;
1654 }
1655 DUK__ADVANCECHARS(lex_ctx, 1);
1656 break;
1657 } else {
11fdf7f2 1658 goto invalid_quantifier;
7c673cae
FG
1659 }
1660 }
1661 if (DUK__L0() == '?') {
1662 out_token->greedy = 0;
1663 DUK__ADVANCECHARS(lex_ctx, 1);
1664 } else {
1665 out_token->greedy = 1;
1666 }
1667 advtok = DUK__ADVTOK(0, DUK_RETOK_QUANTIFIER);
1668 break;
11fdf7f2
TL
1669 invalid_quantifier:
1670#if defined(DUK_USE_ES6_REGEXP_BRACES)
1671 /* Failed to match the quantifier, restore lexer and parse
1672 * opening brace as a literal.
1673 */
1674 DUK_LEXER_SETPOINT(lex_ctx, &lex_pt);
1675 advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_CHAR);
1676 out_token->num = '{';
1677#else
1678 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp quantifier");
1679#endif
1680 break;
7c673cae
FG
1681 }
1682 case '.': {
1683 advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_PERIOD);
1684 break;
1685 }
1686 case '\\': {
1687 /* The E5.1 specification does not seem to allow IdentifierPart characters
1688 * to be used as identity escapes. Unfortunately this includes '$', which
1689 * cannot be escaped as '\$'; it needs to be escaped e.g. as '\u0024'.
1690 * Many other implementations (including V8 and Rhino, for instance) do
1691 * accept '\$' as a valid identity escape, which is quite pragmatic.
1692 * See: test-regexp-identity-escape-dollar.js.
1693 */
1694
1695 advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_CHAR); /* default: char escape (two chars) */
1696 if (y == 'b') {
1697 advtok = DUK__ADVTOK(2, DUK_RETOK_ASSERT_WORD_BOUNDARY);
1698 } else if (y == 'B') {
1699 advtok = DUK__ADVTOK(2, DUK_RETOK_ASSERT_NOT_WORD_BOUNDARY);
1700 } else if (y == 'f') {
1701 out_token->num = 0x000c;
1702 } else if (y == 'n') {
1703 out_token->num = 0x000a;
1704 } else if (y == 't') {
1705 out_token->num = 0x0009;
1706 } else if (y == 'r') {
1707 out_token->num = 0x000d;
1708 } else if (y == 'v') {
1709 out_token->num = 0x000b;
1710 } else if (y == 'c') {
1711 x = DUK__L2();
1712 if ((x >= 'a' && x <= 'z') ||
1713 (x >= 'A' && x <= 'Z')) {
1714 out_token->num = (x % 32);
1715 advtok = DUK__ADVTOK(3, DUK_RETOK_ATOM_CHAR);
1716 } else {
11fdf7f2 1717 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape");
7c673cae
FG
1718 }
1719 } else if (y == 'x') {
1720 out_token->num = duk__decode_hexesc_from_window(lex_ctx, 2);
1721 advtok = DUK__ADVTOK(4, DUK_RETOK_ATOM_CHAR);
1722 } else if (y == 'u') {
1723 out_token->num = duk__decode_uniesc_from_window(lex_ctx, 2);
1724 advtok = DUK__ADVTOK(6, DUK_RETOK_ATOM_CHAR);
1725 } else if (y == 'd') {
1726 advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_DIGIT);
1727 } else if (y == 'D') {
1728 advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_DIGIT);
1729 } else if (y == 's') {
1730 advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_WHITE);
1731 } else if (y == 'S') {
1732 advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_WHITE);
1733 } else if (y == 'w') {
1734 advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_WORD_CHAR);
1735 } else if (y == 'W') {
1736 advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_WORD_CHAR);
1737 } else if (DUK__ISDIGIT(y)) {
1738 /* E5 Section 15.10.2.11 */
1739 if (y == '0') {
1740 if (DUK__ISDIGIT(DUK__L2())) {
11fdf7f2 1741 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape");
7c673cae
FG
1742 }
1743 out_token->num = 0x0000;
1744 advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_CHAR);
1745 } else {
1746 /* XXX: shared parsing? */
1747 duk_uint_fast32_t val = 0;
1748 duk_small_int_t i;
1749 for (i = 0; ; i++) {
1750 if (i >= DUK__MAX_RE_DECESC_DIGITS) {
11fdf7f2 1751 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape");
7c673cae
FG
1752 }
1753 DUK__ADVANCECHARS(lex_ctx, 1); /* eat backslash on entry */
1754 x = DUK__L0();
1755 if (!DUK__ISDIGIT(x)) {
1756 break;
1757 }
1758 val = val * 10 + (duk_uint_fast32_t) duk__hexval(lex_ctx, x);
1759 }
1760 /* DUK__L0() cannot be a digit, because the loop doesn't terminate if it is */
1761 advtok = DUK__ADVTOK(0, DUK_RETOK_ATOM_BACKREFERENCE);
1762 out_token->num = val;
1763 }
1764 } else if ((y >= 0 && !duk_unicode_is_identifier_part(y)) ||
1765#if defined(DUK_USE_NONSTD_REGEXP_DOLLAR_ESCAPE)
1766 y == '$' ||
1767#endif
1768 y == DUK_UNICODE_CP_ZWNJ ||
1769 y == DUK_UNICODE_CP_ZWJ) {
1770 /* IdentityEscape, with dollar added as a valid additional
1771 * non-standard escape (see test-regexp-identity-escape-dollar.js).
1772 * Careful not to match end-of-buffer (<0) here.
1773 */
1774 out_token->num = y;
1775 } else {
11fdf7f2 1776 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape");
7c673cae
FG
1777 }
1778 break;
1779 }
1780 case '(': {
1781 /* XXX: naming is inconsistent: ATOM_END_GROUP ends an ASSERT_START_LOOKAHEAD */
1782
1783 if (y == '?') {
1784 if (DUK__L2() == '=') {
1785 /* (?= */
1786 advtok = DUK__ADVTOK(3, DUK_RETOK_ASSERT_START_POS_LOOKAHEAD);
1787 } else if (DUK__L2() == '!') {
1788 /* (?! */
1789 advtok = DUK__ADVTOK(3, DUK_RETOK_ASSERT_START_NEG_LOOKAHEAD);
1790 } else if (DUK__L2() == ':') {
1791 /* (?: */
1792 advtok = DUK__ADVTOK(3, DUK_RETOK_ATOM_START_NONCAPTURE_GROUP);
11fdf7f2
TL
1793 } else {
1794 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp group");
1795 return;
7c673cae
FG
1796 }
1797 } else {
1798 /* ( */
1799 advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_START_CAPTURE_GROUP);
1800 }
1801 break;
1802 }
1803 case ')': {
1804 advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_END_GROUP);
1805 break;
1806 }
1807 case '[': {
1808 /*
1809 * To avoid creating a heavy intermediate value for the list of ranges,
1810 * only the start token ('[' or '[^') is parsed here. The regexp
1811 * compiler parses the ranges itself.
1812 */
1813 advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_START_CHARCLASS);
1814 if (y == '^') {
1815 advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_START_CHARCLASS_INVERTED);
1816 }
1817 break;
1818 }
11fdf7f2
TL
1819#if !defined(DUK_USE_ES6_REGEXP_BRACES)
1820 case '}':
1821#endif
1822 case ']': {
7c673cae
FG
1823 /* Although these could be parsed as PatternCharacters unambiguously (here),
1824 * E5 Section 15.10.1 grammar explicitly forbids these as PatternCharacters.
1825 */
11fdf7f2 1826 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp character");
7c673cae
FG
1827 break;
1828 }
1829 case -1: {
1830 /* EOF */
1831 advtok = DUK__ADVTOK(0, DUK_TOK_EOF);
1832 break;
1833 }
1834 default: {
1835 /* PatternCharacter, all excluded characters are matched by cases above */
1836 advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_CHAR);
1837 out_token->num = x;
1838 break;
1839 }
1840 }
1841
1842 /*
1843 * Shared exit path
1844 */
1845
1846 DUK__ADVANCEBYTES(lex_ctx, advtok >> 8);
1847 out_token->t = advtok & 0xff;
1848}
1849
1850/*
1851 * Special parser for character classes; calls callback for every
1852 * range parsed and returns the number of ranges present.
1853 */
1854
1855/* XXX: this duplicates functionality in duk_regexp.c where a similar loop is
1856 * required anyway. We could use that BUT we need to update the regexp compiler
1857 * 'nranges' too. Work this out a bit more cleanly to save space.
1858 */
1859
1860/* XXX: the handling of character range detection is a bit convoluted.
1861 * Try to simplify and make smaller.
1862 */
1863
1864/* XXX: logic for handling character ranges is now incorrect, it will accept
1865 * e.g. [\d-z] whereas it should croak from it? SMJS accepts this too, though.
1866 *
1867 * Needs a read through and a lot of additional tests.
1868 */
1869
1870DUK_LOCAL
1871void duk__emit_u16_direct_ranges(duk_lexer_ctx *lex_ctx,
1872 duk_re_range_callback gen_range,
1873 void *userdata,
11fdf7f2 1874 const duk_uint16_t *ranges,
7c673cae 1875 duk_small_int_t num) {
11fdf7f2 1876 const duk_uint16_t *ranges_end;
7c673cae
FG
1877
1878 DUK_UNREF(lex_ctx);
1879
1880 ranges_end = ranges + num;
1881 while (ranges < ranges_end) {
1882 /* mark range 'direct', bypass canonicalization (see Wiki) */
1883 gen_range(userdata, (duk_codepoint_t) ranges[0], (duk_codepoint_t) ranges[1], 1);
1884 ranges += 2;
1885 }
1886}
1887
1888DUK_INTERNAL void duk_lexer_parse_re_ranges(duk_lexer_ctx *lex_ctx, duk_re_range_callback gen_range, void *userdata) {
1889 duk_codepoint_t start = -1;
1890 duk_codepoint_t ch;
1891 duk_codepoint_t x;
1892 duk_bool_t dash = 0;
1893
1894 DUK_DD(DUK_DDPRINT("parsing regexp ranges"));
1895
1896 for (;;) {
1897 x = DUK__L0();
1898 DUK__ADVANCECHARS(lex_ctx, 1);
1899
1900 ch = -1; /* not strictly necessary, but avoids "uninitialized variable" warnings */
1901 DUK_UNREF(ch);
1902
1903 if (x < 0) {
11fdf7f2 1904 DUK_ERROR_SYNTAX(lex_ctx->thr, "eof in character class");
7c673cae 1905 } else if (x == ']') {
7c673cae
FG
1906 if (start >= 0) {
1907 gen_range(userdata, start, start, 0);
1908 }
1909 break;
1910 } else if (x == '-') {
1911 if (start >= 0 && !dash && DUK__L0() != ']') {
1912 /* '-' as a range indicator */
1913 dash = 1;
1914 continue;
1915 } else {
1916 /* '-' verbatim */
1917 ch = x;
1918 }
1919 } else if (x == '\\') {
1920 /*
1921 * The escapes are same as outside a character class, except that \b has a
1922 * different meaning, and \B and backreferences are prohibited (see E5
1923 * Section 15.10.2.19). However, it's difficult to share code because we
1924 * handle e.g. "\n" very differently: here we generate a single character
1925 * range for it.
1926 */
1927
1928 x = DUK__L0();
1929 DUK__ADVANCECHARS(lex_ctx, 1);
1930
1931 if (x == 'b') {
1932 /* Note: '\b' in char class is different than outside (assertion),
1933 * '\B' is not allowed and is caught by the duk_unicode_is_identifier_part()
1934 * check below.
1935 */
1936 ch = 0x0008;
1937 } else if (x == 'f') {
1938 ch = 0x000c;
1939 } else if (x == 'n') {
1940 ch = 0x000a;
1941 } else if (x == 't') {
1942 ch = 0x0009;
1943 } else if (x == 'r') {
1944 ch = 0x000d;
1945 } else if (x == 'v') {
1946 ch = 0x000b;
1947 } else if (x == 'c') {
1948 x = DUK__L0();
1949 DUK__ADVANCECHARS(lex_ctx, 1);
1950 if ((x >= 'a' && x <= 'z') ||
1951 (x >= 'A' && x <= 'Z')) {
1952 ch = (x % 32);
1953 } else {
11fdf7f2 1954 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape");
7c673cae
FG
1955 return; /* never reached, but avoids warnings of
1956 * potentially unused variables.
1957 */
1958 }
1959 } else if (x == 'x') {
1960 ch = duk__decode_hexesc_from_window(lex_ctx, 0);
1961 DUK__ADVANCECHARS(lex_ctx, 2);
1962 } else if (x == 'u') {
1963 ch = duk__decode_uniesc_from_window(lex_ctx, 0);
1964 DUK__ADVANCECHARS(lex_ctx, 4);
1965 } else if (x == 'd') {
1966 duk__emit_u16_direct_ranges(lex_ctx,
1967 gen_range,
1968 userdata,
1969 duk_unicode_re_ranges_digit,
1970 sizeof(duk_unicode_re_ranges_digit) / sizeof(duk_uint16_t));
1971 ch = -1;
1972 } else if (x == 'D') {
1973 duk__emit_u16_direct_ranges(lex_ctx,
1974 gen_range,
1975 userdata,
1976 duk_unicode_re_ranges_not_digit,
1977 sizeof(duk_unicode_re_ranges_not_digit) / sizeof(duk_uint16_t));
1978 ch = -1;
1979 } else if (x == 's') {
1980 duk__emit_u16_direct_ranges(lex_ctx,
1981 gen_range,
1982 userdata,
1983 duk_unicode_re_ranges_white,
1984 sizeof(duk_unicode_re_ranges_white) / sizeof(duk_uint16_t));
1985 ch = -1;
1986 } else if (x == 'S') {
1987 duk__emit_u16_direct_ranges(lex_ctx,
1988 gen_range,
1989 userdata,
1990 duk_unicode_re_ranges_not_white,
1991 sizeof(duk_unicode_re_ranges_not_white) / sizeof(duk_uint16_t));
1992 ch = -1;
1993 } else if (x == 'w') {
1994 duk__emit_u16_direct_ranges(lex_ctx,
1995 gen_range,
1996 userdata,
1997 duk_unicode_re_ranges_wordchar,
1998 sizeof(duk_unicode_re_ranges_wordchar) / sizeof(duk_uint16_t));
1999 ch = -1;
2000 } else if (x == 'W') {
2001 duk__emit_u16_direct_ranges(lex_ctx,
2002 gen_range,
2003 userdata,
2004 duk_unicode_re_ranges_not_wordchar,
2005 sizeof(duk_unicode_re_ranges_not_wordchar) / sizeof(duk_uint16_t));
2006 ch = -1;
2007 } else if (DUK__ISDIGIT(x)) {
2008 /* DecimalEscape, only \0 is allowed, no leading zeroes are allowed */
2009 if (x == '0' && !DUK__ISDIGIT(DUK__L0())) {
2010 ch = 0x0000;
2011 } else {
11fdf7f2 2012 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape");
7c673cae
FG
2013 }
2014 } else if (!duk_unicode_is_identifier_part(x)
2015#if defined(DUK_USE_NONSTD_REGEXP_DOLLAR_ESCAPE)
2016 || x == '$'
2017#endif
2018 ) {
2019 /* IdentityEscape */
2020 ch = x;
2021 } else {
11fdf7f2 2022 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape");
7c673cae
FG
2023 }
2024 } else {
2025 /* character represents itself */
2026 ch = x;
2027 }
2028
2029 /* ch is a literal character here or -1 if parsed entity was
2030 * an escape such as "\s".
2031 */
2032
2033 if (ch < 0) {
2034 /* multi-character sets not allowed as part of ranges, see
2035 * E5 Section 15.10.2.15, abstract operation CharacterRange.
2036 */
2037 if (start >= 0) {
2038 if (dash) {
11fdf7f2 2039 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid range");
7c673cae
FG
2040 } else {
2041 gen_range(userdata, start, start, 0);
2042 start = -1;
2043 /* dash is already 0 */
2044 }
2045 }
2046 } else {
2047 if (start >= 0) {
2048 if (dash) {
2049 if (start > ch) {
11fdf7f2 2050 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid range");
7c673cae
FG
2051 }
2052 gen_range(userdata, start, ch, 0);
2053 start = -1;
2054 dash = 0;
2055 } else {
2056 gen_range(userdata, start, start, 0);
2057 start = ch;
2058 /* dash is already 0 */
2059 }
2060 } else {
2061 start = ch;
2062 }
2063 }
2064 }
2065
2066 return;
2067}
2068
2069#endif /* DUK_USE_REGEXP_SUPPORT */