ceph/src/civetweb/src/third_party/duktape-1.8.0/src-separate/duk_lexer.c

   1 /*
   2  *  Lexer for source files, ToNumber() string conversions, RegExp expressions,
   3  *  and JSON.
   4  *
   5  *  Provides a stream of Ecmascript tokens from an UTF-8/CESU-8 buffer.  The
   6  *  caller can also rewind the token stream into a certain position which is
   7  *  needed by the compiler part for multi-pass scanning.  Tokens are
   8  *  represented as duk_token structures, and contain line number information.
   9  *  Token types are identified with DUK_TOK_* defines.
  10  *
  11  *  Characters are decoded into a fixed size lookup window consisting of
  12  *  decoded Unicode code points, with window positions past the end of the
  13  *  input filled with an invalid codepoint (-1).  The tokenizer can thus
  14  *  perform multiple character lookups efficiently and with few sanity
  15  *  checks (such as access outside the end of the input), which keeps the
  16  *  tokenization code small at the cost of performance.
  17  *
  18  *  Character data in tokens, such as identifier names and string literals,
  19  *  is encoded into CESU-8 format on-the-fly while parsing the token in
  20  *  question.  The string data is made reachable to garbage collection by
  21  *  placing the token-related values in value stack entries allocated for
  22  *  this purpose by the caller.  The characters exist in Unicode code point
  23  *  form only in the fixed size lookup window, which keeps character data
  24  *  expansion (of especially ASCII data) low.
  25  *
  26  *  Token parsing supports the full range of Unicode characters as described
  27  *  in the E5 specification.  Parsing has been optimized for ASCII characters
  28  *  because ordinary Ecmascript code consists almost entirely of ASCII
  29  *  characters.  Matching of complex Unicode codepoint sets (such as in the
  30  *  IdentifierStart and IdentifierPart productions) is optimized for size,
  31  *  and is done using a linear scan of a bit-packed list of ranges.  This is
  32  *  very slow, but should never be entered unless the source code actually
  33  *  contains Unicode characters.
  34  *
  35  *  Ecmascript tokenization is partially context sensitive.  First,
  36  *  additional future reserved words are recognized in strict mode (see E5
  37  *  Section 7.6.1.2).  Second, a forward slash character ('/') can be
  38  *  recognized either as starting a RegExp literal or as a division operator,
  39  *  depending on context.  The caller must provide necessary context flags
  40  *  when requesting a new token.
  41  *
  42  *  Future work:
  43  *
  44  *    * Make line number tracking optional, as it consumes space.
  45  *
  46  *    * Add a feature flag for disabling UTF-8 decoding of input, as most
  47  *      source code is ASCII.  Because of Unicode escapes written in ASCII,
  48  *      this does not allow Unicode support to be removed from e.g.
  49  *      duk_unicode_is_identifier_start() nor does it allow removal of CESU-8
  50  *      encoding of e.g. string literals.
  51  *
  52  *    * Add a feature flag for disabling Unicode compliance of e.g. identifier
  53  *      names.  This allows for a build more than a kilobyte smaller, because
  54  *      Unicode ranges needed by duk_unicode_is_identifier_start() and
  55  *      duk_unicode_is_identifier_part() can be dropped.  String literals
  56  *      should still be allowed to contain escaped Unicode, so this still does
  57  *      not allow removal of CESU-8 encoding of e.g. string literals.
  58  *
  59  *    * Character lookup tables for codepoints above BMP could be stripped.
  60  *
  61  *    * Strictly speaking, E5 specification requires that source code consists
  62  *      of 16-bit code units, and if not, must be conceptually converted to
  63  *      that format first.  The current lexer processes Unicode code points
  64  *      and allows characters outside the BMP.  These should be converted to
  65  *      surrogate pairs while reading the source characters into the window,
  66  *      not after tokens have been formed (as is done now).  However, the fix
  67  *      is not trivial because two characters are decoded from one codepoint.
  68  *
  69  *    * Optimize for speed as well as size.  Large if-else ladders are (at
  70  *      least potentially) slow.
  71  */
  72
  73 #include "duk_internal.h"
  74
  75 /*
  76  *  Various defines and file specific helper macros
  77  */
  78
  79 #define DUK__MAX_RE_DECESC_DIGITS     9
  80 #define DUK__MAX_RE_QUANT_DIGITS      9   /* Does not allow e.g. 2**31-1, but one more would allow overflows of u32. */
  81
  82 /* whether to use macros or helper function depends on call count */
  83 #define DUK__ISDIGIT(x)          ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_9)
  84 #define DUK__ISHEXDIGIT(x)       duk__is_hex_digit((x))
  85 #define DUK__ISOCTDIGIT(x)       ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_7)
  86 #define DUK__ISDIGIT03(x)        ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_3)
  87 #define DUK__ISDIGIT47(x)        ((x) >= DUK_ASC_4 && (x) <= DUK_ASC_7)
  88
  89 /* lexer character window helpers */
  90 #define DUK__LOOKUP(lex_ctx,index)        ((lex_ctx)->window[(index)].codepoint)
  91 #define DUK__ADVANCECHARS(lex_ctx,count)  duk__advance_bytes((lex_ctx), (count) * sizeof(duk_lexer_codepoint))
  92 #define DUK__ADVANCEBYTES(lex_ctx,count)  duk__advance_bytes((lex_ctx), (count))
  93 #define DUK__INITBUFFER(lex_ctx)          duk__initbuffer((lex_ctx))
  94 #define DUK__APPENDBUFFER(lex_ctx,x)      duk__appendbuffer((lex_ctx), (duk_codepoint_t) (x))
  95
  96 /* lookup shorthands (note: assume context variable is named 'lex_ctx') */
  97 #define DUK__L0()  DUK__LOOKUP(lex_ctx, 0)
  98 #define DUK__L1()  DUK__LOOKUP(lex_ctx, 1)
  99 #define DUK__L2()  DUK__LOOKUP(lex_ctx, 2)
 100 #define DUK__L3()  DUK__LOOKUP(lex_ctx, 3)
 101 #define DUK__L4()  DUK__LOOKUP(lex_ctx, 4)
 102 #define DUK__L5()  DUK__LOOKUP(lex_ctx, 5)
 103
 104 /* packed advance/token number macro used by multiple functions */
 105 #define DUK__ADVTOK(advbytes,tok)  ((((advbytes) * sizeof(duk_lexer_codepoint)) << 8) + (tok))
 106
 107 /*
 108  *  Advance lookup window by N characters, filling in new characters as
 109  *  necessary.  After returning caller is guaranteed a character window of
 110  *  at least DUK_LEXER_WINDOW_SIZE characters.
 111  *
 112  *  The main function duk__advance_bytes() is called at least once per every
 113  *  token so it has a major lexer/compiler performance impact.  There are two
 114  *  variants for the main duk__advance_bytes() algorithm: a sliding window
 115  *  approach which is slightly faster at the cost of larger code footprint,
 116  *  and a simple copying one.
 117  *
 118  *  Decoding directly from the source string would be another lexing option.
 119  *  But the lookup window based approach has the advantage of hiding the
 120  *  source string and its encoding effectively which gives more flexibility
 121  *  going forward to e.g. support chunked streaming of source from flash.
 122  *
 123  *  Decodes UTF-8/CESU-8 leniently with support for code points from U+0000 to
 124  *  U+10FFFF, causing an error if the input is unparseable.  Leniency means:
 125  *
 126  *    * Unicode code point validation is intentionally not performed,
 127  *      except to check that the codepoint does not exceed 0x10ffff.
 128  *
 129  *    * In particular, surrogate pairs are allowed and not combined, which
 130  *      allows source files to represent all SourceCharacters with CESU-8.
 131  *      Broken surrogate pairs are allowed, as Ecmascript does not mandate
 132  *      their validation.
 133  *
 134  *    * Allow non-shortest UTF-8 encodings.
 135  *
 136  *  Leniency here causes few security concerns because all character data is
 137  *  decoded into Unicode codepoints before lexer processing, and is then
 138  *  re-encoded into CESU-8.  The source can be parsed as strict UTF-8 with
 139  *  a compiler option.  However, Ecmascript source characters include -all-
 140  *  16-bit unsigned integer codepoints, so leniency seems to be appropriate.
 141  *
 142  *  Note that codepoints above the BMP are not strictly SourceCharacters,
 143  *  but the lexer still accepts them as such.  Before ending up in a string
 144  *  or an identifier name, codepoints above BMP are converted into surrogate
 145  *  pairs and then CESU-8 encoded, resulting in 16-bit Unicode data as
 146  *  expected by Ecmascript.
 147  *
 148  *  An alternative approach to dealing with invalid or partial sequences
 149  *  would be to skip them and replace them with e.g. the Unicode replacement
 150  *  character U+FFFD.  This has limited utility because a replacement character
 151  *  will most likely cause a parse error, unless it occurs inside a string.
 152  *  Further, Ecmascript source is typically pure ASCII.
 153  *
 154  *  See:
 155  *
 156  *     http://en.wikipedia.org/wiki/UTF-8
 157  *     http://en.wikipedia.org/wiki/CESU-8
 158  *     http://tools.ietf.org/html/rfc3629
 159  *     http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
 160  *
 161  *  Future work:
 162  *
 163  *    * Reject other invalid Unicode sequences (see Wikipedia entry for examples)
 164  *      in strict UTF-8 mode.
 165  *
 166  *    * Size optimize.  An attempt to use a 16-byte lookup table for the first
 167  *      byte resulted in a code increase though.
 168  *
 169  *    * Is checking against maximum 0x10ffff really useful?  4-byte encoding
 170  *      imposes a certain limit anyway.
 171  *
 172  *    * Support chunked streaming of source code.  Can be implemented either
 173  *      by streaming chunks of bytes or chunks of codepoints.
 174  */
 175
 176 #if defined(DUK_USE_LEXER_SLIDING_WINDOW)
 177 DUK_LOCAL void duk__fill_lexer_buffer(duk_lexer_ctx *lex_ctx, duk_small_uint_t start_offset_bytes) {
 178         duk_lexer_codepoint *cp, *cp_end;
 179         duk_ucodepoint_t x;
 180         duk_small_uint_t contlen;
 181         const duk_uint8_t *p, *p_end;
 182 #if defined(DUK_USE_STRICT_UTF8_SOURCE)
 183         duk_ucodepoint_t mincp;
 184 #endif
 185         duk_int_t input_line;
 186
 187         /* Use temporaries and update lex_ctx only when finished. */
 188         input_line = lex_ctx->input_line;
 189         p = lex_ctx->input + lex_ctx->input_offset;
 190         p_end = lex_ctx->input + lex_ctx->input_length;
 191
 192         cp = (duk_lexer_codepoint *) (void *) ((duk_uint8_t *) lex_ctx->buffer + start_offset_bytes);
 193         cp_end = lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE;
 194
 195         for (; cp != cp_end; cp++) {
 196                 cp->offset = (duk_size_t) (p - lex_ctx->input);
 197                 cp->line = input_line;
 198
 199                 /* XXX: potential issue with signed pointers, p_end < p. */
 200                 if (DUK_UNLIKELY(p >= p_end)) {
 201                         /* If input_offset were assigned a negative value, it would
 202                          * result in a large positive value.  Most likely it would be
 203                          * larger than input_length and be caught here.  In any case
 204                          * no memory unsafe behavior would happen.
 205                          */
 206                         cp->codepoint = -1;
 207                         continue;
 208                 }
 209
 210                 x = (duk_ucodepoint_t) (*p++);
 211
 212                 /* Fast path. */
 213
 214                 if (DUK_LIKELY(x < 0x80UL)) {
 215                         DUK_ASSERT(x != 0x2028UL && x != 0x2029UL);  /* not LS/PS */
 216                         if (DUK_UNLIKELY(x <= 0x000dUL)) {
 217                                 if ((x == 0x000aUL) ||
 218                                     ((x == 0x000dUL) && (p >= p_end || *p != 0x000aUL))) {
 219                                         /* lookup for 0x000a above assumes shortest encoding now */
 220
 221                                         /* E5 Section 7.3, treat the following as newlines:
 222                                          *   LF
 223                                          *   CR [not followed by LF]
 224                                          *   LS
 225                                          *   PS
 226                                          *
 227                                          * For CR LF, CR is ignored if it is followed by LF, and the LF will bump
 228                                          * the line number.
 229                                          */
 230                                         input_line++;
 231                                 }
 232                         }
 233
 234                         cp->codepoint = (duk_codepoint_t) x;
 235                         continue;
 236                 }
 237
 238                 /* Slow path. */
 239
 240                 if (x < 0xc0UL) {
 241                         /* 10xx xxxx -> invalid */
 242                         goto error_encoding;
 243                 } else if (x < 0xe0UL) {
 244                         /* 110x xxxx   10xx xxxx  */
 245                         contlen = 1;
 246 #if defined(DUK_USE_STRICT_UTF8_SOURCE)
 247                         mincp = 0x80UL;
 248 #endif
 249                         x = x & 0x1fUL;
 250                 } else if (x < 0xf0UL) {
 251                         /* 1110 xxxx   10xx xxxx   10xx xxxx */
 252                         contlen = 2;
 253 #if defined(DUK_USE_STRICT_UTF8_SOURCE)
 254                         mincp = 0x800UL;
 255 #endif
 256                         x = x & 0x0fUL;
 257                 } else if (x < 0xf8UL) {
 258                         /* 1111 0xxx   10xx xxxx   10xx xxxx   10xx xxxx */
 259                         contlen = 3;
 260 #if defined(DUK_USE_STRICT_UTF8_SOURCE)
 261                         mincp = 0x10000UL;
 262 #endif
 263                         x = x & 0x07UL;
 264                 } else {
 265                         /* no point in supporting encodings of 5 or more bytes */
 266                         goto error_encoding;
 267                 }
 268
 269                 DUK_ASSERT(p_end >= p);
 270                 if ((duk_size_t) contlen > (duk_size_t) (p_end - p)) {
 271                         goto error_clipped;
 272                 }
 273
 274                 while (contlen > 0) {
 275                         duk_small_uint_t y;
 276                         y = *p++;
 277                         if ((y & 0xc0U) != 0x80U) {
 278                                 /* check that byte has the form 10xx xxxx */
 279                                 goto error_encoding;
 280                         }
 281                         x = x << 6;
 282                         x += y & 0x3fUL;
 283                         contlen--;
 284                 }
 285
 286                 /* check final character validity */
 287
 288                 if (x > 0x10ffffUL) {
 289                         goto error_encoding;
 290                 }
 291 #if defined(DUK_USE_STRICT_UTF8_SOURCE)
 292                 if (x < mincp || (x >= 0xd800UL && x <= 0xdfffUL) || x == 0xfffeUL) {
 293                         goto error_encoding;
 294                 }
 295 #endif
 296
 297                 DUK_ASSERT(x != 0x000aUL && x != 0x000dUL);
 298                 if ((x == 0x2028UL) || (x == 0x2029UL)) {
 299                         input_line++;
 300                 }
 301
 302                 cp->codepoint = (duk_codepoint_t) x;
 303         }
 304
 305         lex_ctx->input_offset = (duk_size_t) (p - lex_ctx->input);
 306         lex_ctx->input_line = input_line;
 307         return;
 308
 309  error_clipped:   /* clipped codepoint */
 310  error_encoding:  /* invalid codepoint encoding or codepoint */
 311         lex_ctx->input_offset = (duk_size_t) (p - lex_ctx->input);
 312         lex_ctx->input_line = input_line;
 313
 314         DUK_ERROR_SYNTAX(lex_ctx->thr, "utf-8 decode failed");
 315 }
 316
 317 DUK_LOCAL void duk__advance_bytes(duk_lexer_ctx *lex_ctx, duk_small_uint_t count_bytes) {
 318         duk_small_uint_t used_bytes, avail_bytes;
 319
 320         DUK_ASSERT_DISABLE(count_bytes >= 0);  /* unsigned */
 321         DUK_ASSERT(count_bytes <= (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint)));
 322         DUK_ASSERT(lex_ctx->window >= lex_ctx->buffer);
 323         DUK_ASSERT(lex_ctx->window < lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE);
 324         DUK_ASSERT((duk_uint8_t *) lex_ctx->window + count_bytes <= (duk_uint8_t *) lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE * sizeof(duk_lexer_codepoint));
 325
 326         /* Zero 'count' is also allowed to make call sites easier.
 327          * Arithmetic in bytes generates better code in GCC.
 328          */
 329
 330         lex_ctx->window = (duk_lexer_codepoint *) (void *) ((duk_uint8_t *) lex_ctx->window + count_bytes);  /* avoid multiply */
 331         used_bytes = (duk_small_uint_t) ((duk_uint8_t *) lex_ctx->window - (duk_uint8_t *) lex_ctx->buffer);
 332         avail_bytes = DUK_LEXER_BUFFER_SIZE * sizeof(duk_lexer_codepoint) - used_bytes;
 333         if (avail_bytes < (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint))) {
 334                 /* Not enough data to provide a full window, so "scroll" window to
 335                  * start of buffer and fill up the rest.
 336                  */
 337                 DUK_MEMMOVE((void *) lex_ctx->buffer,
 338                             (const void *) lex_ctx->window,
 339                             (size_t) avail_bytes);
 340                 lex_ctx->window = lex_ctx->buffer;
 341                 duk__fill_lexer_buffer(lex_ctx, avail_bytes);
 342         }
 343 }
 344
 345 DUK_LOCAL void duk__init_lexer_window(duk_lexer_ctx *lex_ctx) {
 346         lex_ctx->window = lex_ctx->buffer;
 347         duk__fill_lexer_buffer(lex_ctx, 0);
 348 }
 349 #else  /* DUK_USE_LEXER_SLIDING_WINDOW */
 350 DUK_LOCAL duk_codepoint_t duk__read_char(duk_lexer_ctx *lex_ctx) {
 351         duk_ucodepoint_t x;
 352         duk_small_uint_t len;
 353         duk_small_uint_t i;
 354         const duk_uint8_t *p;
 355 #if defined(DUK_USE_STRICT_UTF8_SOURCE)
 356         duk_ucodepoint_t mincp;
 357 #endif
 358         duk_size_t input_offset;
 359
 360         input_offset = lex_ctx->input_offset;
 361         if (DUK_UNLIKELY(input_offset >= lex_ctx->input_length)) {
 362                 /* If input_offset were assigned a negative value, it would
 363                  * result in a large positive value.  Most likely it would be
 364                  * larger than input_length and be caught here.  In any case
 365                  * no memory unsafe behavior would happen.
 366                  */
 367                 return -1;
 368         }
 369
 370         p = lex_ctx->input + input_offset;
 371         x = (duk_ucodepoint_t) (*p);
 372
 373         if (DUK_LIKELY(x < 0x80UL)) {
 374                 /* 0xxx xxxx -> fast path */
 375
 376                 /* input offset tracking */
 377                 lex_ctx->input_offset++;
 378
 379                 DUK_ASSERT(x != 0x2028UL && x != 0x2029UL);  /* not LS/PS */
 380                 if (DUK_UNLIKELY(x <= 0x000dUL)) {
 381                         if ((x == 0x000aUL) ||
 382                             ((x == 0x000dUL) && (lex_ctx->input_offset >= lex_ctx->input_length ||
 383                                                  lex_ctx->input[lex_ctx->input_offset] != 0x000aUL))) {
 384                                 /* lookup for 0x000a above assumes shortest encoding now */
 385
 386                                 /* E5 Section 7.3, treat the following as newlines:
 387                                  *   LF
 388                                  *   CR [not followed by LF]
 389                                  *   LS
 390                                  *   PS
 391                                  *
 392                                  * For CR LF, CR is ignored if it is followed by LF, and the LF will bump
 393                                  * the line number.
 394                                  */
 395                                 lex_ctx->input_line++;
 396                         }
 397                 }
 398
 399                 return (duk_codepoint_t) x;
 400         }
 401
 402         /* Slow path. */
 403
 404         if (x < 0xc0UL) {
 405                 /* 10xx xxxx -> invalid */
 406                 goto error_encoding;
 407         } else if (x < 0xe0UL) {
 408                 /* 110x xxxx   10xx xxxx  */
 409                 len = 2;
 410 #if defined(DUK_USE_STRICT_UTF8_SOURCE)
 411                 mincp = 0x80UL;
 412 #endif
 413                 x = x & 0x1fUL;
 414         } else if (x < 0xf0UL) {
 415                 /* 1110 xxxx   10xx xxxx   10xx xxxx */
 416                 len = 3;
 417 #if defined(DUK_USE_STRICT_UTF8_SOURCE)
 418                 mincp = 0x800UL;
 419 #endif
 420                 x = x & 0x0fUL;
 421         } else if (x < 0xf8UL) {
 422                 /* 1111 0xxx   10xx xxxx   10xx xxxx   10xx xxxx */
 423                 len = 4;
 424 #if defined(DUK_USE_STRICT_UTF8_SOURCE)
 425                 mincp = 0x10000UL;
 426 #endif
 427                 x = x & 0x07UL;
 428         } else {
 429                 /* no point in supporting encodings of 5 or more bytes */
 430                 goto error_encoding;
 431         }
 432
 433         DUK_ASSERT(lex_ctx->input_length >= lex_ctx->input_offset);
 434         if ((duk_size_t) len > (duk_size_t) (lex_ctx->input_length - lex_ctx->input_offset)) {
 435                 goto error_clipped;
 436         }
 437
 438         p++;
 439         for (i = 1; i < len; i++) {
 440                 duk_small_uint_t y;
 441                 y = *p++;
 442                 if ((y & 0xc0U) != 0x80U) {
 443                         /* check that byte has the form 10xx xxxx */
 444                         goto error_encoding;
 445                 }
 446                 x = x << 6;
 447                 x += y & 0x3fUL;
 448         }
 449
 450         /* check final character validity */
 451
 452         if (x > 0x10ffffUL) {
 453                 goto error_encoding;
 454         }
 455 #if defined(DUK_USE_STRICT_UTF8_SOURCE)
 456         if (x < mincp || (x >= 0xd800UL && x <= 0xdfffUL) || x == 0xfffeUL) {
 457                 goto error_encoding;
 458         }
 459 #endif
 460
 461         /* input offset tracking */
 462         lex_ctx->input_offset += len;
 463
 464         /* line tracking */
 465         DUK_ASSERT(x != 0x000aUL && x != 0x000dUL);
 466         if ((x == 0x2028UL) || (x == 0x2029UL)) {
 467                 lex_ctx->input_line++;
 468         }
 469
 470         return (duk_codepoint_t) x;
 471
 472  error_clipped:   /* clipped codepoint */
 473  error_encoding:  /* invalid codepoint encoding or codepoint */
 474         DUK_ERROR_SYNTAX(lex_ctx->thr, "utf-8 decode failed");
 475         return 0;
 476 }
 477
 478 DUK_LOCAL void duk__advance_bytes(duk_lexer_ctx *lex_ctx, duk_small_uint_t count_bytes) {
 479         duk_small_uint_t keep_bytes;
 480         duk_lexer_codepoint *cp, *cp_end;
 481
 482         DUK_ASSERT_DISABLE(count_bytes >= 0);  /* unsigned */
 483         DUK_ASSERT(count_bytes <= (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint)));
 484
 485         /* Zero 'count' is also allowed to make call sites easier. */
 486
 487         keep_bytes = DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint) - count_bytes;
 488         DUK_MEMMOVE((void *) lex_ctx->window,
 489                     (const void *) ((duk_uint8_t *) lex_ctx->window + count_bytes),
 490                     (size_t) keep_bytes);
 491
 492         cp = (duk_lexer_codepoint *) ((duk_uint8_t *) lex_ctx->window + keep_bytes);
 493         cp_end = lex_ctx->window + DUK_LEXER_WINDOW_SIZE;
 494         for (; cp != cp_end; cp++) {
 495                 cp->offset = lex_ctx->input_offset;
 496                 cp->line = lex_ctx->input_line;
 497                 cp->codepoint = duk__read_char(lex_ctx);
 498         }
 499 }
 500
 501 DUK_LOCAL void duk__init_lexer_window(duk_lexer_ctx *lex_ctx) {
 502         /* Call with count == DUK_LEXER_WINDOW_SIZE to fill buffer initially. */
 503         duk__advance_bytes(lex_ctx, DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint));  /* fill window */
 504 }
 505 #endif  /* DUK_USE_LEXER_SLIDING_WINDOW */
 506
 507 /*
 508  *  (Re)initialize the temporary byte buffer.  May be called extra times
 509  *  with little impact.
 510  */
 511
 512 DUK_LOCAL void duk__initbuffer(duk_lexer_ctx *lex_ctx) {
 513         /* Reuse buffer as is unless buffer has grown large. */
 514         if (DUK_HBUFFER_DYNAMIC_GET_SIZE(lex_ctx->buf) < DUK_LEXER_TEMP_BUF_LIMIT) {
 515                 /* Keep current size */
 516         } else {
 517                 duk_hbuffer_resize(lex_ctx->thr, lex_ctx->buf, DUK_LEXER_TEMP_BUF_LIMIT);
 518         }
 519
 520         DUK_BW_INIT_WITHBUF(lex_ctx->thr, &lex_ctx->bw, lex_ctx->buf);
 521 }
 522
 523 /*
 524  *  Append a Unicode codepoint to the temporary byte buffer.  Performs
 525  *  CESU-8 surrogate pair encoding for codepoints above the BMP.
 526  *  Existing surrogate pairs are allowed and also encoded into CESU-8.
 527  */
 528
 529 DUK_LOCAL void duk__appendbuffer(duk_lexer_ctx *lex_ctx, duk_codepoint_t x) {
 530         /*
 531          *  Since character data is only generated by decoding the source or by
 532          *  the compiler itself, we rely on the input codepoints being correct
 533          *  and avoid a check here.
 534          *
 535          *  Character data can also come here through decoding of Unicode
 536          *  escapes ("\udead\ubeef") so all 16-but unsigned values can be
 537          *  present, even when the source file itself is strict UTF-8.
 538          */
 539
 540         DUK_ASSERT(x >= 0 && x <= 0x10ffff);
 541
 542         DUK_BW_WRITE_ENSURE_CESU8(lex_ctx->thr, &lex_ctx->bw, (duk_ucodepoint_t) x);
 543 }
 544
 545 /*
 546  *  Intern the temporary byte buffer into a valstack slot
 547  *  (in practice, slot1 or slot2).
 548  */
 549
 550 DUK_LOCAL void duk__internbuffer(duk_lexer_ctx *lex_ctx, duk_idx_t valstack_idx) {
 551         duk_context *ctx = (duk_context *) lex_ctx->thr;
 552
 553         DUK_ASSERT(valstack_idx == lex_ctx->slot1_idx || valstack_idx == lex_ctx->slot2_idx);
 554
 555         DUK_BW_PUSH_AS_STRING(lex_ctx->thr, &lex_ctx->bw);
 556         duk_replace(ctx, valstack_idx);
 557 }
 558
 559 /*
 560  *  Init lexer context
 561  */
 562
 563 DUK_INTERNAL void duk_lexer_initctx(duk_lexer_ctx *lex_ctx) {
 564         DUK_ASSERT(lex_ctx != NULL);
 565
 566         DUK_MEMZERO(lex_ctx, sizeof(*lex_ctx));
 567 #if defined(DUK_USE_EXPLICIT_NULL_INIT)
 568 #if defined(DUK_USE_LEXER_SLIDING_WINDOW)
 569         lex_ctx->window = NULL;
 570 #endif
 571         lex_ctx->thr = NULL;
 572         lex_ctx->input = NULL;
 573         lex_ctx->buf = NULL;
 574 #endif
 575 }
 576
 577 /*
 578  *  Set lexer input position and reinitialize lookup window.
 579  */
 580
 581 /* NB: duk_lexer_getpoint() is a macro only */
 582
 583 DUK_INTERNAL void duk_lexer_setpoint(duk_lexer_ctx *lex_ctx, duk_lexer_point *pt) {
 584         DUK_ASSERT_DISABLE(pt->offset >= 0);  /* unsigned */
 585         DUK_ASSERT(pt->line >= 1);
 586         lex_ctx->input_offset = pt->offset;
 587         lex_ctx->input_line = pt->line;
 588         duk__init_lexer_window(lex_ctx);
 589 }
 590
 591 /*
 592  *  Lexing helpers
 593  */
 594
 595 /* numeric value of a hex digit (also covers octal and decimal digits) */
 596 DUK_LOCAL duk_codepoint_t duk__hexval(duk_lexer_ctx *lex_ctx, duk_codepoint_t x) {
 597         duk_small_int_t t;
 598
 599         /* Here 'x' is a Unicode codepoint */
 600         if (DUK_LIKELY(x >= 0 && x <= 0xff)) {
 601                 t = duk_hex_dectab[x];
 602                 if (DUK_LIKELY(t >= 0)) {
 603                         return t;
 604                 }
 605         }
 606
 607         /* Throwing an error this deep makes the error rather vague, but
 608          * saves hundreds of bytes of code.
 609          */
 610         DUK_ERROR_SYNTAX(lex_ctx->thr, "decode error");
 611         return 0;
 612 }
 613
 614 /* having this as a separate function provided a size benefit */
 615 DUK_LOCAL duk_bool_t duk__is_hex_digit(duk_codepoint_t x) {
 616         if (DUK_LIKELY(x >= 0 && x <= 0xff)) {
 617                 return (duk_hex_dectab[x] >= 0);
 618         }
 619         return 0;
 620 }
 621
 622 DUK_LOCAL duk_codepoint_t duk__decode_hexesc_from_window(duk_lexer_ctx *lex_ctx, duk_small_int_t lookup_offset) {
 623         /* validation performed by duk__hexval */
 624         return (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset].codepoint) << 4) |
 625                (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset + 1].codepoint));
 626 }
 627
 628 DUK_LOCAL duk_codepoint_t duk__decode_uniesc_from_window(duk_lexer_ctx *lex_ctx, duk_small_int_t lookup_offset) {
 629         /* validation performed by duk__hexval */
 630         return (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset].codepoint) << 12) |
 631                (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset + 1].codepoint) << 8) |
 632                (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset + 2].codepoint) << 4) |
 633                (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset + 3].codepoint));
 634 }
 635
 636 /*
 637  *  Parse Ecmascript source InputElementDiv or InputElementRegExp
 638  *  (E5 Section 7), skipping whitespace, comments, and line terminators.
 639  *
 640  *  Possible results are:
 641  *    (1) a token
 642  *    (2) a line terminator (skipped)
 643  *    (3) a comment (skipped)
 644  *    (4) EOF
 645  *
 646  *  White space is automatically skipped from the current position (but
 647  *  not after the input element).  If input has already ended, returns
 648  *  DUK_TOK_EOF indefinitely.  If a parse error occurs, uses an DUK_ERROR()
 649  *  macro call (and hence a longjmp through current heap longjmp context).
 650  *  Comments and line terminator tokens are automatically skipped.
 651  *
 652  *  The input element being matched is determined by regexp_mode; if set,
 653  *  parses a InputElementRegExp, otherwise a InputElementDiv.  The
 654  *  difference between these are handling of productions starting with a
 655  *  forward slash.
 656  *
 657  *  If strict_mode is set, recognizes additional future reserved words
 658  *  specific to strict mode, and refuses to parse octal literals.
 659  *
 660  *  The matching strategy below is to (currently) use a six character
 661  *  lookup window to quickly determine which production is the -longest-
 662  *  matching one, and then parse that.  The top-level if-else clauses
 663  *  match the first character, and the code blocks for each clause
 664  *  handle -all- alternatives for that first character.  Ecmascript
 665  *  specification uses the "longest match wins" semantics, so the order
 666  *  of the if-clauses matters.
 667  *
 668  *  Misc notes:
 669  *
 670  *    * Ecmascript numeric literals do not accept a sign character.
 671  *      Consequently e.g. "-1.0" is parsed as two tokens: a negative
 672  *      sign and a positive numeric literal.  The compiler performs
 673  *      the negation during compilation, so this has no adverse impact.
 674  *
 675  *    * There is no token for "undefined": it is just a value available
 676  *      from the global object (or simply established by doing a reference
 677  *      to an undefined value).
 678  *
 679  *    * Some contexts want Identifier tokens, which are IdentifierNames
 680  *      excluding reserved words, while some contexts want IdentifierNames
 681  *      directly.  In the latter case e.g. "while" is interpreted as an
 682  *      identifier name, not a DUK_TOK_WHILE token.  The solution here is
 683  *      to provide both token types: DUK_TOK_WHILE goes to 't' while
 684  *      DUK_TOK_IDENTIFIER goes to 't_nores', and 'slot1' always contains
 685  *      the identifier / keyword name.
 686  *
 687  *    * Directive prologue needs to identify string literals such as
 688  *      "use strict" and 'use strict', which are sensitive to line
 689  *      continuations and escape sequences.  For instance, "use\u0020strict"
 690  *      is a valid directive but is distinct from "use strict".  The solution
 691  *      here is to decode escapes while tokenizing, but to keep track of the
 692  *      number of escapes.  Directive detection can then check that the
 693  *      number of escapes is zero.
 694  *
 695  *    * Multi-line comments with one or more internal LineTerminator are
 696  *      treated like a line terminator to comply with automatic semicolon
 697  *      insertion.
 698  */
 699
 700 DUK_INTERNAL
 701 void duk_lexer_parse_js_input_element(duk_lexer_ctx *lex_ctx,
 702                                       duk_token *out_token,
 703                                       duk_bool_t strict_mode,
 704                                       duk_bool_t regexp_mode) {
 705         duk_codepoint_t x;           /* temporary, must be signed and 32-bit to hold Unicode code points */
 706         duk_small_uint_t advtok = 0; /* (advance << 8) + token_type, updated at function end,
 707                                       * init is unnecessary but suppresses "may be used uninitialized" warnings.
 708                                       */
 709         duk_bool_t got_lineterm = 0;  /* got lineterm preceding non-whitespace, non-lineterm token */
 710
 711         if (++lex_ctx->token_count >= lex_ctx->token_limit) {
 712                 DUK_ERROR_RANGE(lex_ctx->thr, "token limit");
 713                 return;  /* unreachable */
 714         }
 715
 716         out_token->t = DUK_TOK_EOF;
 717         out_token->t_nores = -1;  /* marker: copy t if not changed */
 718 #if 0  /* not necessary to init, disabled for faster parsing */
 719         out_token->num = DUK_DOUBLE_NAN;
 720         out_token->str1 = NULL;
 721         out_token->str2 = NULL;
 722 #endif
 723         out_token->num_escapes = 0;
 724         /* out_token->lineterm set by caller */
 725
 726         /* This would be nice, but parsing is faster without resetting the
 727          * value slots.  The only side effect is that references to temporary
 728          * string values may linger until lexing is finished; they're then
 729          * freed normally.
 730          */
 731 #if 0
 732         duk_to_undefined((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx);
 733         duk_to_undefined((duk_context *) lex_ctx->thr, lex_ctx->slot2_idx);
 734 #endif
 735
 736         /* 'advtok' indicates how much to advance and which token id to assign
 737          * at the end.  This shared functionality minimizes code size.  All
 738          * code paths are required to set 'advtok' to some value, so no default
 739          * init value is used.  Code paths calling DUK_ERROR() never return so
 740          * they don't need to set advtok.
 741          */
 742
 743         /*
 744          *  Matching order:
 745          *
 746          *    Punctuator first chars, also covers comments, regexps
 747          *    LineTerminator
 748          *    Identifier or reserved word, also covers null/true/false literals
 749          *    NumericLiteral
 750          *    StringLiteral
 751          *    EOF
 752          *
 753          *  The order does not matter as long as the longest match is
 754          *  always correctly identified.  There are order dependencies
 755          *  in the clauses, so it's not trivial to convert to a switch.
 756          */
 757
 758  restart_lineupdate:
 759         out_token->start_line = lex_ctx->window[0].line;
 760
 761  restart:
 762         out_token->start_offset = lex_ctx->window[0].offset;
 763
 764         x = DUK__L0();
 765
 766         switch (x) {
 767         case DUK_ASC_SPACE:
 768         case DUK_ASC_HT:  /* fast paths for space and tab */
 769                 DUK__ADVANCECHARS(lex_ctx, 1);
 770                 goto restart;
 771         case DUK_ASC_LF:  /* LF line terminator; CR LF and Unicode lineterms are handled in slow path */
 772                 DUK__ADVANCECHARS(lex_ctx, 1);
 773                 got_lineterm = 1;
 774                 goto restart_lineupdate;
 775         case DUK_ASC_SLASH:  /* '/' */
 776                 if (DUK__L1() == '/') {
 777                         /*
 778                          *  E5 Section 7.4, allow SourceCharacter (which is any 16-bit
 779                          *  code point).
 780                          */
 781
 782                         /* DUK__ADVANCECHARS(lex_ctx, 2) would be correct here, but it unnecessary */
 783                         for (;;) {
 784                                 x = DUK__L0();
 785                                 if (x < 0 || duk_unicode_is_line_terminator(x)) {
 786                                         break;
 787                                 }
 788                                 DUK__ADVANCECHARS(lex_ctx, 1);
 789                         }
 790                         goto restart;  /* line terminator will be handled on next round */
 791                 } else if (DUK__L1() == '*') {
 792                         /*
 793                          *  E5 Section 7.4.  If the multi-line comment contains a newline,
 794                          *  it is treated like a single line terminator for automatic
 795                          *  semicolon insertion.
 796                          */
 797
 798                         duk_bool_t last_asterisk = 0;
 799                         DUK__ADVANCECHARS(lex_ctx, 2);
 800                         for (;;) {
 801                                 x = DUK__L0();
 802                                 if (x < 0) {
 803                                         DUK_ERROR_SYNTAX(lex_ctx->thr, "eof in multiline comment");
 804                                 }
 805                                 DUK__ADVANCECHARS(lex_ctx, 1);
 806                                 if (last_asterisk && x == '/') {
 807                                         break;
 808                                 }
 809                                 if (duk_unicode_is_line_terminator(x)) {
 810                                         got_lineterm = 1;
 811                                 }
 812                                 last_asterisk = (x == '*');
 813                         }
 814                         goto restart_lineupdate;
 815                 } else if (regexp_mode) {
 816 #if defined(DUK_USE_REGEXP_SUPPORT)
 817                         /*
 818                          *  "/" followed by something in regexp mode.  See E5 Section 7.8.5.
 819                          *
 820                          *  RegExp parsing is a bit complex.  First, the regexp body is delimited
 821                          *  by forward slashes, but the body may also contain forward slashes as
 822                          *  part of an escape sequence or inside a character class (delimited by
 823                          *  square brackets).  A mini state machine is used to implement these.
 824                          *
 825                          *  Further, an early (parse time) error must be thrown if the regexp
 826                          *  would cause a run-time error when used in the expression new RegExp(...).
 827                          *  Parsing here simply extracts the (candidate) regexp, and also accepts
 828                          *  invalid regular expressions (which are delimited properly).  The caller
 829                          *  (compiler) must perform final validation and regexp compilation.
 830                          *
 831                          *  RegExp first char may not be '/' (single line comment) or '*' (multi-
 832                          *  line comment).  These have already been checked above, so there is no
 833                          *  need below for special handling of the first regexp character as in
 834                          *  the E5 productions.
 835                          *
 836                          *  About unicode escapes within regexp literals:
 837                          *
 838                          *      E5 Section 7.8.5 grammar does NOT accept \uHHHH escapes.
 839                          *      However, Section 6 states that regexps accept the escapes,
 840                          *      see paragraph starting with "In string literals...".
 841                          *      The regexp grammar, which sees the decoded regexp literal
 842                          *      (after lexical parsing) DOES have a \uHHHH unicode escape.
 843                          *      So, for instance:
 844                          *
 845                          *          /\u1234/
 846                          *
 847                          *      should first be parsed by the lexical grammar as:
 848                          *
 849                          *          '\' 'u'      RegularExpressionBackslashSequence
 850                          *          '1'          RegularExpressionNonTerminator
 851                          *          '2'          RegularExpressionNonTerminator
 852                          *          '3'          RegularExpressionNonTerminator
 853                          *          '4'          RegularExpressionNonTerminator
 854                          *
 855                          *      and the escape itself is then parsed by the regexp engine.
 856                          *      This is the current implementation.
 857                          *
 858                          *  Minor spec inconsistency:
 859                          *
 860                          *      E5 Section 7.8.5 RegularExpressionBackslashSequence is:
 861                          *
 862                          *         \ RegularExpressionNonTerminator
 863                          *
 864                          *      while Section A.1 RegularExpressionBackslashSequence is:
 865                          *
 866                          *         \ NonTerminator
 867                          *
 868                          *      The latter is not normative and a typo.
 869                          *
 870                          */
 871
 872                         /* first, parse regexp body roughly */
 873
 874                         duk_small_int_t state = 0;  /* 0=base, 1=esc, 2=class, 3=class+esc */
 875
 876                         DUK__INITBUFFER(lex_ctx);
 877                         for (;;) {
 878                                 DUK__ADVANCECHARS(lex_ctx, 1);  /* skip opening slash on first loop */
 879                                 x = DUK__L0();
 880                                 if (x < 0 || duk_unicode_is_line_terminator(x)) {
 881                                         DUK_ERROR_SYNTAX(lex_ctx->thr, "eof or line terminator in regexp");
 882                                 }
 883                                 x = DUK__L0();  /* re-read to avoid spill / fetch */
 884                                 if (state == 0) {
 885                                         if (x == '/') {
 886                                                 DUK__ADVANCECHARS(lex_ctx, 1);  /* eat closing slash */
 887                                                 break;
 888                                         } else if (x == '\\') {
 889                                                 state = 1;
 890                                         } else if (x == '[') {
 891                                                 state = 2;
 892                                         }
 893                                 } else if (state == 1) {
 894                                         state = 0;
 895                                 } else if (state == 2) {
 896                                         if (x == ']') {
 897                                                 state = 0;
 898                                         } else if (x == '\\') {
 899                                                 state = 3;
 900                                         }
 901                                 } else { /* state == 3 */
 902                                         state = 2;
 903                                 }
 904                                 DUK__APPENDBUFFER(lex_ctx, x);
 905                         }
 906                         duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
 907                         out_token->str1 = duk_get_hstring((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx);
 908
 909                         /* second, parse flags */
 910
 911                         DUK__INITBUFFER(lex_ctx);
 912                         for (;;) {
 913                                 x = DUK__L0();
 914                                 if (!duk_unicode_is_identifier_part(x)) {
 915                                         break;
 916                                 }
 917                                 x = DUK__L0();  /* re-read to avoid spill / fetch */
 918                                 DUK__APPENDBUFFER(lex_ctx, x);
 919                                 DUK__ADVANCECHARS(lex_ctx, 1);
 920                         }
 921                         duk__internbuffer(lex_ctx, lex_ctx->slot2_idx);
 922                         out_token->str2 = duk_get_hstring((duk_context *) lex_ctx->thr, lex_ctx->slot2_idx);
 923
 924                         DUK__INITBUFFER(lex_ctx);  /* free some memory */
 925
 926                         /* validation of the regexp is caller's responsibility */
 927
 928                         advtok = DUK__ADVTOK(0, DUK_TOK_REGEXP);
 929 #else
 930                         DUK_ERROR_SYNTAX(lex_ctx->thr, "regexp support disabled");
 931 #endif
 932                 } else if (DUK__L1() == '=') {
 933                         /* "/=" and not in regexp mode */
 934                         advtok = DUK__ADVTOK(2, DUK_TOK_DIV_EQ);
 935                 } else {
 936                         /* "/" and not in regexp mode */
 937                         advtok = DUK__ADVTOK(1, DUK_TOK_DIV);
 938                 }
 939                 break;
 940         case DUK_ASC_LCURLY:  /* '{' */
 941                 advtok = DUK__ADVTOK(1, DUK_TOK_LCURLY);
 942                 break;
 943         case DUK_ASC_RCURLY:  /* '}' */
 944                 advtok = DUK__ADVTOK(1, DUK_TOK_RCURLY);
 945                 break;
 946         case DUK_ASC_LPAREN:  /* '(' */
 947                 advtok = DUK__ADVTOK(1, DUK_TOK_LPAREN);
 948                 break;
 949         case DUK_ASC_RPAREN:  /* ')' */
 950                 advtok = DUK__ADVTOK(1, DUK_TOK_RPAREN);
 951                 break;
 952         case DUK_ASC_LBRACKET:  /* '[' */
 953                 advtok = DUK__ADVTOK(1, DUK_TOK_LBRACKET);
 954                 break;
 955         case DUK_ASC_RBRACKET:  /* ']' */
 956                 advtok = DUK__ADVTOK(1, DUK_TOK_RBRACKET);
 957                 break;
 958         case DUK_ASC_PERIOD:  /* '.' */
 959                 if (DUK__ISDIGIT(DUK__L1())) {
 960                         /* Period followed by a digit can only start DecimalLiteral
 961                          * (handled in slow path).  We could jump straight into the
 962                          * DecimalLiteral handling but should avoid goto to inside
 963                          * a block.
 964                          */
 965                         goto slow_path;
 966                 }
 967                 advtok = DUK__ADVTOK(1, DUK_TOK_PERIOD);
 968                 break;
 969         case DUK_ASC_SEMICOLON:  /* ';' */
 970                 advtok = DUK__ADVTOK(1, DUK_TOK_SEMICOLON);
 971                 break;
 972         case DUK_ASC_COMMA:  /* ',' */
 973                 advtok = DUK__ADVTOK(1, DUK_TOK_COMMA);
 974                 break;
 975         case DUK_ASC_LANGLE:  /* '<' */
 976                 if (DUK__L1() == '<' && DUK__L2() == '=') {
 977                         advtok = DUK__ADVTOK(3, DUK_TOK_ALSHIFT_EQ);
 978                 } else if (DUK__L1() == '=') {
 979                         advtok = DUK__ADVTOK(2, DUK_TOK_LE);
 980                 } else if (DUK__L1() == '<') {
 981                         advtok = DUK__ADVTOK(2, DUK_TOK_ALSHIFT);
 982                 } else {
 983                         advtok = DUK__ADVTOK(1, DUK_TOK_LT);
 984                 }
 985                 break;
 986         case DUK_ASC_RANGLE:  /* '>' */
 987                 if (DUK__L1() == '>' && DUK__L2() == '>' && DUK__L3() == '=') {
 988                         advtok = DUK__ADVTOK(4, DUK_TOK_RSHIFT_EQ);
 989                 } else if (DUK__L1() == '>' && DUK__L2() == '>') {
 990                         advtok = DUK__ADVTOK(3, DUK_TOK_RSHIFT);
 991                 } else if (DUK__L1() == '>' && DUK__L2() == '=') {
 992                         advtok = DUK__ADVTOK(3, DUK_TOK_ARSHIFT_EQ);
 993                 } else if (DUK__L1() == '=') {
 994                         advtok = DUK__ADVTOK(2, DUK_TOK_GE);
 995                 } else if (DUK__L1() == '>') {
 996                         advtok = DUK__ADVTOK(2, DUK_TOK_ARSHIFT);
 997                 } else {
 998                         advtok = DUK__ADVTOK(1, DUK_TOK_GT);
 999                 }
1000                 break;
1001         case DUK_ASC_EQUALS:  /* '=' */
1002                 if (DUK__L1() == '=' && DUK__L2() == '=') {
1003                         advtok = DUK__ADVTOK(3, DUK_TOK_SEQ);
1004                 } else if (DUK__L1() == '=') {
1005                         advtok = DUK__ADVTOK(2, DUK_TOK_EQ);
1006                 } else {
1007                         advtok = DUK__ADVTOK(1, DUK_TOK_EQUALSIGN);
1008                 }
1009                 break;
1010         case DUK_ASC_EXCLAMATION:  /* '!' */
1011                 if (DUK__L1() == '=' && DUK__L2() == '=') {
1012                         advtok = DUK__ADVTOK(3, DUK_TOK_SNEQ);
1013                 } else if (DUK__L1() == '=') {
1014                         advtok = DUK__ADVTOK(2, DUK_TOK_NEQ);
1015                 } else {
1016                         advtok = DUK__ADVTOK(1, DUK_TOK_LNOT);
1017                 }
1018                 break;
1019         case DUK_ASC_PLUS:  /* '+' */
1020                 if (DUK__L1() == '+') {
1021                         advtok = DUK__ADVTOK(2, DUK_TOK_INCREMENT);
1022                 } else if (DUK__L1() == '=') {
1023                         advtok = DUK__ADVTOK(2, DUK_TOK_ADD_EQ);
1024                 } else {
1025                         advtok = DUK__ADVTOK(1, DUK_TOK_ADD);
1026                 }
1027                 break;
1028         case DUK_ASC_MINUS:  /* '-' */
1029                 if (DUK__L1() == '-') {
1030                         advtok = DUK__ADVTOK(2, DUK_TOK_DECREMENT);
1031                 } else if (DUK__L1() == '=') {
1032                         advtok = DUK__ADVTOK(2, DUK_TOK_SUB_EQ);
1033                 } else {
1034                         advtok = DUK__ADVTOK(1, DUK_TOK_SUB);
1035                 }
1036                 break;
1037         case DUK_ASC_STAR:  /* '*' */
1038                 if (DUK__L1() == '=') {
1039                         advtok = DUK__ADVTOK(2, DUK_TOK_MUL_EQ);
1040                 } else {
1041                         advtok = DUK__ADVTOK(1, DUK_TOK_MUL);
1042                 }
1043                 break;
1044         case DUK_ASC_PERCENT:  /* '%' */
1045                 if (DUK__L1() == '=') {
1046                         advtok = DUK__ADVTOK(2, DUK_TOK_MOD_EQ);
1047                 } else {
1048                         advtok = DUK__ADVTOK(1, DUK_TOK_MOD);
1049                 }
1050                 break;
1051         case DUK_ASC_AMP:  /* '&' */
1052                 if (DUK__L1() == '&') {
1053                         advtok = DUK__ADVTOK(2, DUK_TOK_LAND);
1054                 } else if (DUK__L1() == '=') {
1055                         advtok = DUK__ADVTOK(2, DUK_TOK_BAND_EQ);
1056                 } else {
1057                         advtok = DUK__ADVTOK(1, DUK_TOK_BAND);
1058                 }
1059                 break;
1060         case DUK_ASC_PIPE:  /* '|' */
1061                 if (DUK__L1() == '|') {
1062                         advtok = DUK__ADVTOK(2, DUK_TOK_LOR);
1063                 } else if (DUK__L1() == '=') {
1064                         advtok = DUK__ADVTOK(2, DUK_TOK_BOR_EQ);
1065                 } else {
1066                         advtok = DUK__ADVTOK(1, DUK_TOK_BOR);
1067                 }
1068                 break;
1069         case DUK_ASC_CARET:  /* '^' */
1070                 if (DUK__L1() == '=') {
1071                         advtok = DUK__ADVTOK(2, DUK_TOK_BXOR_EQ);
1072                 } else {
1073                         advtok = DUK__ADVTOK(1, DUK_TOK_BXOR);
1074                 }
1075                 break;
1076         case DUK_ASC_TILDE:  /* '~' */
1077                 advtok = DUK__ADVTOK(1, DUK_TOK_BNOT);
1078                 break;
1079         case DUK_ASC_QUESTION:  /* '?' */
1080                 advtok = DUK__ADVTOK(1, DUK_TOK_QUESTION);
1081                 break;
1082         case DUK_ASC_COLON:  /* ':' */
1083                 advtok = DUK__ADVTOK(1, DUK_TOK_COLON);
1084                 break;
1085         case DUK_ASC_DOUBLEQUOTE:    /* '"' */
1086         case DUK_ASC_SINGLEQUOTE: {  /* '\'' */
1087                 duk_small_int_t quote = x;  /* Note: duk_uint8_t type yields larger code */
1088                 duk_small_int_t adv;
1089
1090                 DUK__INITBUFFER(lex_ctx);
1091                 for (;;) {
1092                         DUK__ADVANCECHARS(lex_ctx, 1);  /* eat opening quote on first loop */
1093                         x = DUK__L0();
1094                         if (x < 0 || duk_unicode_is_line_terminator(x)) {
1095                                 DUK_ERROR_SYNTAX(lex_ctx->thr, "eof or line terminator in string literal");
1096                         }
1097                         if (x == quote) {
1098                                 DUK__ADVANCECHARS(lex_ctx, 1);  /* eat closing quote */
1099                                 break;
1100                         }
1101                         if (x == '\\') {
1102                                 /* DUK__L0        -> '\' char
1103                                  * DUK__L1 ... DUK__L5 -> more lookup
1104                                  */
1105
1106                                 x = DUK__L1();
1107
1108                                 /* How much to advance before next loop; note that next loop
1109                                  * will advance by 1 anyway, so -1 from the total escape
1110                                  * length (e.g. len('\uXXXX') - 1 = 6 - 1).  As a default,
1111                                  * 1 is good.
1112                                  */
1113                                 adv = 2 - 1;  /* note: long live range */
1114
1115                                 if (x < 0) {
1116                                         DUK_ERROR_SYNTAX(lex_ctx->thr, "eof or line terminator in string literal");
1117                                 }
1118                                 if (duk_unicode_is_line_terminator(x)) {
1119                                         /* line continuation */
1120                                         if (x == 0x000d && DUK__L2() == 0x000a) {
1121                                                 /* CR LF again a special case */
1122                                                 adv = 3 - 1;
1123                                         }
1124                                 } else if (x == '\'') {
1125                                         DUK__APPENDBUFFER(lex_ctx, 0x0027);
1126                                 } else if (x == '"') {
1127                                         DUK__APPENDBUFFER(lex_ctx, 0x0022);
1128                                 } else if (x == '\\') {
1129                                         DUK__APPENDBUFFER(lex_ctx, 0x005c);
1130                                 } else if (x == 'b') {
1131                                         DUK__APPENDBUFFER(lex_ctx, 0x0008);
1132                                 } else if (x == 'f') {
1133                                         DUK__APPENDBUFFER(lex_ctx, 0x000c);
1134                                 } else if (x == 'n') {
1135                                         DUK__APPENDBUFFER(lex_ctx, 0x000a);
1136                                 } else if (x == 'r') {
1137                                         DUK__APPENDBUFFER(lex_ctx, 0x000d);
1138                                 } else if (x == 't') {
1139                                         DUK__APPENDBUFFER(lex_ctx, 0x0009);
1140                                 } else if (x == 'v') {
1141                                         DUK__APPENDBUFFER(lex_ctx, 0x000b);
1142                                 } else if (x == 'x') {
1143                                         adv = 4 - 1;
1144                                         DUK__APPENDBUFFER(lex_ctx, duk__decode_hexesc_from_window(lex_ctx, 2));
1145                                 } else if (x == 'u') {
1146                                         adv = 6 - 1;
1147                                         DUK__APPENDBUFFER(lex_ctx, duk__decode_uniesc_from_window(lex_ctx, 2));
1148                                 } else if (DUK__ISDIGIT(x)) {
1149                                         duk_codepoint_t ch = 0;  /* initialized to avoid warnings of unused var */
1150
1151                                         /*
1152                                          *  Octal escape or zero escape:
1153                                          *    \0                                     (lookahead not DecimalDigit)
1154                                          *    \1 ... \7                              (lookahead not DecimalDigit)
1155                                          *    \ZeroToThree OctalDigit                (lookahead not DecimalDigit)
1156                                          *    \FourToSeven OctalDigit                (no lookahead restrictions)
1157                                          *    \ZeroToThree OctalDigit OctalDigit     (no lookahead restrictions)
1158                                          *
1159                                          *  Zero escape is part of the standard syntax.  Octal escapes are
1160                                          *  defined in E5 Section B.1.2, and are only allowed in non-strict mode.
1161                                          *  Any other productions starting with a decimal digit are invalid.
1162                                          */
1163
1164                                         if (x == '0' && !DUK__ISDIGIT(DUK__L2())) {
1165                                                 /* Zero escape (also allowed in non-strict mode) */
1166                                                 ch = 0;
1167                                                 /* adv = 2 - 1 default OK */
1168 #if defined(DUK_USE_OCTAL_SUPPORT)
1169                                         } else if (strict_mode) {
1170                                                 /* No other escape beginning with a digit in strict mode */
1171                                                 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid escape in string literal");
1172                                         } else if (DUK__ISDIGIT03(x) && DUK__ISOCTDIGIT(DUK__L2()) && DUK__ISOCTDIGIT(DUK__L3())) {
1173                                                 /* Three digit octal escape, digits validated. */
1174                                                 adv = 4 - 1;
1175                                                 ch = (duk__hexval(lex_ctx, x) << 6) +
1176                                                      (duk__hexval(lex_ctx, DUK__L2()) << 3) +
1177                                                      duk__hexval(lex_ctx, DUK__L3());
1178                                         } else if (((DUK__ISDIGIT03(x) && !DUK__ISDIGIT(DUK__L3())) || DUK__ISDIGIT47(x)) &&
1179                                                    DUK__ISOCTDIGIT(DUK__L2())) {
1180                                                 /* Two digit octal escape, digits validated.
1181                                                  *
1182                                                  * The if-condition is a bit tricky.  We could catch e.g.
1183                                                  * '\039' in the three-digit escape and fail it there (by
1184                                                  * validating the digits), but we want to avoid extra
1185                                                  * additional validation code.
1186                                                  */
1187                                                 adv = 3 - 1;
1188                                                 ch = (duk__hexval(lex_ctx, x) << 3) +
1189                                                      duk__hexval(lex_ctx, DUK__L2());
1190                                         } else if (DUK__ISDIGIT(x) && !DUK__ISDIGIT(DUK__L2())) {
1191                                                 /* One digit octal escape, digit validated. */
1192                                                 /* adv = 2 default OK */
1193                                                 ch = duk__hexval(lex_ctx, x);
1194 #else
1195                                         /* fall through to error */
1196 #endif
1197                                         } else {
1198                                                 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid escape in string literal");
1199                                         }
1200
1201                                         DUK__APPENDBUFFER(lex_ctx, ch);
1202                                 } else {
1203                                         /* escaped NonEscapeCharacter */
1204                                         DUK__APPENDBUFFER(lex_ctx, x);
1205                                 }
1206                                 DUK__ADVANCECHARS(lex_ctx, adv);
1207
1208                                 /* Track number of escapes; count not really needed but directive
1209                                  * prologues need to detect whether there were any escapes or line
1210                                  * continuations or not.
1211                                  */
1212                                 out_token->num_escapes++;
1213                         } else {
1214                                 /* part of string */
1215                                 DUK__APPENDBUFFER(lex_ctx, x);
1216                         }
1217                 }
1218
1219                 duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
1220                 out_token->str1 = duk_get_hstring((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx);
1221
1222                 DUK__INITBUFFER(lex_ctx);  /* free some memory */
1223
1224                 advtok = DUK__ADVTOK(0, DUK_TOK_STRING);
1225                 break;
1226         }
1227         default:
1228                 goto slow_path;
1229         }  /* switch */
1230
1231         goto skip_slow_path;
1232
1233  slow_path:
1234         if (duk_unicode_is_line_terminator(x)) {
1235                 if (x == 0x000d && DUK__L1() == 0x000a) {
1236                         /*
1237                          *  E5 Section 7.3: CR LF is detected as a single line terminator for
1238                          *  line numbers.  Here we also detect it as a single line terminator
1239                          *  token.
1240                          */
1241                         DUK__ADVANCECHARS(lex_ctx, 2);
1242                 } else {
1243                         DUK__ADVANCECHARS(lex_ctx, 1);
1244                 }
1245                 got_lineterm = 1;
1246                 goto restart_lineupdate;
1247         } else if (duk_unicode_is_identifier_start(x) || x == '\\') {
1248                 /*
1249                  *  Parse an identifier and then check whether it is:
1250                  *    - reserved word (keyword or other reserved word)
1251                  *    - "null"  (NullLiteral)
1252                  *    - "true"  (BooleanLiteral)
1253                  *    - "false" (BooleanLiteral)
1254                  *    - anything else => identifier
1255                  *
1256                  *  This does not follow the E5 productions cleanly, but is
1257                  *  useful and compact.
1258                  *
1259                  *  Note that identifiers may contain Unicode escapes,
1260                  *  see E5 Sections 6 and 7.6.  They must be decoded first,
1261                  *  and the result checked against allowed characters.
1262                  *  The above if-clause accepts an identifier start and an
1263                  *  '\' character -- no other token can begin with a '\'.
1264                  *
1265                  *  Note that "get" and "set" are not reserved words in E5
1266                  *  specification so they are recognized as plain identifiers
1267                  *  (the tokens DUK_TOK_GET and DUK_TOK_SET are actually not
1268                  *  used now).  The compiler needs to work around this.
1269                  *
1270                  *  Strictly speaking, following Ecmascript longest match
1271                  *  specification, an invalid escape for the first character
1272                  *  should cause a syntax error.  However, an invalid escape
1273                  *  for IdentifierParts should just terminate the identifier
1274                  *  early (longest match), and let the next tokenization
1275                  *  fail.  For instance Rhino croaks with 'foo\z' when
1276                  *  parsing the identifier.  This has little practical impact.
1277                  */
1278
1279                 duk_small_int_t i, i_end;
1280                 duk_bool_t first = 1;
1281                 duk_hstring *str;
1282
1283                 DUK__INITBUFFER(lex_ctx);
1284                 for (;;) {
1285                         /* re-lookup first char on first loop */
1286                         if (DUK__L0() == '\\') {
1287                                 duk_codepoint_t ch;
1288                                 if (DUK__L1() != 'u') {
1289                                         DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid unicode escape in identifier");
1290                                 }
1291
1292                                 ch = duk__decode_uniesc_from_window(lex_ctx, 2);
1293
1294                                 /* IdentifierStart is stricter than IdentifierPart, so if the first
1295                                  * character is escaped, must have a stricter check here.
1296                                  */
1297                                 if (!(first ? duk_unicode_is_identifier_start(ch) : duk_unicode_is_identifier_part(ch))) {
1298                                         DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid unicode escape in identifier");
1299                                 }
1300                                 DUK__APPENDBUFFER(lex_ctx, ch);
1301                                 DUK__ADVANCECHARS(lex_ctx, 6);
1302
1303                                 /* Track number of escapes: necessary for proper keyword
1304                                  * detection.
1305                                  */
1306                                 out_token->num_escapes++;
1307                         } else {
1308                                 /* Note: first character is checked against this.  But because
1309                                  * IdentifierPart includes all IdentifierStart characters, and
1310                                  * the first character (if unescaped) has already been checked
1311                                  * in the if condition, this is OK.
1312                                  */
1313                                 if (!duk_unicode_is_identifier_part(DUK__L0())) {
1314                                         break;
1315                                 }
1316                                 DUK__APPENDBUFFER(lex_ctx, DUK__L0());
1317                                 DUK__ADVANCECHARS(lex_ctx, 1);
1318                         }
1319                         first = 0;
1320                 }
1321
1322                 duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
1323                 out_token->str1 = duk_get_hstring((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx);
1324                 str = out_token->str1;
1325                 DUK_ASSERT(str != NULL);
1326                 out_token->t_nores = DUK_TOK_IDENTIFIER;
1327
1328                 DUK__INITBUFFER(lex_ctx);  /* free some memory */
1329
1330                 /*
1331                  *  Interned identifier is compared against reserved words, which are
1332                  *  currently interned into the heap context.  See genbuiltins.py.
1333                  *
1334                  *  Note that an escape in the identifier disables recognition of
1335                  *  keywords; e.g. "\u0069f = 1;" is a valid statement (assigns to
1336                  *  identifier named "if").  This is not necessarily compliant,
1337                  *  see test-dec-escaped-char-in-keyword.js.
1338                  *
1339                  *  Note: "get" and "set" are awkward.  They are not officially
1340                  *  ReservedWords (and indeed e.g. "var set = 1;" is valid), and
1341                  *  must come out as DUK_TOK_IDENTIFIER.  The compiler needs to
1342                  *  work around this a bit.
1343                  */
1344
1345                 /* XXX: optimize by adding the token numbers directly into the
1346                  * always interned duk_hstring objects (there should be enough
1347                  * flag bits free for that)?
1348                  */
1349
1350                 i_end = (strict_mode ? DUK_STRIDX_END_RESERVED : DUK_STRIDX_START_STRICT_RESERVED);
1351
1352                 advtok = DUK__ADVTOK(0, DUK_TOK_IDENTIFIER);
1353                 if (out_token->num_escapes == 0) {
1354                         for (i = DUK_STRIDX_START_RESERVED; i < i_end; i++) {
1355                                 DUK_ASSERT(i >= 0 && i < DUK_HEAP_NUM_STRINGS);
1356                                 if (DUK_HTHREAD_GET_STRING(lex_ctx->thr, i) == str) {
1357                                         advtok = DUK__ADVTOK(0, DUK_STRIDX_TO_TOK(i));
1358                                         break;
1359                                 }
1360                         }
1361                 }
1362         } else if (DUK__ISDIGIT(x) || (x == '.')) {
1363                 /* Note: decimal number may start with a period, but must be followed by a digit */
1364
1365                 /*
1366                  *  DecimalLiteral, HexIntegerLiteral, OctalIntegerLiteral
1367                  *  "pre-parsing", followed by an actual, accurate parser step.
1368                  *
1369                  *  Note: the leading sign character ('+' or '-') is -not- part of
1370                  *  the production in E5 grammar, and that the a DecimalLiteral
1371                  *  starting with a '0' must be followed by a non-digit.  Leading
1372                  *  zeroes are syntax errors and must be checked for.
1373                  *
1374                  *  XXX: the two step parsing process is quite awkward, it would
1375                  *  be more straightforward to allow numconv to parse the longest
1376                  *  valid prefix (it already does that, it only needs to indicate
1377                  *  where the input ended).  However, the lexer decodes characters
1378                  *  using a lookup window, so this is not a trivial change.
1379                  */
1380
1381                 /* XXX: because of the final check below (that the literal is not
1382                  * followed by a digit), this could maybe be simplified, if we bail
1383                  * out early from a leading zero (and if there are no periods etc).
1384                  * Maybe too complex.
1385                  */
1386
1387                 duk_double_t val;
1388                 duk_bool_t int_only = 0;
1389                 duk_bool_t allow_hex = 0;
1390                 duk_small_int_t state;  /* 0=before period/exp,
1391                                          * 1=after period, before exp
1392                                          * 2=after exp, allow '+' or '-'
1393                                          * 3=after exp and exp sign
1394                                          */
1395                 duk_small_uint_t s2n_flags;
1396                 duk_codepoint_t y;
1397
1398                 DUK__INITBUFFER(lex_ctx);
1399                 y = DUK__L1();
1400                 if (x == '0' && (y == 'x' || y == 'X')) {
1401                         DUK__APPENDBUFFER(lex_ctx, x);
1402                         DUK__APPENDBUFFER(lex_ctx, y);
1403                         DUK__ADVANCECHARS(lex_ctx, 2);
1404                         int_only = 1;
1405                         allow_hex = 1;
1406 #if defined(DUK_USE_OCTAL_SUPPORT)
1407                 } else if (!strict_mode && x == '0' && DUK__ISDIGIT(y)) {
1408                         /* Note: if DecimalLiteral starts with a '0', it can only be
1409                          * followed by a period or an exponent indicator which starts
1410                          * with 'e' or 'E'.  Hence the if-check above ensures that
1411                          * OctalIntegerLiteral is the only valid NumericLiteral
1412                          * alternative at this point (even if y is, say, '9').
1413                          */
1414
1415                         DUK__APPENDBUFFER(lex_ctx, x);
1416                         DUK__ADVANCECHARS(lex_ctx, 1);
1417                         int_only = 1;
1418 #endif
1419                 }
1420
1421                 state = 0;
1422                 for (;;) {
1423                         x = DUK__L0();  /* re-lookup curr char on first round */
1424                         if (DUK__ISDIGIT(x)) {
1425                                 /* Note: intentionally allow leading zeroes here, as the
1426                                  * actual parser will check for them.
1427                                  */
1428                                 if (state == 2) {
1429                                         state = 3;
1430                                 }
1431                         } else if (allow_hex && DUK__ISHEXDIGIT(x)) {
1432                                 /* Note: 'e' and 'E' are also accepted here. */
1433                                 ;
1434                         } else if (x == '.') {
1435                                 if (state >= 1 || int_only) {
1436                                         break;
1437                                 } else {
1438                                         state = 1;
1439                                 }
1440                         } else if (x == 'e' || x == 'E') {
1441                                 if (state >= 2 || int_only) {
1442                                         break;
1443                                 } else {
1444                                         state = 2;
1445                                 }
1446                         } else if (x == '-' || x == '+') {
1447                                 if (state != 2) {
1448                                         break;
1449                                 } else {
1450                                         state = 3;
1451                                 }
1452                         } else {
1453                                 break;
1454                         }
1455                         DUK__APPENDBUFFER(lex_ctx, x);
1456                         DUK__ADVANCECHARS(lex_ctx, 1);
1457                 }
1458
1459                 /* XXX: better coercion */
1460                 duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
1461
1462                 s2n_flags = DUK_S2N_FLAG_ALLOW_EXP |
1463                             DUK_S2N_FLAG_ALLOW_FRAC |
1464                             DUK_S2N_FLAG_ALLOW_NAKED_FRAC |
1465                             DUK_S2N_FLAG_ALLOW_EMPTY_FRAC |
1466 #if defined(DUK_USE_OCTAL_SUPPORT)
1467                             (strict_mode ? 0 : DUK_S2N_FLAG_ALLOW_AUTO_OCT_INT) |
1468 #endif
1469                             DUK_S2N_FLAG_ALLOW_AUTO_HEX_INT;
1470
1471                 duk_dup((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx);
1472                 duk_numconv_parse((duk_context *) lex_ctx->thr, 10 /*radix*/, s2n_flags);
1473                 val = duk_to_number((duk_context *) lex_ctx->thr, -1);
1474                 if (DUK_ISNAN(val)) {
1475                         DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid numeric literal");
1476                 }
1477                 duk_replace((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx);  /* could also just pop? */
1478
1479                 DUK__INITBUFFER(lex_ctx);  /* free some memory */
1480
1481                 /* Section 7.8.3 (note): NumericLiteral must be followed by something other than
1482                  * IdentifierStart or DecimalDigit.
1483                  */
1484
1485                 if (DUK__ISDIGIT(DUK__L0()) || duk_unicode_is_identifier_start(DUK__L0())) {
1486                         DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid numeric literal");
1487                 }
1488
1489                 out_token->num = val;
1490                 advtok = DUK__ADVTOK(0, DUK_TOK_NUMBER);
1491         } else if (duk_unicode_is_whitespace(DUK__LOOKUP(lex_ctx, 0))) {
1492                 DUK__ADVANCECHARS(lex_ctx, 1);
1493                 goto restart;
1494         } else if (x < 0) {
1495                 advtok = DUK__ADVTOK(0, DUK_TOK_EOF);
1496         } else {
1497                 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid token");
1498         }
1499  skip_slow_path:
1500
1501         /*
1502          *  Shared exit path
1503          */
1504
1505         DUK__ADVANCEBYTES(lex_ctx, advtok >> 8);
1506         out_token->t = advtok & 0xff;
1507         if (out_token->t_nores < 0) {
1508                 out_token->t_nores = out_token->t;
1509         }
1510         out_token->lineterm = got_lineterm;
1511
1512         /* Automatic semicolon insertion is allowed if a token is preceded
1513          * by line terminator(s), or terminates a statement list (right curly
1514          * or EOF).
1515          */
1516         if (got_lineterm || out_token->t == DUK_TOK_RCURLY || out_token->t == DUK_TOK_EOF) {
1517                 out_token->allow_auto_semi = 1;
1518         } else {
1519                 out_token->allow_auto_semi = 0;
1520         }
1521 }
1522
1523 #if defined(DUK_USE_REGEXP_SUPPORT)
1524
1525 /*
1526  *  Parse a RegExp token.  The grammar is described in E5 Section 15.10.
1527  *  Terminal constructions (such as quantifiers) are parsed directly here.
1528  *
1529  *  0xffffffffU is used as a marker for "infinity" in quantifiers.  Further,
1530  *  DUK__MAX_RE_QUANT_DIGITS limits the maximum number of digits that
1531  *  will be accepted for a quantifier.
1532  */
1533
1534 DUK_INTERNAL void duk_lexer_parse_re_token(duk_lexer_ctx *lex_ctx, duk_re_token *out_token) {
1535         duk_small_int_t advtok = 0;  /* init is unnecessary but suppresses "may be used uninitialized" warnings */
1536         duk_codepoint_t x, y;
1537
1538         if (++lex_ctx->token_count >= lex_ctx->token_limit) {
1539                 DUK_ERROR_RANGE(lex_ctx->thr, "token limit");
1540                 return;  /* unreachable */
1541         }
1542
1543         DUK_MEMZERO(out_token, sizeof(*out_token));
1544
1545         x = DUK__L0();
1546         y = DUK__L1();
1547
1548         DUK_DDD(DUK_DDDPRINT("parsing regexp token, L0=%ld, L1=%ld", (long) x, (long) y));
1549
1550         switch (x) {
1551         case '|': {
1552                 advtok = DUK__ADVTOK(1, DUK_RETOK_DISJUNCTION);
1553                 break;
1554         }
1555         case '^': {
1556                 advtok = DUK__ADVTOK(1, DUK_RETOK_ASSERT_START);
1557                 break;
1558         }
1559         case '$': {
1560                 advtok = DUK__ADVTOK(1, DUK_RETOK_ASSERT_END);
1561                 break;
1562         }
1563         case '?': {
1564                 out_token->qmin = 0;
1565                 out_token->qmax = 1;
1566                 if (y == '?') {
1567                         advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER);
1568                         out_token->greedy = 0;
1569                 } else {
1570                         advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER);
1571                         out_token->greedy = 1;
1572                 }
1573                 break;
1574         }
1575         case '*': {
1576                 out_token->qmin = 0;
1577                 out_token->qmax = DUK_RE_QUANTIFIER_INFINITE;
1578                 if (y == '?') {
1579                         advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER);
1580                         out_token->greedy = 0;
1581                 } else {
1582                         advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER);
1583                         out_token->greedy = 1;
1584                 }
1585                 break;
1586         }
1587         case '+': {
1588                 out_token->qmin = 1;
1589                 out_token->qmax = DUK_RE_QUANTIFIER_INFINITE;
1590                 if (y == '?') {
1591                         advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER);
1592                         out_token->greedy = 0;
1593                 } else {
1594                         advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER);
1595                         out_token->greedy = 1;
1596                 }
1597                 break;
1598         }
1599         case '{': {
1600                 /* Production allows 'DecimalDigits', including leading zeroes */
1601                 duk_uint_fast32_t val1 = 0;
1602                 duk_uint_fast32_t val2 = DUK_RE_QUANTIFIER_INFINITE;
1603                 duk_small_int_t digits = 0;
1604 #if defined(DUK_USE_ES6_REGEXP_BRACES)
1605                 duk_lexer_point lex_pt;
1606 #endif
1607
1608 #if defined(DUK_USE_ES6_REGEXP_BRACES)
1609                 /* Store lexer position, restoring if quantifier is invalid. */
1610                 DUK_LEXER_GETPOINT(lex_ctx, &lex_pt);
1611 #endif
1612
1613                 for (;;) {
1614                         DUK__ADVANCECHARS(lex_ctx, 1);  /* eat '{' on entry */
1615                         x = DUK__L0();
1616                         if (DUK__ISDIGIT(x)) {
1617                                 digits++;
1618                                 val1 = val1 * 10 + (duk_uint_fast32_t) duk__hexval(lex_ctx, x);
1619                         } else if (x == ',') {
1620                                 if (digits > DUK__MAX_RE_QUANT_DIGITS) {
1621                                         goto invalid_quantifier;
1622                                 }
1623                                 if (val2 != DUK_RE_QUANTIFIER_INFINITE) {
1624                                         goto invalid_quantifier;
1625                                 }
1626                                 if (DUK__L1() == '}') {
1627                                         /* form: { DecimalDigits , }, val1 = min count */
1628                                         if (digits == 0) {
1629                                                 goto invalid_quantifier;
1630                                         }
1631                                         out_token->qmin = val1;
1632                                         out_token->qmax = DUK_RE_QUANTIFIER_INFINITE;
1633                                         DUK__ADVANCECHARS(lex_ctx, 2);
1634                                         break;
1635                                 }
1636                                 val2 = val1;
1637                                 val1 = 0;
1638                                 digits = 0;  /* not strictly necessary because of lookahead '}' above */
1639                         } else if (x == '}') {
1640                                 if (digits > DUK__MAX_RE_QUANT_DIGITS) {
1641                                         goto invalid_quantifier;
1642                                 }
1643                                 if (digits == 0) {
1644                                         goto invalid_quantifier;
1645                                 }
1646                                 if (val2 != DUK_RE_QUANTIFIER_INFINITE) {
1647                                         /* val2 = min count, val1 = max count */
1648                                         out_token->qmin = val2;
1649                                         out_token->qmax = val1;
1650                                 } else {
1651                                         /* val1 = count */
1652                                         out_token->qmin = val1;
1653                                         out_token->qmax = val1;
1654                                 }
1655                                 DUK__ADVANCECHARS(lex_ctx, 1);
1656                                 break;
1657                         } else {
1658                                 goto invalid_quantifier;
1659                         }
1660                 }
1661                 if (DUK__L0() == '?') {
1662                         out_token->greedy = 0;
1663                         DUK__ADVANCECHARS(lex_ctx, 1);
1664                 } else {
1665                         out_token->greedy = 1;
1666                 }
1667                 advtok = DUK__ADVTOK(0, DUK_RETOK_QUANTIFIER);
1668                 break;
1669  invalid_quantifier:
1670 #if defined(DUK_USE_ES6_REGEXP_BRACES)
1671                 /* Failed to match the quantifier, restore lexer and parse
1672                  * opening brace as a literal.
1673                  */
1674                 DUK_LEXER_SETPOINT(lex_ctx, &lex_pt);
1675                 advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_CHAR);
1676                 out_token->num = '{';
1677 #else
1678                 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp quantifier");
1679 #endif
1680                 break;
1681         }
1682         case '.': {
1683                 advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_PERIOD);
1684                 break;
1685         }
1686         case '\\': {
1687                 /* The E5.1 specification does not seem to allow IdentifierPart characters
1688                  * to be used as identity escapes.  Unfortunately this includes '$', which
1689                  * cannot be escaped as '\$'; it needs to be escaped e.g. as '\u0024'.
1690                  * Many other implementations (including V8 and Rhino, for instance) do
1691                  * accept '\$' as a valid identity escape, which is quite pragmatic.
1692                  * See: test-regexp-identity-escape-dollar.js.
1693                  */
1694
1695                 advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_CHAR);  /* default: char escape (two chars) */
1696                 if (y == 'b') {
1697                         advtok = DUK__ADVTOK(2, DUK_RETOK_ASSERT_WORD_BOUNDARY);
1698                 } else if (y == 'B') {
1699                         advtok = DUK__ADVTOK(2, DUK_RETOK_ASSERT_NOT_WORD_BOUNDARY);
1700                 } else if (y == 'f') {
1701                         out_token->num = 0x000c;
1702                 } else if (y == 'n') {
1703                         out_token->num = 0x000a;
1704                 } else if (y == 't') {
1705                         out_token->num = 0x0009;
1706                 } else if (y == 'r') {
1707                         out_token->num = 0x000d;
1708                 } else if (y == 'v') {
1709                         out_token->num = 0x000b;
1710                 } else if (y == 'c') {
1711                         x = DUK__L2();
1712                         if ((x >= 'a' && x <= 'z') ||
1713                             (x >= 'A' && x <= 'Z')) {
1714                                 out_token->num = (x % 32);
1715                                 advtok = DUK__ADVTOK(3, DUK_RETOK_ATOM_CHAR);
1716                         } else {
1717                                 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape");
1718                         }
1719                 } else if (y == 'x') {
1720                         out_token->num = duk__decode_hexesc_from_window(lex_ctx, 2);
1721                         advtok = DUK__ADVTOK(4, DUK_RETOK_ATOM_CHAR);
1722                 } else if (y == 'u') {
1723                         out_token->num = duk__decode_uniesc_from_window(lex_ctx, 2);
1724                         advtok = DUK__ADVTOK(6, DUK_RETOK_ATOM_CHAR);
1725                 } else if (y == 'd') {
1726                         advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_DIGIT);
1727                 } else if (y == 'D') {
1728                         advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_DIGIT);
1729                 } else if (y == 's') {
1730                         advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_WHITE);
1731                 } else if (y == 'S') {
1732                         advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_WHITE);
1733                 } else if (y == 'w') {
1734                         advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_WORD_CHAR);
1735                 } else if (y == 'W') {
1736                         advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_WORD_CHAR);
1737                 } else if (DUK__ISDIGIT(y)) {
1738                         /* E5 Section 15.10.2.11 */
1739                         if (y == '0') {
1740                                 if (DUK__ISDIGIT(DUK__L2())) {
1741                                         DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape");
1742                                 }
1743                                 out_token->num = 0x0000;
1744                                 advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_CHAR);
1745                         } else {
1746                                 /* XXX: shared parsing? */
1747                                 duk_uint_fast32_t val = 0;
1748                                 duk_small_int_t i;
1749                                 for (i = 0; ; i++) {
1750                                         if (i >= DUK__MAX_RE_DECESC_DIGITS) {
1751                                                 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape");
1752                                         }
1753                                         DUK__ADVANCECHARS(lex_ctx, 1);  /* eat backslash on entry */
1754                                         x = DUK__L0();
1755                                         if (!DUK__ISDIGIT(x)) {
1756                                                 break;
1757                                         }
1758                                         val = val * 10 + (duk_uint_fast32_t) duk__hexval(lex_ctx, x);
1759                                 }
1760                                 /* DUK__L0() cannot be a digit, because the loop doesn't terminate if it is */
1761                                 advtok = DUK__ADVTOK(0, DUK_RETOK_ATOM_BACKREFERENCE);
1762                                 out_token->num = val;
1763                         }
1764                 } else if ((y >= 0 && !duk_unicode_is_identifier_part(y)) ||
1765 #if defined(DUK_USE_NONSTD_REGEXP_DOLLAR_ESCAPE)
1766                            y == '$' ||
1767 #endif
1768                            y == DUK_UNICODE_CP_ZWNJ ||
1769                            y == DUK_UNICODE_CP_ZWJ) {
1770                         /* IdentityEscape, with dollar added as a valid additional
1771                          * non-standard escape (see test-regexp-identity-escape-dollar.js).
1772                          * Careful not to match end-of-buffer (<0) here.
1773                          */
1774                         out_token->num = y;
1775                 } else {
1776                         DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape");
1777                 }
1778                 break;
1779         }
1780         case '(': {
1781                 /* XXX: naming is inconsistent: ATOM_END_GROUP ends an ASSERT_START_LOOKAHEAD */
1782
1783                 if (y == '?') {
1784                         if (DUK__L2() == '=') {
1785                                 /* (?= */
1786                                 advtok = DUK__ADVTOK(3, DUK_RETOK_ASSERT_START_POS_LOOKAHEAD);
1787                         } else if (DUK__L2() == '!') {
1788                                 /* (?! */
1789                                 advtok = DUK__ADVTOK(3, DUK_RETOK_ASSERT_START_NEG_LOOKAHEAD);
1790                         } else if (DUK__L2() == ':') {
1791                                 /* (?: */
1792                                 advtok = DUK__ADVTOK(3, DUK_RETOK_ATOM_START_NONCAPTURE_GROUP);
1793                         } else {
1794                                 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp group");
1795                                 return;
1796                         }
1797                 } else {
1798                         /* ( */
1799                         advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_START_CAPTURE_GROUP);
1800                 }
1801                 break;
1802         }
1803         case ')': {
1804                 advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_END_GROUP);
1805                 break;
1806         }
1807         case '[': {
1808                 /*
1809                  *  To avoid creating a heavy intermediate value for the list of ranges,
1810                  *  only the start token ('[' or '[^') is parsed here.  The regexp
1811                  *  compiler parses the ranges itself.
1812                  */
1813                 advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_START_CHARCLASS);
1814                 if (y == '^') {
1815                         advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_START_CHARCLASS_INVERTED);
1816                 }
1817                 break;
1818         }
1819 #if !defined(DUK_USE_ES6_REGEXP_BRACES)
1820         case '}':
1821 #endif
1822         case ']': {
1823                 /* Although these could be parsed as PatternCharacters unambiguously (here),
1824                  * E5 Section 15.10.1 grammar explicitly forbids these as PatternCharacters.
1825                  */
1826                 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp character");
1827                 break;
1828         }
1829         case -1: {
1830                 /* EOF */
1831                 advtok = DUK__ADVTOK(0, DUK_TOK_EOF);
1832                 break;
1833         }
1834         default: {
1835                 /* PatternCharacter, all excluded characters are matched by cases above */
1836                 advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_CHAR);
1837                 out_token->num = x;
1838                 break;
1839         }
1840         }
1841
1842         /*
1843          *  Shared exit path
1844          */
1845
1846         DUK__ADVANCEBYTES(lex_ctx, advtok >> 8);
1847         out_token->t = advtok & 0xff;
1848 }
1849
1850 /*
1851  *  Special parser for character classes; calls callback for every
1852  *  range parsed and returns the number of ranges present.
1853  */
1854
1855 /* XXX: this duplicates functionality in duk_regexp.c where a similar loop is
1856  * required anyway.  We could use that BUT we need to update the regexp compiler
1857  * 'nranges' too.  Work this out a bit more cleanly to save space.
1858  */
1859
1860 /* XXX: the handling of character range detection is a bit convoluted.
1861  * Try to simplify and make smaller.
1862  */
1863
1864 /* XXX: logic for handling character ranges is now incorrect, it will accept
1865  * e.g. [\d-z] whereas it should croak from it?  SMJS accepts this too, though.
1866  *
1867  * Needs a read through and a lot of additional tests.
1868  */
1869
1870 DUK_LOCAL
1871 void duk__emit_u16_direct_ranges(duk_lexer_ctx *lex_ctx,
1872                                  duk_re_range_callback gen_range,
1873                                  void *userdata,
1874                                  const duk_uint16_t *ranges,
1875                                  duk_small_int_t num) {
1876         const duk_uint16_t *ranges_end;
1877
1878         DUK_UNREF(lex_ctx);
1879
1880         ranges_end = ranges + num;
1881         while (ranges < ranges_end) {
1882                 /* mark range 'direct', bypass canonicalization (see Wiki) */
1883                 gen_range(userdata, (duk_codepoint_t) ranges[0], (duk_codepoint_t) ranges[1], 1);
1884                 ranges += 2;
1885         }
1886 }
1887
1888 DUK_INTERNAL void duk_lexer_parse_re_ranges(duk_lexer_ctx *lex_ctx, duk_re_range_callback gen_range, void *userdata) {
1889         duk_codepoint_t start = -1;
1890         duk_codepoint_t ch;
1891         duk_codepoint_t x;
1892         duk_bool_t dash = 0;
1893
1894         DUK_DD(DUK_DDPRINT("parsing regexp ranges"));
1895
1896         for (;;) {
1897                 x = DUK__L0();
1898                 DUK__ADVANCECHARS(lex_ctx, 1);
1899
1900                 ch = -1;  /* not strictly necessary, but avoids "uninitialized variable" warnings */
1901                 DUK_UNREF(ch);
1902
1903                 if (x < 0) {
1904                         DUK_ERROR_SYNTAX(lex_ctx->thr, "eof in character class");
1905                 } else if (x == ']') {
1906                         if (start >= 0) {
1907                                 gen_range(userdata, start, start, 0);
1908                         }
1909                         break;
1910                 } else if (x == '-') {
1911                         if (start >= 0 && !dash && DUK__L0() != ']') {
1912                                 /* '-' as a range indicator */
1913                                 dash = 1;
1914                                 continue;
1915                         } else {
1916                                 /* '-' verbatim */
1917                                 ch = x;
1918                         }
1919                 } else if (x == '\\') {
1920                         /*
1921                          *  The escapes are same as outside a character class, except that \b has a
1922                          *  different meaning, and \B and backreferences are prohibited (see E5
1923                          *  Section 15.10.2.19).  However, it's difficult to share code because we
1924                          *  handle e.g. "\n" very differently: here we generate a single character
1925                          *  range for it.
1926                          */
1927
1928                         x = DUK__L0();
1929                         DUK__ADVANCECHARS(lex_ctx, 1);
1930
1931                         if (x == 'b') {
1932                                 /* Note: '\b' in char class is different than outside (assertion),
1933                                  * '\B' is not allowed and is caught by the duk_unicode_is_identifier_part()
1934                                  * check below.
1935                                  */
1936                                 ch = 0x0008;
1937                         } else if (x == 'f') {
1938                                 ch = 0x000c;
1939                         } else if (x == 'n') {
1940                                 ch = 0x000a;
1941                         } else if (x == 't') {
1942                                 ch = 0x0009;
1943                         } else if (x == 'r') {
1944                                 ch = 0x000d;
1945                         } else if (x == 'v') {
1946                                 ch = 0x000b;
1947                         } else if (x == 'c') {
1948                                 x = DUK__L0();
1949                                 DUK__ADVANCECHARS(lex_ctx, 1);
1950                                 if ((x >= 'a' && x <= 'z') ||
1951                                     (x >= 'A' && x <= 'Z')) {
1952                                         ch = (x % 32);
1953                                 } else {
1954                                         DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape");
1955                                         return;  /* never reached, but avoids warnings of
1956                                                   * potentially unused variables.
1957                                                   */
1958                                 }
1959                         } else if (x == 'x') {
1960                                 ch = duk__decode_hexesc_from_window(lex_ctx, 0);
1961                                 DUK__ADVANCECHARS(lex_ctx, 2);
1962                         } else if (x == 'u') {
1963                                 ch = duk__decode_uniesc_from_window(lex_ctx, 0);
1964                                 DUK__ADVANCECHARS(lex_ctx, 4);
1965                         } else if (x == 'd') {
1966                                 duk__emit_u16_direct_ranges(lex_ctx,
1967                                                             gen_range,
1968                                                             userdata,
1969                                                             duk_unicode_re_ranges_digit,
1970                                                             sizeof(duk_unicode_re_ranges_digit) / sizeof(duk_uint16_t));
1971                                 ch = -1;
1972                         } else if (x == 'D') {
1973                                 duk__emit_u16_direct_ranges(lex_ctx,
1974                                                             gen_range,
1975                                                             userdata,
1976                                                             duk_unicode_re_ranges_not_digit,
1977                                                             sizeof(duk_unicode_re_ranges_not_digit) / sizeof(duk_uint16_t));
1978                                 ch = -1;
1979                         } else if (x == 's') {
1980                                 duk__emit_u16_direct_ranges(lex_ctx,
1981                                                             gen_range,
1982                                                             userdata,
1983                                                             duk_unicode_re_ranges_white,
1984                                                             sizeof(duk_unicode_re_ranges_white) / sizeof(duk_uint16_t));
1985                                 ch = -1;
1986                         } else if (x == 'S') {
1987                                 duk__emit_u16_direct_ranges(lex_ctx,
1988                                                             gen_range,
1989                                                             userdata,
1990                                                             duk_unicode_re_ranges_not_white,
1991                                                             sizeof(duk_unicode_re_ranges_not_white) / sizeof(duk_uint16_t));
1992                                 ch = -1;
1993                         } else if (x == 'w') {
1994                                 duk__emit_u16_direct_ranges(lex_ctx,
1995                                                             gen_range,
1996                                                             userdata,
1997                                                             duk_unicode_re_ranges_wordchar,
1998                                                             sizeof(duk_unicode_re_ranges_wordchar) / sizeof(duk_uint16_t));
1999                                 ch = -1;
2000                         } else if (x == 'W') {
2001                                 duk__emit_u16_direct_ranges(lex_ctx,
2002                                                             gen_range,
2003                                                             userdata,
2004                                                             duk_unicode_re_ranges_not_wordchar,
2005                                                             sizeof(duk_unicode_re_ranges_not_wordchar) / sizeof(duk_uint16_t));
2006                                 ch = -1;
2007                         } else if (DUK__ISDIGIT(x)) {
2008                                 /* DecimalEscape, only \0 is allowed, no leading zeroes are allowed */
2009                                 if (x == '0' && !DUK__ISDIGIT(DUK__L0())) {
2010                                         ch = 0x0000;
2011                                 } else {
2012                                         DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape");
2013                                 }
2014                         } else if (!duk_unicode_is_identifier_part(x)
2015 #if defined(DUK_USE_NONSTD_REGEXP_DOLLAR_ESCAPE)
2016                                    || x == '$'
2017 #endif
2018                                   ) {
2019                                 /* IdentityEscape */
2020                                 ch = x;
2021                         } else {
2022                                 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape");
2023                         }
2024                 } else {
2025                         /* character represents itself */
2026                         ch = x;
2027                 }
2028
2029                 /* ch is a literal character here or -1 if parsed entity was
2030                  * an escape such as "\s".
2031                  */
2032
2033                 if (ch < 0) {
2034                         /* multi-character sets not allowed as part of ranges, see
2035                          * E5 Section 15.10.2.15, abstract operation CharacterRange.
2036                          */
2037                         if (start >= 0) {
2038                                 if (dash) {
2039                                         DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid range");
2040                                 } else {
2041                                         gen_range(userdata, start, start, 0);
2042                                         start = -1;
2043                                         /* dash is already 0 */
2044                                 }
2045                         }
2046                 } else {
2047                         if (start >= 0) {
2048                                 if (dash) {
2049                                         if (start > ch) {
2050                                                 DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid range");
2051                                         }
2052                                         gen_range(userdata, start, ch, 0);
2053                                         start = -1;
2054                                         dash = 0;
2055                                 } else {
2056                                         gen_range(userdata, start, start, 0);
2057                                         start = ch;
2058                                         /* dash is already 0 */
2059                                 }
2060                         } else {
2061                                 start = ch;
2062                         }
2063                 }
2064         }
2065
2066         return;
2067 }
2068
2069 #endif  /* DUK_USE_REGEXP_SUPPORT */