ceph/src/civetweb/src/third_party/duktape-1.8.0/src-separate/duk_unicode_support.c

   1 /*
   2  *  Various Unicode help functions for character classification predicates,
   3  *  case conversion, decoding, etc.
   4  */
   5
   6 #include "duk_internal.h"
   7
   8 /*
   9  *  Fast path tables
  10  */
  11
  12 #if defined(DUK_USE_IDCHAR_FASTPATH)
  13 DUK_INTERNAL const duk_int8_t duk_is_idchar_tab[128] = {
  14         /* 0: not IdentifierStart or IdentifierPart
  15          * 1: IdentifierStart and IdentifierPart
  16          * -1: IdentifierPart only
  17          */
  18         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,   /* 0x00...0x0f */
  19         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,   /* 0x10...0x1f */
  20         0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,   /* 0x20...0x2f */
  21         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  0,  0,  0,  0,  0,   /* 0x30...0x3f */
  22         0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,   /* 0x40...0x4f */
  23         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  1,   /* 0x50...0x5f */
  24         0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,   /* 0x60...0x6f */
  25         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0    /* 0x70...0x7f */
  26 };
  27 #endif
  28
  29 /*
  30  *  XUTF-8 and CESU-8 encoding/decoding
  31  */
  32
  33 DUK_INTERNAL duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp) {
  34         duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
  35         if (x < 0x80UL) {
  36                 /* 7 bits */
  37                 return 1;
  38         } else if (x < 0x800UL) {
  39                 /* 11 bits */
  40                 return 2;
  41         } else if (x < 0x10000UL) {
  42                 /* 16 bits */
  43                 return 3;
  44         } else if (x < 0x200000UL) {
  45                 /* 21 bits */
  46                 return 4;
  47         } else if (x < 0x4000000UL) {
  48                 /* 26 bits */
  49                 return 5;
  50         } else if (x < (duk_ucodepoint_t) 0x80000000UL) {
  51                 /* 31 bits */
  52                 return 6;
  53         } else {
  54                 /* 36 bits */
  55                 return 7;
  56         }
  57 }
  58
  59 #if defined(DUK_USE_ASSERTIONS)
  60 DUK_INTERNAL duk_small_int_t duk_unicode_get_cesu8_length(duk_ucodepoint_t cp) {
  61         duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
  62         if (x < 0x80UL) {
  63                 /* 7 bits */
  64                 return 1;
  65         } else if (x < 0x800UL) {
  66                 /* 11 bits */
  67                 return 2;
  68         } else if (x < 0x10000UL) {
  69                 /* 16 bits */
  70                 return 3;
  71         } else {
  72                 /* Encoded as surrogate pair, each encoding to 3 bytes for
  73                  * 6 bytes total.  Codepoints above U+10FFFF encode as 6 bytes
  74                  * too, see duk_unicode_encode_cesu8().
  75                   */
  76                 return 3 + 3;
  77         }
  78 }
  79 #endif  /* DUK_USE_ASSERTIONS */
  80
  81 DUK_INTERNAL const duk_uint8_t duk_unicode_xutf8_markers[7] = {
  82         0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe
  83 };
  84
  85 /* Encode to extended UTF-8; 'out' must have space for at least
  86  * DUK_UNICODE_MAX_XUTF8_LENGTH bytes.  Allows encoding of any
  87  * 32-bit (unsigned) codepoint.
  88  */
  89 DUK_INTERNAL duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out) {
  90         duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
  91         duk_small_int_t len;
  92         duk_uint8_t marker;
  93         duk_small_int_t i;
  94
  95         len = duk_unicode_get_xutf8_length(cp);
  96         DUK_ASSERT(len > 0);
  97
  98         marker = duk_unicode_xutf8_markers[len - 1];  /* 64-bit OK because always >= 0 */
  99
 100         i = len;
 101         DUK_ASSERT(i > 0);
 102         do {
 103                 i--;
 104                 if (i > 0) {
 105                         out[i] = (duk_uint8_t) (0x80 + (x & 0x3f));
 106                         x >>= 6;
 107                 } else {
 108                         /* Note: masking of 'x' is not necessary because of
 109                          * range check and shifting -> no bits overlapping
 110                          * the marker should be set.
 111                          */
 112                         out[0] = (duk_uint8_t) (marker + x);
 113                 }
 114         } while (i > 0);
 115
 116         return len;
 117 }
 118
 119 /* Encode to CESU-8; 'out' must have space for at least
 120  * DUK_UNICODE_MAX_CESU8_LENGTH bytes; codepoints above U+10FFFF
 121  * will encode to garbage but won't overwrite the output buffer.
 122  */
 123 DUK_INTERNAL duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out) {
 124         duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
 125         duk_small_int_t len;
 126
 127         if (x < 0x80UL) {
 128                 out[0] = (duk_uint8_t) x;
 129                 len = 1;
 130         } else if (x < 0x800UL) {
 131                 out[0] = (duk_uint8_t) (0xc0 + ((x >> 6) & 0x1f));
 132                 out[1] = (duk_uint8_t) (0x80 + (x & 0x3f));
 133                 len = 2;
 134         } else if (x < 0x10000UL) {
 135                 /* surrogate pairs get encoded here */
 136                 out[0] = (duk_uint8_t) (0xe0 + ((x >> 12) & 0x0f));
 137                 out[1] = (duk_uint8_t) (0x80 + ((x >> 6) & 0x3f));
 138                 out[2] = (duk_uint8_t) (0x80 + (x & 0x3f));
 139                 len = 3;
 140         } else {
 141                 /*
 142                  *  Unicode codepoints above U+FFFF are encoded as surrogate
 143                  *  pairs here.  This ensures that all CESU-8 codepoints are
 144                  *  16-bit values as expected in Ecmascript.  The surrogate
 145                  *  pairs always get a 3-byte encoding (each) in CESU-8.
 146                  *  See: http://en.wikipedia.org/wiki/Surrogate_pair
 147                  *
 148                  *  20-bit codepoint, 10 bits (A and B) per surrogate pair:
 149                  *
 150                  *    x = 0b00000000 0000AAAA AAAAAABB BBBBBBBB
 151                  *  sp1 = 0b110110AA AAAAAAAA  (0xd800 + ((x >> 10) & 0x3ff))
 152                  *  sp2 = 0b110111BB BBBBBBBB  (0xdc00 + (x & 0x3ff))
 153                  *
 154                  *  Encoded into CESU-8:
 155                  *
 156                  *  sp1 -> 0b11101101  (0xe0 + ((sp1 >> 12) & 0x0f))
 157                  *      -> 0b1010AAAA  (0x80 + ((sp1 >> 6) & 0x3f))
 158                  *      -> 0b10AAAAAA  (0x80 + (sp1 & 0x3f))
 159                  *  sp2 -> 0b11101101  (0xe0 + ((sp2 >> 12) & 0x0f))
 160                  *      -> 0b1011BBBB  (0x80 + ((sp2 >> 6) & 0x3f))
 161                  *      -> 0b10BBBBBB  (0x80 + (sp2 & 0x3f))
 162                  *
 163                  *  Note that 0x10000 must be subtracted first.  The code below
 164                  *  avoids the sp1, sp2 temporaries which saves around 20 bytes
 165                  *  of code.
 166                  */
 167
 168                 x -= 0x10000UL;
 169
 170                 out[0] = (duk_uint8_t) (0xed);
 171                 out[1] = (duk_uint8_t) (0xa0 + ((x >> 16) & 0x0f));
 172                 out[2] = (duk_uint8_t) (0x80 + ((x >> 10) & 0x3f));
 173                 out[3] = (duk_uint8_t) (0xed);
 174                 out[4] = (duk_uint8_t) (0xb0 + ((x >> 6) & 0x0f));
 175                 out[5] = (duk_uint8_t) (0x80 + (x & 0x3f));
 176                 len = 6;
 177         }
 178
 179         return len;
 180 }
 181
 182 /* Decode helper.  Return zero on error. */
 183 DUK_INTERNAL duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end, duk_ucodepoint_t *out_cp) {
 184         const duk_uint8_t *p;
 185         duk_uint32_t res;
 186         duk_uint_fast8_t ch;
 187         duk_small_int_t n;
 188
 189         DUK_UNREF(thr);
 190
 191         p = *ptr;
 192         if (p < ptr_start || p >= ptr_end) {
 193                 goto fail;
 194         }
 195
 196         /*
 197          *  UTF-8 decoder which accepts longer than standard byte sequences.
 198          *  This allows full 32-bit code points to be used.
 199          */
 200
 201         ch = (duk_uint_fast8_t) (*p++);
 202         if (ch < 0x80) {
 203                 /* 0xxx xxxx   [7 bits] */
 204                 res = (duk_uint32_t) (ch & 0x7f);
 205                 n = 0;
 206         } else if (ch < 0xc0) {
 207                 /* 10xx xxxx -> invalid */
 208                 goto fail;
 209         } else if (ch < 0xe0) {
 210                 /* 110x xxxx   10xx xxxx   [11 bits] */
 211                 res = (duk_uint32_t) (ch & 0x1f);
 212                 n = 1;
 213         } else if (ch < 0xf0) {
 214                 /* 1110 xxxx   10xx xxxx   10xx xxxx   [16 bits] */
 215                 res = (duk_uint32_t) (ch & 0x0f);
 216                 n = 2;
 217         } else if (ch < 0xf8) {
 218                 /* 1111 0xxx   10xx xxxx   10xx xxxx   10xx xxxx   [21 bits] */
 219                 res = (duk_uint32_t) (ch & 0x07);
 220                 n = 3;
 221         } else if (ch < 0xfc) {
 222                 /* 1111 10xx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   [26 bits] */
 223                 res = (duk_uint32_t) (ch & 0x03);
 224                 n = 4;
 225         } else if (ch < 0xfe) {
 226                 /* 1111 110x   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   [31 bits] */
 227                 res = (duk_uint32_t) (ch & 0x01);
 228                 n = 5;
 229         } else if (ch < 0xff) {
 230                 /* 1111 1110   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   [36 bits] */
 231                 res = (duk_uint32_t) (0);
 232                 n = 6;
 233         } else {
 234                 /* 8-byte format could be:
 235                  * 1111 1111   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   [41 bits]
 236                  *
 237                  * However, this format would not have a zero bit following the
 238                  * leading one bits and would not allow 0xFF to be used as an
 239                  * "invalid xutf-8" marker for internal keys.  Further, 8-byte
 240                  * encodings (up to 41 bit code points) are not currently needed.
 241                  */
 242                 goto fail;
 243         }
 244
 245         DUK_ASSERT(p >= ptr_start);  /* verified at beginning */
 246         if (p + n > ptr_end) {
 247                 /* check pointer at end */
 248                 goto fail;
 249         }
 250
 251         while (n > 0) {
 252                 DUK_ASSERT(p >= ptr_start && p < ptr_end);
 253                 res = res << 6;
 254                 res += (duk_uint32_t) ((*p++) & 0x3f);
 255                 n--;
 256         }
 257
 258         *ptr = p;
 259         *out_cp = res;
 260         return 1;
 261
 262  fail:
 263         return 0;
 264 }
 265
 266 /* used by e.g. duk_regexp_executor.c, string built-ins */
 267 DUK_INTERNAL duk_ucodepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end) {
 268         duk_ucodepoint_t cp;
 269
 270         if (duk_unicode_decode_xutf8(thr, ptr, ptr_start, ptr_end, &cp)) {
 271                 return cp;
 272         }
 273         DUK_ERROR_INTERNAL(thr, "utf-8 decode failed");  /* XXX: 'internal error' is a bit of a misnomer */
 274         DUK_UNREACHABLE();
 275         return 0;
 276 }
 277
 278 /* Compute (extended) utf-8 length without codepoint encoding validation,
 279  * used for string interning.
 280  *
 281  * NOTE: This algorithm is performance critical, more so than string hashing
 282  * in some cases.  It is needed when interning a string and needs to scan
 283  * every byte of the string with no skipping.  Having an ASCII fast path
 284  * is useful if possible in the algorithm.  The current algorithms were
 285  * chosen from several variants, based on x64 gcc -O2 testing.  See:
 286  * https://github.com/svaarala/duktape/pull/422
 287  *
 288  * NOTE: must match src/dukutil.py:duk_unicode_unvalidated_utf8_length().
 289  */
 290
 291 #if defined(DUK_USE_PREFER_SIZE)
 292 /* Small variant; roughly 150 bytes smaller than the fast variant. */
 293 DUK_INTERNAL duk_size_t duk_unicode_unvalidated_utf8_length(const duk_uint8_t *data, duk_size_t blen) {
 294         const duk_uint8_t *p;
 295         const duk_uint8_t *p_end;
 296         duk_size_t ncont;
 297         duk_size_t clen;
 298
 299         p = data;
 300         p_end = data + blen;
 301         ncont = 0;
 302         while (p != p_end) {
 303                 duk_uint8_t x;
 304                 x = *p++;
 305                 if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) {
 306                         ncont++;
 307                 }
 308         }
 309
 310         DUK_ASSERT(ncont <= blen);
 311         clen = blen - ncont;
 312         DUK_ASSERT(clen <= blen);
 313         return clen;
 314 }
 315 #else  /* DUK_USE_PREFER_SIZE */
 316 /* This seems like a good overall approach.  Fast path for ASCII in 4 byte
 317  * blocks.
 318  */
 319 DUK_INTERNAL duk_size_t duk_unicode_unvalidated_utf8_length(const duk_uint8_t *data, duk_size_t blen) {
 320         const duk_uint8_t *p;
 321         const duk_uint8_t *p_end;
 322         const duk_uint32_t *p32_end;
 323         const duk_uint32_t *p32;
 324         duk_size_t ncont;
 325         duk_size_t clen;
 326
 327         ncont = 0;  /* number of continuation (non-initial) bytes in [0x80,0xbf] */
 328         p = data;
 329         p_end = data + blen;
 330         if (blen < 16) {
 331                 goto skip_fastpath;
 332         }
 333
 334         /* Align 'p' to 4; the input data may have arbitrary alignment.
 335          * End of string check not needed because blen >= 16.
 336          */
 337         while (((duk_size_t) (const void *) p) & 0x03U) {
 338                 duk_uint8_t x;
 339                 x = *p++;
 340                 if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) {
 341                         ncont++;
 342                 }
 343         }
 344
 345         /* Full, aligned 4-byte reads. */
 346         p32_end = (const duk_uint32_t *) (const void *) (p + ((duk_size_t) (p_end - p) & (duk_size_t) (~0x03)));
 347         p32 = (const duk_uint32_t *) (const void *) p;
 348         while (p32 != (const duk_uint32_t *) p32_end) {
 349                 duk_uint32_t x;
 350                 x = *p32++;
 351                 if (DUK_LIKELY((x & 0x80808080UL) == 0)) {
 352                         ;  /* ASCII fast path */
 353                 } else {
 354                         /* Flip highest bit of each byte which changes
 355                          * the bit pattern 10xxxxxx into 00xxxxxx which
 356                          * allows an easy bit mask test.
 357                          */
 358                         x ^= 0x80808080UL;
 359                         if (DUK_UNLIKELY(!(x & 0xc0000000UL))) {
 360                                 ncont++;
 361                         }
 362                         if (DUK_UNLIKELY(!(x & 0x00c00000UL))) {
 363                                 ncont++;
 364                         }
 365                         if (DUK_UNLIKELY(!(x & 0x0000c000UL))) {
 366                                 ncont++;
 367                         }
 368                         if (DUK_UNLIKELY(!(x & 0x000000c0UL))) {
 369                                 ncont++;
 370                         }
 371                 }
 372         }
 373         p = (const duk_uint8_t *) p32;
 374         /* Fall through to handle the rest. */
 375
 376  skip_fastpath:
 377         while (p != p_end) {
 378                 duk_uint8_t x;
 379                 x = *p++;
 380                 if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) {
 381                         ncont++;
 382                 }
 383         }
 384
 385         DUK_ASSERT(ncont <= blen);
 386         clen = blen - ncont;
 387         DUK_ASSERT(clen <= blen);
 388         return clen;
 389 }
 390 #endif  /* DUK_USE_PREFER_SIZE */
 391
 392 /*
 393  *  Unicode range matcher
 394  *
 395  *  Matches a codepoint against a packed bitstream of character ranges.
 396  *  Used for slow path Unicode matching.
 397  */
 398
 399 /* Must match src/extract_chars.py, generate_match_table3(). */
 400 DUK_LOCAL duk_uint32_t duk__uni_decode_value(duk_bitdecoder_ctx *bd_ctx) {
 401         duk_uint32_t t;
 402
 403         t = (duk_uint32_t) duk_bd_decode(bd_ctx, 4);
 404         if (t <= 0x0eU) {
 405                 return t;
 406         }
 407         t = (duk_uint32_t) duk_bd_decode(bd_ctx, 8);
 408         if (t <= 0xfdU) {
 409                 return t + 0x0f;
 410         }
 411         if (t == 0xfeU) {
 412                 t = (duk_uint32_t) duk_bd_decode(bd_ctx, 12);
 413                 return t + 0x0fU + 0xfeU;
 414         } else {
 415                 t = (duk_uint32_t) duk_bd_decode(bd_ctx, 24);
 416                 return t + 0x0fU + 0xfeU + 0x1000UL;
 417         }
 418 }
 419
 420 DUK_LOCAL duk_small_int_t duk__uni_range_match(const duk_uint8_t *unitab, duk_size_t unilen, duk_codepoint_t cp) {
 421         duk_bitdecoder_ctx bd_ctx;
 422         duk_codepoint_t prev_re;
 423
 424         DUK_MEMZERO(&bd_ctx, sizeof(bd_ctx));
 425         bd_ctx.data = (const duk_uint8_t *) unitab;
 426         bd_ctx.length = (duk_size_t) unilen;
 427
 428         prev_re = 0;
 429         for (;;) {
 430                 duk_codepoint_t r1, r2;
 431                 r1 = (duk_codepoint_t) duk__uni_decode_value(&bd_ctx);
 432                 if (r1 == 0) {
 433                         break;
 434                 }
 435                 r2 = (duk_codepoint_t) duk__uni_decode_value(&bd_ctx);
 436
 437                 r1 = prev_re + r1;
 438                 r2 = r1 + r2;
 439                 prev_re = r2;
 440
 441                 /* [r1,r2] is the range */
 442
 443                 DUK_DDD(DUK_DDDPRINT("duk__uni_range_match: cp=%06lx range=[0x%06lx,0x%06lx]",
 444                                      (unsigned long) cp, (unsigned long) r1, (unsigned long) r2));
 445                 if (cp >= r1 && cp <= r2) {
 446                         return 1;
 447                 }
 448         }
 449
 450         return 0;
 451 }
 452
 453 /*
 454  *  "WhiteSpace" production check.
 455  */
 456
 457 DUK_INTERNAL duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp) {
 458         /*
 459          *  E5 Section 7.2 specifies six characters specifically as
 460          *  white space:
 461          *
 462          *    0009;<control>;Cc;0;S;;;;;N;CHARACTER TABULATION;;;;
 463          *    000B;<control>;Cc;0;S;;;;;N;LINE TABULATION;;;;
 464          *    000C;<control>;Cc;0;WS;;;;;N;FORM FEED (FF);;;;
 465          *    0020;SPACE;Zs;0;WS;;;;;N;;;;;
 466          *    00A0;NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;;
 467          *    FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;;
 468          *
 469          *  It also specifies any Unicode category 'Zs' characters as white
 470          *  space.  These can be extracted with the "src/extract_chars.py" script.
 471          *  Current result:
 472          *
 473          *    RAW OUTPUT:
 474          *    ===========
 475          *    0020;SPACE;Zs;0;WS;;;;;N;;;;;
 476          *    00A0;NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;;
 477          *    1680;OGHAM SPACE MARK;Zs;0;WS;;;;;N;;;;;
 478          *    180E;MONGOLIAN VOWEL SEPARATOR;Zs;0;WS;;;;;N;;;;;
 479          *    2000;EN QUAD;Zs;0;WS;2002;;;;N;;;;;
 480          *    2001;EM QUAD;Zs;0;WS;2003;;;;N;;;;;
 481          *    2002;EN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
 482          *    2003;EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
 483          *    2004;THREE-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
 484          *    2005;FOUR-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
 485          *    2006;SIX-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
 486          *    2007;FIGURE SPACE;Zs;0;WS;<noBreak> 0020;;;;N;;;;;
 487          *    2008;PUNCTUATION SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
 488          *    2009;THIN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
 489          *    200A;HAIR SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
 490          *    202F;NARROW NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;;;;;
 491          *    205F;MEDIUM MATHEMATICAL SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
 492          *    3000;IDEOGRAPHIC SPACE;Zs;0;WS;<wide> 0020;;;;N;;;;;
 493          *
 494          *    RANGES:
 495          *    =======
 496          *    0x0020
 497          *    0x00a0
 498          *    0x1680
 499          *    0x180e
 500          *    0x2000 ... 0x200a
 501          *    0x202f
 502          *    0x205f
 503          *    0x3000
 504          *
 505          *  A manual decoder (below) is probably most compact for this.
 506          */
 507
 508         duk_uint_fast8_t lo;
 509         duk_uint_fast32_t hi;
 510
 511         /* cp == -1 (EOF) never matches and causes return value 0 */
 512
 513         lo = (duk_uint_fast8_t) (cp & 0xff);
 514         hi = (duk_uint_fast32_t) (cp >> 8);  /* does not fit into an uchar */
 515
 516         if (hi == 0x0000UL) {
 517                 if (lo == 0x09U || lo == 0x0bU || lo == 0x0cU ||
 518                     lo == 0x20U || lo == 0xa0U) {
 519                         return 1;
 520                 }
 521         } else if (hi == 0x0020UL) {
 522                 if (lo <= 0x0aU || lo == 0x2fU || lo == 0x5fU) {
 523                         return 1;
 524                 }
 525         } else if (cp == 0x1680L || cp == 0x180eL || cp == 0x3000L ||
 526                    cp == 0xfeffL) {
 527                 return 1;
 528         }
 529
 530         return 0;
 531 }
 532
 533 /*
 534  *  "LineTerminator" production check.
 535  */
 536
 537 DUK_INTERNAL duk_small_int_t duk_unicode_is_line_terminator(duk_codepoint_t cp) {
 538         /*
 539          *  E5 Section 7.3
 540          *
 541          *  A LineTerminatorSequence essentially merges <CR> <LF> sequences
 542          *  into a single line terminator.  This must be handled by the caller.
 543          */
 544
 545         if (cp == 0x000aL || cp == 0x000dL || cp == 0x2028L ||
 546             cp == 0x2029L) {
 547                 return 1;
 548         }
 549
 550         return 0;
 551 }
 552
 553 /*
 554  *  "IdentifierStart" production check.
 555  */
 556
 557 DUK_INTERNAL duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp) {
 558         /*
 559          *  E5 Section 7.6:
 560          *
 561          *    IdentifierStart:
 562          *      UnicodeLetter
 563          *      $
 564          *      _
 565          *      \ UnicodeEscapeSequence
 566          *
 567          *  IdentifierStart production has one multi-character production:
 568          *
 569          *    \ UnicodeEscapeSequence
 570          *
 571          *  The '\' character is -not- matched by this function.  Rather, the caller
 572          *  should decode the escape and then call this function to check whether the
 573          *  decoded character is acceptable (see discussion in E5 Section 7.6).
 574          *
 575          *  The "UnicodeLetter" alternative of the production allows letters
 576          *  from various Unicode categories.  These can be extracted with the
 577          *  "src/extract_chars.py" script.
 578          *
 579          *  Because the result has hundreds of Unicode codepoint ranges, matching
 580          *  for any values >= 0x80 are done using a very slow range-by-range scan
 581          *  and a packed range format.
 582          *
 583          *  The ASCII portion (codepoints 0x00 ... 0x7f) is fast-pathed below because
 584          *  it matters the most.  The ASCII related ranges of IdentifierStart are:
 585          *
 586          *    0x0041 ... 0x005a     ['A' ... 'Z']
 587          *    0x0061 ... 0x007a     ['a' ... 'z']
 588          *    0x0024                ['$']
 589          *    0x005f                ['_']
 590          */
 591
 592         /* ASCII (and EOF) fast path -- quick accept and reject */
 593         if (cp <= 0x7fL) {
 594 #if defined(DUK_USE_IDCHAR_FASTPATH)
 595                 return (cp >= 0) && (duk_is_idchar_tab[cp] > 0);
 596 #else
 597                 if ((cp >= 'a' && cp <= 'z') ||
 598                     (cp >= 'A' && cp <= 'Z') ||
 599                     cp == '_' || cp == '$') {
 600                         return 1;
 601                 }
 602                 return 0;
 603 #endif
 604         }
 605
 606         /* Non-ASCII slow path (range-by-range linear comparison), very slow */
 607
 608 #ifdef DUK_USE_SOURCE_NONBMP
 609         if (duk__uni_range_match(duk_unicode_ids_noa,
 610                                  (duk_size_t) sizeof(duk_unicode_ids_noa),
 611                                  (duk_codepoint_t) cp)) {
 612                 return 1;
 613         }
 614         return 0;
 615 #else
 616         if (cp < 0x10000L) {
 617                 if (duk__uni_range_match(duk_unicode_ids_noabmp,
 618                                          sizeof(duk_unicode_ids_noabmp),
 619                                          (duk_codepoint_t) cp)) {
 620                         return 1;
 621                 }
 622                 return 0;
 623         } else {
 624                 /* without explicit non-BMP support, assume non-BMP characters
 625                  * are always accepted as identifier characters.
 626                  */
 627                 return 1;
 628         }
 629 #endif
 630 }
 631
 632 /*
 633  *  "IdentifierPart" production check.
 634  */
 635
 636 DUK_INTERNAL duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp) {
 637         /*
 638          *  E5 Section 7.6:
 639          *
 640          *    IdentifierPart:
 641          *      IdentifierStart
 642          *      UnicodeCombiningMark
 643          *      UnicodeDigit
 644          *      UnicodeConnectorPunctuation
 645          *      <ZWNJ>  [U+200C]
 646          *      <ZWJ>   [U+200D]
 647          *
 648          *  IdentifierPart production has one multi-character production
 649          *  as part of its IdentifierStart alternative.  The '\' character
 650          *  of an escape sequence is not matched here, see discussion in
 651          *  duk_unicode_is_identifier_start().
 652          *
 653          *  To match non-ASCII characters (codepoints >= 0x80), a very slow
 654          *  linear range-by-range scan is used.  The codepoint is first compared
 655          *  to the IdentifierStart ranges, and if it doesn't match, then to a
 656          *  set consisting of code points in IdentifierPart but not in
 657          *  IdentifierStart.  This is done to keep the unicode range data small,
 658          *  at the expense of speed.
 659          *
 660          *  The ASCII fast path consists of:
 661          *
 662          *    0x0030 ... 0x0039     ['0' ... '9', UnicodeDigit]
 663          *    0x0041 ... 0x005a     ['A' ... 'Z', IdentifierStart]
 664          *    0x0061 ... 0x007a     ['a' ... 'z', IdentifierStart]
 665          *    0x0024                ['$', IdentifierStart]
 666          *    0x005f                ['_', IdentifierStart and
 667          *                                UnicodeConnectorPunctuation]
 668          *
 669          *  UnicodeCombiningMark has no code points <= 0x7f.
 670          *
 671          *  The matching code reuses the "identifier start" tables, and then
 672          *  consults a separate range set for characters in "identifier part"
 673          *  but not in "identifier start".  These can be extracted with the
 674          *  "src/extract_chars.py" script.
 675          *
 676          *  UnicodeCombiningMark -> categories Mn, Mc
 677          *  UnicodeDigit -> categories Nd
 678          *  UnicodeConnectorPunctuation -> categories Pc
 679          */
 680
 681         /* ASCII (and EOF) fast path -- quick accept and reject */
 682         if (cp <= 0x7fL) {
 683 #if defined(DUK_USE_IDCHAR_FASTPATH)
 684                 return (cp >= 0) && (duk_is_idchar_tab[cp] != 0);
 685 #else
 686                 if ((cp >= 'a' && cp <= 'z') ||
 687                     (cp >= 'A' && cp <= 'Z') ||
 688                     (cp >= '0' && cp <= '9') ||
 689                     cp == '_' || cp == '$') {
 690                         return 1;
 691                 }
 692                 return 0;
 693 #endif
 694         }
 695
 696         /* Non-ASCII slow path (range-by-range linear comparison), very slow */
 697
 698 #ifdef DUK_USE_SOURCE_NONBMP
 699         if (duk__uni_range_match(duk_unicode_ids_noa,
 700                                  sizeof(duk_unicode_ids_noa),
 701                                  (duk_codepoint_t) cp) ||
 702             duk__uni_range_match(duk_unicode_idp_m_ids_noa,
 703                                  sizeof(duk_unicode_idp_m_ids_noa),
 704                                  (duk_codepoint_t) cp)) {
 705                 return 1;
 706         }
 707         return 0;
 708 #else
 709         if (cp < 0x10000L) {
 710                 if (duk__uni_range_match(duk_unicode_ids_noabmp,
 711                                          sizeof(duk_unicode_ids_noabmp),
 712                                          (duk_codepoint_t) cp) ||
 713                     duk__uni_range_match(duk_unicode_idp_m_ids_noabmp,
 714                                          sizeof(duk_unicode_idp_m_ids_noabmp),
 715                                          (duk_codepoint_t) cp)) {
 716                         return 1;
 717                 }
 718                 return 0;
 719         } else {
 720                 /* without explicit non-BMP support, assume non-BMP characters
 721                  * are always accepted as identifier characters.
 722                  */
 723                 return 1;
 724         }
 725 #endif
 726 }
 727
 728 /*
 729  *  Unicode letter check.
 730  */
 731
 732 DUK_INTERNAL duk_small_int_t duk_unicode_is_letter(duk_codepoint_t cp) {
 733         /*
 734          *  Unicode letter is now taken to be the categories:
 735          *
 736          *    Lu, Ll, Lt, Lm, Lo
 737          *
 738          *  (Not sure if this is exactly correct.)
 739          *
 740          *  The ASCII fast path consists of:
 741          *
 742          *    0x0041 ... 0x005a     ['A' ... 'Z']
 743          *    0x0061 ... 0x007a     ['a' ... 'z']
 744          */
 745
 746         /* ASCII (and EOF) fast path -- quick accept and reject */
 747         if (cp <= 0x7fL) {
 748                 if ((cp >= 'a' && cp <= 'z') ||
 749                     (cp >= 'A' && cp <= 'Z')) {
 750                         return 1;
 751                 }
 752                 return 0;
 753         }
 754
 755         /* Non-ASCII slow path (range-by-range linear comparison), very slow */
 756
 757 #ifdef DUK_USE_SOURCE_NONBMP
 758         if (duk__uni_range_match(duk_unicode_ids_noa,
 759                                  sizeof(duk_unicode_ids_noa),
 760                                  (duk_codepoint_t) cp) &&
 761             !duk__uni_range_match(duk_unicode_ids_m_let_noa,
 762                                   sizeof(duk_unicode_ids_m_let_noa),
 763                                   (duk_codepoint_t) cp)) {
 764                 return 1;
 765         }
 766         return 0;
 767 #else
 768         if (cp < 0x10000L) {
 769                 if (duk__uni_range_match(duk_unicode_ids_noabmp,
 770                                          sizeof(duk_unicode_ids_noabmp),
 771                                          (duk_codepoint_t) cp) &&
 772                     !duk__uni_range_match(duk_unicode_ids_m_let_noabmp,
 773                                           sizeof(duk_unicode_ids_m_let_noabmp),
 774                                           (duk_codepoint_t) cp)) {
 775                         return 1;
 776                 }
 777                 return 0;
 778         } else {
 779                 /* without explicit non-BMP support, assume non-BMP characters
 780                  * are always accepted as letters.
 781                  */
 782                 return 1;
 783         }
 784 #endif
 785 }
 786
 787 /*
 788  *  Complex case conversion helper which decodes a bit-packed conversion
 789  *  control stream generated by unicode/extract_caseconv.py.  The conversion
 790  *  is very slow because it runs through the conversion data in a linear
 791  *  fashion to save space (which is why ASCII characters have a special
 792  *  fast path before arriving here).
 793  *
 794  *  The particular bit counts etc have been determined experimentally to
 795  *  be small but still sufficient, and must match the Python script
 796  *  (src/extract_caseconv.py).
 797  *
 798  *  The return value is the case converted codepoint or -1 if the conversion
 799  *  results in multiple characters (this is useful for regexp Canonicalization
 800  *  operation).  If 'buf' is not NULL, the result codepoint(s) are also
 801  *  appended to the hbuffer.
 802  *
 803  *  Context and locale specific rules must be checked before consulting
 804  *  this function.
 805  */
 806
 807 DUK_LOCAL
 808 duk_codepoint_t duk__slow_case_conversion(duk_hthread *thr,
 809                                           duk_bufwriter_ctx *bw,
 810                                           duk_codepoint_t cp,
 811                                           duk_bitdecoder_ctx *bd_ctx) {
 812         duk_small_int_t skip = 0;
 813         duk_small_int_t n;
 814         duk_small_int_t t;
 815         duk_small_int_t count;
 816         duk_codepoint_t tmp_cp;
 817         duk_codepoint_t start_i;
 818         duk_codepoint_t start_o;
 819
 820         DUK_UNREF(thr);
 821         DUK_ASSERT(bd_ctx != NULL);
 822
 823         DUK_DDD(DUK_DDDPRINT("slow case conversion for codepoint: %ld", (long) cp));
 824
 825         /* range conversion with a "skip" */
 826         DUK_DDD(DUK_DDDPRINT("checking ranges"));
 827         for (;;) {
 828                 skip++;
 829                 n = (duk_small_int_t) duk_bd_decode(bd_ctx, 6);
 830                 if (n == 0x3f) {
 831                         /* end marker */
 832                         break;
 833                 }
 834                 DUK_DDD(DUK_DDDPRINT("skip=%ld, n=%ld", (long) skip, (long) n));
 835
 836                 while (n--) {
 837                         start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
 838                         start_o = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
 839                         count = (duk_small_int_t) duk_bd_decode(bd_ctx, 7);
 840                         DUK_DDD(DUK_DDDPRINT("range: start_i=%ld, start_o=%ld, count=%ld, skip=%ld",
 841                                              (long) start_i, (long) start_o, (long) count, (long) skip));
 842
 843                         if (cp >= start_i) {
 844                                 tmp_cp = cp - start_i;  /* always >= 0 */
 845                                 if (tmp_cp < (duk_codepoint_t) count * (duk_codepoint_t) skip &&
 846                                     (tmp_cp % (duk_codepoint_t) skip) == 0) {
 847                                         DUK_DDD(DUK_DDDPRINT("range matches input codepoint"));
 848                                         cp = start_o + tmp_cp;
 849                                         goto single;
 850                                 }
 851                         }
 852                 }
 853         }
 854
 855         /* 1:1 conversion */
 856         n = (duk_small_int_t) duk_bd_decode(bd_ctx, 6);
 857         DUK_DDD(DUK_DDDPRINT("checking 1:1 conversions (count %ld)", (long) n));
 858         while (n--) {
 859                 start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
 860                 start_o = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
 861                 DUK_DDD(DUK_DDDPRINT("1:1 conversion %ld -> %ld", (long) start_i, (long) start_o));
 862                 if (cp == start_i) {
 863                         DUK_DDD(DUK_DDDPRINT("1:1 matches input codepoint"));
 864                         cp = start_o;
 865                         goto single;
 866                 }
 867         }
 868
 869         /* complex, multicharacter conversion */
 870         n = (duk_small_int_t) duk_bd_decode(bd_ctx, 7);
 871         DUK_DDD(DUK_DDDPRINT("checking 1:n conversions (count %ld)", (long) n));
 872         while (n--) {
 873                 start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
 874                 t = (duk_small_int_t) duk_bd_decode(bd_ctx, 2);
 875                 DUK_DDD(DUK_DDDPRINT("1:n conversion %ld -> %ld chars", (long) start_i, (long) t));
 876                 if (cp == start_i) {
 877                         DUK_DDD(DUK_DDDPRINT("1:n matches input codepoint"));
 878                         if (bw != NULL) {
 879                                 while (t--) {
 880                                         tmp_cp = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
 881                                         DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) tmp_cp);
 882                                 }
 883                         }
 884                         return -1;
 885                 } else {
 886                         while (t--) {
 887                                 (void) duk_bd_decode(bd_ctx, 16);
 888                         }
 889                 }
 890         }
 891
 892         /* default: no change */
 893         DUK_DDD(DUK_DDDPRINT("no rule matches, output is same as input"));
 894         /* fall through */
 895
 896  single:
 897         if (bw != NULL) {
 898                 DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) cp);
 899         }
 900         return cp;
 901 }
 902
 903 /*
 904  *  Case conversion helper, with context/local sensitivity.
 905  *  For proper case conversion, one needs to know the character
 906  *  and the preceding and following characters, as well as
 907  *  locale/language.
 908  */
 909
 910 /* XXX: add 'language' argument when locale/language sensitive rule
 911  * support added.
 912  */
 913 DUK_LOCAL
 914 duk_codepoint_t duk__case_transform_helper(duk_hthread *thr,
 915                                            duk_bufwriter_ctx *bw,
 916                                            duk_codepoint_t cp,
 917                                            duk_codepoint_t prev,
 918                                            duk_codepoint_t next,
 919                                            duk_bool_t uppercase) {
 920         duk_bitdecoder_ctx bd_ctx;
 921
 922         /* fast path for ASCII */
 923         if (cp < 0x80L) {
 924                 /* XXX: there are language sensitive rules for the ASCII range.
 925                  * If/when language/locale support is implemented, they need to
 926                  * be implemented here for the fast path.  There are no context
 927                  * sensitive rules for ASCII range.
 928                  */
 929
 930                 if (uppercase) {
 931                         if (cp >= 'a' && cp <= 'z') {
 932                                 cp = cp - 'a' + 'A';
 933                         }
 934                 } else {
 935                         if (cp >= 'A' && cp <= 'Z') {
 936                                 cp = cp - 'A' + 'a';
 937                         }
 938                 }
 939
 940                 if (bw != NULL) {
 941                         DUK_BW_WRITE_RAW_U8(thr, bw, (duk_uint8_t) cp);
 942                 }
 943                 return cp;
 944         }
 945
 946         /* context and locale specific rules which cannot currently be represented
 947          * in the caseconv bitstream: hardcoded rules in C
 948          */
 949         if (uppercase) {
 950                 /* XXX: turkish / azeri */
 951         } else {
 952                 /*
 953                  *  Final sigma context specific rule.  This is a rather tricky
 954                  *  rule and this handling is probably not 100% correct now.
 955                  *  The rule is not locale/language specific so it is supported.
 956                  */
 957
 958                 if (cp == 0x03a3L &&    /* U+03A3 = GREEK CAPITAL LETTER SIGMA */
 959                     duk_unicode_is_letter(prev) &&        /* prev exists and is not a letter */
 960                     !duk_unicode_is_letter(next)) {       /* next does not exist or next is not a letter */
 961                         /* Capital sigma occurred at "end of word", lowercase to
 962                          * U+03C2 = GREEK SMALL LETTER FINAL SIGMA.  Otherwise
 963                          * fall through and let the normal rules lowercase it to
 964                          * U+03C3 = GREEK SMALL LETTER SIGMA.
 965                          */
 966                         cp = 0x03c2L;
 967                         goto singlechar;
 968                 }
 969
 970                 /* XXX: lithuanian not implemented */
 971                 /* XXX: lithuanian, explicit dot rules */
 972                 /* XXX: turkish / azeri, lowercase rules */
 973         }
 974
 975         /* 1:1 or special conversions, but not locale/context specific: script generated rules */
 976         DUK_MEMZERO(&bd_ctx, sizeof(bd_ctx));
 977         if (uppercase) {
 978                 bd_ctx.data = (const duk_uint8_t *) duk_unicode_caseconv_uc;
 979                 bd_ctx.length = (duk_size_t) sizeof(duk_unicode_caseconv_uc);
 980         } else {
 981                 bd_ctx.data = (const duk_uint8_t *) duk_unicode_caseconv_lc;
 982                 bd_ctx.length = (duk_size_t) sizeof(duk_unicode_caseconv_lc);
 983         }
 984         return duk__slow_case_conversion(thr, bw, cp, &bd_ctx);
 985
 986  singlechar:
 987         if (bw != NULL) {
 988                 DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) cp);
 989         }
 990         return cp;
 991
 992  /* unused now, not needed until Turkish/Azeri */
 993 #if 0
 994  nochar:
 995         return -1;
 996 #endif
 997 }
 998
 999 /*
1000  *  Replace valstack top with case converted version.
1001  */
1002
1003 DUK_INTERNAL void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_int_t uppercase) {
1004         duk_context *ctx = (duk_context *) thr;
1005         duk_hstring *h_input;
1006         duk_bufwriter_ctx bw_alloc;
1007         duk_bufwriter_ctx *bw;
1008         const duk_uint8_t *p, *p_start, *p_end;
1009         duk_codepoint_t prev, curr, next;
1010
1011         h_input = duk_require_hstring(ctx, -1);
1012         DUK_ASSERT(h_input != NULL);
1013
1014         bw = &bw_alloc;
1015         DUK_BW_INIT_PUSHBUF(thr, bw, DUK_HSTRING_GET_BYTELEN(h_input));
1016
1017         /* [ ... input buffer ] */
1018
1019         p_start = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h_input);
1020         p_end = p_start + DUK_HSTRING_GET_BYTELEN(h_input);
1021         p = p_start;
1022
1023         prev = -1; DUK_UNREF(prev);
1024         curr = -1;
1025         next = -1;
1026         for (;;) {
1027                 prev = curr;
1028                 curr = next;
1029                 next = -1;
1030                 if (p < p_end) {
1031                         next = (int) duk_unicode_decode_xutf8_checked(thr, &p, p_start, p_end);
1032                 } else {
1033                         /* end of input and last char has been processed */
1034                         if (curr < 0) {
1035                                 break;
1036                         }
1037                 }
1038
1039                 /* on first round, skip */
1040                 if (curr >= 0) {
1041                         /* XXX: could add a fast path to process chunks of input codepoints,
1042                          * but relative benefit would be quite small.
1043                          */
1044
1045                         /* Ensure space for maximum multi-character result; estimate is overkill. */
1046                         DUK_BW_ENSURE(thr, bw, 8 * DUK_UNICODE_MAX_XUTF8_LENGTH);
1047
1048                         duk__case_transform_helper(thr,
1049                                                    bw,
1050                                                    (duk_codepoint_t) curr,
1051                                                    prev,
1052                                                    next,
1053                                                    uppercase);
1054                 }
1055         }
1056
1057         DUK_BW_COMPACT(thr, bw);
1058         duk_to_string(ctx, -1);  /* invalidates h_buf pointer */
1059         duk_remove(ctx, -2);
1060 }
1061
1062 #ifdef DUK_USE_REGEXP_SUPPORT
1063
1064 /*
1065  *  Canonicalize() abstract operation needed for canonicalization of individual
1066  *  codepoints during regexp compilation and execution, see E5 Section 15.10.2.8.
1067  *  Note that codepoints are canonicalized one character at a time, so no context
1068  *  specific rules can apply.  Locale specific rules can apply, though.
1069  */
1070
1071 DUK_INTERNAL duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr, duk_codepoint_t cp) {
1072 #if defined(DUK_USE_REGEXP_CANON_WORKAROUND)
1073         /* Fast canonicalization lookup at the cost of 128kB footprint. */
1074         DUK_ASSERT(cp >= 0);
1075         DUK_UNREF(thr);
1076         if (DUK_LIKELY(cp < 0x10000L)) {
1077                 return (duk_codepoint_t) duk_unicode_re_canon_lookup[cp];
1078         }
1079         return cp;
1080 #else  /* DUK_USE_REGEXP_CANON_WORKAROUND */
1081         duk_codepoint_t y;
1082
1083         y = duk__case_transform_helper(thr,
1084                                        NULL,    /* NULL is allowed, no output */
1085                                        cp,      /* curr char */
1086                                        -1,      /* prev char */
1087                                        -1,      /* next char */
1088                                        1);      /* uppercase */
1089
1090         if ((y < 0) || (cp >= 0x80 && y < 0x80)) {
1091                 /* multiple codepoint conversion or non-ASCII mapped to ASCII
1092                  * --> leave as is.
1093                  */
1094                 return cp;
1095         }
1096
1097         return y;
1098 #endif  /* DUK_USE_REGEXP_CANON_WORKAROUND */
1099 }
1100
1101 /*
1102  *  E5 Section 15.10.2.6 "IsWordChar" abstract operation.  Assume
1103  *  x < 0 for characters read outside the string.
1104  */
1105
1106 DUK_INTERNAL duk_small_int_t duk_unicode_re_is_wordchar(duk_codepoint_t x) {
1107         /*
1108          *  Note: the description in E5 Section 15.10.2.6 has a typo, it
1109          *  contains 'A' twice and lacks 'a'; the intent is [0-9a-zA-Z_].
1110          */
1111         if ((x >= '0' && x <= '9') ||
1112             (x >= 'a' && x <= 'z') ||
1113             (x >= 'A' && x <= 'Z') ||
1114             (x == '_')) {
1115                 return 1;
1116         }
1117         return 0;
1118 }
1119
1120 /*
1121  *  Regexp range tables
1122  */
1123
1124 /* exposed because lexer needs these too */
1125 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_digit[2] = {
1126         (duk_uint16_t) 0x0030UL, (duk_uint16_t) 0x0039UL,
1127 };
1128 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_white[22] = {
1129         (duk_uint16_t) 0x0009UL, (duk_uint16_t) 0x000DUL,
1130         (duk_uint16_t) 0x0020UL, (duk_uint16_t) 0x0020UL,
1131         (duk_uint16_t) 0x00A0UL, (duk_uint16_t) 0x00A0UL,
1132         (duk_uint16_t) 0x1680UL, (duk_uint16_t) 0x1680UL,
1133         (duk_uint16_t) 0x180EUL, (duk_uint16_t) 0x180EUL,
1134         (duk_uint16_t) 0x2000UL, (duk_uint16_t) 0x200AUL,
1135         (duk_uint16_t) 0x2028UL, (duk_uint16_t) 0x2029UL,
1136         (duk_uint16_t) 0x202FUL, (duk_uint16_t) 0x202FUL,
1137         (duk_uint16_t) 0x205FUL, (duk_uint16_t) 0x205FUL,
1138         (duk_uint16_t) 0x3000UL, (duk_uint16_t) 0x3000UL,
1139         (duk_uint16_t) 0xFEFFUL, (duk_uint16_t) 0xFEFFUL,
1140 };
1141 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_wordchar[8] = {
1142         (duk_uint16_t) 0x0030UL, (duk_uint16_t) 0x0039UL,
1143         (duk_uint16_t) 0x0041UL, (duk_uint16_t) 0x005AUL,
1144         (duk_uint16_t) 0x005FUL, (duk_uint16_t) 0x005FUL,
1145         (duk_uint16_t) 0x0061UL, (duk_uint16_t) 0x007AUL,
1146 };
1147 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_digit[4] = {
1148         (duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x002FUL,
1149         (duk_uint16_t) 0x003AUL, (duk_uint16_t) 0xFFFFUL,
1150 };
1151 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_white[24] = {
1152         (duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x0008UL,
1153         (duk_uint16_t) 0x000EUL, (duk_uint16_t) 0x001FUL,
1154         (duk_uint16_t) 0x0021UL, (duk_uint16_t) 0x009FUL,
1155         (duk_uint16_t) 0x00A1UL, (duk_uint16_t) 0x167FUL,
1156         (duk_uint16_t) 0x1681UL, (duk_uint16_t) 0x180DUL,
1157         (duk_uint16_t) 0x180FUL, (duk_uint16_t) 0x1FFFUL,
1158         (duk_uint16_t) 0x200BUL, (duk_uint16_t) 0x2027UL,
1159         (duk_uint16_t) 0x202AUL, (duk_uint16_t) 0x202EUL,
1160         (duk_uint16_t) 0x2030UL, (duk_uint16_t) 0x205EUL,
1161         (duk_uint16_t) 0x2060UL, (duk_uint16_t) 0x2FFFUL,
1162         (duk_uint16_t) 0x3001UL, (duk_uint16_t) 0xFEFEUL,
1163         (duk_uint16_t) 0xFF00UL, (duk_uint16_t) 0xFFFFUL,
1164 };
1165 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_wordchar[10] = {
1166         (duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x002FUL,
1167         (duk_uint16_t) 0x003AUL, (duk_uint16_t) 0x0040UL,
1168         (duk_uint16_t) 0x005BUL, (duk_uint16_t) 0x005EUL,
1169         (duk_uint16_t) 0x0060UL, (duk_uint16_t) 0x0060UL,
1170         (duk_uint16_t) 0x007BUL, (duk_uint16_t) 0xFFFFUL,
1171 };
1172
1173 #endif  /* DUK_USE_REGEXP_SUPPORT */