2 * Various Unicode help functions for character classification predicates,
3 * case conversion, decoding, etc.
6 #include "duk_internal.h"
12 #if defined(DUK_USE_IDCHAR_FASTPATH)
13 DUK_INTERNAL
const duk_int8_t duk_is_idchar_tab
[128] = {
14 /* 0: not IdentifierStart or IdentifierPart
15 * 1: IdentifierStart and IdentifierPart
16 * -1: IdentifierPart only
18 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00...0x0f */
19 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10...0x1f */
20 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20...0x2f */
21 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, /* 0x30...0x3f */
22 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40...0x4f */
23 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 0x50...0x5f */
24 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60...0x6f */
25 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 /* 0x70...0x7f */
30 * XUTF-8 and CESU-8 encoding/decoding
33 DUK_INTERNAL duk_small_int_t
duk_unicode_get_xutf8_length(duk_ucodepoint_t cp
) {
34 duk_uint_fast32_t x
= (duk_uint_fast32_t
) cp
;
38 } else if (x
< 0x800UL
) {
41 } else if (x
< 0x10000UL
) {
44 } else if (x
< 0x200000UL
) {
47 } else if (x
< 0x4000000UL
) {
50 } else if (x
< (duk_ucodepoint_t
) 0x80000000UL
) {
59 #if defined(DUK_USE_ASSERTIONS)
60 DUK_INTERNAL duk_small_int_t
duk_unicode_get_cesu8_length(duk_ucodepoint_t cp
) {
61 duk_uint_fast32_t x
= (duk_uint_fast32_t
) cp
;
65 } else if (x
< 0x800UL
) {
68 } else if (x
< 0x10000UL
) {
72 /* Encoded as surrogate pair, each encoding to 3 bytes for
73 * 6 bytes total. Codepoints above U+10FFFF encode as 6 bytes
74 * too, see duk_unicode_encode_cesu8().
79 #endif /* DUK_USE_ASSERTIONS */
81 DUK_INTERNAL
const duk_uint8_t duk_unicode_xutf8_markers
[7] = {
82 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe
85 /* Encode to extended UTF-8; 'out' must have space for at least
86 * DUK_UNICODE_MAX_XUTF8_LENGTH bytes. Allows encoding of any
87 * 32-bit (unsigned) codepoint.
89 DUK_INTERNAL duk_small_int_t
duk_unicode_encode_xutf8(duk_ucodepoint_t cp
, duk_uint8_t
*out
) {
90 duk_uint_fast32_t x
= (duk_uint_fast32_t
) cp
;
95 len
= duk_unicode_get_xutf8_length(cp
);
98 marker
= duk_unicode_xutf8_markers
[len
- 1]; /* 64-bit OK because always >= 0 */
105 out
[i
] = (duk_uint8_t
) (0x80 + (x
& 0x3f));
108 /* Note: masking of 'x' is not necessary because of
109 * range check and shifting -> no bits overlapping
110 * the marker should be set.
112 out
[0] = (duk_uint8_t
) (marker
+ x
);
119 /* Encode to CESU-8; 'out' must have space for at least
120 * DUK_UNICODE_MAX_CESU8_LENGTH bytes; codepoints above U+10FFFF
121 * will encode to garbage but won't overwrite the output buffer.
123 DUK_INTERNAL duk_small_int_t
duk_unicode_encode_cesu8(duk_ucodepoint_t cp
, duk_uint8_t
*out
) {
124 duk_uint_fast32_t x
= (duk_uint_fast32_t
) cp
;
128 out
[0] = (duk_uint8_t
) x
;
130 } else if (x
< 0x800UL
) {
131 out
[0] = (duk_uint8_t
) (0xc0 + ((x
>> 6) & 0x1f));
132 out
[1] = (duk_uint8_t
) (0x80 + (x
& 0x3f));
134 } else if (x
< 0x10000UL
) {
135 /* surrogate pairs get encoded here */
136 out
[0] = (duk_uint8_t
) (0xe0 + ((x
>> 12) & 0x0f));
137 out
[1] = (duk_uint8_t
) (0x80 + ((x
>> 6) & 0x3f));
138 out
[2] = (duk_uint8_t
) (0x80 + (x
& 0x3f));
142 * Unicode codepoints above U+FFFF are encoded as surrogate
143 * pairs here. This ensures that all CESU-8 codepoints are
144 * 16-bit values as expected in Ecmascript. The surrogate
145 * pairs always get a 3-byte encoding (each) in CESU-8.
146 * See: http://en.wikipedia.org/wiki/Surrogate_pair
148 * 20-bit codepoint, 10 bits (A and B) per surrogate pair:
150 * x = 0b00000000 0000AAAA AAAAAABB BBBBBBBB
151 * sp1 = 0b110110AA AAAAAAAA (0xd800 + ((x >> 10) & 0x3ff))
152 * sp2 = 0b110111BB BBBBBBBB (0xdc00 + (x & 0x3ff))
154 * Encoded into CESU-8:
156 * sp1 -> 0b11101101 (0xe0 + ((sp1 >> 12) & 0x0f))
157 * -> 0b1010AAAA (0x80 + ((sp1 >> 6) & 0x3f))
158 * -> 0b10AAAAAA (0x80 + (sp1 & 0x3f))
159 * sp2 -> 0b11101101 (0xe0 + ((sp2 >> 12) & 0x0f))
160 * -> 0b1011BBBB (0x80 + ((sp2 >> 6) & 0x3f))
161 * -> 0b10BBBBBB (0x80 + (sp2 & 0x3f))
163 * Note that 0x10000 must be subtracted first. The code below
164 * avoids the sp1, sp2 temporaries which saves around 20 bytes
170 out
[0] = (duk_uint8_t
) (0xed);
171 out
[1] = (duk_uint8_t
) (0xa0 + ((x
>> 16) & 0x0f));
172 out
[2] = (duk_uint8_t
) (0x80 + ((x
>> 10) & 0x3f));
173 out
[3] = (duk_uint8_t
) (0xed);
174 out
[4] = (duk_uint8_t
) (0xb0 + ((x
>> 6) & 0x0f));
175 out
[5] = (duk_uint8_t
) (0x80 + (x
& 0x3f));
182 /* Decode helper. Return zero on error. */
183 DUK_INTERNAL duk_small_int_t
duk_unicode_decode_xutf8(duk_hthread
*thr
, const duk_uint8_t
**ptr
, const duk_uint8_t
*ptr_start
, const duk_uint8_t
*ptr_end
, duk_ucodepoint_t
*out_cp
) {
184 const duk_uint8_t
*p
;
192 if (p
< ptr_start
|| p
>= ptr_end
) {
197 * UTF-8 decoder which accepts longer than standard byte sequences.
198 * This allows full 32-bit code points to be used.
201 ch
= (duk_uint_fast8_t
) (*p
++);
203 /* 0xxx xxxx [7 bits] */
204 res
= (duk_uint32_t
) (ch
& 0x7f);
206 } else if (ch
< 0xc0) {
207 /* 10xx xxxx -> invalid */
209 } else if (ch
< 0xe0) {
210 /* 110x xxxx 10xx xxxx [11 bits] */
211 res
= (duk_uint32_t
) (ch
& 0x1f);
213 } else if (ch
< 0xf0) {
214 /* 1110 xxxx 10xx xxxx 10xx xxxx [16 bits] */
215 res
= (duk_uint32_t
) (ch
& 0x0f);
217 } else if (ch
< 0xf8) {
218 /* 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx [21 bits] */
219 res
= (duk_uint32_t
) (ch
& 0x07);
221 } else if (ch
< 0xfc) {
222 /* 1111 10xx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [26 bits] */
223 res
= (duk_uint32_t
) (ch
& 0x03);
225 } else if (ch
< 0xfe) {
226 /* 1111 110x 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [31 bits] */
227 res
= (duk_uint32_t
) (ch
& 0x01);
229 } else if (ch
< 0xff) {
230 /* 1111 1110 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [36 bits] */
231 res
= (duk_uint32_t
) (0);
234 /* 8-byte format could be:
235 * 1111 1111 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [41 bits]
237 * However, this format would not have a zero bit following the
238 * leading one bits and would not allow 0xFF to be used as an
239 * "invalid xutf-8" marker for internal keys. Further, 8-byte
240 * encodings (up to 41 bit code points) are not currently needed.
245 DUK_ASSERT(p
>= ptr_start
); /* verified at beginning */
246 if (p
+ n
> ptr_end
) {
247 /* check pointer at end */
252 DUK_ASSERT(p
>= ptr_start
&& p
< ptr_end
);
254 res
+= (duk_uint32_t
) ((*p
++) & 0x3f);
266 /* used by e.g. duk_regexp_executor.c, string built-ins */
267 DUK_INTERNAL duk_ucodepoint_t
duk_unicode_decode_xutf8_checked(duk_hthread
*thr
, const duk_uint8_t
**ptr
, const duk_uint8_t
*ptr_start
, const duk_uint8_t
*ptr_end
) {
270 if (duk_unicode_decode_xutf8(thr
, ptr
, ptr_start
, ptr_end
, &cp
)) {
273 DUK_ERROR_INTERNAL(thr
, "utf-8 decode failed"); /* XXX: 'internal error' is a bit of a misnomer */
278 /* Compute (extended) utf-8 length without codepoint encoding validation,
279 * used for string interning.
281 * NOTE: This algorithm is performance critical, more so than string hashing
282 * in some cases. It is needed when interning a string and needs to scan
283 * every byte of the string with no skipping. Having an ASCII fast path
284 * is useful if possible in the algorithm. The current algorithms were
285 * chosen from several variants, based on x64 gcc -O2 testing. See:
286 * https://github.com/svaarala/duktape/pull/422
288 * NOTE: must match src/dukutil.py:duk_unicode_unvalidated_utf8_length().
291 #if defined(DUK_USE_PREFER_SIZE)
292 /* Small variant; roughly 150 bytes smaller than the fast variant. */
293 DUK_INTERNAL duk_size_t
duk_unicode_unvalidated_utf8_length(const duk_uint8_t
*data
, duk_size_t blen
) {
294 const duk_uint8_t
*p
;
295 const duk_uint8_t
*p_end
;
305 if (DUK_UNLIKELY(x
>= 0x80 && x
<= 0xbf)) {
310 DUK_ASSERT(ncont
<= blen
);
312 DUK_ASSERT(clen
<= blen
);
315 #else /* DUK_USE_PREFER_SIZE */
316 /* This seems like a good overall approach. Fast path for ASCII in 4 byte
319 DUK_INTERNAL duk_size_t
duk_unicode_unvalidated_utf8_length(const duk_uint8_t
*data
, duk_size_t blen
) {
320 const duk_uint8_t
*p
;
321 const duk_uint8_t
*p_end
;
322 const duk_uint32_t
*p32_end
;
323 const duk_uint32_t
*p32
;
327 ncont
= 0; /* number of continuation (non-initial) bytes in [0x80,0xbf] */
334 /* Align 'p' to 4; the input data may have arbitrary alignment.
335 * End of string check not needed because blen >= 16.
337 while (((duk_size_t
) (const void *) p
) & 0x03U
) {
340 if (DUK_UNLIKELY(x
>= 0x80 && x
<= 0xbf)) {
345 /* Full, aligned 4-byte reads. */
346 p32_end
= (const duk_uint32_t
*) (const void *) (p
+ ((duk_size_t
) (p_end
- p
) & (duk_size_t
) (~0x03)));
347 p32
= (const duk_uint32_t
*) (const void *) p
;
348 while (p32
!= (const duk_uint32_t
*) p32_end
) {
351 if (DUK_LIKELY((x
& 0x80808080UL
) == 0)) {
352 ; /* ASCII fast path */
354 /* Flip highest bit of each byte which changes
355 * the bit pattern 10xxxxxx into 00xxxxxx which
356 * allows an easy bit mask test.
359 if (DUK_UNLIKELY(!(x
& 0xc0000000UL
))) {
362 if (DUK_UNLIKELY(!(x
& 0x00c00000UL
))) {
365 if (DUK_UNLIKELY(!(x
& 0x0000c000UL
))) {
368 if (DUK_UNLIKELY(!(x
& 0x000000c0UL
))) {
373 p
= (const duk_uint8_t
*) p32
;
374 /* Fall through to handle the rest. */
380 if (DUK_UNLIKELY(x
>= 0x80 && x
<= 0xbf)) {
385 DUK_ASSERT(ncont
<= blen
);
387 DUK_ASSERT(clen
<= blen
);
390 #endif /* DUK_USE_PREFER_SIZE */
393 * Unicode range matcher
395 * Matches a codepoint against a packed bitstream of character ranges.
396 * Used for slow path Unicode matching.
399 /* Must match src/extract_chars.py, generate_match_table3(). */
400 DUK_LOCAL duk_uint32_t
duk__uni_decode_value(duk_bitdecoder_ctx
*bd_ctx
) {
403 t
= (duk_uint32_t
) duk_bd_decode(bd_ctx
, 4);
407 t
= (duk_uint32_t
) duk_bd_decode(bd_ctx
, 8);
412 t
= (duk_uint32_t
) duk_bd_decode(bd_ctx
, 12);
413 return t
+ 0x0fU
+ 0xfeU
;
415 t
= (duk_uint32_t
) duk_bd_decode(bd_ctx
, 24);
416 return t
+ 0x0fU
+ 0xfeU
+ 0x1000UL
;
420 DUK_LOCAL duk_small_int_t
duk__uni_range_match(const duk_uint8_t
*unitab
, duk_size_t unilen
, duk_codepoint_t cp
) {
421 duk_bitdecoder_ctx bd_ctx
;
422 duk_codepoint_t prev_re
;
424 DUK_MEMZERO(&bd_ctx
, sizeof(bd_ctx
));
425 bd_ctx
.data
= (const duk_uint8_t
*) unitab
;
426 bd_ctx
.length
= (duk_size_t
) unilen
;
430 duk_codepoint_t r1
, r2
;
431 r1
= (duk_codepoint_t
) duk__uni_decode_value(&bd_ctx
);
435 r2
= (duk_codepoint_t
) duk__uni_decode_value(&bd_ctx
);
441 /* [r1,r2] is the range */
443 DUK_DDD(DUK_DDDPRINT("duk__uni_range_match: cp=%06lx range=[0x%06lx,0x%06lx]",
444 (unsigned long) cp
, (unsigned long) r1
, (unsigned long) r2
));
445 if (cp
>= r1
&& cp
<= r2
) {
454 * "WhiteSpace" production check.
457 DUK_INTERNAL duk_small_int_t
duk_unicode_is_whitespace(duk_codepoint_t cp
) {
459 * E5 Section 7.2 specifies six characters specifically as
462 * 0009;<control>;Cc;0;S;;;;;N;CHARACTER TABULATION;;;;
463 * 000B;<control>;Cc;0;S;;;;;N;LINE TABULATION;;;;
464 * 000C;<control>;Cc;0;WS;;;;;N;FORM FEED (FF);;;;
465 * 0020;SPACE;Zs;0;WS;;;;;N;;;;;
466 * 00A0;NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;;
467 * FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;;
469 * It also specifies any Unicode category 'Zs' characters as white
470 * space. These can be extracted with the "src/extract_chars.py" script.
475 * 0020;SPACE;Zs;0;WS;;;;;N;;;;;
476 * 00A0;NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;;
477 * 1680;OGHAM SPACE MARK;Zs;0;WS;;;;;N;;;;;
478 * 180E;MONGOLIAN VOWEL SEPARATOR;Zs;0;WS;;;;;N;;;;;
479 * 2000;EN QUAD;Zs;0;WS;2002;;;;N;;;;;
480 * 2001;EM QUAD;Zs;0;WS;2003;;;;N;;;;;
481 * 2002;EN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
482 * 2003;EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
483 * 2004;THREE-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
484 * 2005;FOUR-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
485 * 2006;SIX-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
486 * 2007;FIGURE SPACE;Zs;0;WS;<noBreak> 0020;;;;N;;;;;
487 * 2008;PUNCTUATION SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
488 * 2009;THIN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
489 * 200A;HAIR SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
490 * 202F;NARROW NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;;;;;
491 * 205F;MEDIUM MATHEMATICAL SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
492 * 3000;IDEOGRAPHIC SPACE;Zs;0;WS;<wide> 0020;;;;N;;;;;
505 * A manual decoder (below) is probably most compact for this.
509 duk_uint_fast32_t hi
;
511 /* cp == -1 (EOF) never matches and causes return value 0 */
513 lo
= (duk_uint_fast8_t
) (cp
& 0xff);
514 hi
= (duk_uint_fast32_t
) (cp
>> 8); /* does not fit into an uchar */
516 if (hi
== 0x0000UL
) {
517 if (lo
== 0x09U
|| lo
== 0x0bU
|| lo
== 0x0cU
||
518 lo
== 0x20U
|| lo
== 0xa0U
) {
521 } else if (hi
== 0x0020UL
) {
522 if (lo
<= 0x0aU
|| lo
== 0x2fU
|| lo
== 0x5fU
) {
525 } else if (cp
== 0x1680L
|| cp
== 0x180eL
|| cp
== 0x3000L
||
534 * "LineTerminator" production check.
537 DUK_INTERNAL duk_small_int_t
duk_unicode_is_line_terminator(duk_codepoint_t cp
) {
541 * A LineTerminatorSequence essentially merges <CR> <LF> sequences
542 * into a single line terminator. This must be handled by the caller.
545 if (cp
== 0x000aL
|| cp
== 0x000dL
|| cp
== 0x2028L
||
554 * "IdentifierStart" production check.
557 DUK_INTERNAL duk_small_int_t
duk_unicode_is_identifier_start(duk_codepoint_t cp
) {
565 * \ UnicodeEscapeSequence
567 * IdentifierStart production has one multi-character production:
569 * \ UnicodeEscapeSequence
571 * The '\' character is -not- matched by this function. Rather, the caller
572 * should decode the escape and then call this function to check whether the
573 * decoded character is acceptable (see discussion in E5 Section 7.6).
575 * The "UnicodeLetter" alternative of the production allows letters
576 * from various Unicode categories. These can be extracted with the
577 * "src/extract_chars.py" script.
579 * Because the result has hundreds of Unicode codepoint ranges, matching
580 * for any values >= 0x80 are done using a very slow range-by-range scan
581 * and a packed range format.
583 * The ASCII portion (codepoints 0x00 ... 0x7f) is fast-pathed below because
584 * it matters the most. The ASCII related ranges of IdentifierStart are:
586 * 0x0041 ... 0x005a ['A' ... 'Z']
587 * 0x0061 ... 0x007a ['a' ... 'z']
592 /* ASCII (and EOF) fast path -- quick accept and reject */
594 #if defined(DUK_USE_IDCHAR_FASTPATH)
595 return (cp
>= 0) && (duk_is_idchar_tab
[cp
] > 0);
597 if ((cp
>= 'a' && cp
<= 'z') ||
598 (cp
>= 'A' && cp
<= 'Z') ||
599 cp
== '_' || cp
== '$') {
606 /* Non-ASCII slow path (range-by-range linear comparison), very slow */
608 #ifdef DUK_USE_SOURCE_NONBMP
609 if (duk__uni_range_match(duk_unicode_ids_noa
,
610 (duk_size_t
) sizeof(duk_unicode_ids_noa
),
611 (duk_codepoint_t
) cp
)) {
617 if (duk__uni_range_match(duk_unicode_ids_noabmp
,
618 sizeof(duk_unicode_ids_noabmp
),
619 (duk_codepoint_t
) cp
)) {
624 /* without explicit non-BMP support, assume non-BMP characters
625 * are always accepted as identifier characters.
633 * "IdentifierPart" production check.
636 DUK_INTERNAL duk_small_int_t
duk_unicode_is_identifier_part(duk_codepoint_t cp
) {
642 * UnicodeCombiningMark
644 * UnicodeConnectorPunctuation
648 * IdentifierPart production has one multi-character production
649 * as part of its IdentifierStart alternative. The '\' character
650 * of an escape sequence is not matched here, see discussion in
651 * duk_unicode_is_identifier_start().
653 * To match non-ASCII characters (codepoints >= 0x80), a very slow
654 * linear range-by-range scan is used. The codepoint is first compared
655 * to the IdentifierStart ranges, and if it doesn't match, then to a
656 * set consisting of code points in IdentifierPart but not in
657 * IdentifierStart. This is done to keep the unicode range data small,
658 * at the expense of speed.
660 * The ASCII fast path consists of:
662 * 0x0030 ... 0x0039 ['0' ... '9', UnicodeDigit]
663 * 0x0041 ... 0x005a ['A' ... 'Z', IdentifierStart]
664 * 0x0061 ... 0x007a ['a' ... 'z', IdentifierStart]
665 * 0x0024 ['$', IdentifierStart]
666 * 0x005f ['_', IdentifierStart and
667 * UnicodeConnectorPunctuation]
669 * UnicodeCombiningMark has no code points <= 0x7f.
671 * The matching code reuses the "identifier start" tables, and then
672 * consults a separate range set for characters in "identifier part"
673 * but not in "identifier start". These can be extracted with the
674 * "src/extract_chars.py" script.
676 * UnicodeCombiningMark -> categories Mn, Mc
677 * UnicodeDigit -> categories Nd
678 * UnicodeConnectorPunctuation -> categories Pc
681 /* ASCII (and EOF) fast path -- quick accept and reject */
683 #if defined(DUK_USE_IDCHAR_FASTPATH)
684 return (cp
>= 0) && (duk_is_idchar_tab
[cp
] != 0);
686 if ((cp
>= 'a' && cp
<= 'z') ||
687 (cp
>= 'A' && cp
<= 'Z') ||
688 (cp
>= '0' && cp
<= '9') ||
689 cp
== '_' || cp
== '$') {
696 /* Non-ASCII slow path (range-by-range linear comparison), very slow */
698 #ifdef DUK_USE_SOURCE_NONBMP
699 if (duk__uni_range_match(duk_unicode_ids_noa
,
700 sizeof(duk_unicode_ids_noa
),
701 (duk_codepoint_t
) cp
) ||
702 duk__uni_range_match(duk_unicode_idp_m_ids_noa
,
703 sizeof(duk_unicode_idp_m_ids_noa
),
704 (duk_codepoint_t
) cp
)) {
710 if (duk__uni_range_match(duk_unicode_ids_noabmp
,
711 sizeof(duk_unicode_ids_noabmp
),
712 (duk_codepoint_t
) cp
) ||
713 duk__uni_range_match(duk_unicode_idp_m_ids_noabmp
,
714 sizeof(duk_unicode_idp_m_ids_noabmp
),
715 (duk_codepoint_t
) cp
)) {
720 /* without explicit non-BMP support, assume non-BMP characters
721 * are always accepted as identifier characters.
729 * Unicode letter check.
732 DUK_INTERNAL duk_small_int_t
duk_unicode_is_letter(duk_codepoint_t cp
) {
734 * Unicode letter is now taken to be the categories:
738 * (Not sure if this is exactly correct.)
740 * The ASCII fast path consists of:
742 * 0x0041 ... 0x005a ['A' ... 'Z']
743 * 0x0061 ... 0x007a ['a' ... 'z']
746 /* ASCII (and EOF) fast path -- quick accept and reject */
748 if ((cp
>= 'a' && cp
<= 'z') ||
749 (cp
>= 'A' && cp
<= 'Z')) {
755 /* Non-ASCII slow path (range-by-range linear comparison), very slow */
757 #ifdef DUK_USE_SOURCE_NONBMP
758 if (duk__uni_range_match(duk_unicode_ids_noa
,
759 sizeof(duk_unicode_ids_noa
),
760 (duk_codepoint_t
) cp
) &&
761 !duk__uni_range_match(duk_unicode_ids_m_let_noa
,
762 sizeof(duk_unicode_ids_m_let_noa
),
763 (duk_codepoint_t
) cp
)) {
769 if (duk__uni_range_match(duk_unicode_ids_noabmp
,
770 sizeof(duk_unicode_ids_noabmp
),
771 (duk_codepoint_t
) cp
) &&
772 !duk__uni_range_match(duk_unicode_ids_m_let_noabmp
,
773 sizeof(duk_unicode_ids_m_let_noabmp
),
774 (duk_codepoint_t
) cp
)) {
779 /* without explicit non-BMP support, assume non-BMP characters
780 * are always accepted as letters.
788 * Complex case conversion helper which decodes a bit-packed conversion
789 * control stream generated by unicode/extract_caseconv.py. The conversion
790 * is very slow because it runs through the conversion data in a linear
791 * fashion to save space (which is why ASCII characters have a special
792 * fast path before arriving here).
794 * The particular bit counts etc have been determined experimentally to
795 * be small but still sufficient, and must match the Python script
796 * (src/extract_caseconv.py).
798 * The return value is the case converted codepoint or -1 if the conversion
799 * results in multiple characters (this is useful for regexp Canonicalization
800 * operation). If 'buf' is not NULL, the result codepoint(s) are also
801 * appended to the hbuffer.
803 * Context and locale specific rules must be checked before consulting
808 duk_codepoint_t
duk__slow_case_conversion(duk_hthread
*thr
,
809 duk_bufwriter_ctx
*bw
,
811 duk_bitdecoder_ctx
*bd_ctx
) {
812 duk_small_int_t skip
= 0;
815 duk_small_int_t count
;
816 duk_codepoint_t tmp_cp
;
817 duk_codepoint_t start_i
;
818 duk_codepoint_t start_o
;
821 DUK_ASSERT(bd_ctx
!= NULL
);
823 DUK_DDD(DUK_DDDPRINT("slow case conversion for codepoint: %ld", (long) cp
));
825 /* range conversion with a "skip" */
826 DUK_DDD(DUK_DDDPRINT("checking ranges"));
829 n
= (duk_small_int_t
) duk_bd_decode(bd_ctx
, 6);
834 DUK_DDD(DUK_DDDPRINT("skip=%ld, n=%ld", (long) skip
, (long) n
));
837 start_i
= (duk_codepoint_t
) duk_bd_decode(bd_ctx
, 16);
838 start_o
= (duk_codepoint_t
) duk_bd_decode(bd_ctx
, 16);
839 count
= (duk_small_int_t
) duk_bd_decode(bd_ctx
, 7);
840 DUK_DDD(DUK_DDDPRINT("range: start_i=%ld, start_o=%ld, count=%ld, skip=%ld",
841 (long) start_i
, (long) start_o
, (long) count
, (long) skip
));
844 tmp_cp
= cp
- start_i
; /* always >= 0 */
845 if (tmp_cp
< (duk_codepoint_t
) count
* (duk_codepoint_t
) skip
&&
846 (tmp_cp
% (duk_codepoint_t
) skip
) == 0) {
847 DUK_DDD(DUK_DDDPRINT("range matches input codepoint"));
848 cp
= start_o
+ tmp_cp
;
856 n
= (duk_small_int_t
) duk_bd_decode(bd_ctx
, 6);
857 DUK_DDD(DUK_DDDPRINT("checking 1:1 conversions (count %ld)", (long) n
));
859 start_i
= (duk_codepoint_t
) duk_bd_decode(bd_ctx
, 16);
860 start_o
= (duk_codepoint_t
) duk_bd_decode(bd_ctx
, 16);
861 DUK_DDD(DUK_DDDPRINT("1:1 conversion %ld -> %ld", (long) start_i
, (long) start_o
));
863 DUK_DDD(DUK_DDDPRINT("1:1 matches input codepoint"));
869 /* complex, multicharacter conversion */
870 n
= (duk_small_int_t
) duk_bd_decode(bd_ctx
, 7);
871 DUK_DDD(DUK_DDDPRINT("checking 1:n conversions (count %ld)", (long) n
));
873 start_i
= (duk_codepoint_t
) duk_bd_decode(bd_ctx
, 16);
874 t
= (duk_small_int_t
) duk_bd_decode(bd_ctx
, 2);
875 DUK_DDD(DUK_DDDPRINT("1:n conversion %ld -> %ld chars", (long) start_i
, (long) t
));
877 DUK_DDD(DUK_DDDPRINT("1:n matches input codepoint"));
880 tmp_cp
= (duk_codepoint_t
) duk_bd_decode(bd_ctx
, 16);
881 DUK_BW_WRITE_RAW_XUTF8(thr
, bw
, (duk_ucodepoint_t
) tmp_cp
);
887 (void) duk_bd_decode(bd_ctx
, 16);
892 /* default: no change */
893 DUK_DDD(DUK_DDDPRINT("no rule matches, output is same as input"));
898 DUK_BW_WRITE_RAW_XUTF8(thr
, bw
, (duk_ucodepoint_t
) cp
);
904 * Case conversion helper, with context/local sensitivity.
905 * For proper case conversion, one needs to know the character
906 * and the preceding and following characters, as well as
910 /* XXX: add 'language' argument when locale/language sensitive rule
914 duk_codepoint_t
duk__case_transform_helper(duk_hthread
*thr
,
915 duk_bufwriter_ctx
*bw
,
917 duk_codepoint_t prev
,
918 duk_codepoint_t next
,
919 duk_bool_t uppercase
) {
920 duk_bitdecoder_ctx bd_ctx
;
922 /* fast path for ASCII */
924 /* XXX: there are language sensitive rules for the ASCII range.
925 * If/when language/locale support is implemented, they need to
926 * be implemented here for the fast path. There are no context
927 * sensitive rules for ASCII range.
931 if (cp
>= 'a' && cp
<= 'z') {
935 if (cp
>= 'A' && cp
<= 'Z') {
941 DUK_BW_WRITE_RAW_U8(thr
, bw
, (duk_uint8_t
) cp
);
946 /* context and locale specific rules which cannot currently be represented
947 * in the caseconv bitstream: hardcoded rules in C
950 /* XXX: turkish / azeri */
953 * Final sigma context specific rule. This is a rather tricky
954 * rule and this handling is probably not 100% correct now.
955 * The rule is not locale/language specific so it is supported.
958 if (cp
== 0x03a3L
&& /* U+03A3 = GREEK CAPITAL LETTER SIGMA */
959 duk_unicode_is_letter(prev
) && /* prev exists and is not a letter */
960 !duk_unicode_is_letter(next
)) { /* next does not exist or next is not a letter */
961 /* Capital sigma occurred at "end of word", lowercase to
962 * U+03C2 = GREEK SMALL LETTER FINAL SIGMA. Otherwise
963 * fall through and let the normal rules lowercase it to
964 * U+03C3 = GREEK SMALL LETTER SIGMA.
970 /* XXX: lithuanian not implemented */
971 /* XXX: lithuanian, explicit dot rules */
972 /* XXX: turkish / azeri, lowercase rules */
975 /* 1:1 or special conversions, but not locale/context specific: script generated rules */
976 DUK_MEMZERO(&bd_ctx
, sizeof(bd_ctx
));
978 bd_ctx
.data
= (const duk_uint8_t
*) duk_unicode_caseconv_uc
;
979 bd_ctx
.length
= (duk_size_t
) sizeof(duk_unicode_caseconv_uc
);
981 bd_ctx
.data
= (const duk_uint8_t
*) duk_unicode_caseconv_lc
;
982 bd_ctx
.length
= (duk_size_t
) sizeof(duk_unicode_caseconv_lc
);
984 return duk__slow_case_conversion(thr
, bw
, cp
, &bd_ctx
);
988 DUK_BW_WRITE_RAW_XUTF8(thr
, bw
, (duk_ucodepoint_t
) cp
);
992 /* unused now, not needed until Turkish/Azeri */
1000 * Replace valstack top with case converted version.
1003 DUK_INTERNAL
void duk_unicode_case_convert_string(duk_hthread
*thr
, duk_small_int_t uppercase
) {
1004 duk_context
*ctx
= (duk_context
*) thr
;
1005 duk_hstring
*h_input
;
1006 duk_bufwriter_ctx bw_alloc
;
1007 duk_bufwriter_ctx
*bw
;
1008 const duk_uint8_t
*p
, *p_start
, *p_end
;
1009 duk_codepoint_t prev
, curr
, next
;
1011 h_input
= duk_require_hstring(ctx
, -1);
1012 DUK_ASSERT(h_input
!= NULL
);
1015 DUK_BW_INIT_PUSHBUF(thr
, bw
, DUK_HSTRING_GET_BYTELEN(h_input
));
1017 /* [ ... input buffer ] */
1019 p_start
= (const duk_uint8_t
*) DUK_HSTRING_GET_DATA(h_input
);
1020 p_end
= p_start
+ DUK_HSTRING_GET_BYTELEN(h_input
);
1023 prev
= -1; DUK_UNREF(prev
);
1031 next
= (int) duk_unicode_decode_xutf8_checked(thr
, &p
, p_start
, p_end
);
1033 /* end of input and last char has been processed */
1039 /* on first round, skip */
1041 /* XXX: could add a fast path to process chunks of input codepoints,
1042 * but relative benefit would be quite small.
1045 /* Ensure space for maximum multi-character result; estimate is overkill. */
1046 DUK_BW_ENSURE(thr
, bw
, 8 * DUK_UNICODE_MAX_XUTF8_LENGTH
);
1048 duk__case_transform_helper(thr
,
1050 (duk_codepoint_t
) curr
,
1057 DUK_BW_COMPACT(thr
, bw
);
1058 duk_to_string(ctx
, -1); /* invalidates h_buf pointer */
1059 duk_remove(ctx
, -2);
1062 #ifdef DUK_USE_REGEXP_SUPPORT
1065 * Canonicalize() abstract operation needed for canonicalization of individual
1066 * codepoints during regexp compilation and execution, see E5 Section 15.10.2.8.
1067 * Note that codepoints are canonicalized one character at a time, so no context
1068 * specific rules can apply. Locale specific rules can apply, though.
1071 DUK_INTERNAL duk_codepoint_t
duk_unicode_re_canonicalize_char(duk_hthread
*thr
, duk_codepoint_t cp
) {
1072 #if defined(DUK_USE_REGEXP_CANON_WORKAROUND)
1073 /* Fast canonicalization lookup at the cost of 128kB footprint. */
1074 DUK_ASSERT(cp
>= 0);
1076 if (DUK_LIKELY(cp
< 0x10000L
)) {
1077 return (duk_codepoint_t
) duk_unicode_re_canon_lookup
[cp
];
1080 #else /* DUK_USE_REGEXP_CANON_WORKAROUND */
1083 y
= duk__case_transform_helper(thr
,
1084 NULL
, /* NULL is allowed, no output */
1090 if ((y
< 0) || (cp
>= 0x80 && y
< 0x80)) {
1091 /* multiple codepoint conversion or non-ASCII mapped to ASCII
1098 #endif /* DUK_USE_REGEXP_CANON_WORKAROUND */
1102 * E5 Section 15.10.2.6 "IsWordChar" abstract operation. Assume
1103 * x < 0 for characters read outside the string.
1106 DUK_INTERNAL duk_small_int_t
duk_unicode_re_is_wordchar(duk_codepoint_t x
) {
1108 * Note: the description in E5 Section 15.10.2.6 has a typo, it
1109 * contains 'A' twice and lacks 'a'; the intent is [0-9a-zA-Z_].
1111 if ((x
>= '0' && x
<= '9') ||
1112 (x
>= 'a' && x
<= 'z') ||
1113 (x
>= 'A' && x
<= 'Z') ||
1121 * Regexp range tables
1124 /* exposed because lexer needs these too */
1125 DUK_INTERNAL
const duk_uint16_t duk_unicode_re_ranges_digit
[2] = {
1126 (duk_uint16_t
) 0x0030UL
, (duk_uint16_t
) 0x0039UL
,
1128 DUK_INTERNAL
const duk_uint16_t duk_unicode_re_ranges_white
[22] = {
1129 (duk_uint16_t
) 0x0009UL
, (duk_uint16_t
) 0x000DUL
,
1130 (duk_uint16_t
) 0x0020UL
, (duk_uint16_t
) 0x0020UL
,
1131 (duk_uint16_t
) 0x00A0UL
, (duk_uint16_t
) 0x00A0UL
,
1132 (duk_uint16_t
) 0x1680UL
, (duk_uint16_t
) 0x1680UL
,
1133 (duk_uint16_t
) 0x180EUL
, (duk_uint16_t
) 0x180EUL
,
1134 (duk_uint16_t
) 0x2000UL
, (duk_uint16_t
) 0x200AUL
,
1135 (duk_uint16_t
) 0x2028UL
, (duk_uint16_t
) 0x2029UL
,
1136 (duk_uint16_t
) 0x202FUL
, (duk_uint16_t
) 0x202FUL
,
1137 (duk_uint16_t
) 0x205FUL
, (duk_uint16_t
) 0x205FUL
,
1138 (duk_uint16_t
) 0x3000UL
, (duk_uint16_t
) 0x3000UL
,
1139 (duk_uint16_t
) 0xFEFFUL
, (duk_uint16_t
) 0xFEFFUL
,
1141 DUK_INTERNAL
const duk_uint16_t duk_unicode_re_ranges_wordchar
[8] = {
1142 (duk_uint16_t
) 0x0030UL
, (duk_uint16_t
) 0x0039UL
,
1143 (duk_uint16_t
) 0x0041UL
, (duk_uint16_t
) 0x005AUL
,
1144 (duk_uint16_t
) 0x005FUL
, (duk_uint16_t
) 0x005FUL
,
1145 (duk_uint16_t
) 0x0061UL
, (duk_uint16_t
) 0x007AUL
,
1147 DUK_INTERNAL
const duk_uint16_t duk_unicode_re_ranges_not_digit
[4] = {
1148 (duk_uint16_t
) 0x0000UL
, (duk_uint16_t
) 0x002FUL
,
1149 (duk_uint16_t
) 0x003AUL
, (duk_uint16_t
) 0xFFFFUL
,
1151 DUK_INTERNAL
const duk_uint16_t duk_unicode_re_ranges_not_white
[24] = {
1152 (duk_uint16_t
) 0x0000UL
, (duk_uint16_t
) 0x0008UL
,
1153 (duk_uint16_t
) 0x000EUL
, (duk_uint16_t
) 0x001FUL
,
1154 (duk_uint16_t
) 0x0021UL
, (duk_uint16_t
) 0x009FUL
,
1155 (duk_uint16_t
) 0x00A1UL
, (duk_uint16_t
) 0x167FUL
,
1156 (duk_uint16_t
) 0x1681UL
, (duk_uint16_t
) 0x180DUL
,
1157 (duk_uint16_t
) 0x180FUL
, (duk_uint16_t
) 0x1FFFUL
,
1158 (duk_uint16_t
) 0x200BUL
, (duk_uint16_t
) 0x2027UL
,
1159 (duk_uint16_t
) 0x202AUL
, (duk_uint16_t
) 0x202EUL
,
1160 (duk_uint16_t
) 0x2030UL
, (duk_uint16_t
) 0x205EUL
,
1161 (duk_uint16_t
) 0x2060UL
, (duk_uint16_t
) 0x2FFFUL
,
1162 (duk_uint16_t
) 0x3001UL
, (duk_uint16_t
) 0xFEFEUL
,
1163 (duk_uint16_t
) 0xFF00UL
, (duk_uint16_t
) 0xFFFFUL
,
1165 DUK_INTERNAL
const duk_uint16_t duk_unicode_re_ranges_not_wordchar
[10] = {
1166 (duk_uint16_t
) 0x0000UL
, (duk_uint16_t
) 0x002FUL
,
1167 (duk_uint16_t
) 0x003AUL
, (duk_uint16_t
) 0x0040UL
,
1168 (duk_uint16_t
) 0x005BUL
, (duk_uint16_t
) 0x005EUL
,
1169 (duk_uint16_t
) 0x0060UL
, (duk_uint16_t
) 0x0060UL
,
1170 (duk_uint16_t
) 0x007BUL
, (duk_uint16_t
) 0xFFFFUL
,
1173 #endif /* DUK_USE_REGEXP_SUPPORT */