]>
Commit | Line | Data |
---|---|---|
1e59de90 TL |
1 | /* |
2 | * Various Unicode help functions for character classification predicates, | |
3 | * case conversion, decoding, etc. | |
4 | */ | |
5 | ||
6 | #include "duk_internal.h" | |
7 | ||
8 | /* | |
9 | * Fast path tables | |
10 | */ | |
11 | ||
12 | #if defined(DUK_USE_IDCHAR_FASTPATH) | |
13 | DUK_INTERNAL const duk_int8_t duk_is_idchar_tab[128] = { | |
14 | /* 0: not IdentifierStart or IdentifierPart | |
15 | * 1: IdentifierStart and IdentifierPart | |
16 | * -1: IdentifierPart only | |
17 | */ | |
18 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00...0x0f */ | |
19 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10...0x1f */ | |
20 | 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20...0x2f */ | |
21 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, /* 0x30...0x3f */ | |
22 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40...0x4f */ | |
23 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 0x50...0x5f */ | |
24 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60...0x6f */ | |
25 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 /* 0x70...0x7f */ | |
26 | }; | |
27 | #endif | |
28 | ||
29 | /* | |
30 | * XUTF-8 and CESU-8 encoding/decoding | |
31 | */ | |
32 | ||
33 | DUK_INTERNAL duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp) { | |
34 | duk_uint_fast32_t x = (duk_uint_fast32_t) cp; | |
35 | if (x < 0x80UL) { | |
36 | /* 7 bits */ | |
37 | return 1; | |
38 | } else if (x < 0x800UL) { | |
39 | /* 11 bits */ | |
40 | return 2; | |
41 | } else if (x < 0x10000UL) { | |
42 | /* 16 bits */ | |
43 | return 3; | |
44 | } else if (x < 0x200000UL) { | |
45 | /* 21 bits */ | |
46 | return 4; | |
47 | } else if (x < 0x4000000UL) { | |
48 | /* 26 bits */ | |
49 | return 5; | |
50 | } else if (x < (duk_ucodepoint_t) 0x80000000UL) { | |
51 | /* 31 bits */ | |
52 | return 6; | |
53 | } else { | |
54 | /* 36 bits */ | |
55 | return 7; | |
56 | } | |
57 | } | |
58 | ||
59 | #if defined(DUK_USE_ASSERTIONS) | |
60 | DUK_INTERNAL duk_small_int_t duk_unicode_get_cesu8_length(duk_ucodepoint_t cp) { | |
61 | duk_uint_fast32_t x = (duk_uint_fast32_t) cp; | |
62 | if (x < 0x80UL) { | |
63 | /* 7 bits */ | |
64 | return 1; | |
65 | } else if (x < 0x800UL) { | |
66 | /* 11 bits */ | |
67 | return 2; | |
68 | } else if (x < 0x10000UL) { | |
69 | /* 16 bits */ | |
70 | return 3; | |
71 | } else { | |
72 | /* Encoded as surrogate pair, each encoding to 3 bytes for | |
73 | * 6 bytes total. Codepoints above U+10FFFF encode as 6 bytes | |
74 | * too, see duk_unicode_encode_cesu8(). | |
75 | */ | |
76 | return 3 + 3; | |
77 | } | |
78 | } | |
79 | #endif /* DUK_USE_ASSERTIONS */ | |
80 | ||
81 | DUK_INTERNAL const duk_uint8_t duk_unicode_xutf8_markers[7] = { | |
82 | 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe | |
83 | }; | |
84 | ||
85 | /* Encode to extended UTF-8; 'out' must have space for at least | |
86 | * DUK_UNICODE_MAX_XUTF8_LENGTH bytes. Allows encoding of any | |
87 | * 32-bit (unsigned) codepoint. | |
88 | */ | |
89 | DUK_INTERNAL duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out) { | |
90 | duk_uint_fast32_t x = (duk_uint_fast32_t) cp; | |
91 | duk_small_int_t len; | |
92 | duk_uint8_t marker; | |
93 | duk_small_int_t i; | |
94 | ||
95 | len = duk_unicode_get_xutf8_length(cp); | |
96 | DUK_ASSERT(len > 0); | |
97 | ||
98 | marker = duk_unicode_xutf8_markers[len - 1]; /* 64-bit OK because always >= 0 */ | |
99 | ||
100 | i = len; | |
101 | DUK_ASSERT(i > 0); | |
102 | do { | |
103 | i--; | |
104 | if (i > 0) { | |
105 | out[i] = (duk_uint8_t) (0x80 + (x & 0x3f)); | |
106 | x >>= 6; | |
107 | } else { | |
108 | /* Note: masking of 'x' is not necessary because of | |
109 | * range check and shifting -> no bits overlapping | |
110 | * the marker should be set. | |
111 | */ | |
112 | out[0] = (duk_uint8_t) (marker + x); | |
113 | } | |
114 | } while (i > 0); | |
115 | ||
116 | return len; | |
117 | } | |
118 | ||
119 | /* Encode to CESU-8; 'out' must have space for at least | |
120 | * DUK_UNICODE_MAX_CESU8_LENGTH bytes; codepoints above U+10FFFF | |
121 | * will encode to garbage but won't overwrite the output buffer. | |
122 | */ | |
123 | DUK_INTERNAL duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out) { | |
124 | duk_uint_fast32_t x = (duk_uint_fast32_t) cp; | |
125 | duk_small_int_t len; | |
126 | ||
127 | if (x < 0x80UL) { | |
128 | out[0] = (duk_uint8_t) x; | |
129 | len = 1; | |
130 | } else if (x < 0x800UL) { | |
131 | out[0] = (duk_uint8_t) (0xc0 + ((x >> 6) & 0x1f)); | |
132 | out[1] = (duk_uint8_t) (0x80 + (x & 0x3f)); | |
133 | len = 2; | |
134 | } else if (x < 0x10000UL) { | |
135 | /* surrogate pairs get encoded here */ | |
136 | out[0] = (duk_uint8_t) (0xe0 + ((x >> 12) & 0x0f)); | |
137 | out[1] = (duk_uint8_t) (0x80 + ((x >> 6) & 0x3f)); | |
138 | out[2] = (duk_uint8_t) (0x80 + (x & 0x3f)); | |
139 | len = 3; | |
140 | } else { | |
141 | /* | |
142 | * Unicode codepoints above U+FFFF are encoded as surrogate | |
143 | * pairs here. This ensures that all CESU-8 codepoints are | |
144 | * 16-bit values as expected in Ecmascript. The surrogate | |
145 | * pairs always get a 3-byte encoding (each) in CESU-8. | |
146 | * See: http://en.wikipedia.org/wiki/Surrogate_pair | |
147 | * | |
148 | * 20-bit codepoint, 10 bits (A and B) per surrogate pair: | |
149 | * | |
150 | * x = 0b00000000 0000AAAA AAAAAABB BBBBBBBB | |
151 | * sp1 = 0b110110AA AAAAAAAA (0xd800 + ((x >> 10) & 0x3ff)) | |
152 | * sp2 = 0b110111BB BBBBBBBB (0xdc00 + (x & 0x3ff)) | |
153 | * | |
154 | * Encoded into CESU-8: | |
155 | * | |
156 | * sp1 -> 0b11101101 (0xe0 + ((sp1 >> 12) & 0x0f)) | |
157 | * -> 0b1010AAAA (0x80 + ((sp1 >> 6) & 0x3f)) | |
158 | * -> 0b10AAAAAA (0x80 + (sp1 & 0x3f)) | |
159 | * sp2 -> 0b11101101 (0xe0 + ((sp2 >> 12) & 0x0f)) | |
160 | * -> 0b1011BBBB (0x80 + ((sp2 >> 6) & 0x3f)) | |
161 | * -> 0b10BBBBBB (0x80 + (sp2 & 0x3f)) | |
162 | * | |
163 | * Note that 0x10000 must be subtracted first. The code below | |
164 | * avoids the sp1, sp2 temporaries which saves around 20 bytes | |
165 | * of code. | |
166 | */ | |
167 | ||
168 | x -= 0x10000UL; | |
169 | ||
170 | out[0] = (duk_uint8_t) (0xed); | |
171 | out[1] = (duk_uint8_t) (0xa0 + ((x >> 16) & 0x0f)); | |
172 | out[2] = (duk_uint8_t) (0x80 + ((x >> 10) & 0x3f)); | |
173 | out[3] = (duk_uint8_t) (0xed); | |
174 | out[4] = (duk_uint8_t) (0xb0 + ((x >> 6) & 0x0f)); | |
175 | out[5] = (duk_uint8_t) (0x80 + (x & 0x3f)); | |
176 | len = 6; | |
177 | } | |
178 | ||
179 | return len; | |
180 | } | |
181 | ||
182 | /* Decode helper. Return zero on error. */ | |
183 | DUK_INTERNAL duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end, duk_ucodepoint_t *out_cp) { | |
184 | const duk_uint8_t *p; | |
185 | duk_uint32_t res; | |
186 | duk_uint_fast8_t ch; | |
187 | duk_small_int_t n; | |
188 | ||
189 | DUK_UNREF(thr); | |
190 | ||
191 | p = *ptr; | |
192 | if (p < ptr_start || p >= ptr_end) { | |
193 | goto fail; | |
194 | } | |
195 | ||
196 | /* | |
197 | * UTF-8 decoder which accepts longer than standard byte sequences. | |
198 | * This allows full 32-bit code points to be used. | |
199 | */ | |
200 | ||
201 | ch = (duk_uint_fast8_t) (*p++); | |
202 | if (ch < 0x80) { | |
203 | /* 0xxx xxxx [7 bits] */ | |
204 | res = (duk_uint32_t) (ch & 0x7f); | |
205 | n = 0; | |
206 | } else if (ch < 0xc0) { | |
207 | /* 10xx xxxx -> invalid */ | |
208 | goto fail; | |
209 | } else if (ch < 0xe0) { | |
210 | /* 110x xxxx 10xx xxxx [11 bits] */ | |
211 | res = (duk_uint32_t) (ch & 0x1f); | |
212 | n = 1; | |
213 | } else if (ch < 0xf0) { | |
214 | /* 1110 xxxx 10xx xxxx 10xx xxxx [16 bits] */ | |
215 | res = (duk_uint32_t) (ch & 0x0f); | |
216 | n = 2; | |
217 | } else if (ch < 0xf8) { | |
218 | /* 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx [21 bits] */ | |
219 | res = (duk_uint32_t) (ch & 0x07); | |
220 | n = 3; | |
221 | } else if (ch < 0xfc) { | |
222 | /* 1111 10xx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [26 bits] */ | |
223 | res = (duk_uint32_t) (ch & 0x03); | |
224 | n = 4; | |
225 | } else if (ch < 0xfe) { | |
226 | /* 1111 110x 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [31 bits] */ | |
227 | res = (duk_uint32_t) (ch & 0x01); | |
228 | n = 5; | |
229 | } else if (ch < 0xff) { | |
230 | /* 1111 1110 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [36 bits] */ | |
231 | res = (duk_uint32_t) (0); | |
232 | n = 6; | |
233 | } else { | |
234 | /* 8-byte format could be: | |
235 | * 1111 1111 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [41 bits] | |
236 | * | |
237 | * However, this format would not have a zero bit following the | |
238 | * leading one bits and would not allow 0xFF to be used as an | |
239 | * "invalid xutf-8" marker for internal keys. Further, 8-byte | |
240 | * encodings (up to 41 bit code points) are not currently needed. | |
241 | */ | |
242 | goto fail; | |
243 | } | |
244 | ||
245 | DUK_ASSERT(p >= ptr_start); /* verified at beginning */ | |
246 | if (p + n > ptr_end) { | |
247 | /* check pointer at end */ | |
248 | goto fail; | |
249 | } | |
250 | ||
251 | while (n > 0) { | |
252 | DUK_ASSERT(p >= ptr_start && p < ptr_end); | |
253 | res = res << 6; | |
254 | res += (duk_uint32_t) ((*p++) & 0x3f); | |
255 | n--; | |
256 | } | |
257 | ||
258 | *ptr = p; | |
259 | *out_cp = res; | |
260 | return 1; | |
261 | ||
262 | fail: | |
263 | return 0; | |
264 | } | |
265 | ||
266 | /* used by e.g. duk_regexp_executor.c, string built-ins */ | |
267 | DUK_INTERNAL duk_ucodepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end) { | |
268 | duk_ucodepoint_t cp; | |
269 | ||
270 | if (duk_unicode_decode_xutf8(thr, ptr, ptr_start, ptr_end, &cp)) { | |
271 | return cp; | |
272 | } | |
273 | DUK_ERROR_INTERNAL(thr, "utf-8 decode failed"); /* XXX: 'internal error' is a bit of a misnomer */ | |
274 | DUK_UNREACHABLE(); | |
275 | return 0; | |
276 | } | |
277 | ||
278 | /* Compute (extended) utf-8 length without codepoint encoding validation, | |
279 | * used for string interning. | |
280 | * | |
281 | * NOTE: This algorithm is performance critical, more so than string hashing | |
282 | * in some cases. It is needed when interning a string and needs to scan | |
283 | * every byte of the string with no skipping. Having an ASCII fast path | |
284 | * is useful if possible in the algorithm. The current algorithms were | |
285 | * chosen from several variants, based on x64 gcc -O2 testing. See: | |
286 | * https://github.com/svaarala/duktape/pull/422 | |
287 | * | |
288 | * NOTE: must match src/dukutil.py:duk_unicode_unvalidated_utf8_length(). | |
289 | */ | |
290 | ||
291 | #if defined(DUK_USE_PREFER_SIZE) | |
292 | /* Small variant; roughly 150 bytes smaller than the fast variant. */ | |
293 | DUK_INTERNAL duk_size_t duk_unicode_unvalidated_utf8_length(const duk_uint8_t *data, duk_size_t blen) { | |
294 | const duk_uint8_t *p; | |
295 | const duk_uint8_t *p_end; | |
296 | duk_size_t ncont; | |
297 | duk_size_t clen; | |
298 | ||
299 | p = data; | |
300 | p_end = data + blen; | |
301 | ncont = 0; | |
302 | while (p != p_end) { | |
303 | duk_uint8_t x; | |
304 | x = *p++; | |
305 | if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) { | |
306 | ncont++; | |
307 | } | |
308 | } | |
309 | ||
310 | DUK_ASSERT(ncont <= blen); | |
311 | clen = blen - ncont; | |
312 | DUK_ASSERT(clen <= blen); | |
313 | return clen; | |
314 | } | |
315 | #else /* DUK_USE_PREFER_SIZE */ | |
316 | /* This seems like a good overall approach. Fast path for ASCII in 4 byte | |
317 | * blocks. | |
318 | */ | |
319 | DUK_INTERNAL duk_size_t duk_unicode_unvalidated_utf8_length(const duk_uint8_t *data, duk_size_t blen) { | |
320 | const duk_uint8_t *p; | |
321 | const duk_uint8_t *p_end; | |
322 | const duk_uint32_t *p32_end; | |
323 | const duk_uint32_t *p32; | |
324 | duk_size_t ncont; | |
325 | duk_size_t clen; | |
326 | ||
327 | ncont = 0; /* number of continuation (non-initial) bytes in [0x80,0xbf] */ | |
328 | p = data; | |
329 | p_end = data + blen; | |
330 | if (blen < 16) { | |
331 | goto skip_fastpath; | |
332 | } | |
333 | ||
334 | /* Align 'p' to 4; the input data may have arbitrary alignment. | |
335 | * End of string check not needed because blen >= 16. | |
336 | */ | |
337 | while (((duk_size_t) (const void *) p) & 0x03U) { | |
338 | duk_uint8_t x; | |
339 | x = *p++; | |
340 | if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) { | |
341 | ncont++; | |
342 | } | |
343 | } | |
344 | ||
345 | /* Full, aligned 4-byte reads. */ | |
346 | p32_end = (const duk_uint32_t *) (const void *) (p + ((duk_size_t) (p_end - p) & (duk_size_t) (~0x03))); | |
347 | p32 = (const duk_uint32_t *) (const void *) p; | |
348 | while (p32 != (const duk_uint32_t *) p32_end) { | |
349 | duk_uint32_t x; | |
350 | x = *p32++; | |
351 | if (DUK_LIKELY((x & 0x80808080UL) == 0)) { | |
352 | ; /* ASCII fast path */ | |
353 | } else { | |
354 | /* Flip highest bit of each byte which changes | |
355 | * the bit pattern 10xxxxxx into 00xxxxxx which | |
356 | * allows an easy bit mask test. | |
357 | */ | |
358 | x ^= 0x80808080UL; | |
359 | if (DUK_UNLIKELY(!(x & 0xc0000000UL))) { | |
360 | ncont++; | |
361 | } | |
362 | if (DUK_UNLIKELY(!(x & 0x00c00000UL))) { | |
363 | ncont++; | |
364 | } | |
365 | if (DUK_UNLIKELY(!(x & 0x0000c000UL))) { | |
366 | ncont++; | |
367 | } | |
368 | if (DUK_UNLIKELY(!(x & 0x000000c0UL))) { | |
369 | ncont++; | |
370 | } | |
371 | } | |
372 | } | |
373 | p = (const duk_uint8_t *) p32; | |
374 | /* Fall through to handle the rest. */ | |
375 | ||
376 | skip_fastpath: | |
377 | while (p != p_end) { | |
378 | duk_uint8_t x; | |
379 | x = *p++; | |
380 | if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) { | |
381 | ncont++; | |
382 | } | |
383 | } | |
384 | ||
385 | DUK_ASSERT(ncont <= blen); | |
386 | clen = blen - ncont; | |
387 | DUK_ASSERT(clen <= blen); | |
388 | return clen; | |
389 | } | |
390 | #endif /* DUK_USE_PREFER_SIZE */ | |
391 | ||
392 | /* | |
393 | * Unicode range matcher | |
394 | * | |
395 | * Matches a codepoint against a packed bitstream of character ranges. | |
396 | * Used for slow path Unicode matching. | |
397 | */ | |
398 | ||
399 | /* Must match src/extract_chars.py, generate_match_table3(). */ | |
400 | DUK_LOCAL duk_uint32_t duk__uni_decode_value(duk_bitdecoder_ctx *bd_ctx) { | |
401 | duk_uint32_t t; | |
402 | ||
403 | t = (duk_uint32_t) duk_bd_decode(bd_ctx, 4); | |
404 | if (t <= 0x0eU) { | |
405 | return t; | |
406 | } | |
407 | t = (duk_uint32_t) duk_bd_decode(bd_ctx, 8); | |
408 | if (t <= 0xfdU) { | |
409 | return t + 0x0f; | |
410 | } | |
411 | if (t == 0xfeU) { | |
412 | t = (duk_uint32_t) duk_bd_decode(bd_ctx, 12); | |
413 | return t + 0x0fU + 0xfeU; | |
414 | } else { | |
415 | t = (duk_uint32_t) duk_bd_decode(bd_ctx, 24); | |
416 | return t + 0x0fU + 0xfeU + 0x1000UL; | |
417 | } | |
418 | } | |
419 | ||
420 | DUK_LOCAL duk_small_int_t duk__uni_range_match(const duk_uint8_t *unitab, duk_size_t unilen, duk_codepoint_t cp) { | |
421 | duk_bitdecoder_ctx bd_ctx; | |
422 | duk_codepoint_t prev_re; | |
423 | ||
424 | DUK_MEMZERO(&bd_ctx, sizeof(bd_ctx)); | |
425 | bd_ctx.data = (const duk_uint8_t *) unitab; | |
426 | bd_ctx.length = (duk_size_t) unilen; | |
427 | ||
428 | prev_re = 0; | |
429 | for (;;) { | |
430 | duk_codepoint_t r1, r2; | |
431 | r1 = (duk_codepoint_t) duk__uni_decode_value(&bd_ctx); | |
432 | if (r1 == 0) { | |
433 | break; | |
434 | } | |
435 | r2 = (duk_codepoint_t) duk__uni_decode_value(&bd_ctx); | |
436 | ||
437 | r1 = prev_re + r1; | |
438 | r2 = r1 + r2; | |
439 | prev_re = r2; | |
440 | ||
441 | /* [r1,r2] is the range */ | |
442 | ||
443 | DUK_DDD(DUK_DDDPRINT("duk__uni_range_match: cp=%06lx range=[0x%06lx,0x%06lx]", | |
444 | (unsigned long) cp, (unsigned long) r1, (unsigned long) r2)); | |
445 | if (cp >= r1 && cp <= r2) { | |
446 | return 1; | |
447 | } | |
448 | } | |
449 | ||
450 | return 0; | |
451 | } | |
452 | ||
453 | /* | |
454 | * "WhiteSpace" production check. | |
455 | */ | |
456 | ||
457 | DUK_INTERNAL duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp) { | |
458 | /* | |
459 | * E5 Section 7.2 specifies six characters specifically as | |
460 | * white space: | |
461 | * | |
462 | * 0009;<control>;Cc;0;S;;;;;N;CHARACTER TABULATION;;;; | |
463 | * 000B;<control>;Cc;0;S;;;;;N;LINE TABULATION;;;; | |
464 | * 000C;<control>;Cc;0;WS;;;;;N;FORM FEED (FF);;;; | |
465 | * 0020;SPACE;Zs;0;WS;;;;;N;;;;; | |
466 | * 00A0;NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;; | |
467 | * FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; | |
468 | * | |
469 | * It also specifies any Unicode category 'Zs' characters as white | |
470 | * space. These can be extracted with the "src/extract_chars.py" script. | |
471 | * Current result: | |
472 | * | |
473 | * RAW OUTPUT: | |
474 | * =========== | |
475 | * 0020;SPACE;Zs;0;WS;;;;;N;;;;; | |
476 | * 00A0;NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;; | |
477 | * 1680;OGHAM SPACE MARK;Zs;0;WS;;;;;N;;;;; | |
478 | * 180E;MONGOLIAN VOWEL SEPARATOR;Zs;0;WS;;;;;N;;;;; | |
479 | * 2000;EN QUAD;Zs;0;WS;2002;;;;N;;;;; | |
480 | * 2001;EM QUAD;Zs;0;WS;2003;;;;N;;;;; | |
481 | * 2002;EN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;; | |
482 | * 2003;EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;; | |
483 | * 2004;THREE-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;; | |
484 | * 2005;FOUR-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;; | |
485 | * 2006;SIX-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;; | |
486 | * 2007;FIGURE SPACE;Zs;0;WS;<noBreak> 0020;;;;N;;;;; | |
487 | * 2008;PUNCTUATION SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;; | |
488 | * 2009;THIN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;; | |
489 | * 200A;HAIR SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;; | |
490 | * 202F;NARROW NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;;;;; | |
491 | * 205F;MEDIUM MATHEMATICAL SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;; | |
492 | * 3000;IDEOGRAPHIC SPACE;Zs;0;WS;<wide> 0020;;;;N;;;;; | |
493 | * | |
494 | * RANGES: | |
495 | * ======= | |
496 | * 0x0020 | |
497 | * 0x00a0 | |
498 | * 0x1680 | |
499 | * 0x180e | |
500 | * 0x2000 ... 0x200a | |
501 | * 0x202f | |
502 | * 0x205f | |
503 | * 0x3000 | |
504 | * | |
505 | * A manual decoder (below) is probably most compact for this. | |
506 | */ | |
507 | ||
508 | duk_uint_fast8_t lo; | |
509 | duk_uint_fast32_t hi; | |
510 | ||
511 | /* cp == -1 (EOF) never matches and causes return value 0 */ | |
512 | ||
513 | lo = (duk_uint_fast8_t) (cp & 0xff); | |
514 | hi = (duk_uint_fast32_t) (cp >> 8); /* does not fit into an uchar */ | |
515 | ||
516 | if (hi == 0x0000UL) { | |
517 | if (lo == 0x09U || lo == 0x0bU || lo == 0x0cU || | |
518 | lo == 0x20U || lo == 0xa0U) { | |
519 | return 1; | |
520 | } | |
521 | } else if (hi == 0x0020UL) { | |
522 | if (lo <= 0x0aU || lo == 0x2fU || lo == 0x5fU) { | |
523 | return 1; | |
524 | } | |
525 | } else if (cp == 0x1680L || cp == 0x180eL || cp == 0x3000L || | |
526 | cp == 0xfeffL) { | |
527 | return 1; | |
528 | } | |
529 | ||
530 | return 0; | |
531 | } | |
532 | ||
533 | /* | |
534 | * "LineTerminator" production check. | |
535 | */ | |
536 | ||
537 | DUK_INTERNAL duk_small_int_t duk_unicode_is_line_terminator(duk_codepoint_t cp) { | |
538 | /* | |
539 | * E5 Section 7.3 | |
540 | * | |
541 | * A LineTerminatorSequence essentially merges <CR> <LF> sequences | |
542 | * into a single line terminator. This must be handled by the caller. | |
543 | */ | |
544 | ||
545 | if (cp == 0x000aL || cp == 0x000dL || cp == 0x2028L || | |
546 | cp == 0x2029L) { | |
547 | return 1; | |
548 | } | |
549 | ||
550 | return 0; | |
551 | } | |
552 | ||
553 | /* | |
554 | * "IdentifierStart" production check. | |
555 | */ | |
556 | ||
557 | DUK_INTERNAL duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp) { | |
558 | /* | |
559 | * E5 Section 7.6: | |
560 | * | |
561 | * IdentifierStart: | |
562 | * UnicodeLetter | |
563 | * $ | |
564 | * _ | |
565 | * \ UnicodeEscapeSequence | |
566 | * | |
567 | * IdentifierStart production has one multi-character production: | |
568 | * | |
569 | * \ UnicodeEscapeSequence | |
570 | * | |
571 | * The '\' character is -not- matched by this function. Rather, the caller | |
572 | * should decode the escape and then call this function to check whether the | |
573 | * decoded character is acceptable (see discussion in E5 Section 7.6). | |
574 | * | |
575 | * The "UnicodeLetter" alternative of the production allows letters | |
576 | * from various Unicode categories. These can be extracted with the | |
577 | * "src/extract_chars.py" script. | |
578 | * | |
579 | * Because the result has hundreds of Unicode codepoint ranges, matching | |
580 | * for any values >= 0x80 are done using a very slow range-by-range scan | |
581 | * and a packed range format. | |
582 | * | |
583 | * The ASCII portion (codepoints 0x00 ... 0x7f) is fast-pathed below because | |
584 | * it matters the most. The ASCII related ranges of IdentifierStart are: | |
585 | * | |
586 | * 0x0041 ... 0x005a ['A' ... 'Z'] | |
587 | * 0x0061 ... 0x007a ['a' ... 'z'] | |
588 | * 0x0024 ['$'] | |
589 | * 0x005f ['_'] | |
590 | */ | |
591 | ||
592 | /* ASCII (and EOF) fast path -- quick accept and reject */ | |
593 | if (cp <= 0x7fL) { | |
594 | #if defined(DUK_USE_IDCHAR_FASTPATH) | |
595 | return (cp >= 0) && (duk_is_idchar_tab[cp] > 0); | |
596 | #else | |
597 | if ((cp >= 'a' && cp <= 'z') || | |
598 | (cp >= 'A' && cp <= 'Z') || | |
599 | cp == '_' || cp == '$') { | |
600 | return 1; | |
601 | } | |
602 | return 0; | |
603 | #endif | |
604 | } | |
605 | ||
606 | /* Non-ASCII slow path (range-by-range linear comparison), very slow */ | |
607 | ||
608 | #ifdef DUK_USE_SOURCE_NONBMP | |
609 | if (duk__uni_range_match(duk_unicode_ids_noa, | |
610 | (duk_size_t) sizeof(duk_unicode_ids_noa), | |
611 | (duk_codepoint_t) cp)) { | |
612 | return 1; | |
613 | } | |
614 | return 0; | |
615 | #else | |
616 | if (cp < 0x10000L) { | |
617 | if (duk__uni_range_match(duk_unicode_ids_noabmp, | |
618 | sizeof(duk_unicode_ids_noabmp), | |
619 | (duk_codepoint_t) cp)) { | |
620 | return 1; | |
621 | } | |
622 | return 0; | |
623 | } else { | |
624 | /* without explicit non-BMP support, assume non-BMP characters | |
625 | * are always accepted as identifier characters. | |
626 | */ | |
627 | return 1; | |
628 | } | |
629 | #endif | |
630 | } | |
631 | ||
632 | /* | |
633 | * "IdentifierPart" production check. | |
634 | */ | |
635 | ||
636 | DUK_INTERNAL duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp) { | |
637 | /* | |
638 | * E5 Section 7.6: | |
639 | * | |
640 | * IdentifierPart: | |
641 | * IdentifierStart | |
642 | * UnicodeCombiningMark | |
643 | * UnicodeDigit | |
644 | * UnicodeConnectorPunctuation | |
645 | * <ZWNJ> [U+200C] | |
646 | * <ZWJ> [U+200D] | |
647 | * | |
648 | * IdentifierPart production has one multi-character production | |
649 | * as part of its IdentifierStart alternative. The '\' character | |
650 | * of an escape sequence is not matched here, see discussion in | |
651 | * duk_unicode_is_identifier_start(). | |
652 | * | |
653 | * To match non-ASCII characters (codepoints >= 0x80), a very slow | |
654 | * linear range-by-range scan is used. The codepoint is first compared | |
655 | * to the IdentifierStart ranges, and if it doesn't match, then to a | |
656 | * set consisting of code points in IdentifierPart but not in | |
657 | * IdentifierStart. This is done to keep the unicode range data small, | |
658 | * at the expense of speed. | |
659 | * | |
660 | * The ASCII fast path consists of: | |
661 | * | |
662 | * 0x0030 ... 0x0039 ['0' ... '9', UnicodeDigit] | |
663 | * 0x0041 ... 0x005a ['A' ... 'Z', IdentifierStart] | |
664 | * 0x0061 ... 0x007a ['a' ... 'z', IdentifierStart] | |
665 | * 0x0024 ['$', IdentifierStart] | |
666 | * 0x005f ['_', IdentifierStart and | |
667 | * UnicodeConnectorPunctuation] | |
668 | * | |
669 | * UnicodeCombiningMark has no code points <= 0x7f. | |
670 | * | |
671 | * The matching code reuses the "identifier start" tables, and then | |
672 | * consults a separate range set for characters in "identifier part" | |
673 | * but not in "identifier start". These can be extracted with the | |
674 | * "src/extract_chars.py" script. | |
675 | * | |
676 | * UnicodeCombiningMark -> categories Mn, Mc | |
677 | * UnicodeDigit -> categories Nd | |
678 | * UnicodeConnectorPunctuation -> categories Pc | |
679 | */ | |
680 | ||
681 | /* ASCII (and EOF) fast path -- quick accept and reject */ | |
682 | if (cp <= 0x7fL) { | |
683 | #if defined(DUK_USE_IDCHAR_FASTPATH) | |
684 | return (cp >= 0) && (duk_is_idchar_tab[cp] != 0); | |
685 | #else | |
686 | if ((cp >= 'a' && cp <= 'z') || | |
687 | (cp >= 'A' && cp <= 'Z') || | |
688 | (cp >= '0' && cp <= '9') || | |
689 | cp == '_' || cp == '$') { | |
690 | return 1; | |
691 | } | |
692 | return 0; | |
693 | #endif | |
694 | } | |
695 | ||
696 | /* Non-ASCII slow path (range-by-range linear comparison), very slow */ | |
697 | ||
698 | #ifdef DUK_USE_SOURCE_NONBMP | |
699 | if (duk__uni_range_match(duk_unicode_ids_noa, | |
700 | sizeof(duk_unicode_ids_noa), | |
701 | (duk_codepoint_t) cp) || | |
702 | duk__uni_range_match(duk_unicode_idp_m_ids_noa, | |
703 | sizeof(duk_unicode_idp_m_ids_noa), | |
704 | (duk_codepoint_t) cp)) { | |
705 | return 1; | |
706 | } | |
707 | return 0; | |
708 | #else | |
709 | if (cp < 0x10000L) { | |
710 | if (duk__uni_range_match(duk_unicode_ids_noabmp, | |
711 | sizeof(duk_unicode_ids_noabmp), | |
712 | (duk_codepoint_t) cp) || | |
713 | duk__uni_range_match(duk_unicode_idp_m_ids_noabmp, | |
714 | sizeof(duk_unicode_idp_m_ids_noabmp), | |
715 | (duk_codepoint_t) cp)) { | |
716 | return 1; | |
717 | } | |
718 | return 0; | |
719 | } else { | |
720 | /* without explicit non-BMP support, assume non-BMP characters | |
721 | * are always accepted as identifier characters. | |
722 | */ | |
723 | return 1; | |
724 | } | |
725 | #endif | |
726 | } | |
727 | ||
728 | /* | |
729 | * Unicode letter check. | |
730 | */ | |
731 | ||
732 | DUK_INTERNAL duk_small_int_t duk_unicode_is_letter(duk_codepoint_t cp) { | |
733 | /* | |
734 | * Unicode letter is now taken to be the categories: | |
735 | * | |
736 | * Lu, Ll, Lt, Lm, Lo | |
737 | * | |
738 | * (Not sure if this is exactly correct.) | |
739 | * | |
740 | * The ASCII fast path consists of: | |
741 | * | |
742 | * 0x0041 ... 0x005a ['A' ... 'Z'] | |
743 | * 0x0061 ... 0x007a ['a' ... 'z'] | |
744 | */ | |
745 | ||
746 | /* ASCII (and EOF) fast path -- quick accept and reject */ | |
747 | if (cp <= 0x7fL) { | |
748 | if ((cp >= 'a' && cp <= 'z') || | |
749 | (cp >= 'A' && cp <= 'Z')) { | |
750 | return 1; | |
751 | } | |
752 | return 0; | |
753 | } | |
754 | ||
755 | /* Non-ASCII slow path (range-by-range linear comparison), very slow */ | |
756 | ||
757 | #ifdef DUK_USE_SOURCE_NONBMP | |
758 | if (duk__uni_range_match(duk_unicode_ids_noa, | |
759 | sizeof(duk_unicode_ids_noa), | |
760 | (duk_codepoint_t) cp) && | |
761 | !duk__uni_range_match(duk_unicode_ids_m_let_noa, | |
762 | sizeof(duk_unicode_ids_m_let_noa), | |
763 | (duk_codepoint_t) cp)) { | |
764 | return 1; | |
765 | } | |
766 | return 0; | |
767 | #else | |
768 | if (cp < 0x10000L) { | |
769 | if (duk__uni_range_match(duk_unicode_ids_noabmp, | |
770 | sizeof(duk_unicode_ids_noabmp), | |
771 | (duk_codepoint_t) cp) && | |
772 | !duk__uni_range_match(duk_unicode_ids_m_let_noabmp, | |
773 | sizeof(duk_unicode_ids_m_let_noabmp), | |
774 | (duk_codepoint_t) cp)) { | |
775 | return 1; | |
776 | } | |
777 | return 0; | |
778 | } else { | |
779 | /* without explicit non-BMP support, assume non-BMP characters | |
780 | * are always accepted as letters. | |
781 | */ | |
782 | return 1; | |
783 | } | |
784 | #endif | |
785 | } | |
786 | ||
787 | /* | |
788 | * Complex case conversion helper which decodes a bit-packed conversion | |
789 | * control stream generated by unicode/extract_caseconv.py. The conversion | |
790 | * is very slow because it runs through the conversion data in a linear | |
791 | * fashion to save space (which is why ASCII characters have a special | |
792 | * fast path before arriving here). | |
793 | * | |
794 | * The particular bit counts etc have been determined experimentally to | |
795 | * be small but still sufficient, and must match the Python script | |
796 | * (src/extract_caseconv.py). | |
797 | * | |
798 | * The return value is the case converted codepoint or -1 if the conversion | |
799 | * results in multiple characters (this is useful for regexp Canonicalization | |
800 | * operation). If 'buf' is not NULL, the result codepoint(s) are also | |
801 | * appended to the hbuffer. | |
802 | * | |
803 | * Context and locale specific rules must be checked before consulting | |
804 | * this function. | |
805 | */ | |
806 | ||
807 | DUK_LOCAL | |
808 | duk_codepoint_t duk__slow_case_conversion(duk_hthread *thr, | |
809 | duk_bufwriter_ctx *bw, | |
810 | duk_codepoint_t cp, | |
811 | duk_bitdecoder_ctx *bd_ctx) { | |
812 | duk_small_int_t skip = 0; | |
813 | duk_small_int_t n; | |
814 | duk_small_int_t t; | |
815 | duk_small_int_t count; | |
816 | duk_codepoint_t tmp_cp; | |
817 | duk_codepoint_t start_i; | |
818 | duk_codepoint_t start_o; | |
819 | ||
820 | DUK_UNREF(thr); | |
821 | DUK_ASSERT(bd_ctx != NULL); | |
822 | ||
823 | DUK_DDD(DUK_DDDPRINT("slow case conversion for codepoint: %ld", (long) cp)); | |
824 | ||
825 | /* range conversion with a "skip" */ | |
826 | DUK_DDD(DUK_DDDPRINT("checking ranges")); | |
827 | for (;;) { | |
828 | skip++; | |
829 | n = (duk_small_int_t) duk_bd_decode(bd_ctx, 6); | |
830 | if (n == 0x3f) { | |
831 | /* end marker */ | |
832 | break; | |
833 | } | |
834 | DUK_DDD(DUK_DDDPRINT("skip=%ld, n=%ld", (long) skip, (long) n)); | |
835 | ||
836 | while (n--) { | |
837 | start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16); | |
838 | start_o = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16); | |
839 | count = (duk_small_int_t) duk_bd_decode(bd_ctx, 7); | |
840 | DUK_DDD(DUK_DDDPRINT("range: start_i=%ld, start_o=%ld, count=%ld, skip=%ld", | |
841 | (long) start_i, (long) start_o, (long) count, (long) skip)); | |
842 | ||
843 | if (cp >= start_i) { | |
844 | tmp_cp = cp - start_i; /* always >= 0 */ | |
845 | if (tmp_cp < (duk_codepoint_t) count * (duk_codepoint_t) skip && | |
846 | (tmp_cp % (duk_codepoint_t) skip) == 0) { | |
847 | DUK_DDD(DUK_DDDPRINT("range matches input codepoint")); | |
848 | cp = start_o + tmp_cp; | |
849 | goto single; | |
850 | } | |
851 | } | |
852 | } | |
853 | } | |
854 | ||
855 | /* 1:1 conversion */ | |
856 | n = (duk_small_int_t) duk_bd_decode(bd_ctx, 6); | |
857 | DUK_DDD(DUK_DDDPRINT("checking 1:1 conversions (count %ld)", (long) n)); | |
858 | while (n--) { | |
859 | start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16); | |
860 | start_o = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16); | |
861 | DUK_DDD(DUK_DDDPRINT("1:1 conversion %ld -> %ld", (long) start_i, (long) start_o)); | |
862 | if (cp == start_i) { | |
863 | DUK_DDD(DUK_DDDPRINT("1:1 matches input codepoint")); | |
864 | cp = start_o; | |
865 | goto single; | |
866 | } | |
867 | } | |
868 | ||
869 | /* complex, multicharacter conversion */ | |
870 | n = (duk_small_int_t) duk_bd_decode(bd_ctx, 7); | |
871 | DUK_DDD(DUK_DDDPRINT("checking 1:n conversions (count %ld)", (long) n)); | |
872 | while (n--) { | |
873 | start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16); | |
874 | t = (duk_small_int_t) duk_bd_decode(bd_ctx, 2); | |
875 | DUK_DDD(DUK_DDDPRINT("1:n conversion %ld -> %ld chars", (long) start_i, (long) t)); | |
876 | if (cp == start_i) { | |
877 | DUK_DDD(DUK_DDDPRINT("1:n matches input codepoint")); | |
878 | if (bw != NULL) { | |
879 | while (t--) { | |
880 | tmp_cp = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16); | |
881 | DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) tmp_cp); | |
882 | } | |
883 | } | |
884 | return -1; | |
885 | } else { | |
886 | while (t--) { | |
887 | (void) duk_bd_decode(bd_ctx, 16); | |
888 | } | |
889 | } | |
890 | } | |
891 | ||
892 | /* default: no change */ | |
893 | DUK_DDD(DUK_DDDPRINT("no rule matches, output is same as input")); | |
894 | /* fall through */ | |
895 | ||
896 | single: | |
897 | if (bw != NULL) { | |
898 | DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) cp); | |
899 | } | |
900 | return cp; | |
901 | } | |
902 | ||
903 | /* | |
904 | * Case conversion helper, with context/local sensitivity. | |
905 | * For proper case conversion, one needs to know the character | |
906 | * and the preceding and following characters, as well as | |
907 | * locale/language. | |
908 | */ | |
909 | ||
910 | /* XXX: add 'language' argument when locale/language sensitive rule | |
911 | * support added. | |
912 | */ | |
913 | DUK_LOCAL | |
914 | duk_codepoint_t duk__case_transform_helper(duk_hthread *thr, | |
915 | duk_bufwriter_ctx *bw, | |
916 | duk_codepoint_t cp, | |
917 | duk_codepoint_t prev, | |
918 | duk_codepoint_t next, | |
919 | duk_bool_t uppercase) { | |
920 | duk_bitdecoder_ctx bd_ctx; | |
921 | ||
922 | /* fast path for ASCII */ | |
923 | if (cp < 0x80L) { | |
924 | /* XXX: there are language sensitive rules for the ASCII range. | |
925 | * If/when language/locale support is implemented, they need to | |
926 | * be implemented here for the fast path. There are no context | |
927 | * sensitive rules for ASCII range. | |
928 | */ | |
929 | ||
930 | if (uppercase) { | |
931 | if (cp >= 'a' && cp <= 'z') { | |
932 | cp = cp - 'a' + 'A'; | |
933 | } | |
934 | } else { | |
935 | if (cp >= 'A' && cp <= 'Z') { | |
936 | cp = cp - 'A' + 'a'; | |
937 | } | |
938 | } | |
939 | ||
940 | if (bw != NULL) { | |
941 | DUK_BW_WRITE_RAW_U8(thr, bw, (duk_uint8_t) cp); | |
942 | } | |
943 | return cp; | |
944 | } | |
945 | ||
946 | /* context and locale specific rules which cannot currently be represented | |
947 | * in the caseconv bitstream: hardcoded rules in C | |
948 | */ | |
949 | if (uppercase) { | |
950 | /* XXX: turkish / azeri */ | |
951 | } else { | |
952 | /* | |
953 | * Final sigma context specific rule. This is a rather tricky | |
954 | * rule and this handling is probably not 100% correct now. | |
955 | * The rule is not locale/language specific so it is supported. | |
956 | */ | |
957 | ||
958 | if (cp == 0x03a3L && /* U+03A3 = GREEK CAPITAL LETTER SIGMA */ | |
959 | duk_unicode_is_letter(prev) && /* prev exists and is not a letter */ | |
960 | !duk_unicode_is_letter(next)) { /* next does not exist or next is not a letter */ | |
961 | /* Capital sigma occurred at "end of word", lowercase to | |
962 | * U+03C2 = GREEK SMALL LETTER FINAL SIGMA. Otherwise | |
963 | * fall through and let the normal rules lowercase it to | |
964 | * U+03C3 = GREEK SMALL LETTER SIGMA. | |
965 | */ | |
966 | cp = 0x03c2L; | |
967 | goto singlechar; | |
968 | } | |
969 | ||
970 | /* XXX: lithuanian not implemented */ | |
971 | /* XXX: lithuanian, explicit dot rules */ | |
972 | /* XXX: turkish / azeri, lowercase rules */ | |
973 | } | |
974 | ||
975 | /* 1:1 or special conversions, but not locale/context specific: script generated rules */ | |
976 | DUK_MEMZERO(&bd_ctx, sizeof(bd_ctx)); | |
977 | if (uppercase) { | |
978 | bd_ctx.data = (const duk_uint8_t *) duk_unicode_caseconv_uc; | |
979 | bd_ctx.length = (duk_size_t) sizeof(duk_unicode_caseconv_uc); | |
980 | } else { | |
981 | bd_ctx.data = (const duk_uint8_t *) duk_unicode_caseconv_lc; | |
982 | bd_ctx.length = (duk_size_t) sizeof(duk_unicode_caseconv_lc); | |
983 | } | |
984 | return duk__slow_case_conversion(thr, bw, cp, &bd_ctx); | |
985 | ||
986 | singlechar: | |
987 | if (bw != NULL) { | |
988 | DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) cp); | |
989 | } | |
990 | return cp; | |
991 | ||
992 | /* unused now, not needed until Turkish/Azeri */ | |
993 | #if 0 | |
994 | nochar: | |
995 | return -1; | |
996 | #endif | |
997 | } | |
998 | ||
999 | /* | |
1000 | * Replace valstack top with case converted version. | |
1001 | */ | |
1002 | ||
1003 | DUK_INTERNAL void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_int_t uppercase) { | |
1004 | duk_context *ctx = (duk_context *) thr; | |
1005 | duk_hstring *h_input; | |
1006 | duk_bufwriter_ctx bw_alloc; | |
1007 | duk_bufwriter_ctx *bw; | |
1008 | const duk_uint8_t *p, *p_start, *p_end; | |
1009 | duk_codepoint_t prev, curr, next; | |
1010 | ||
1011 | h_input = duk_require_hstring(ctx, -1); | |
1012 | DUK_ASSERT(h_input != NULL); | |
1013 | ||
1014 | bw = &bw_alloc; | |
1015 | DUK_BW_INIT_PUSHBUF(thr, bw, DUK_HSTRING_GET_BYTELEN(h_input)); | |
1016 | ||
1017 | /* [ ... input buffer ] */ | |
1018 | ||
1019 | p_start = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h_input); | |
1020 | p_end = p_start + DUK_HSTRING_GET_BYTELEN(h_input); | |
1021 | p = p_start; | |
1022 | ||
1023 | prev = -1; DUK_UNREF(prev); | |
1024 | curr = -1; | |
1025 | next = -1; | |
1026 | for (;;) { | |
1027 | prev = curr; | |
1028 | curr = next; | |
1029 | next = -1; | |
1030 | if (p < p_end) { | |
1031 | next = (int) duk_unicode_decode_xutf8_checked(thr, &p, p_start, p_end); | |
1032 | } else { | |
1033 | /* end of input and last char has been processed */ | |
1034 | if (curr < 0) { | |
1035 | break; | |
1036 | } | |
1037 | } | |
1038 | ||
1039 | /* on first round, skip */ | |
1040 | if (curr >= 0) { | |
1041 | /* XXX: could add a fast path to process chunks of input codepoints, | |
1042 | * but relative benefit would be quite small. | |
1043 | */ | |
1044 | ||
1045 | /* Ensure space for maximum multi-character result; estimate is overkill. */ | |
1046 | DUK_BW_ENSURE(thr, bw, 8 * DUK_UNICODE_MAX_XUTF8_LENGTH); | |
1047 | ||
1048 | duk__case_transform_helper(thr, | |
1049 | bw, | |
1050 | (duk_codepoint_t) curr, | |
1051 | prev, | |
1052 | next, | |
1053 | uppercase); | |
1054 | } | |
1055 | } | |
1056 | ||
1057 | DUK_BW_COMPACT(thr, bw); | |
1058 | duk_to_string(ctx, -1); /* invalidates h_buf pointer */ | |
1059 | duk_remove(ctx, -2); | |
1060 | } | |
1061 | ||
1062 | #ifdef DUK_USE_REGEXP_SUPPORT | |
1063 | ||
1064 | /* | |
1065 | * Canonicalize() abstract operation needed for canonicalization of individual | |
1066 | * codepoints during regexp compilation and execution, see E5 Section 15.10.2.8. | |
1067 | * Note that codepoints are canonicalized one character at a time, so no context | |
1068 | * specific rules can apply. Locale specific rules can apply, though. | |
1069 | */ | |
1070 | ||
1071 | DUK_INTERNAL duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr, duk_codepoint_t cp) { | |
1072 | #if defined(DUK_USE_REGEXP_CANON_WORKAROUND) | |
1073 | /* Fast canonicalization lookup at the cost of 128kB footprint. */ | |
1074 | DUK_ASSERT(cp >= 0); | |
1075 | DUK_UNREF(thr); | |
1076 | if (DUK_LIKELY(cp < 0x10000L)) { | |
1077 | return (duk_codepoint_t) duk_unicode_re_canon_lookup[cp]; | |
1078 | } | |
1079 | return cp; | |
1080 | #else /* DUK_USE_REGEXP_CANON_WORKAROUND */ | |
1081 | duk_codepoint_t y; | |
1082 | ||
1083 | y = duk__case_transform_helper(thr, | |
1084 | NULL, /* NULL is allowed, no output */ | |
1085 | cp, /* curr char */ | |
1086 | -1, /* prev char */ | |
1087 | -1, /* next char */ | |
1088 | 1); /* uppercase */ | |
1089 | ||
1090 | if ((y < 0) || (cp >= 0x80 && y < 0x80)) { | |
1091 | /* multiple codepoint conversion or non-ASCII mapped to ASCII | |
1092 | * --> leave as is. | |
1093 | */ | |
1094 | return cp; | |
1095 | } | |
1096 | ||
1097 | return y; | |
1098 | #endif /* DUK_USE_REGEXP_CANON_WORKAROUND */ | |
1099 | } | |
1100 | ||
1101 | /* | |
1102 | * E5 Section 15.10.2.6 "IsWordChar" abstract operation. Assume | |
1103 | * x < 0 for characters read outside the string. | |
1104 | */ | |
1105 | ||
1106 | DUK_INTERNAL duk_small_int_t duk_unicode_re_is_wordchar(duk_codepoint_t x) { | |
1107 | /* | |
1108 | * Note: the description in E5 Section 15.10.2.6 has a typo, it | |
1109 | * contains 'A' twice and lacks 'a'; the intent is [0-9a-zA-Z_]. | |
1110 | */ | |
1111 | if ((x >= '0' && x <= '9') || | |
1112 | (x >= 'a' && x <= 'z') || | |
1113 | (x >= 'A' && x <= 'Z') || | |
1114 | (x == '_')) { | |
1115 | return 1; | |
1116 | } | |
1117 | return 0; | |
1118 | } | |
1119 | ||
1120 | /* | |
1121 | * Regexp range tables | |
1122 | */ | |
1123 | ||
1124 | /* exposed because lexer needs these too */ | |
1125 | DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_digit[2] = { | |
1126 | (duk_uint16_t) 0x0030UL, (duk_uint16_t) 0x0039UL, | |
1127 | }; | |
1128 | DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_white[22] = { | |
1129 | (duk_uint16_t) 0x0009UL, (duk_uint16_t) 0x000DUL, | |
1130 | (duk_uint16_t) 0x0020UL, (duk_uint16_t) 0x0020UL, | |
1131 | (duk_uint16_t) 0x00A0UL, (duk_uint16_t) 0x00A0UL, | |
1132 | (duk_uint16_t) 0x1680UL, (duk_uint16_t) 0x1680UL, | |
1133 | (duk_uint16_t) 0x180EUL, (duk_uint16_t) 0x180EUL, | |
1134 | (duk_uint16_t) 0x2000UL, (duk_uint16_t) 0x200AUL, | |
1135 | (duk_uint16_t) 0x2028UL, (duk_uint16_t) 0x2029UL, | |
1136 | (duk_uint16_t) 0x202FUL, (duk_uint16_t) 0x202FUL, | |
1137 | (duk_uint16_t) 0x205FUL, (duk_uint16_t) 0x205FUL, | |
1138 | (duk_uint16_t) 0x3000UL, (duk_uint16_t) 0x3000UL, | |
1139 | (duk_uint16_t) 0xFEFFUL, (duk_uint16_t) 0xFEFFUL, | |
1140 | }; | |
1141 | DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_wordchar[8] = { | |
1142 | (duk_uint16_t) 0x0030UL, (duk_uint16_t) 0x0039UL, | |
1143 | (duk_uint16_t) 0x0041UL, (duk_uint16_t) 0x005AUL, | |
1144 | (duk_uint16_t) 0x005FUL, (duk_uint16_t) 0x005FUL, | |
1145 | (duk_uint16_t) 0x0061UL, (duk_uint16_t) 0x007AUL, | |
1146 | }; | |
1147 | DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_digit[4] = { | |
1148 | (duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x002FUL, | |
1149 | (duk_uint16_t) 0x003AUL, (duk_uint16_t) 0xFFFFUL, | |
1150 | }; | |
1151 | DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_white[24] = { | |
1152 | (duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x0008UL, | |
1153 | (duk_uint16_t) 0x000EUL, (duk_uint16_t) 0x001FUL, | |
1154 | (duk_uint16_t) 0x0021UL, (duk_uint16_t) 0x009FUL, | |
1155 | (duk_uint16_t) 0x00A1UL, (duk_uint16_t) 0x167FUL, | |
1156 | (duk_uint16_t) 0x1681UL, (duk_uint16_t) 0x180DUL, | |
1157 | (duk_uint16_t) 0x180FUL, (duk_uint16_t) 0x1FFFUL, | |
1158 | (duk_uint16_t) 0x200BUL, (duk_uint16_t) 0x2027UL, | |
1159 | (duk_uint16_t) 0x202AUL, (duk_uint16_t) 0x202EUL, | |
1160 | (duk_uint16_t) 0x2030UL, (duk_uint16_t) 0x205EUL, | |
1161 | (duk_uint16_t) 0x2060UL, (duk_uint16_t) 0x2FFFUL, | |
1162 | (duk_uint16_t) 0x3001UL, (duk_uint16_t) 0xFEFEUL, | |
1163 | (duk_uint16_t) 0xFF00UL, (duk_uint16_t) 0xFFFFUL, | |
1164 | }; | |
1165 | DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_wordchar[10] = { | |
1166 | (duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x002FUL, | |
1167 | (duk_uint16_t) 0x003AUL, (duk_uint16_t) 0x0040UL, | |
1168 | (duk_uint16_t) 0x005BUL, (duk_uint16_t) 0x005EUL, | |
1169 | (duk_uint16_t) 0x0060UL, (duk_uint16_t) 0x0060UL, | |
1170 | (duk_uint16_t) 0x007BUL, (duk_uint16_t) 0xFFFFUL, | |
1171 | }; | |
1172 | ||
1173 | #endif /* DUK_USE_REGEXP_SUPPORT */ |