]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /* |
2 | * Unicode helpers | |
3 | */ | |
4 | ||
5 | #ifndef DUK_UNICODE_H_INCLUDED | |
6 | #define DUK_UNICODE_H_INCLUDED | |
7 | ||
8 | /* | |
9 | * UTF-8 / XUTF-8 / CESU-8 constants | |
10 | */ | |
11 | ||
12 | #define DUK_UNICODE_MAX_XUTF8_LENGTH 7 /* up to 36 bit codepoints */ | |
13 | #define DUK_UNICODE_MAX_XUTF8_BMP_LENGTH 3 /* all codepoints up to U+FFFF */ | |
14 | #define DUK_UNICODE_MAX_CESU8_LENGTH 6 /* all codepoints up to U+10FFFF */ | |
15 | #define DUK_UNICODE_MAX_CESU8_BMP_LENGTH 3 /* all codepoints up to U+FFFF */ | |
16 | ||
17 | /* | |
18 | * Useful Unicode codepoints | |
19 | * | |
20 | * Integer constants must be signed to avoid unexpected coercions | |
21 | * in comparisons. | |
22 | */ | |
23 | ||
24 | #define DUK_UNICODE_CP_ZWNJ 0x200cL /* zero-width non-joiner */ | |
25 | #define DUK_UNICODE_CP_ZWJ 0x200dL /* zero-width joiner */ | |
26 | #define DUK_UNICODE_CP_REPLACEMENT_CHARACTER 0xfffdL /* http://en.wikipedia.org/wiki/Replacement_character#Replacement_character */ | |
27 | ||
28 | /* | |
29 | * ASCII character constants | |
30 | * | |
31 | * C character literals like 'x' have a platform specific value and do | |
32 | * not match ASCII (UTF-8) values on e.g. EBCDIC platforms. So, use | |
33 | * these (admittedly awkward) constants instead. These constants must | |
34 | * also have signed values to avoid unexpected coercions in comparisons. | |
35 | * | |
36 | * http://en.wikipedia.org/wiki/ASCII | |
37 | */ | |
38 | ||
39 | #define DUK_ASC_NUL 0x00 | |
40 | #define DUK_ASC_SOH 0x01 | |
41 | #define DUK_ASC_STX 0x02 | |
42 | #define DUK_ASC_ETX 0x03 | |
43 | #define DUK_ASC_EOT 0x04 | |
44 | #define DUK_ASC_ENQ 0x05 | |
45 | #define DUK_ASC_ACK 0x06 | |
46 | #define DUK_ASC_BEL 0x07 | |
47 | #define DUK_ASC_BS 0x08 | |
48 | #define DUK_ASC_HT 0x09 | |
49 | #define DUK_ASC_LF 0x0a | |
50 | #define DUK_ASC_VT 0x0b | |
51 | #define DUK_ASC_FF 0x0c | |
52 | #define DUK_ASC_CR 0x0d | |
53 | #define DUK_ASC_SO 0x0e | |
54 | #define DUK_ASC_SI 0x0f | |
55 | #define DUK_ASC_DLE 0x10 | |
56 | #define DUK_ASC_DC1 0x11 | |
57 | #define DUK_ASC_DC2 0x12 | |
58 | #define DUK_ASC_DC3 0x13 | |
59 | #define DUK_ASC_DC4 0x14 | |
60 | #define DUK_ASC_NAK 0x15 | |
61 | #define DUK_ASC_SYN 0x16 | |
62 | #define DUK_ASC_ETB 0x17 | |
63 | #define DUK_ASC_CAN 0x18 | |
64 | #define DUK_ASC_EM 0x19 | |
65 | #define DUK_ASC_SUB 0x1a | |
66 | #define DUK_ASC_ESC 0x1b | |
67 | #define DUK_ASC_FS 0x1c | |
68 | #define DUK_ASC_GS 0x1d | |
69 | #define DUK_ASC_RS 0x1e | |
70 | #define DUK_ASC_US 0x1f | |
71 | #define DUK_ASC_SPACE 0x20 | |
72 | #define DUK_ASC_EXCLAMATION 0x21 | |
73 | #define DUK_ASC_DOUBLEQUOTE 0x22 | |
74 | #define DUK_ASC_HASH 0x23 | |
75 | #define DUK_ASC_DOLLAR 0x24 | |
76 | #define DUK_ASC_PERCENT 0x25 | |
77 | #define DUK_ASC_AMP 0x26 | |
78 | #define DUK_ASC_SINGLEQUOTE 0x27 | |
79 | #define DUK_ASC_LPAREN 0x28 | |
80 | #define DUK_ASC_RPAREN 0x29 | |
81 | #define DUK_ASC_STAR 0x2a | |
82 | #define DUK_ASC_PLUS 0x2b | |
83 | #define DUK_ASC_COMMA 0x2c | |
84 | #define DUK_ASC_MINUS 0x2d | |
85 | #define DUK_ASC_PERIOD 0x2e | |
86 | #define DUK_ASC_SLASH 0x2f | |
87 | #define DUK_ASC_0 0x30 | |
88 | #define DUK_ASC_1 0x31 | |
89 | #define DUK_ASC_2 0x32 | |
90 | #define DUK_ASC_3 0x33 | |
91 | #define DUK_ASC_4 0x34 | |
92 | #define DUK_ASC_5 0x35 | |
93 | #define DUK_ASC_6 0x36 | |
94 | #define DUK_ASC_7 0x37 | |
95 | #define DUK_ASC_8 0x38 | |
96 | #define DUK_ASC_9 0x39 | |
97 | #define DUK_ASC_COLON 0x3a | |
98 | #define DUK_ASC_SEMICOLON 0x3b | |
99 | #define DUK_ASC_LANGLE 0x3c | |
100 | #define DUK_ASC_EQUALS 0x3d | |
101 | #define DUK_ASC_RANGLE 0x3e | |
102 | #define DUK_ASC_QUESTION 0x3f | |
103 | #define DUK_ASC_ATSIGN 0x40 | |
104 | #define DUK_ASC_UC_A 0x41 | |
105 | #define DUK_ASC_UC_B 0x42 | |
106 | #define DUK_ASC_UC_C 0x43 | |
107 | #define DUK_ASC_UC_D 0x44 | |
108 | #define DUK_ASC_UC_E 0x45 | |
109 | #define DUK_ASC_UC_F 0x46 | |
110 | #define DUK_ASC_UC_G 0x47 | |
111 | #define DUK_ASC_UC_H 0x48 | |
112 | #define DUK_ASC_UC_I 0x49 | |
113 | #define DUK_ASC_UC_J 0x4a | |
114 | #define DUK_ASC_UC_K 0x4b | |
115 | #define DUK_ASC_UC_L 0x4c | |
116 | #define DUK_ASC_UC_M 0x4d | |
117 | #define DUK_ASC_UC_N 0x4e | |
118 | #define DUK_ASC_UC_O 0x4f | |
119 | #define DUK_ASC_UC_P 0x50 | |
120 | #define DUK_ASC_UC_Q 0x51 | |
121 | #define DUK_ASC_UC_R 0x52 | |
122 | #define DUK_ASC_UC_S 0x53 | |
123 | #define DUK_ASC_UC_T 0x54 | |
124 | #define DUK_ASC_UC_U 0x55 | |
125 | #define DUK_ASC_UC_V 0x56 | |
126 | #define DUK_ASC_UC_W 0x57 | |
127 | #define DUK_ASC_UC_X 0x58 | |
128 | #define DUK_ASC_UC_Y 0x59 | |
129 | #define DUK_ASC_UC_Z 0x5a | |
130 | #define DUK_ASC_LBRACKET 0x5b | |
131 | #define DUK_ASC_BACKSLASH 0x5c | |
132 | #define DUK_ASC_RBRACKET 0x5d | |
133 | #define DUK_ASC_CARET 0x5e | |
134 | #define DUK_ASC_UNDERSCORE 0x5f | |
135 | #define DUK_ASC_GRAVE 0x60 | |
136 | #define DUK_ASC_LC_A 0x61 | |
137 | #define DUK_ASC_LC_B 0x62 | |
138 | #define DUK_ASC_LC_C 0x63 | |
139 | #define DUK_ASC_LC_D 0x64 | |
140 | #define DUK_ASC_LC_E 0x65 | |
141 | #define DUK_ASC_LC_F 0x66 | |
142 | #define DUK_ASC_LC_G 0x67 | |
143 | #define DUK_ASC_LC_H 0x68 | |
144 | #define DUK_ASC_LC_I 0x69 | |
145 | #define DUK_ASC_LC_J 0x6a | |
146 | #define DUK_ASC_LC_K 0x6b | |
147 | #define DUK_ASC_LC_L 0x6c | |
148 | #define DUK_ASC_LC_M 0x6d | |
149 | #define DUK_ASC_LC_N 0x6e | |
150 | #define DUK_ASC_LC_O 0x6f | |
151 | #define DUK_ASC_LC_P 0x70 | |
152 | #define DUK_ASC_LC_Q 0x71 | |
153 | #define DUK_ASC_LC_R 0x72 | |
154 | #define DUK_ASC_LC_S 0x73 | |
155 | #define DUK_ASC_LC_T 0x74 | |
156 | #define DUK_ASC_LC_U 0x75 | |
157 | #define DUK_ASC_LC_V 0x76 | |
158 | #define DUK_ASC_LC_W 0x77 | |
159 | #define DUK_ASC_LC_X 0x78 | |
160 | #define DUK_ASC_LC_Y 0x79 | |
161 | #define DUK_ASC_LC_Z 0x7a | |
162 | #define DUK_ASC_LCURLY 0x7b | |
163 | #define DUK_ASC_PIPE 0x7c | |
164 | #define DUK_ASC_RCURLY 0x7d | |
165 | #define DUK_ASC_TILDE 0x7e | |
166 | #define DUK_ASC_DEL 0x7f | |
167 | ||
168 | /* | |
169 | * Unicode tables | |
170 | */ | |
171 | ||
172 | #ifdef DUK_USE_SOURCE_NONBMP | |
173 | /* | |
174 | * Automatically generated by extract_chars.py, do not edit! | |
175 | */ | |
176 | ||
177 | extern const duk_uint8_t duk_unicode_ids_noa[791]; | |
178 | #else | |
179 | /* | |
180 | * Automatically generated by extract_chars.py, do not edit! | |
181 | */ | |
182 | ||
183 | extern const duk_uint8_t duk_unicode_ids_noabmp[611]; | |
184 | #endif | |
185 | ||
186 | #ifdef DUK_USE_SOURCE_NONBMP | |
187 | /* | |
188 | * Automatically generated by extract_chars.py, do not edit! | |
189 | */ | |
190 | ||
191 | extern const duk_uint8_t duk_unicode_ids_m_let_noa[42]; | |
192 | #else | |
193 | /* | |
194 | * Automatically generated by extract_chars.py, do not edit! | |
195 | */ | |
196 | ||
197 | extern const duk_uint8_t duk_unicode_ids_m_let_noabmp[24]; | |
198 | #endif | |
199 | ||
200 | #ifdef DUK_USE_SOURCE_NONBMP | |
201 | /* | |
202 | * Automatically generated by extract_chars.py, do not edit! | |
203 | */ | |
204 | ||
205 | extern const duk_uint8_t duk_unicode_idp_m_ids_noa[397]; | |
206 | #else | |
207 | /* | |
208 | * Automatically generated by extract_chars.py, do not edit! | |
209 | */ | |
210 | ||
211 | extern const duk_uint8_t duk_unicode_idp_m_ids_noabmp[348]; | |
212 | #endif | |
213 | ||
214 | /* | |
215 | * Automatically generated by extract_caseconv.py, do not edit! | |
216 | */ | |
217 | ||
218 | extern const duk_uint8_t duk_unicode_caseconv_uc[1288]; | |
219 | extern const duk_uint8_t duk_unicode_caseconv_lc[616]; | |
220 | ||
221 | /* | |
222 | * Extern | |
223 | */ | |
224 | ||
225 | /* duk_unicode_support.c */ | |
226 | #if !defined(DUK_SINGLE_FILE) | |
227 | DUK_INTERNAL_DECL duk_uint8_t duk_unicode_xutf8_markers[7]; | |
228 | DUK_INTERNAL_DECL duk_uint16_t duk_unicode_re_ranges_digit[2]; | |
229 | DUK_INTERNAL_DECL duk_uint16_t duk_unicode_re_ranges_white[22]; | |
230 | DUK_INTERNAL_DECL duk_uint16_t duk_unicode_re_ranges_wordchar[8]; | |
231 | DUK_INTERNAL_DECL duk_uint16_t duk_unicode_re_ranges_not_digit[4]; | |
232 | DUK_INTERNAL_DECL duk_uint16_t duk_unicode_re_ranges_not_white[24]; | |
233 | DUK_INTERNAL_DECL duk_uint16_t duk_unicode_re_ranges_not_wordchar[10]; | |
234 | #endif /* !DUK_SINGLE_FILE */ | |
235 | ||
236 | /* | |
237 | * Prototypes | |
238 | */ | |
239 | ||
240 | DUK_INTERNAL_DECL duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp); | |
241 | #if defined(DUK_USE_ASSERTIONS) | |
242 | DUK_INTERNAL_DECL duk_small_int_t duk_unicode_get_cesu8_length(duk_ucodepoint_t cp); | |
243 | #endif | |
244 | DUK_INTERNAL_DECL duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out); | |
245 | DUK_INTERNAL_DECL duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out); | |
246 | DUK_INTERNAL_DECL duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end, duk_ucodepoint_t *out_cp); | |
247 | DUK_INTERNAL_DECL duk_ucodepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end); | |
248 | DUK_INTERNAL_DECL duk_size_t duk_unicode_unvalidated_utf8_length(const duk_uint8_t *data, duk_size_t blen); | |
249 | DUK_INTERNAL_DECL duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp); | |
250 | DUK_INTERNAL_DECL duk_small_int_t duk_unicode_is_line_terminator(duk_codepoint_t cp); | |
251 | DUK_INTERNAL_DECL duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp); | |
252 | DUK_INTERNAL_DECL duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp); | |
253 | DUK_INTERNAL_DECL duk_small_int_t duk_unicode_is_letter(duk_codepoint_t cp); | |
254 | DUK_INTERNAL_DECL void duk_unicode_case_convert_string(duk_hthread *thr, duk_bool_t uppercase); | |
255 | DUK_INTERNAL_DECL duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr, duk_codepoint_t cp); | |
256 | DUK_INTERNAL_DECL duk_small_int_t duk_unicode_re_is_wordchar(duk_codepoint_t cp); | |
257 | ||
258 | #endif /* DUK_UNICODE_H_INCLUDED */ |