]>
Commit | Line | Data |
---|---|---|
14b0e578 CS |
1 | /**********************************************************************\r |
2 | utf16_le.c - Oniguruma (regular expression library)\r | |
3 | **********************************************************************/\r | |
4 | /*-\r | |
b26691c4 | 5 | * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>\r |
14b0e578 CS |
6 | * All rights reserved.\r |
7 | *\r | |
8 | * Redistribution and use in source and binary forms, with or without\r | |
9 | * modification, are permitted provided that the following conditions\r | |
10 | * are met:\r | |
11 | * 1. Redistributions of source code must retain the above copyright\r | |
12 | * notice, this list of conditions and the following disclaimer.\r | |
13 | * 2. Redistributions in binary form must reproduce the above copyright\r | |
14 | * notice, this list of conditions and the following disclaimer in the\r | |
15 | * documentation and/or other materials provided with the distribution.\r | |
16 | *\r | |
17 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND\r | |
18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r | |
19 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r | |
20 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE\r | |
21 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r | |
22 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS\r | |
23 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)\r | |
24 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT\r | |
25 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY\r | |
26 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF\r | |
27 | * SUCH DAMAGE.\r | |
28 | */\r | |
b602265d | 29 | #include "regint.h" /* for USE_CALLOUT */\r |
14b0e578 | 30 | \r |
b602265d DG |
31 | static int\r |
32 | init(void)\r | |
33 | {\r | |
34 | #ifdef USE_CALLOUT\r | |
35 | \r | |
36 | int id;\r | |
37 | OnigEncoding enc;\r | |
38 | char* name;\r | |
39 | unsigned int args[4];\r | |
40 | OnigValue opts[4];\r | |
41 | \r | |
42 | enc = ONIG_ENCODING_UTF16_LE;\r | |
43 | \r | |
44 | name = "F\000A\000I\000L\000\000\000"; BC0_P(name, fail);\r | |
45 | name = "M\000I\000S\000M\000A\000T\000C\000H\000\000\000"; BC0_P(name, mismatch);\r | |
46 | \r | |
47 | name = "M\000A\000X\000\000\000";\r | |
48 | args[0] = ONIG_TYPE_TAG | ONIG_TYPE_LONG;\r | |
49 | args[1] = ONIG_TYPE_CHAR;\r | |
50 | opts[0].c = 'X';\r | |
51 | BC_B_O(name, max, 2, args, 1, opts);\r | |
52 | \r | |
53 | name = "E\000R\000R\000O\000R\000\000\000";\r | |
54 | args[0] = ONIG_TYPE_LONG; opts[0].l = ONIG_ABORT;\r | |
55 | BC_P_O(name, error, 1, args, 1, opts);\r | |
56 | \r | |
57 | name = "C\000O\000U\000N\000T\000\000\000";\r | |
58 | args[0] = ONIG_TYPE_CHAR; opts[0].c = '>';\r | |
59 | BC_B_O(name, count, 1, args, 1, opts);\r | |
60 | \r | |
61 | name = "T\000O\000T\000A\000L\000_\000C\000O\000U\000N\000T\000\000\000";\r | |
62 | args[0] = ONIG_TYPE_CHAR; opts[0].c = '>';\r | |
63 | BC_B_O(name, total_count, 1, args, 1, opts);\r | |
64 | \r | |
65 | name = "C\000M\000P\000\000\000";\r | |
66 | args[0] = ONIG_TYPE_TAG | ONIG_TYPE_LONG;\r | |
67 | args[1] = ONIG_TYPE_STRING;\r | |
68 | args[2] = ONIG_TYPE_TAG | ONIG_TYPE_LONG;\r | |
69 | BC_P(name, cmp, 3, args);\r | |
70 | \r | |
71 | #endif /* USE_CALLOUT */\r | |
72 | \r | |
73 | return ONIG_NORMAL;\r | |
74 | }\r | |
14b0e578 CS |
75 | \r |
76 | static const int EncLen_UTF16[] = {\r | |
77 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
78 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
79 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
80 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
81 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
82 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
83 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
84 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
85 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
86 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
87 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
88 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
89 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
90 | 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,\r | |
91 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
92 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2\r | |
93 | };\r | |
94 | \r | |
95 | static int\r | |
96 | utf16le_code_to_mbclen(OnigCodePoint code)\r | |
97 | {\r | |
b26691c4 LG |
98 | if (code > 0xffff) {\r |
99 | if (code > 0x10ffff)\r | |
100 | return ONIGERR_INVALID_CODE_POINT_VALUE;\r | |
101 | else\r | |
102 | return 4;\r | |
103 | }\r | |
104 | else {\r | |
105 | return 2;\r | |
106 | }\r | |
14b0e578 CS |
107 | }\r |
108 | \r | |
109 | static int\r | |
110 | utf16le_mbc_enc_len(const UChar* p)\r | |
111 | {\r | |
112 | return EncLen_UTF16[*(p+1)];\r | |
113 | }\r | |
114 | \r | |
b602265d DG |
115 | static int\r |
116 | is_valid_mbc_string(const UChar* p, const UChar* end)\r | |
117 | {\r | |
118 | const UChar* end1 = end - 1;\r | |
119 | \r | |
120 | while (p < end1) {\r | |
b26691c4 LG |
121 | int len = utf16le_mbc_enc_len(p);\r |
122 | if (len == 4) {\r | |
123 | if (p + 3 < end && ! UTF16_IS_SURROGATE_SECOND(*(p + 3)))\r | |
124 | return FALSE;\r | |
125 | }\r | |
126 | else\r | |
127 | if (UTF16_IS_SURROGATE_SECOND(*(p + 1)))\r | |
128 | return FALSE;\r | |
129 | \r | |
130 | p += len;\r | |
b602265d DG |
131 | }\r |
132 | \r | |
133 | if (p != end)\r | |
134 | return FALSE;\r | |
135 | else\r | |
136 | return TRUE;\r | |
137 | }\r | |
138 | \r | |
14b0e578 CS |
139 | static int\r |
140 | utf16le_is_mbc_newline(const UChar* p, const UChar* end)\r | |
141 | {\r | |
142 | if (p + 1 < end) {\r | |
143 | if (*p == 0x0a && *(p+1) == 0x00)\r | |
144 | return 1;\r | |
145 | #ifdef USE_UNICODE_ALL_LINE_TERMINATORS\r | |
146 | if ((\r | |
147 | #ifndef USE_CRNL_AS_LINE_TERMINATOR\r | |
b602265d | 148 | *p == 0x0d ||\r |
14b0e578 | 149 | #endif\r |
b602265d | 150 | *p == 0x85) && *(p+1) == 0x00)\r |
14b0e578 | 151 | return 1;\r |
b602265d | 152 | \r |
14b0e578 CS |
153 | if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28))\r |
154 | return 1;\r | |
155 | #endif\r | |
156 | }\r | |
157 | return 0;\r | |
158 | }\r | |
159 | \r | |
160 | static OnigCodePoint\r | |
161 | utf16le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)\r | |
162 | {\r | |
163 | OnigCodePoint code;\r | |
164 | UChar c0 = *p;\r | |
165 | UChar c1 = *(p+1);\r | |
166 | \r | |
167 | if (UTF16_IS_SURROGATE_FIRST(c1)) {\r | |
168 | code = ((((c1 - 0xd8) << 2) + ((c0 & 0xc0) >> 6) + 1) << 16)\r | |
169 | + ((((c0 & 0x3f) << 2) + (p[3] - 0xdc)) << 8)\r | |
170 | + p[2];\r | |
171 | }\r | |
172 | else {\r | |
173 | code = c1 * 256 + p[0];\r | |
174 | }\r | |
175 | return code;\r | |
176 | }\r | |
177 | \r | |
178 | static int\r | |
179 | utf16le_code_to_mbc(OnigCodePoint code, UChar *buf)\r | |
180 | {\r | |
181 | UChar* p = buf;\r | |
182 | \r | |
183 | if (code > 0xffff) {\r | |
184 | unsigned int plane, high;\r | |
185 | \r | |
186 | plane = (code >> 16) - 1;\r | |
187 | high = (code & 0xff00) >> 8;\r | |
188 | \r | |
b602265d DG |
189 | *p++ = ((plane & 0x03) << 6) + (high >> 2);\r |
190 | *p++ = (plane >> 2) + 0xd8;\r | |
14b0e578 CS |
191 | *p++ = (UChar )(code & 0xff);\r |
192 | *p = (high & 0x03) + 0xdc;\r | |
193 | return 4;\r | |
194 | }\r | |
195 | else {\r | |
196 | *p++ = (UChar )(code & 0xff);\r | |
197 | *p++ = (UChar )((code & 0xff00) >> 8);\r | |
198 | return 2;\r | |
199 | }\r | |
200 | }\r | |
201 | \r | |
202 | static int\r | |
203 | utf16le_mbc_case_fold(OnigCaseFoldType flag,\r | |
b26691c4 | 204 | const UChar** pp, const UChar* end, UChar* fold)\r |
14b0e578 CS |
205 | {\r |
206 | const UChar* p = *pp;\r | |
207 | \r | |
208 | if (ONIGENC_IS_ASCII_CODE(*p) && *(p+1) == 0) {\r | |
209 | #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI\r | |
210 | if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {\r | |
211 | if (*p == 0x49) {\r | |
b602265d DG |
212 | *fold++ = 0x31;\r |
213 | *fold = 0x01;\r | |
214 | (*pp) += 2;\r | |
215 | return 2;\r | |
14b0e578 CS |
216 | }\r |
217 | }\r | |
218 | #endif\r | |
219 | \r | |
220 | *fold++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);\r | |
221 | *fold = 0;\r | |
222 | *pp += 2;\r | |
223 | return 2;\r | |
224 | }\r | |
225 | else\r | |
226 | return onigenc_unicode_mbc_case_fold(ONIG_ENCODING_UTF16_LE, flag, pp, end,\r | |
b26691c4 | 227 | fold);\r |
14b0e578 CS |
228 | }\r |
229 | \r | |
230 | #if 0\r | |
231 | static int\r | |
232 | utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp,\r | |
b26691c4 | 233 | const UChar* end)\r |
14b0e578 CS |
234 | {\r |
235 | const UChar* p = *pp;\r | |
236 | \r | |
237 | (*pp) += EncLen_UTF16[*(p+1)];\r | |
238 | \r | |
239 | if (*(p+1) == 0) {\r | |
240 | int c, v;\r | |
241 | \r | |
242 | if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {\r | |
243 | return TRUE;\r | |
244 | }\r | |
245 | \r | |
246 | c = *p;\r | |
247 | v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c,\r | |
248 | (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));\r | |
249 | if ((v | BIT_CTYPE_LOWER) != 0) {\r | |
250 | /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */\r | |
251 | if (c >= 0xaa && c <= 0xba)\r | |
b602265d | 252 | return FALSE;\r |
14b0e578 | 253 | else\r |
b602265d | 254 | return TRUE;\r |
14b0e578 CS |
255 | }\r |
256 | return (v != 0 ? TRUE : FALSE);\r | |
257 | }\r | |
258 | \r | |
259 | return FALSE;\r | |
260 | }\r | |
261 | #endif\r | |
262 | \r | |
263 | static UChar*\r | |
264 | utf16le_left_adjust_char_head(const UChar* start, const UChar* s)\r | |
265 | {\r | |
266 | if (s <= start) return (UChar* )s;\r | |
267 | \r | |
268 | if ((s - start) % 2 == 1) {\r | |
269 | s--;\r | |
270 | }\r | |
271 | \r | |
b26691c4 LG |
272 | if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1 &&\r |
273 | UTF16_IS_SURROGATE_FIRST(*(s-1)))\r | |
14b0e578 CS |
274 | s -= 2;\r |
275 | \r | |
276 | return (UChar* )s;\r | |
277 | }\r | |
278 | \r | |
279 | static int\r | |
280 | utf16le_get_case_fold_codes_by_str(OnigCaseFoldType flag,\r | |
281 | const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])\r | |
282 | {\r | |
283 | return onigenc_unicode_get_case_fold_codes_by_str(ONIG_ENCODING_UTF16_LE,\r | |
b26691c4 | 284 | flag, p, end, items);\r |
14b0e578 CS |
285 | }\r |
286 | \r | |
287 | OnigEncodingType OnigEncodingUTF16_LE = {\r | |
288 | utf16le_mbc_enc_len,\r | |
289 | "UTF-16LE", /* name */\r | |
b602265d DG |
290 | 4, /* max enc length */\r |
291 | 2, /* min enc length */\r | |
14b0e578 CS |
292 | utf16le_is_mbc_newline,\r |
293 | utf16le_mbc_to_code,\r | |
294 | utf16le_code_to_mbclen,\r | |
295 | utf16le_code_to_mbc,\r | |
296 | utf16le_mbc_case_fold,\r | |
297 | onigenc_unicode_apply_all_case_fold,\r | |
298 | utf16le_get_case_fold_codes_by_str,\r | |
299 | onigenc_unicode_property_name_to_ctype,\r | |
300 | onigenc_unicode_is_code_ctype,\r | |
301 | onigenc_utf16_32_get_ctype_code_range,\r | |
302 | utf16le_left_adjust_char_head,\r | |
b602265d DG |
303 | onigenc_always_false_is_allowed_reverse_match,\r |
304 | init,\r | |
305 | 0, /* is_initialized */\r | |
306 | is_valid_mbc_string,\r | |
b26691c4 | 307 | ENC_FLAG_UNICODE|ENC_FLAG_SKIP_OFFSET_1,\r |
b602265d | 308 | 0, 0\r |
14b0e578 | 309 | };\r |