]>
Commit | Line | Data |
---|---|---|
b602265d DG |
1 | /**********************************************************************\r |
2 | unicode.c - Oniguruma (regular expression library)\r | |
3 | **********************************************************************/\r | |
4 | /*-\r | |
b26691c4 | 5 | * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>\r |
b602265d DG |
6 | * All rights reserved.\r |
7 | *\r | |
8 | * Redistribution and use in source and binary forms, with or without\r | |
9 | * modification, are permitted provided that the following conditions\r | |
10 | * are met:\r | |
11 | * 1. Redistributions of source code must retain the above copyright\r | |
12 | * notice, this list of conditions and the following disclaimer.\r | |
13 | * 2. Redistributions in binary form must reproduce the above copyright\r | |
14 | * notice, this list of conditions and the following disclaimer in the\r | |
15 | * documentation and/or other materials provided with the distribution.\r | |
16 | *\r | |
17 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND\r | |
18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r | |
19 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r | |
20 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE\r | |
21 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r | |
22 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS\r | |
23 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)\r | |
24 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT\r | |
25 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY\r | |
26 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF\r | |
27 | * SUCH DAMAGE.\r | |
28 | */\r | |
29 | \r | |
30 | #include "regint.h"\r | |
31 | \r | |
32 | struct PoolPropertyNameCtype {\r | |
33 | short int name;\r | |
34 | short int ctype;\r | |
35 | };\r | |
36 | \r | |
37 | #define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \\r | |
38 | ((EncUNICODE_ISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)\r | |
39 | \r | |
40 | static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {\r | |
41 | 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,\r | |
42 | 0x4008, 0x428c, 0x4289, 0x4288, 0x4288, 0x4288, 0x4008, 0x4008,\r | |
43 | 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,\r | |
44 | 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,\r | |
45 | 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,\r | |
46 | 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,\r | |
47 | 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,\r | |
48 | 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,\r | |
49 | 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,\r | |
50 | 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,\r | |
51 | 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,\r | |
52 | 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,\r | |
53 | 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,\r | |
54 | 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,\r | |
55 | 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,\r | |
56 | 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,\r | |
57 | 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,\r | |
58 | 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,\r | |
59 | 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,\r | |
60 | 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,\r | |
61 | 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,\r | |
62 | 0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0,\r | |
63 | 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,\r | |
64 | 0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,\r | |
65 | 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,\r | |
66 | 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,\r | |
67 | 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,\r | |
68 | 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,\r | |
69 | 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,\r | |
70 | 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,\r | |
71 | 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,\r | |
72 | 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2\r | |
73 | };\r | |
74 | \r | |
75 | #include "st.h"\r | |
76 | \r | |
77 | #include "unicode_fold_data.c"\r | |
78 | \r | |
79 | extern int\r | |
80 | onigenc_unicode_mbc_case_fold(OnigEncoding enc,\r | |
81 | OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end,\r | |
82 | UChar* fold)\r | |
83 | {\r | |
84 | const struct ByUnfoldKey* buk;\r | |
85 | \r | |
86 | OnigCodePoint code;\r | |
87 | int i, len, rlen;\r | |
88 | const UChar *p = *pp;\r | |
89 | \r | |
90 | code = ONIGENC_MBC_TO_CODE(enc, p, end);\r | |
91 | len = enclen(enc, p);\r | |
92 | *pp += len;\r | |
93 | \r | |
94 | #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI\r | |
95 | if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {\r | |
96 | if (code == 0x0130) {\r | |
97 | return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold);\r | |
98 | }\r | |
99 | #if 0\r | |
100 | if (code == 0x0049) {\r | |
101 | return ONIGENC_CODE_TO_MBC(enc, 0x0131, fold);\r | |
102 | }\r | |
103 | #endif\r | |
104 | }\r | |
105 | #endif\r | |
106 | \r | |
107 | buk = onigenc_unicode_unfold_key(code);\r | |
108 | if (buk != 0) {\r | |
109 | if (buk->fold_len == 1) {\r | |
110 | return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold);\r | |
111 | }\r | |
112 | else {\r | |
113 | OnigCodePoint* addr;\r | |
114 | \r | |
115 | FOLDS_FOLD_ADDR_BUK(buk, addr);\r | |
116 | rlen = 0;\r | |
117 | for (i = 0; i < buk->fold_len; i++) {\r | |
118 | OnigCodePoint c = addr[i];\r | |
119 | len = ONIGENC_CODE_TO_MBC(enc, c, fold);\r | |
120 | fold += len;\r | |
121 | rlen += len;\r | |
122 | }\r | |
123 | return rlen;\r | |
124 | }\r | |
125 | }\r | |
126 | \r | |
127 | for (i = 0; i < len; i++) {\r | |
128 | *fold++ = *p++;\r | |
129 | }\r | |
130 | return len;\r | |
131 | }\r | |
132 | \r | |
133 | static int\r | |
134 | apply_case_fold1(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)\r | |
135 | {\r | |
136 | int i, j, k, n, r;\r | |
137 | \r | |
138 | for (i = from; i < to; ) {\r | |
139 | OnigCodePoint fold = *FOLDS1_FOLD(i);\r | |
140 | n = FOLDS1_UNFOLDS_NUM(i);\r | |
141 | for (j = 0; j < n; j++) {\r | |
142 | OnigCodePoint unfold = FOLDS1_UNFOLDS(i)[j];\r | |
143 | \r | |
144 | r = (*f)(fold, &unfold, 1, arg);\r | |
145 | if (r != 0) return r;\r | |
146 | r = (*f)(unfold, &fold, 1, arg);\r | |
147 | if (r != 0) return r;\r | |
148 | \r | |
149 | for (k = 0; k < j; k++) {\r | |
150 | OnigCodePoint unfold2 = FOLDS1_UNFOLDS(i)[k];\r | |
151 | r = (*f)(unfold, &unfold2, 1, arg);\r | |
152 | if (r != 0) return r;\r | |
153 | r = (*f)(unfold2, &unfold, 1, arg);\r | |
154 | if (r != 0) return r;\r | |
155 | }\r | |
156 | }\r | |
157 | \r | |
158 | i = FOLDS1_NEXT_INDEX(i);\r | |
159 | }\r | |
160 | \r | |
161 | return 0;\r | |
162 | }\r | |
163 | \r | |
164 | static int\r | |
165 | apply_case_fold2(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)\r | |
166 | {\r | |
167 | int i, j, k, n, r;\r | |
168 | \r | |
169 | for (i = from; i < to; ) {\r | |
170 | OnigCodePoint* fold = FOLDS2_FOLD(i);\r | |
171 | n = FOLDS2_UNFOLDS_NUM(i);\r | |
172 | for (j = 0; j < n; j++) {\r | |
173 | OnigCodePoint unfold = FOLDS2_UNFOLDS(i)[j];\r | |
174 | \r | |
175 | r = (*f)(unfold, fold, 2, arg);\r | |
176 | if (r != 0) return r;\r | |
177 | \r | |
178 | for (k = 0; k < j; k++) {\r | |
179 | OnigCodePoint unfold2 = FOLDS2_UNFOLDS(i)[k];\r | |
180 | r = (*f)(unfold, &unfold2, 1, arg);\r | |
181 | if (r != 0) return r;\r | |
182 | r = (*f)(unfold2, &unfold, 1, arg);\r | |
183 | if (r != 0) return r;\r | |
184 | }\r | |
185 | }\r | |
186 | \r | |
187 | i = FOLDS2_NEXT_INDEX(i);\r | |
188 | }\r | |
189 | \r | |
190 | return 0;\r | |
191 | }\r | |
192 | \r | |
193 | static int\r | |
194 | apply_case_fold3(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)\r | |
195 | {\r | |
196 | int i, j, k, n, r;\r | |
197 | \r | |
198 | for (i = from; i < to; ) {\r | |
199 | OnigCodePoint* fold = FOLDS3_FOLD(i);\r | |
200 | n = FOLDS3_UNFOLDS_NUM(i);\r | |
201 | for (j = 0; j < n; j++) {\r | |
202 | OnigCodePoint unfold = FOLDS3_UNFOLDS(i)[j];\r | |
203 | \r | |
204 | r = (*f)(unfold, fold, 3, arg);\r | |
205 | if (r != 0) return r;\r | |
206 | \r | |
207 | for (k = 0; k < j; k++) {\r | |
208 | OnigCodePoint unfold2 = FOLDS3_UNFOLDS(i)[k];\r | |
209 | r = (*f)(unfold, &unfold2, 1, arg);\r | |
210 | if (r != 0) return r;\r | |
211 | r = (*f)(unfold2, &unfold, 1, arg);\r | |
212 | if (r != 0) return r;\r | |
213 | }\r | |
214 | }\r | |
215 | \r | |
216 | i = FOLDS3_NEXT_INDEX(i);\r | |
217 | }\r | |
218 | \r | |
219 | return 0;\r | |
220 | }\r | |
221 | \r | |
222 | extern int\r | |
223 | onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,\r | |
224 | OnigApplyAllCaseFoldFunc f, void* arg)\r | |
225 | {\r | |
226 | int r;\r | |
227 | \r | |
228 | r = apply_case_fold1(0, FOLDS1_NORMAL_END_INDEX, f, arg);\r | |
229 | if (r != 0) return r;\r | |
230 | \r | |
231 | #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI\r | |
232 | if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {\r | |
233 | code = 0x0131;\r | |
234 | r = (*f)(0x0049, &code, 1, arg);\r | |
235 | if (r != 0) return r;\r | |
236 | code = 0x0049;\r | |
237 | r = (*f)(0x0131, &code, 1, arg);\r | |
238 | if (r != 0) return r;\r | |
239 | \r | |
240 | code = 0x0130;\r | |
241 | r = (*f)(0x0069, &code, 1, arg);\r | |
242 | if (r != 0) return r;\r | |
243 | code = 0x0069;\r | |
244 | r = (*f)(0x0130, &code, 1, arg);\r | |
245 | if (r != 0) return r;\r | |
246 | }\r | |
247 | else {\r | |
248 | #endif\r | |
249 | r = apply_case_fold1(FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg);\r | |
250 | if (r != 0) return r;\r | |
251 | #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI\r | |
252 | }\r | |
253 | #endif\r | |
254 | \r | |
255 | if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)\r | |
256 | return 0;\r | |
257 | \r | |
258 | r = apply_case_fold2(0, FOLDS2_NORMAL_END_INDEX, f, arg);\r | |
259 | if (r != 0) return r;\r | |
260 | \r | |
261 | #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI\r | |
262 | if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) {\r | |
263 | #endif\r | |
264 | r = apply_case_fold2(FOLDS2_NORMAL_END_INDEX, FOLDS2_END_INDEX, f, arg);\r | |
265 | if (r != 0) return r;\r | |
266 | #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI\r | |
267 | }\r | |
268 | #endif\r | |
269 | \r | |
270 | r = apply_case_fold3(0, FOLDS3_NORMAL_END_INDEX, f, arg);\r | |
271 | if (r != 0) return r;\r | |
272 | \r | |
273 | return 0;\r | |
274 | }\r | |
275 | \r | |
276 | extern int\r | |
277 | onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,\r | |
278 | OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end,\r | |
279 | OnigCaseFoldCodeItem items[])\r | |
280 | {\r | |
281 | int n, m, i, j, k, len;\r | |
282 | OnigCodePoint code, codes[3];\r | |
283 | const struct ByUnfoldKey* buk;\r | |
284 | \r | |
285 | n = 0;\r | |
286 | \r | |
287 | code = ONIGENC_MBC_TO_CODE(enc, p, end);\r | |
288 | len = enclen(enc, p);\r | |
289 | \r | |
290 | #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI\r | |
291 | if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {\r | |
292 | if (code == 0x0049) {\r | |
293 | items[0].byte_len = len;\r | |
294 | items[0].code_len = 1;\r | |
295 | items[0].code[0] = 0x0131;\r | |
296 | return 1;\r | |
297 | }\r | |
298 | else if (code == 0x0130) {\r | |
299 | items[0].byte_len = len;\r | |
300 | items[0].code_len = 1;\r | |
301 | items[0].code[0] = 0x0069;\r | |
302 | return 1;\r | |
303 | }\r | |
304 | else if (code == 0x0131) {\r | |
305 | items[0].byte_len = len;\r | |
306 | items[0].code_len = 1;\r | |
307 | items[0].code[0] = 0x0049;\r | |
308 | return 1;\r | |
309 | }\r | |
310 | else if (code == 0x0069) {\r | |
311 | items[0].byte_len = len;\r | |
312 | items[0].code_len = 1;\r | |
313 | items[0].code[0] = 0x0130;\r | |
314 | return 1;\r | |
315 | }\r | |
316 | }\r | |
317 | #endif\r | |
318 | \r | |
319 | buk = onigenc_unicode_unfold_key(code);\r | |
320 | if (buk != 0) {\r | |
321 | if (buk->fold_len == 1) {\r | |
322 | int un;\r | |
323 | items[0].byte_len = len;\r | |
324 | items[0].code_len = 1;\r | |
325 | items[0].code[0] = *FOLDS1_FOLD(buk->index);\r | |
326 | n++;\r | |
327 | \r | |
328 | un = FOLDS1_UNFOLDS_NUM(buk->index);\r | |
329 | for (i = 0; i < un; i++) {\r | |
330 | OnigCodePoint unfold = FOLDS1_UNFOLDS(buk->index)[i];\r | |
331 | if (unfold != code) {\r | |
332 | items[n].byte_len = len;\r | |
333 | items[n].code_len = 1;\r | |
334 | items[n].code[0] = unfold;\r | |
335 | n++;\r | |
336 | }\r | |
337 | }\r | |
338 | code = items[0].code[0]; /* for multi-code to unfold search. */\r | |
339 | }\r | |
340 | else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {\r | |
341 | OnigCodePoint cs[3][4];\r | |
342 | int fn, ncs[3];\r | |
343 | \r | |
344 | if (buk->fold_len == 2) {\r | |
345 | m = FOLDS2_UNFOLDS_NUM(buk->index);\r | |
346 | for (i = 0; i < m; i++) {\r | |
347 | OnigCodePoint unfold = FOLDS2_UNFOLDS(buk->index)[i];\r | |
348 | if (unfold == code) continue;\r | |
349 | \r | |
350 | items[n].byte_len = len;\r | |
351 | items[n].code_len = 1;\r | |
352 | items[n].code[0] = unfold;\r | |
353 | n++;\r | |
354 | }\r | |
355 | \r | |
356 | for (fn = 0; fn < 2; fn++) {\r | |
357 | int index;\r | |
358 | cs[fn][0] = FOLDS2_FOLD(buk->index)[fn];\r | |
359 | index = onigenc_unicode_fold1_key(&cs[fn][0]);\r | |
360 | if (index >= 0) {\r | |
361 | int m = FOLDS1_UNFOLDS_NUM(index);\r | |
362 | for (i = 0; i < m; i++) {\r | |
363 | cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];\r | |
364 | }\r | |
365 | ncs[fn] = m + 1;\r | |
366 | }\r | |
367 | else\r | |
368 | ncs[fn] = 1;\r | |
369 | }\r | |
370 | \r | |
371 | for (i = 0; i < ncs[0]; i++) {\r | |
372 | for (j = 0; j < ncs[1]; j++) {\r | |
373 | items[n].byte_len = len;\r | |
374 | items[n].code_len = 2;\r | |
375 | items[n].code[0] = cs[0][i];\r | |
376 | items[n].code[1] = cs[1][j];\r | |
377 | n++;\r | |
378 | }\r | |
379 | }\r | |
380 | }\r | |
381 | else { /* fold_len == 3 */\r | |
382 | m = FOLDS3_UNFOLDS_NUM(buk->index);\r | |
383 | for (i = 0; i < m; i++) {\r | |
384 | OnigCodePoint unfold = FOLDS3_UNFOLDS(buk->index)[i];\r | |
385 | if (unfold == code) continue;\r | |
386 | \r | |
387 | items[n].byte_len = len;\r | |
388 | items[n].code_len = 1;\r | |
389 | items[n].code[0] = unfold;\r | |
390 | n++;\r | |
391 | }\r | |
392 | \r | |
393 | for (fn = 0; fn < 3; fn++) {\r | |
394 | int index;\r | |
395 | cs[fn][0] = FOLDS3_FOLD(buk->index)[fn];\r | |
396 | index = onigenc_unicode_fold1_key(&cs[fn][0]);\r | |
397 | if (index >= 0) {\r | |
398 | int m = FOLDS1_UNFOLDS_NUM(index);\r | |
399 | for (i = 0; i < m; i++) {\r | |
400 | cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];\r | |
401 | }\r | |
402 | ncs[fn] = m + 1;\r | |
403 | }\r | |
404 | else\r | |
405 | ncs[fn] = 1;\r | |
406 | }\r | |
407 | \r | |
408 | for (i = 0; i < ncs[0]; i++) {\r | |
409 | for (j = 0; j < ncs[1]; j++) {\r | |
410 | for (k = 0; k < ncs[2]; k++) {\r | |
411 | items[n].byte_len = len;\r | |
412 | items[n].code_len = 3;\r | |
413 | items[n].code[0] = cs[0][i];\r | |
414 | items[n].code[1] = cs[1][j];\r | |
415 | items[n].code[2] = cs[2][k];\r | |
416 | n++;\r | |
417 | }\r | |
418 | }\r | |
419 | }\r | |
420 | }\r | |
421 | \r | |
422 | /* multi char folded code is not head of another folded multi char */\r | |
423 | return n;\r | |
424 | }\r | |
425 | }\r | |
426 | else {\r | |
427 | int index = onigenc_unicode_fold1_key(&code);\r | |
428 | if (index >= 0) {\r | |
429 | int m = FOLDS1_UNFOLDS_NUM(index);\r | |
430 | for (i = 0; i < m; i++) {\r | |
431 | items[n].byte_len = len;\r | |
432 | items[n].code_len = 1;\r | |
433 | items[n].code[0] = FOLDS1_UNFOLDS(index)[i];\r | |
434 | n++;\r | |
435 | }\r | |
436 | }\r | |
437 | }\r | |
438 | \r | |
439 | if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)\r | |
440 | return n;\r | |
441 | \r | |
442 | p += len;\r | |
443 | if (p < end) {\r | |
444 | int clen;\r | |
445 | int index;\r | |
446 | \r | |
447 | codes[0] = code;\r | |
448 | code = ONIGENC_MBC_TO_CODE(enc, p, end);\r | |
449 | \r | |
450 | buk = onigenc_unicode_unfold_key(code);\r | |
451 | if (buk != 0 && buk->fold_len == 1) {\r | |
452 | codes[1] = *FOLDS1_FOLD(buk->index);\r | |
453 | }\r | |
454 | else\r | |
455 | codes[1] = code;\r | |
456 | \r | |
457 | clen = enclen(enc, p);\r | |
458 | len += clen;\r | |
459 | \r | |
460 | index = onigenc_unicode_fold2_key(codes);\r | |
461 | if (index >= 0) {\r | |
462 | m = FOLDS2_UNFOLDS_NUM(index);\r | |
463 | for (i = 0; i < m; i++) {\r | |
464 | items[n].byte_len = len;\r | |
465 | items[n].code_len = 1;\r | |
466 | items[n].code[0] = FOLDS2_UNFOLDS(index)[i];\r | |
467 | n++;\r | |
468 | }\r | |
469 | }\r | |
470 | \r | |
471 | p += clen;\r | |
472 | if (p < end) {\r | |
473 | code = ONIGENC_MBC_TO_CODE(enc, p, end);\r | |
474 | buk = onigenc_unicode_unfold_key(code);\r | |
475 | if (buk != 0 && buk->fold_len == 1) {\r | |
476 | codes[2] = *FOLDS1_FOLD(buk->index);\r | |
477 | }\r | |
478 | else\r | |
479 | codes[2] = code;\r | |
480 | \r | |
481 | clen = enclen(enc, p);\r | |
482 | len += clen;\r | |
483 | \r | |
484 | index = onigenc_unicode_fold3_key(codes);\r | |
485 | if (index >= 0) {\r | |
486 | m = FOLDS3_UNFOLDS_NUM(index);\r | |
487 | for (i = 0; i < m; i++) {\r | |
488 | items[n].byte_len = len;\r | |
489 | items[n].code_len = 1;\r | |
490 | items[n].code[0] = FOLDS3_UNFOLDS(index)[i];\r | |
491 | n++;\r | |
492 | }\r | |
493 | }\r | |
494 | }\r | |
495 | }\r | |
496 | \r | |
497 | return n;\r | |
498 | }\r | |
499 | \r | |
500 | #ifdef USE_UNICODE_PROPERTIES\r | |
501 | #include "unicode_property_data.c"\r | |
502 | #else\r | |
503 | #include "unicode_property_data_posix.c"\r | |
504 | #endif\r | |
505 | \r | |
506 | \r | |
b26691c4 LG |
507 | #ifdef USE_UNICODE_WORD_BREAK\r |
508 | \r | |
509 | enum WB_TYPE {\r | |
510 | WB_Any = 0,\r | |
511 | WB_ALetter,\r | |
512 | WB_CR,\r | |
513 | WB_Double_Quote,\r | |
514 | WB_Extend,\r | |
515 | WB_ExtendNumLet,\r | |
516 | WB_Format,\r | |
517 | WB_Hebrew_Letter,\r | |
518 | WB_Katakana,\r | |
519 | WB_LF,\r | |
520 | WB_MidLetter,\r | |
521 | WB_MidNum,\r | |
522 | WB_MidNumLet,\r | |
523 | WB_Newline,\r | |
524 | WB_Numeric,\r | |
525 | WB_Regional_Indicator,\r | |
526 | WB_Single_Quote,\r | |
527 | WB_WSegSpace,\r | |
528 | WB_ZWJ,\r | |
529 | };\r | |
530 | \r | |
531 | typedef struct {\r | |
532 | OnigCodePoint start;\r | |
533 | OnigCodePoint end;\r | |
534 | enum WB_TYPE type;\r | |
535 | } WB_RANGE_TYPE;\r | |
536 | \r | |
537 | #include "unicode_wb_data.c"\r | |
538 | \r | |
539 | static enum WB_TYPE\r | |
540 | wb_get_type(OnigCodePoint code)\r | |
541 | {\r | |
542 | OnigCodePoint low, high, x;\r | |
543 | enum WB_TYPE type;\r | |
544 | \r | |
545 | for (low = 0, high = (OnigCodePoint )WB_RANGE_NUM; low < high; ) {\r | |
546 | x = (low + high) >> 1;\r | |
547 | if (code > WB_RANGES[x].end)\r | |
548 | low = x + 1;\r | |
549 | else\r | |
550 | high = x;\r | |
551 | }\r | |
552 | \r | |
553 | type = (low < (OnigCodePoint )WB_RANGE_NUM &&\r | |
554 | code >= WB_RANGES[low].start) ?\r | |
555 | WB_RANGES[low].type : WB_Any;\r | |
556 | \r | |
557 | return type;\r | |
558 | }\r | |
559 | \r | |
560 | #define IS_WB_IGNORE_TAIL(t) ((t) == WB_Extend || (t) == WB_Format || (t) == WB_ZWJ)\r | |
561 | #define IS_WB_AHLetter(t) ((t) == WB_ALetter || (t) == WB_Hebrew_Letter)\r | |
562 | #define IS_WB_MidNumLetQ(t) ((t) == WB_MidNumLet || (t) == WB_Single_Quote)\r | |
563 | \r | |
564 | static int\r | |
565 | wb_get_next_main_code(OnigEncoding enc, UChar* p, const UChar* end,\r | |
566 | OnigCodePoint* rcode, enum WB_TYPE* rtype)\r | |
567 | {\r | |
568 | OnigCodePoint code;\r | |
569 | enum WB_TYPE type;\r | |
570 | \r | |
571 | while (TRUE) {\r | |
572 | p += enclen(enc, p);\r | |
573 | if (p >= end) break;\r | |
574 | \r | |
575 | code = ONIGENC_MBC_TO_CODE(enc, p, end);\r | |
576 | type = wb_get_type(code);\r | |
577 | if (! IS_WB_IGNORE_TAIL(type)) {\r | |
578 | *rcode = code;\r | |
579 | *rtype = type;\r | |
580 | return 1;\r | |
581 | }\r | |
582 | }\r | |
583 | \r | |
584 | return 0;\r | |
585 | }\r | |
586 | \r | |
587 | extern int\r | |
588 | onigenc_wb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,\r | |
589 | const UChar* start, const UChar* end)\r | |
590 | {\r | |
591 | int r;\r | |
592 | UChar* pp;\r | |
593 | OnigCodePoint cfrom;\r | |
594 | OnigCodePoint cfrom2;\r | |
595 | OnigCodePoint cto;\r | |
596 | OnigCodePoint cto2;\r | |
597 | enum WB_TYPE from;\r | |
598 | enum WB_TYPE from2;\r | |
599 | enum WB_TYPE to;\r | |
600 | enum WB_TYPE to2;\r | |
601 | \r | |
602 | /* WB1: sot / Any */\r | |
603 | if (p == start) return TRUE;\r | |
604 | /* WB2: Any / eot */\r | |
605 | if (p == end) return TRUE;\r | |
606 | \r | |
607 | if (IS_NULL(prev)) {\r | |
608 | prev = onigenc_get_prev_char_head(enc, start, p);\r | |
609 | if (IS_NULL(prev)) return TRUE;\r | |
610 | }\r | |
611 | \r | |
612 | cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);\r | |
613 | cto = ONIGENC_MBC_TO_CODE(enc, p, end);\r | |
614 | \r | |
615 | from = wb_get_type(cfrom);\r | |
616 | to = wb_get_type(cto);\r | |
617 | \r | |
618 | /* short cut */\r | |
619 | if (from == 0 && to == 0) goto WB999;\r | |
620 | \r | |
621 | /* WB3: CR + LF */\r | |
622 | if (from == WB_CR && to == WB_LF) return FALSE;\r | |
623 | \r | |
624 | /* WB3a: (Newline|CR|LF) / */\r | |
625 | if (from == WB_Newline || from == WB_CR || from == WB_LF) return TRUE;\r | |
626 | /* WB3b: / (Newline|CR|LF) */\r | |
627 | if (to == WB_Newline || to == WB_CR || to == WB_LF) return TRUE;\r | |
628 | \r | |
629 | /* WB3c: ZWJ + {Extended_Pictographic} */\r | |
630 | if (from == WB_ZWJ) {\r | |
631 | if (onigenc_unicode_is_code_ctype(cto, PROP_INDEX_EXTENDEDPICTOGRAPHIC))\r | |
632 | return FALSE;\r | |
633 | }\r | |
634 | \r | |
635 | /* WB3d: WSegSpace + WSegSpace */\r | |
636 | if (from == WB_WSegSpace && to == WB_WSegSpace) return FALSE;\r | |
637 | \r | |
638 | /* WB4: X (Extend|Format|ZWJ)* -> X */\r | |
639 | if (IS_WB_IGNORE_TAIL(to)) return FALSE;\r | |
640 | if (IS_WB_IGNORE_TAIL(from)) {\r | |
641 | while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r | |
642 | prev = pp;\r | |
643 | cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);\r | |
644 | from = wb_get_type(cfrom);\r | |
645 | if (! IS_WB_IGNORE_TAIL(from))\r | |
646 | break;\r | |
647 | }\r | |
648 | }\r | |
649 | \r | |
650 | if (IS_WB_AHLetter(from)) {\r | |
651 | /* WB5: AHLetter + AHLetter */\r | |
652 | if (IS_WB_AHLetter(to)) return FALSE;\r | |
653 | \r | |
654 | /* WB6: AHLetter + (MidLetter | MidNumLetQ) AHLetter */\r | |
655 | if (to == WB_MidLetter || IS_WB_MidNumLetQ(to)) {\r | |
656 | r = wb_get_next_main_code(enc, p, end, &cto2, &to2);\r | |
657 | if (r == 1) {\r | |
658 | if (IS_WB_AHLetter(to2)) return FALSE;\r | |
659 | }\r | |
660 | }\r | |
661 | }\r | |
662 | \r | |
663 | /* WB7: AHLetter (MidLetter | MidNumLetQ) + AHLetter */\r | |
664 | if (from == WB_MidLetter || IS_WB_MidNumLetQ(from)) {\r | |
665 | if (IS_WB_AHLetter(to)) {\r | |
666 | from2 = WB_Any;\r | |
667 | while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r | |
668 | prev = pp;\r | |
669 | cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);\r | |
670 | from2 = wb_get_type(cfrom2);\r | |
671 | if (! IS_WB_IGNORE_TAIL(from2))\r | |
672 | break;\r | |
673 | }\r | |
674 | \r | |
675 | if (IS_WB_AHLetter(from2)) return FALSE;\r | |
676 | }\r | |
677 | }\r | |
678 | \r | |
679 | if (from == WB_Hebrew_Letter) {\r | |
680 | /* WB7a: Hebrew_Letter + Single_Quote */\r | |
681 | if (to == WB_Single_Quote) return FALSE;\r | |
682 | \r | |
683 | /* WB7b: Hebrew_Letter + Double_Quote Hebrew_Letter */\r | |
684 | if (to == WB_Double_Quote) {\r | |
685 | r = wb_get_next_main_code(enc, p, end, &cto2, &to2);\r | |
686 | if (r == 1) {\r | |
687 | if (to2 == WB_Hebrew_Letter) return FALSE;\r | |
688 | }\r | |
689 | }\r | |
690 | }\r | |
691 | \r | |
692 | /* WB7c: Hebrew_Letter Double_Quote + Hebrew_Letter */\r | |
693 | if (from == WB_Double_Quote) {\r | |
694 | if (to == WB_Hebrew_Letter) {\r | |
695 | from2 = WB_Any;\r | |
696 | while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r | |
697 | prev = pp;\r | |
698 | cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);\r | |
699 | from2 = wb_get_type(cfrom2);\r | |
700 | if (! IS_WB_IGNORE_TAIL(from2))\r | |
701 | break;\r | |
702 | }\r | |
703 | \r | |
704 | if (from2 == WB_Hebrew_Letter) return FALSE;\r | |
705 | }\r | |
706 | }\r | |
707 | \r | |
708 | if (to == WB_Numeric) {\r | |
709 | /* WB8: Numeric + Numeric */\r | |
710 | if (from == WB_Numeric) return FALSE;\r | |
711 | \r | |
712 | /* WB9: AHLetter + Numeric */\r | |
713 | if (IS_WB_AHLetter(from)) return FALSE;\r | |
714 | \r | |
715 | /* WB11: Numeric (MidNum | MidNumLetQ) + Numeric */\r | |
716 | if (from == WB_MidNum || IS_WB_MidNumLetQ(from)) {\r | |
717 | from2 = WB_Any;\r | |
718 | while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r | |
719 | prev = pp;\r | |
720 | cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);\r | |
721 | from2 = wb_get_type(cfrom2);\r | |
722 | if (! IS_WB_IGNORE_TAIL(from2))\r | |
723 | break;\r | |
724 | }\r | |
725 | \r | |
726 | if (from2 == WB_Numeric) return FALSE;\r | |
727 | }\r | |
728 | }\r | |
729 | \r | |
730 | if (from == WB_Numeric) {\r | |
731 | /* WB10: Numeric + AHLetter */\r | |
732 | if (IS_WB_AHLetter(to)) return FALSE;\r | |
733 | \r | |
734 | /* WB12: Numeric + (MidNum | MidNumLetQ) Numeric */\r | |
735 | if (to == WB_MidNum || IS_WB_MidNumLetQ(to)) {\r | |
736 | r = wb_get_next_main_code(enc, p, end, &cto2, &to2);\r | |
737 | if (r == 1) {\r | |
738 | if (to2 == WB_Numeric) return FALSE;\r | |
739 | }\r | |
740 | }\r | |
741 | }\r | |
742 | \r | |
743 | /* WB13: Katakana + Katakana */\r | |
744 | if (from == WB_Katakana && to == WB_Katakana) return FALSE;\r | |
745 | \r | |
746 | /* WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) + ExtendNumLet */\r | |
747 | if (IS_WB_AHLetter(from) || from == WB_Numeric || from == WB_Katakana\r | |
748 | || from == WB_ExtendNumLet) {\r | |
749 | if (to == WB_ExtendNumLet) return FALSE;\r | |
750 | }\r | |
751 | \r | |
752 | /* WB13b: ExtendNumLet + (AHLetter | Numeric | Katakana) */\r | |
753 | if (from == WB_ExtendNumLet) {\r | |
754 | if (IS_WB_AHLetter(to) || to == WB_Numeric || to == WB_Katakana)\r | |
755 | return FALSE;\r | |
756 | }\r | |
757 | \r | |
758 | \r | |
759 | /* WB15: sot (RI RI)* RI + RI */\r | |
760 | /* WB16: [^RI] (RI RI)* RI + RI */\r | |
761 | if (from == WB_Regional_Indicator && to == WB_Regional_Indicator) {\r | |
762 | int n = 0;\r | |
763 | while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r | |
764 | cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);\r | |
765 | from2 = wb_get_type(cfrom2);\r | |
766 | if (from2 != WB_Regional_Indicator)\r | |
767 | break;\r | |
768 | \r | |
769 | n++;\r | |
770 | }\r | |
771 | if ((n % 2) == 0) return FALSE;\r | |
772 | }\r | |
773 | \r | |
774 | WB999:\r | |
775 | /* WB999: Any / Any */\r | |
776 | return TRUE;\r | |
777 | }\r | |
778 | \r | |
779 | #endif /* USE_UNICODE_WORD_BREAK */\r | |
780 | \r | |
781 | \r | |
b602265d DG |
782 | #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER\r |
783 | \r | |
784 | enum EGCB_BREAK_TYPE {\r | |
785 | EGCB_NOT_BREAK = 0,\r | |
786 | EGCB_BREAK = 1,\r | |
787 | EGCB_BREAK_UNDEF_GB11 = 2,\r | |
788 | EGCB_BREAK_UNDEF_RI_RI = 3\r | |
789 | };\r | |
790 | \r | |
791 | enum EGCB_TYPE {\r | |
792 | EGCB_Other = 0,\r | |
793 | EGCB_CR = 1,\r | |
794 | EGCB_LF = 2,\r | |
795 | EGCB_Control = 3,\r | |
796 | EGCB_Extend = 4,\r | |
797 | EGCB_Prepend = 5,\r | |
798 | EGCB_Regional_Indicator = 6,\r | |
799 | EGCB_SpacingMark = 7,\r | |
800 | EGCB_ZWJ = 8,\r | |
801 | #if 0\r | |
802 | /* obsoleted */\r | |
803 | EGCB_E_Base = 9,\r | |
804 | EGCB_E_Base_GAZ = 10,\r | |
805 | EGCB_E_Modifier = 11,\r | |
806 | EGCB_Glue_After_Zwj = 12,\r | |
807 | #endif\r | |
808 | EGCB_L = 13,\r | |
809 | EGCB_LV = 14,\r | |
810 | EGCB_LVT = 15,\r | |
811 | EGCB_T = 16,\r | |
812 | EGCB_V = 17\r | |
813 | };\r | |
814 | \r | |
815 | typedef struct {\r | |
816 | OnigCodePoint start;\r | |
817 | OnigCodePoint end;\r | |
818 | enum EGCB_TYPE type;\r | |
819 | } EGCB_RANGE_TYPE;\r | |
820 | \r | |
821 | #include "unicode_egcb_data.c"\r | |
822 | \r | |
823 | static enum EGCB_TYPE\r | |
824 | egcb_get_type(OnigCodePoint code)\r | |
825 | {\r | |
826 | OnigCodePoint low, high, x;\r | |
827 | enum EGCB_TYPE type;\r | |
828 | \r | |
829 | for (low = 0, high = (OnigCodePoint )EGCB_RANGE_NUM; low < high; ) {\r | |
830 | x = (low + high) >> 1;\r | |
831 | if (code > EGCB_RANGES[x].end)\r | |
832 | low = x + 1;\r | |
833 | else\r | |
834 | high = x;\r | |
835 | }\r | |
836 | \r | |
837 | type = (low < (OnigCodePoint )EGCB_RANGE_NUM &&\r | |
838 | code >= EGCB_RANGES[low].start) ?\r | |
839 | EGCB_RANGES[low].type : EGCB_Other;\r | |
840 | \r | |
841 | return type;\r | |
842 | }\r | |
843 | \r | |
844 | #define IS_CONTROL_CR_LF(code) ((code) <= EGCB_Control && (code) >= EGCB_CR)\r | |
845 | #define IS_HANGUL(code) ((code) >= EGCB_L)\r | |
846 | \r | |
847 | /* GB1 and GB2 are outside of this function. */\r | |
848 | static enum EGCB_BREAK_TYPE\r | |
849 | unicode_egcb_is_break_2code(OnigCodePoint from_code, OnigCodePoint to_code)\r | |
850 | {\r | |
851 | enum EGCB_TYPE from;\r | |
852 | enum EGCB_TYPE to;\r | |
853 | \r | |
854 | from = egcb_get_type(from_code);\r | |
855 | to = egcb_get_type(to_code);\r | |
856 | \r | |
857 | /* short cut */\r | |
858 | if (from == 0 && to == 0) goto GB999;\r | |
859 | \r | |
860 | /* GB3 */\r | |
861 | if (from == EGCB_CR && to == EGCB_LF) return EGCB_NOT_BREAK;\r | |
862 | /* GB4 */\r | |
863 | if (IS_CONTROL_CR_LF(from)) return EGCB_BREAK;\r | |
864 | /* GB5 */\r | |
865 | if (IS_CONTROL_CR_LF(to)) return EGCB_BREAK;\r | |
866 | \r | |
867 | if (IS_HANGUL(from) && IS_HANGUL(to)) {\r | |
868 | /* GB6 */\r | |
869 | if (from == EGCB_L && to != EGCB_T) return EGCB_NOT_BREAK;\r | |
870 | /* GB7 */\r | |
871 | if ((from == EGCB_LV || from == EGCB_V)\r | |
872 | && (to == EGCB_V || to == EGCB_T)) return EGCB_NOT_BREAK;\r | |
873 | \r | |
874 | /* GB8 */\r | |
875 | if ((to == EGCB_T) && (from == EGCB_LVT || from == EGCB_T))\r | |
876 | return EGCB_NOT_BREAK;\r | |
877 | \r | |
878 | goto GB999;\r | |
879 | }\r | |
880 | \r | |
881 | /* GB9 */\r | |
882 | if (to == EGCB_Extend || to == EGCB_ZWJ) return EGCB_NOT_BREAK;\r | |
883 | \r | |
884 | /* GB9a */\r | |
885 | if (to == EGCB_SpacingMark) return EGCB_NOT_BREAK;\r | |
886 | /* GB9b */\r | |
887 | if (from == EGCB_Prepend) return EGCB_NOT_BREAK;\r | |
888 | \r | |
889 | /* GB10 removed */\r | |
890 | \r | |
891 | /* GB11 */\r | |
892 | if (from == EGCB_ZWJ) {\r | |
893 | if (onigenc_unicode_is_code_ctype(to_code, PROP_INDEX_EXTENDEDPICTOGRAPHIC))\r | |
894 | return EGCB_BREAK_UNDEF_GB11;\r | |
895 | \r | |
896 | goto GB999;\r | |
897 | }\r | |
898 | \r | |
899 | /* GB12, GB13 */\r | |
900 | if (from == EGCB_Regional_Indicator && to == EGCB_Regional_Indicator) {\r | |
901 | return EGCB_BREAK_UNDEF_RI_RI;\r | |
902 | }\r | |
903 | \r | |
904 | GB999:\r | |
905 | return EGCB_BREAK;\r | |
906 | }\r | |
907 | \r | |
908 | #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */\r | |
909 | \r | |
910 | extern int\r | |
911 | onigenc_egcb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,\r | |
912 | const UChar* start, const UChar* end)\r | |
913 | {\r | |
914 | OnigCodePoint from;\r | |
915 | OnigCodePoint to;\r | |
916 | #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER\r | |
917 | enum EGCB_BREAK_TYPE btype;\r | |
918 | enum EGCB_TYPE type;\r | |
919 | #endif\r | |
920 | \r | |
921 | /* GB1 and GB2 */\r | |
922 | if (p == start) return 1;\r | |
923 | if (p == end) return 1;\r | |
924 | \r | |
925 | if (IS_NULL(prev)) {\r | |
926 | prev = onigenc_get_prev_char_head(enc, start, p);\r | |
927 | if (IS_NULL(prev)) return 1;\r | |
928 | }\r | |
929 | \r | |
930 | from = ONIGENC_MBC_TO_CODE(enc, prev, end);\r | |
931 | to = ONIGENC_MBC_TO_CODE(enc, p, end);\r | |
932 | \r | |
933 | #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER\r | |
934 | if (! ONIGENC_IS_UNICODE_ENCODING(enc)) {\r | |
b26691c4 | 935 | return from != 0x000d || to != 0x000a;\r |
b602265d DG |
936 | }\r |
937 | \r | |
938 | btype = unicode_egcb_is_break_2code(from, to);\r | |
939 | switch (btype) {\r | |
940 | case EGCB_NOT_BREAK:\r | |
941 | return 0;\r | |
942 | break;\r | |
943 | case EGCB_BREAK:\r | |
944 | return 1;\r | |
945 | break;\r | |
946 | \r | |
947 | case EGCB_BREAK_UNDEF_GB11:\r | |
948 | while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r | |
949 | from = ONIGENC_MBC_TO_CODE(enc, prev, end);\r | |
950 | if (onigenc_unicode_is_code_ctype(from, PROP_INDEX_EXTENDEDPICTOGRAPHIC))\r | |
951 | return 0;\r | |
952 | \r | |
953 | type = egcb_get_type(from);\r | |
954 | if (type != EGCB_Extend)\r | |
955 | break;\r | |
956 | }\r | |
957 | break;\r | |
958 | \r | |
959 | case EGCB_BREAK_UNDEF_RI_RI:\r | |
960 | {\r | |
961 | int n = 0;\r | |
962 | while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r | |
963 | from = ONIGENC_MBC_TO_CODE(enc, prev, end);\r | |
964 | type = egcb_get_type(from);\r | |
965 | if (type != EGCB_Regional_Indicator)\r | |
966 | break;\r | |
967 | \r | |
968 | n++;\r | |
969 | }\r | |
970 | if ((n % 2) == 0) return 0;\r | |
971 | }\r | |
972 | break;\r | |
973 | }\r | |
974 | \r | |
975 | return 1;\r | |
976 | \r | |
977 | #else\r | |
b26691c4 | 978 | return from != 0x000d || to != 0x000a;\r |
b602265d DG |
979 | #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */\r |
980 | }\r | |
981 | \r | |
982 | \r | |
983 | #define USER_DEFINED_PROPERTY_MAX_NUM 20\r | |
984 | \r | |
985 | typedef struct {\r | |
986 | int ctype;\r | |
987 | OnigCodePoint* ranges;\r | |
988 | } UserDefinedPropertyValue;\r | |
989 | \r | |
990 | static int UserDefinedPropertyNum;\r | |
991 | static UserDefinedPropertyValue\r | |
992 | UserDefinedPropertyRanges[USER_DEFINED_PROPERTY_MAX_NUM];\r | |
993 | static st_table* UserDefinedPropertyTable;\r | |
994 | \r | |
995 | extern int\r | |
996 | onig_unicode_define_user_property(const char* name, OnigCodePoint* ranges)\r | |
997 | {\r | |
998 | UserDefinedPropertyValue* e;\r | |
999 | int r;\r | |
1000 | int i;\r | |
1001 | int n;\r | |
1002 | int len;\r | |
1003 | int c;\r | |
1004 | char* s;\r | |
b26691c4 | 1005 | UChar* uname;\r |
b602265d DG |
1006 | \r |
1007 | if (UserDefinedPropertyNum >= USER_DEFINED_PROPERTY_MAX_NUM)\r | |
1008 | return ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS;\r | |
1009 | \r | |
1010 | len = (int )strlen_s(name,MAX_STRING_SIZE);\r | |
1011 | if (len >= PROPERTY_NAME_MAX_SIZE)\r | |
1012 | return ONIGERR_TOO_LONG_PROPERTY_NAME;\r | |
1013 | \r | |
1014 | s = (char* )xmalloc(len + 1);\r | |
1015 | if (s == 0)\r | |
1016 | return ONIGERR_MEMORY;\r | |
1017 | \r | |
b26691c4 | 1018 | uname = (UChar* )name;\r |
b602265d DG |
1019 | n = 0;\r |
1020 | for (i = 0; i < len; i++) {\r | |
b26691c4 LG |
1021 | c = uname[i];\r |
1022 | if (c < 0x20 || c >= 0x80) {\r | |
b602265d DG |
1023 | xfree(s);\r |
1024 | return ONIGERR_INVALID_CHAR_PROPERTY_NAME;\r | |
1025 | }\r | |
1026 | \r | |
1027 | if (c != ' ' && c != '-' && c != '_') {\r | |
1028 | s[n] = c;\r | |
1029 | n++;\r | |
1030 | }\r | |
1031 | }\r | |
1032 | s[n] = '\0';\r | |
1033 | \r | |
1034 | if (UserDefinedPropertyTable == 0) {\r | |
1035 | UserDefinedPropertyTable = onig_st_init_strend_table_with_size(10);\r | |
a5def177 DG |
1036 | if (IS_NULL(UserDefinedPropertyTable)) {\r |
1037 | xfree(s);\r | |
1038 | return ONIGERR_MEMORY;\r | |
1039 | }\r | |
b602265d DG |
1040 | }\r |
1041 | \r | |
1042 | e = UserDefinedPropertyRanges + UserDefinedPropertyNum;\r | |
1043 | e->ctype = CODE_RANGES_NUM + UserDefinedPropertyNum;\r | |
1044 | e->ranges = ranges;\r | |
1045 | r = onig_st_insert_strend(UserDefinedPropertyTable,\r | |
1046 | (const UChar* )s, (const UChar* )s + n,\r | |
1047 | (hash_data_type )((void* )e));\r | |
1048 | if (r < 0) return r;\r | |
1049 | \r | |
1050 | UserDefinedPropertyNum++;\r | |
1051 | return 0;\r | |
1052 | }\r | |
1053 | \r | |
1054 | extern int\r | |
1055 | onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype)\r | |
1056 | {\r | |
1057 | if (\r | |
1058 | #ifdef USE_UNICODE_PROPERTIES\r | |
1059 | ctype <= ONIGENC_MAX_STD_CTYPE &&\r | |
1060 | #endif\r | |
1061 | code < 256) {\r | |
1062 | return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype);\r | |
1063 | }\r | |
1064 | \r | |
1065 | if (ctype >= CODE_RANGES_NUM) {\r | |
1066 | int index = ctype - CODE_RANGES_NUM;\r | |
1067 | if (index < UserDefinedPropertyNum)\r | |
1068 | return onig_is_in_code_range((UChar* )UserDefinedPropertyRanges[index].ranges, code);\r | |
1069 | else\r | |
1070 | return ONIGERR_TYPE_BUG;\r | |
1071 | }\r | |
1072 | \r | |
1073 | return onig_is_in_code_range((UChar* )CodeRanges[ctype], code);\r | |
1074 | }\r | |
1075 | \r | |
1076 | \r | |
1077 | extern int\r | |
1078 | onigenc_unicode_ctype_code_range(OnigCtype ctype, const OnigCodePoint* ranges[])\r | |
1079 | {\r | |
1080 | if (ctype >= CODE_RANGES_NUM) {\r | |
1081 | int index = ctype - CODE_RANGES_NUM;\r | |
1082 | if (index < UserDefinedPropertyNum) {\r | |
1083 | *ranges = UserDefinedPropertyRanges[index].ranges;\r | |
1084 | return 0;\r | |
1085 | }\r | |
1086 | else\r | |
1087 | return ONIGERR_TYPE_BUG;\r | |
1088 | }\r | |
1089 | \r | |
1090 | *ranges = CodeRanges[ctype];\r | |
1091 | return 0;\r | |
1092 | }\r | |
1093 | \r | |
1094 | extern int\r | |
1095 | onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,\r | |
1096 | const OnigCodePoint* ranges[])\r | |
1097 | {\r | |
1098 | *sb_out = 0x00;\r | |
1099 | return onigenc_unicode_ctype_code_range(ctype, ranges);\r | |
1100 | }\r | |
1101 | \r | |
1102 | extern int\r | |
1103 | onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name, UChar* end)\r | |
1104 | {\r | |
1105 | int len;\r | |
1106 | UChar *p;\r | |
1107 | OnigCodePoint code;\r | |
1108 | const struct PoolPropertyNameCtype* pc;\r | |
1109 | char buf[PROPERTY_NAME_MAX_SIZE];\r | |
1110 | \r | |
1111 | p = name;\r | |
1112 | len = 0;\r | |
1113 | while (p < end) {\r | |
1114 | code = ONIGENC_MBC_TO_CODE(enc, p, end);\r | |
1115 | if (code >= 0x80)\r | |
1116 | return ONIGERR_INVALID_CHAR_PROPERTY_NAME;\r | |
1117 | \r | |
1118 | if (code != ' ' && code != '-' && code != '_') {\r | |
1119 | buf[len++] = (char )code;\r | |
1120 | if (len >= PROPERTY_NAME_MAX_SIZE)\r | |
1121 | return ONIGERR_INVALID_CHAR_PROPERTY_NAME;\r | |
1122 | }\r | |
1123 | \r | |
1124 | p += enclen(enc, p);\r | |
1125 | }\r | |
1126 | \r | |
1127 | buf[len] = 0;\r | |
1128 | \r | |
1129 | if (UserDefinedPropertyTable != 0) {\r | |
1130 | UserDefinedPropertyValue* e;\r | |
1131 | e = (UserDefinedPropertyValue* )NULL;\r | |
1132 | onig_st_lookup_strend(UserDefinedPropertyTable,\r | |
1133 | (const UChar* )buf, (const UChar* )buf + len,\r | |
1134 | (hash_data_type* )((void* )(&e)));\r | |
1135 | if (e != 0) {\r | |
1136 | return e->ctype;\r | |
1137 | }\r | |
1138 | }\r | |
1139 | \r | |
1140 | pc = unicode_lookup_property_name(buf, len);\r | |
1141 | if (pc != 0) {\r | |
1142 | /* fprintf(stderr, "LOOKUP: %s: %d\n", buf, pc->ctype); */\r | |
1143 | #ifndef USE_UNICODE_PROPERTIES\r | |
1144 | if (pc->ctype > ONIGENC_MAX_STD_CTYPE)\r | |
1145 | return ONIGERR_INVALID_CHAR_PROPERTY_NAME;\r | |
1146 | #endif\r | |
1147 | \r | |
1148 | return (int )pc->ctype;\r | |
1149 | }\r | |
1150 | \r | |
1151 | return ONIGERR_INVALID_CHAR_PROPERTY_NAME;\r | |
1152 | }\r |