1 /**********************************************************************
2 unicode.c - Oniguruma (regular expression library)
3 **********************************************************************/
5 * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 struct PoolPropertyNameCtype
{
37 #define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \
38 ((EncUNICODE_ISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
40 static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable
[256] = {
41 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
42 0x4008, 0x428c, 0x4289, 0x4288, 0x4288, 0x4288, 0x4008, 0x4008,
43 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
44 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
45 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
46 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
47 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
48 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
49 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
50 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
51 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
52 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
53 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
54 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
55 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
56 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
57 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,
58 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
59 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
60 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
61 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
62 0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0,
63 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
64 0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
65 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
66 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
67 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
68 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
69 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
70 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
71 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
72 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
77 #include "unicode_fold_data.c"
80 onigenc_unicode_mbc_case_fold(OnigEncoding enc
,
81 OnigCaseFoldType flag ARG_UNUSED
, const UChar
** pp
, const UChar
* end
,
84 const struct ByUnfoldKey
* buk
;
90 code
= ONIGENC_MBC_TO_CODE(enc
, p
, end
);
94 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
95 if ((flag
& ONIGENC_CASE_FOLD_TURKISH_AZERI
) != 0) {
97 return ONIGENC_CODE_TO_MBC(enc
, 0x0069, fold
);
100 if (code
== 0x0049) {
101 return ONIGENC_CODE_TO_MBC(enc
, 0x0131, fold
);
107 buk
= onigenc_unicode_unfold_key(code
);
109 if (buk
->fold_len
== 1) {
110 return ONIGENC_CODE_TO_MBC(enc
, *FOLDS1_FOLD(buk
->index
), fold
);
115 FOLDS_FOLD_ADDR_BUK(buk
, addr
);
117 for (i
= 0; i
< buk
->fold_len
; i
++) {
118 OnigCodePoint c
= addr
[i
];
119 len
= ONIGENC_CODE_TO_MBC(enc
, c
, fold
);
127 for (i
= 0; i
< len
; i
++) {
134 apply_case_fold1(int from
, int to
, OnigApplyAllCaseFoldFunc f
, void* arg
)
138 for (i
= from
; i
< to
; ) {
139 OnigCodePoint fold
= *FOLDS1_FOLD(i
);
140 n
= FOLDS1_UNFOLDS_NUM(i
);
141 for (j
= 0; j
< n
; j
++) {
142 OnigCodePoint unfold
= FOLDS1_UNFOLDS(i
)[j
];
144 r
= (*f
)(fold
, &unfold
, 1, arg
);
145 if (r
!= 0) return r
;
146 r
= (*f
)(unfold
, &fold
, 1, arg
);
147 if (r
!= 0) return r
;
149 for (k
= 0; k
< j
; k
++) {
150 OnigCodePoint unfold2
= FOLDS1_UNFOLDS(i
)[k
];
151 r
= (*f
)(unfold
, &unfold2
, 1, arg
);
152 if (r
!= 0) return r
;
153 r
= (*f
)(unfold2
, &unfold
, 1, arg
);
154 if (r
!= 0) return r
;
158 i
= FOLDS1_NEXT_INDEX(i
);
165 apply_case_fold2(int from
, int to
, OnigApplyAllCaseFoldFunc f
, void* arg
)
169 for (i
= from
; i
< to
; ) {
170 OnigCodePoint
* fold
= FOLDS2_FOLD(i
);
171 n
= FOLDS2_UNFOLDS_NUM(i
);
172 for (j
= 0; j
< n
; j
++) {
173 OnigCodePoint unfold
= FOLDS2_UNFOLDS(i
)[j
];
175 r
= (*f
)(unfold
, fold
, 2, arg
);
176 if (r
!= 0) return r
;
178 for (k
= 0; k
< j
; k
++) {
179 OnigCodePoint unfold2
= FOLDS2_UNFOLDS(i
)[k
];
180 r
= (*f
)(unfold
, &unfold2
, 1, arg
);
181 if (r
!= 0) return r
;
182 r
= (*f
)(unfold2
, &unfold
, 1, arg
);
183 if (r
!= 0) return r
;
187 i
= FOLDS2_NEXT_INDEX(i
);
194 apply_case_fold3(int from
, int to
, OnigApplyAllCaseFoldFunc f
, void* arg
)
198 for (i
= from
; i
< to
; ) {
199 OnigCodePoint
* fold
= FOLDS3_FOLD(i
);
200 n
= FOLDS3_UNFOLDS_NUM(i
);
201 for (j
= 0; j
< n
; j
++) {
202 OnigCodePoint unfold
= FOLDS3_UNFOLDS(i
)[j
];
204 r
= (*f
)(unfold
, fold
, 3, arg
);
205 if (r
!= 0) return r
;
207 for (k
= 0; k
< j
; k
++) {
208 OnigCodePoint unfold2
= FOLDS3_UNFOLDS(i
)[k
];
209 r
= (*f
)(unfold
, &unfold2
, 1, arg
);
210 if (r
!= 0) return r
;
211 r
= (*f
)(unfold2
, &unfold
, 1, arg
);
212 if (r
!= 0) return r
;
216 i
= FOLDS3_NEXT_INDEX(i
);
223 onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag
,
224 OnigApplyAllCaseFoldFunc f
, void* arg
)
228 r
= apply_case_fold1(0, FOLDS1_NORMAL_END_INDEX
, f
, arg
);
229 if (r
!= 0) return r
;
231 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
232 if ((flag
& ONIGENC_CASE_FOLD_TURKISH_AZERI
) != 0) {
234 r
= (*f
)(0x0049, &code
, 1, arg
);
235 if (r
!= 0) return r
;
237 r
= (*f
)(0x0131, &code
, 1, arg
);
238 if (r
!= 0) return r
;
241 r
= (*f
)(0x0069, &code
, 1, arg
);
242 if (r
!= 0) return r
;
244 r
= (*f
)(0x0130, &code
, 1, arg
);
245 if (r
!= 0) return r
;
249 r
= apply_case_fold1(FOLDS1_NORMAL_END_INDEX
, FOLDS1_END_INDEX
, f
, arg
);
250 if (r
!= 0) return r
;
251 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
255 if ((flag
& INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR
) == 0)
258 r
= apply_case_fold2(0, FOLDS2_NORMAL_END_INDEX
, f
, arg
);
259 if (r
!= 0) return r
;
261 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
262 if ((flag
& ONIGENC_CASE_FOLD_TURKISH_AZERI
) == 0) {
264 r
= apply_case_fold2(FOLDS2_NORMAL_END_INDEX
, FOLDS2_END_INDEX
, f
, arg
);
265 if (r
!= 0) return r
;
266 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
270 r
= apply_case_fold3(0, FOLDS3_NORMAL_END_INDEX
, f
, arg
);
271 if (r
!= 0) return r
;
277 onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc
,
278 OnigCaseFoldType flag
, const OnigUChar
* p
, const OnigUChar
* end
,
279 OnigCaseFoldCodeItem items
[])
281 int n
, m
, i
, j
, k
, len
;
282 OnigCodePoint code
, codes
[3];
283 const struct ByUnfoldKey
* buk
;
287 code
= ONIGENC_MBC_TO_CODE(enc
, p
, end
);
288 len
= enclen(enc
, p
);
290 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
291 if ((flag
& ONIGENC_CASE_FOLD_TURKISH_AZERI
) != 0) {
292 if (code
== 0x0049) {
293 items
[0].byte_len
= len
;
294 items
[0].code_len
= 1;
295 items
[0].code
[0] = 0x0131;
298 else if (code
== 0x0130) {
299 items
[0].byte_len
= len
;
300 items
[0].code_len
= 1;
301 items
[0].code
[0] = 0x0069;
304 else if (code
== 0x0131) {
305 items
[0].byte_len
= len
;
306 items
[0].code_len
= 1;
307 items
[0].code
[0] = 0x0049;
310 else if (code
== 0x0069) {
311 items
[0].byte_len
= len
;
312 items
[0].code_len
= 1;
313 items
[0].code
[0] = 0x0130;
319 buk
= onigenc_unicode_unfold_key(code
);
321 if (buk
->fold_len
== 1) {
323 items
[0].byte_len
= len
;
324 items
[0].code_len
= 1;
325 items
[0].code
[0] = *FOLDS1_FOLD(buk
->index
);
328 un
= FOLDS1_UNFOLDS_NUM(buk
->index
);
329 for (i
= 0; i
< un
; i
++) {
330 OnigCodePoint unfold
= FOLDS1_UNFOLDS(buk
->index
)[i
];
331 if (unfold
!= code
) {
332 items
[n
].byte_len
= len
;
333 items
[n
].code_len
= 1;
334 items
[n
].code
[0] = unfold
;
338 code
= items
[0].code
[0]; /* for multi-code to unfold search. */
340 else if ((flag
& INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR
) != 0) {
341 OnigCodePoint cs
[3][4];
344 if (buk
->fold_len
== 2) {
345 m
= FOLDS2_UNFOLDS_NUM(buk
->index
);
346 for (i
= 0; i
< m
; i
++) {
347 OnigCodePoint unfold
= FOLDS2_UNFOLDS(buk
->index
)[i
];
348 if (unfold
== code
) continue;
350 items
[n
].byte_len
= len
;
351 items
[n
].code_len
= 1;
352 items
[n
].code
[0] = unfold
;
356 for (fn
= 0; fn
< 2; fn
++) {
358 cs
[fn
][0] = FOLDS2_FOLD(buk
->index
)[fn
];
359 index
= onigenc_unicode_fold1_key(&cs
[fn
][0]);
361 int m
= FOLDS1_UNFOLDS_NUM(index
);
362 for (i
= 0; i
< m
; i
++) {
363 cs
[fn
][i
+1] = FOLDS1_UNFOLDS(index
)[i
];
371 for (i
= 0; i
< ncs
[0]; i
++) {
372 for (j
= 0; j
< ncs
[1]; j
++) {
373 items
[n
].byte_len
= len
;
374 items
[n
].code_len
= 2;
375 items
[n
].code
[0] = cs
[0][i
];
376 items
[n
].code
[1] = cs
[1][j
];
381 else { /* fold_len == 3 */
382 m
= FOLDS3_UNFOLDS_NUM(buk
->index
);
383 for (i
= 0; i
< m
; i
++) {
384 OnigCodePoint unfold
= FOLDS3_UNFOLDS(buk
->index
)[i
];
385 if (unfold
== code
) continue;
387 items
[n
].byte_len
= len
;
388 items
[n
].code_len
= 1;
389 items
[n
].code
[0] = unfold
;
393 for (fn
= 0; fn
< 3; fn
++) {
395 cs
[fn
][0] = FOLDS3_FOLD(buk
->index
)[fn
];
396 index
= onigenc_unicode_fold1_key(&cs
[fn
][0]);
398 int m
= FOLDS1_UNFOLDS_NUM(index
);
399 for (i
= 0; i
< m
; i
++) {
400 cs
[fn
][i
+1] = FOLDS1_UNFOLDS(index
)[i
];
408 for (i
= 0; i
< ncs
[0]; i
++) {
409 for (j
= 0; j
< ncs
[1]; j
++) {
410 for (k
= 0; k
< ncs
[2]; k
++) {
411 items
[n
].byte_len
= len
;
412 items
[n
].code_len
= 3;
413 items
[n
].code
[0] = cs
[0][i
];
414 items
[n
].code
[1] = cs
[1][j
];
415 items
[n
].code
[2] = cs
[2][k
];
422 /* multi char folded code is not head of another folded multi char */
427 int index
= onigenc_unicode_fold1_key(&code
);
429 int m
= FOLDS1_UNFOLDS_NUM(index
);
430 for (i
= 0; i
< m
; i
++) {
431 items
[n
].byte_len
= len
;
432 items
[n
].code_len
= 1;
433 items
[n
].code
[0] = FOLDS1_UNFOLDS(index
)[i
];
439 if ((flag
& INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR
) == 0)
448 code
= ONIGENC_MBC_TO_CODE(enc
, p
, end
);
450 buk
= onigenc_unicode_unfold_key(code
);
451 if (buk
!= 0 && buk
->fold_len
== 1) {
452 codes
[1] = *FOLDS1_FOLD(buk
->index
);
457 clen
= enclen(enc
, p
);
460 index
= onigenc_unicode_fold2_key(codes
);
462 m
= FOLDS2_UNFOLDS_NUM(index
);
463 for (i
= 0; i
< m
; i
++) {
464 items
[n
].byte_len
= len
;
465 items
[n
].code_len
= 1;
466 items
[n
].code
[0] = FOLDS2_UNFOLDS(index
)[i
];
473 code
= ONIGENC_MBC_TO_CODE(enc
, p
, end
);
474 buk
= onigenc_unicode_unfold_key(code
);
475 if (buk
!= 0 && buk
->fold_len
== 1) {
476 codes
[2] = *FOLDS1_FOLD(buk
->index
);
481 clen
= enclen(enc
, p
);
484 index
= onigenc_unicode_fold3_key(codes
);
486 m
= FOLDS3_UNFOLDS_NUM(index
);
487 for (i
= 0; i
< m
; i
++) {
488 items
[n
].byte_len
= len
;
489 items
[n
].code_len
= 1;
490 items
[n
].code
[0] = FOLDS3_UNFOLDS(index
)[i
];
500 #ifdef USE_UNICODE_PROPERTIES
501 #include "unicode_property_data.c"
503 #include "unicode_property_data_posix.c"
507 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
509 enum EGCB_BREAK_TYPE
{
512 EGCB_BREAK_UNDEF_GB11
= 2,
513 EGCB_BREAK_UNDEF_RI_RI
= 3
523 EGCB_Regional_Indicator
= 6,
524 EGCB_SpacingMark
= 7,
529 EGCB_E_Base_GAZ
= 10,
530 EGCB_E_Modifier
= 11,
531 EGCB_Glue_After_Zwj
= 12,
546 #include "unicode_egcb_data.c"
548 static enum EGCB_TYPE
549 egcb_get_type(OnigCodePoint code
)
551 OnigCodePoint low
, high
, x
;
554 for (low
= 0, high
= (OnigCodePoint
)EGCB_RANGE_NUM
; low
< high
; ) {
555 x
= (low
+ high
) >> 1;
556 if (code
> EGCB_RANGES
[x
].end
)
562 type
= (low
< (OnigCodePoint
)EGCB_RANGE_NUM
&&
563 code
>= EGCB_RANGES
[low
].start
) ?
564 EGCB_RANGES
[low
].type
: EGCB_Other
;
569 #define IS_CONTROL_CR_LF(code) ((code) <= EGCB_Control && (code) >= EGCB_CR)
570 #define IS_HANGUL(code) ((code) >= EGCB_L)
572 /* GB1 and GB2 are outside of this function. */
573 static enum EGCB_BREAK_TYPE
574 unicode_egcb_is_break_2code(OnigCodePoint from_code
, OnigCodePoint to_code
)
579 from
= egcb_get_type(from_code
);
580 to
= egcb_get_type(to_code
);
583 if (from
== 0 && to
== 0) goto GB999
;
586 if (from
== EGCB_CR
&& to
== EGCB_LF
) return EGCB_NOT_BREAK
;
588 if (IS_CONTROL_CR_LF(from
)) return EGCB_BREAK
;
590 if (IS_CONTROL_CR_LF(to
)) return EGCB_BREAK
;
592 if (IS_HANGUL(from
) && IS_HANGUL(to
)) {
594 if (from
== EGCB_L
&& to
!= EGCB_T
) return EGCB_NOT_BREAK
;
596 if ((from
== EGCB_LV
|| from
== EGCB_V
)
597 && (to
== EGCB_V
|| to
== EGCB_T
)) return EGCB_NOT_BREAK
;
600 if ((to
== EGCB_T
) && (from
== EGCB_LVT
|| from
== EGCB_T
))
601 return EGCB_NOT_BREAK
;
607 if (to
== EGCB_Extend
|| to
== EGCB_ZWJ
) return EGCB_NOT_BREAK
;
610 if (to
== EGCB_SpacingMark
) return EGCB_NOT_BREAK
;
612 if (from
== EGCB_Prepend
) return EGCB_NOT_BREAK
;
617 if (from
== EGCB_ZWJ
) {
618 if (onigenc_unicode_is_code_ctype(to_code
, PROP_INDEX_EXTENDEDPICTOGRAPHIC
))
619 return EGCB_BREAK_UNDEF_GB11
;
625 if (from
== EGCB_Regional_Indicator
&& to
== EGCB_Regional_Indicator
) {
626 return EGCB_BREAK_UNDEF_RI_RI
;
633 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
636 onigenc_egcb_is_break_position(OnigEncoding enc
, UChar
* p
, UChar
* prev
,
637 const UChar
* start
, const UChar
* end
)
641 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
642 enum EGCB_BREAK_TYPE btype
;
647 if (p
== start
) return 1;
648 if (p
== end
) return 1;
651 prev
= onigenc_get_prev_char_head(enc
, start
, p
);
652 if (IS_NULL(prev
)) return 1;
655 from
= ONIGENC_MBC_TO_CODE(enc
, prev
, end
);
656 to
= ONIGENC_MBC_TO_CODE(enc
, p
, end
);
658 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
659 if (! ONIGENC_IS_UNICODE_ENCODING(enc
)) {
660 if (from
== 0x000d && to
== 0x000a) return 0;
664 btype
= unicode_egcb_is_break_2code(from
, to
);
673 case EGCB_BREAK_UNDEF_GB11
:
674 while ((prev
= onigenc_get_prev_char_head(enc
, start
, prev
)) != NULL
) {
675 from
= ONIGENC_MBC_TO_CODE(enc
, prev
, end
);
676 if (onigenc_unicode_is_code_ctype(from
, PROP_INDEX_EXTENDEDPICTOGRAPHIC
))
679 type
= egcb_get_type(from
);
680 if (type
!= EGCB_Extend
)
685 case EGCB_BREAK_UNDEF_RI_RI
:
688 while ((prev
= onigenc_get_prev_char_head(enc
, start
, prev
)) != NULL
) {
689 from
= ONIGENC_MBC_TO_CODE(enc
, prev
, end
);
690 type
= egcb_get_type(from
);
691 if (type
!= EGCB_Regional_Indicator
)
696 if ((n
% 2) == 0) return 0;
704 if (from
== 0x000d && to
== 0x000a) return 0;
706 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
710 #define USER_DEFINED_PROPERTY_MAX_NUM 20
714 OnigCodePoint
* ranges
;
715 } UserDefinedPropertyValue
;
717 static int UserDefinedPropertyNum
;
718 static UserDefinedPropertyValue
719 UserDefinedPropertyRanges
[USER_DEFINED_PROPERTY_MAX_NUM
];
720 static st_table
* UserDefinedPropertyTable
;
723 onig_unicode_define_user_property(const char* name
, OnigCodePoint
* ranges
)
725 UserDefinedPropertyValue
* e
;
733 if (UserDefinedPropertyNum
>= USER_DEFINED_PROPERTY_MAX_NUM
)
734 return ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS
;
736 len
= (int )strlen_s(name
,MAX_STRING_SIZE
);
737 if (len
>= PROPERTY_NAME_MAX_SIZE
)
738 return ONIGERR_TOO_LONG_PROPERTY_NAME
;
740 s
= (char* )xmalloc(len
+ 1);
742 return ONIGERR_MEMORY
;
745 for (i
= 0; i
< len
; i
++) {
747 if (c
<= 0 || c
>= 0x80) {
749 return ONIGERR_INVALID_CHAR_PROPERTY_NAME
;
752 if (c
!= ' ' && c
!= '-' && c
!= '_') {
759 if (UserDefinedPropertyTable
== 0) {
760 UserDefinedPropertyTable
= onig_st_init_strend_table_with_size(10);
763 e
= UserDefinedPropertyRanges
+ UserDefinedPropertyNum
;
764 e
->ctype
= CODE_RANGES_NUM
+ UserDefinedPropertyNum
;
766 r
= onig_st_insert_strend(UserDefinedPropertyTable
,
767 (const UChar
* )s
, (const UChar
* )s
+ n
,
768 (hash_data_type
)((void* )e
));
771 UserDefinedPropertyNum
++;
776 onigenc_unicode_is_code_ctype(OnigCodePoint code
, unsigned int ctype
)
779 #ifdef USE_UNICODE_PROPERTIES
780 ctype
<= ONIGENC_MAX_STD_CTYPE
&&
783 return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code
, ctype
);
786 if (ctype
>= CODE_RANGES_NUM
) {
787 int index
= ctype
- CODE_RANGES_NUM
;
788 if (index
< UserDefinedPropertyNum
)
789 return onig_is_in_code_range((UChar
* )UserDefinedPropertyRanges
[index
].ranges
, code
);
791 return ONIGERR_TYPE_BUG
;
794 return onig_is_in_code_range((UChar
* )CodeRanges
[ctype
], code
);
799 onigenc_unicode_ctype_code_range(OnigCtype ctype
, const OnigCodePoint
* ranges
[])
801 if (ctype
>= CODE_RANGES_NUM
) {
802 int index
= ctype
- CODE_RANGES_NUM
;
803 if (index
< UserDefinedPropertyNum
) {
804 *ranges
= UserDefinedPropertyRanges
[index
].ranges
;
808 return ONIGERR_TYPE_BUG
;
811 *ranges
= CodeRanges
[ctype
];
816 onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype
, OnigCodePoint
* sb_out
,
817 const OnigCodePoint
* ranges
[])
820 return onigenc_unicode_ctype_code_range(ctype
, ranges
);
824 onigenc_unicode_property_name_to_ctype(OnigEncoding enc
, UChar
* name
, UChar
* end
)
829 const struct PoolPropertyNameCtype
* pc
;
830 char buf
[PROPERTY_NAME_MAX_SIZE
];
835 code
= ONIGENC_MBC_TO_CODE(enc
, p
, end
);
837 return ONIGERR_INVALID_CHAR_PROPERTY_NAME
;
839 if (code
!= ' ' && code
!= '-' && code
!= '_') {
840 buf
[len
++] = (char )code
;
841 if (len
>= PROPERTY_NAME_MAX_SIZE
)
842 return ONIGERR_INVALID_CHAR_PROPERTY_NAME
;
850 if (UserDefinedPropertyTable
!= 0) {
851 UserDefinedPropertyValue
* e
;
852 e
= (UserDefinedPropertyValue
* )NULL
;
853 onig_st_lookup_strend(UserDefinedPropertyTable
,
854 (const UChar
* )buf
, (const UChar
* )buf
+ len
,
855 (hash_data_type
* )((void* )(&e
)));
861 pc
= unicode_lookup_property_name(buf
, len
);
863 /* fprintf(stderr, "LOOKUP: %s: %d\n", buf, pc->ctype); */
864 #ifndef USE_UNICODE_PROPERTIES
865 if (pc
->ctype
> ONIGENC_MAX_STD_CTYPE
)
866 return ONIGERR_INVALID_CHAR_PROPERTY_NAME
;
869 return (int )pc
->ctype
;
872 return ONIGERR_INVALID_CHAR_PROPERTY_NAME
;