1 /**********************************************************************
2 unicode.c - Oniguruma (regular expression library)
3 **********************************************************************/
5 * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 struct PoolPropertyNameCtype
{
37 #define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \
38 ((EncUNICODE_ISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
40 static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable
[256] = {
41 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
42 0x4008, 0x428c, 0x4289, 0x4288, 0x4288, 0x4288, 0x4008, 0x4008,
43 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
44 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
45 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
46 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
47 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
48 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
49 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
50 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
51 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
52 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
53 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
54 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
55 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
56 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
57 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,
58 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
59 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
60 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
61 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
62 0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0,
63 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
64 0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
65 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
66 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
67 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
68 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
69 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
70 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
71 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
72 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
77 #include "unicode_fold_data.c"
80 onigenc_unicode_mbc_case_fold(OnigEncoding enc
,
81 OnigCaseFoldType flag ARG_UNUSED
, const UChar
** pp
, const UChar
* end
,
84 const struct ByUnfoldKey
* buk
;
90 code
= ONIGENC_MBC_TO_CODE(enc
, p
, end
);
94 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
95 if ((flag
& ONIGENC_CASE_FOLD_TURKISH_AZERI
) != 0) {
97 return ONIGENC_CODE_TO_MBC(enc
, 0x0069, fold
);
100 if (code
== 0x0049) {
101 return ONIGENC_CODE_TO_MBC(enc
, 0x0131, fold
);
107 buk
= onigenc_unicode_unfold_key(code
);
109 if (buk
->fold_len
== 1) {
110 return ONIGENC_CODE_TO_MBC(enc
, *FOLDS1_FOLD(buk
->index
), fold
);
115 FOLDS_FOLD_ADDR_BUK(buk
, addr
);
117 for (i
= 0; i
< buk
->fold_len
; i
++) {
118 OnigCodePoint c
= addr
[i
];
119 len
= ONIGENC_CODE_TO_MBC(enc
, c
, fold
);
127 for (i
= 0; i
< len
; i
++) {
134 apply_case_fold1(int from
, int to
, OnigApplyAllCaseFoldFunc f
, void* arg
)
138 for (i
= from
; i
< to
; ) {
139 OnigCodePoint fold
= *FOLDS1_FOLD(i
);
140 n
= FOLDS1_UNFOLDS_NUM(i
);
141 for (j
= 0; j
< n
; j
++) {
142 OnigCodePoint unfold
= FOLDS1_UNFOLDS(i
)[j
];
144 r
= (*f
)(fold
, &unfold
, 1, arg
);
145 if (r
!= 0) return r
;
146 r
= (*f
)(unfold
, &fold
, 1, arg
);
147 if (r
!= 0) return r
;
149 for (k
= 0; k
< j
; k
++) {
150 OnigCodePoint unfold2
= FOLDS1_UNFOLDS(i
)[k
];
151 r
= (*f
)(unfold
, &unfold2
, 1, arg
);
152 if (r
!= 0) return r
;
153 r
= (*f
)(unfold2
, &unfold
, 1, arg
);
154 if (r
!= 0) return r
;
158 i
= FOLDS1_NEXT_INDEX(i
);
165 apply_case_fold2(int from
, int to
, OnigApplyAllCaseFoldFunc f
, void* arg
)
169 for (i
= from
; i
< to
; ) {
170 OnigCodePoint
* fold
= FOLDS2_FOLD(i
);
171 n
= FOLDS2_UNFOLDS_NUM(i
);
172 for (j
= 0; j
< n
; j
++) {
173 OnigCodePoint unfold
= FOLDS2_UNFOLDS(i
)[j
];
175 r
= (*f
)(unfold
, fold
, 2, arg
);
176 if (r
!= 0) return r
;
178 for (k
= 0; k
< j
; k
++) {
179 OnigCodePoint unfold2
= FOLDS2_UNFOLDS(i
)[k
];
180 r
= (*f
)(unfold
, &unfold2
, 1, arg
);
181 if (r
!= 0) return r
;
182 r
= (*f
)(unfold2
, &unfold
, 1, arg
);
183 if (r
!= 0) return r
;
187 i
= FOLDS2_NEXT_INDEX(i
);
194 apply_case_fold3(int from
, int to
, OnigApplyAllCaseFoldFunc f
, void* arg
)
198 for (i
= from
; i
< to
; ) {
199 OnigCodePoint
* fold
= FOLDS3_FOLD(i
);
200 n
= FOLDS3_UNFOLDS_NUM(i
);
201 for (j
= 0; j
< n
; j
++) {
202 OnigCodePoint unfold
= FOLDS3_UNFOLDS(i
)[j
];
204 r
= (*f
)(unfold
, fold
, 3, arg
);
205 if (r
!= 0) return r
;
207 for (k
= 0; k
< j
; k
++) {
208 OnigCodePoint unfold2
= FOLDS3_UNFOLDS(i
)[k
];
209 r
= (*f
)(unfold
, &unfold2
, 1, arg
);
210 if (r
!= 0) return r
;
211 r
= (*f
)(unfold2
, &unfold
, 1, arg
);
212 if (r
!= 0) return r
;
216 i
= FOLDS3_NEXT_INDEX(i
);
223 onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag
,
224 OnigApplyAllCaseFoldFunc f
, void* arg
)
228 r
= apply_case_fold1(0, FOLDS1_NORMAL_END_INDEX
, f
, arg
);
229 if (r
!= 0) return r
;
231 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
232 if ((flag
& ONIGENC_CASE_FOLD_TURKISH_AZERI
) != 0) {
234 r
= (*f
)(0x0049, &code
, 1, arg
);
235 if (r
!= 0) return r
;
237 r
= (*f
)(0x0131, &code
, 1, arg
);
238 if (r
!= 0) return r
;
241 r
= (*f
)(0x0069, &code
, 1, arg
);
242 if (r
!= 0) return r
;
244 r
= (*f
)(0x0130, &code
, 1, arg
);
245 if (r
!= 0) return r
;
249 r
= apply_case_fold1(FOLDS1_NORMAL_END_INDEX
, FOLDS1_END_INDEX
, f
, arg
);
250 if (r
!= 0) return r
;
251 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
255 if ((flag
& INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR
) == 0)
258 r
= apply_case_fold2(0, FOLDS2_NORMAL_END_INDEX
, f
, arg
);
259 if (r
!= 0) return r
;
261 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
262 if ((flag
& ONIGENC_CASE_FOLD_TURKISH_AZERI
) == 0) {
264 r
= apply_case_fold2(FOLDS2_NORMAL_END_INDEX
, FOLDS2_END_INDEX
, f
, arg
);
265 if (r
!= 0) return r
;
266 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
270 r
= apply_case_fold3(0, FOLDS3_NORMAL_END_INDEX
, f
, arg
);
271 if (r
!= 0) return r
;
277 onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc
,
278 OnigCaseFoldType flag
, const OnigUChar
* p
, const OnigUChar
* end
,
279 OnigCaseFoldCodeItem items
[])
281 int n
, m
, i
, j
, k
, len
;
282 OnigCodePoint code
, codes
[3];
283 const struct ByUnfoldKey
* buk
;
287 code
= ONIGENC_MBC_TO_CODE(enc
, p
, end
);
288 len
= enclen(enc
, p
);
290 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
291 if ((flag
& ONIGENC_CASE_FOLD_TURKISH_AZERI
) != 0) {
292 if (code
== 0x0049) {
293 items
[0].byte_len
= len
;
294 items
[0].code_len
= 1;
295 items
[0].code
[0] = 0x0131;
298 else if (code
== 0x0130) {
299 items
[0].byte_len
= len
;
300 items
[0].code_len
= 1;
301 items
[0].code
[0] = 0x0069;
304 else if (code
== 0x0131) {
305 items
[0].byte_len
= len
;
306 items
[0].code_len
= 1;
307 items
[0].code
[0] = 0x0049;
310 else if (code
== 0x0069) {
311 items
[0].byte_len
= len
;
312 items
[0].code_len
= 1;
313 items
[0].code
[0] = 0x0130;
319 buk
= onigenc_unicode_unfold_key(code
);
321 if (buk
->fold_len
== 1) {
323 items
[0].byte_len
= len
;
324 items
[0].code_len
= 1;
325 items
[0].code
[0] = *FOLDS1_FOLD(buk
->index
);
328 un
= FOLDS1_UNFOLDS_NUM(buk
->index
);
329 for (i
= 0; i
< un
; i
++) {
330 OnigCodePoint unfold
= FOLDS1_UNFOLDS(buk
->index
)[i
];
331 if (unfold
!= code
) {
332 items
[n
].byte_len
= len
;
333 items
[n
].code_len
= 1;
334 items
[n
].code
[0] = unfold
;
338 code
= items
[0].code
[0]; /* for multi-code to unfold search. */
340 else if ((flag
& INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR
) != 0) {
341 OnigCodePoint cs
[3][4];
344 if (buk
->fold_len
== 2) {
345 m
= FOLDS2_UNFOLDS_NUM(buk
->index
);
346 for (i
= 0; i
< m
; i
++) {
347 OnigCodePoint unfold
= FOLDS2_UNFOLDS(buk
->index
)[i
];
348 if (unfold
== code
) continue;
350 items
[n
].byte_len
= len
;
351 items
[n
].code_len
= 1;
352 items
[n
].code
[0] = unfold
;
356 for (fn
= 0; fn
< 2; fn
++) {
358 cs
[fn
][0] = FOLDS2_FOLD(buk
->index
)[fn
];
359 index
= onigenc_unicode_fold1_key(&cs
[fn
][0]);
361 int m
= FOLDS1_UNFOLDS_NUM(index
);
362 for (i
= 0; i
< m
; i
++) {
363 cs
[fn
][i
+1] = FOLDS1_UNFOLDS(index
)[i
];
371 for (i
= 0; i
< ncs
[0]; i
++) {
372 for (j
= 0; j
< ncs
[1]; j
++) {
373 items
[n
].byte_len
= len
;
374 items
[n
].code_len
= 2;
375 items
[n
].code
[0] = cs
[0][i
];
376 items
[n
].code
[1] = cs
[1][j
];
381 else { /* fold_len == 3 */
382 m
= FOLDS3_UNFOLDS_NUM(buk
->index
);
383 for (i
= 0; i
< m
; i
++) {
384 OnigCodePoint unfold
= FOLDS3_UNFOLDS(buk
->index
)[i
];
385 if (unfold
== code
) continue;
387 items
[n
].byte_len
= len
;
388 items
[n
].code_len
= 1;
389 items
[n
].code
[0] = unfold
;
393 for (fn
= 0; fn
< 3; fn
++) {
395 cs
[fn
][0] = FOLDS3_FOLD(buk
->index
)[fn
];
396 index
= onigenc_unicode_fold1_key(&cs
[fn
][0]);
398 int m
= FOLDS1_UNFOLDS_NUM(index
);
399 for (i
= 0; i
< m
; i
++) {
400 cs
[fn
][i
+1] = FOLDS1_UNFOLDS(index
)[i
];
408 for (i
= 0; i
< ncs
[0]; i
++) {
409 for (j
= 0; j
< ncs
[1]; j
++) {
410 for (k
= 0; k
< ncs
[2]; k
++) {
411 items
[n
].byte_len
= len
;
412 items
[n
].code_len
= 3;
413 items
[n
].code
[0] = cs
[0][i
];
414 items
[n
].code
[1] = cs
[1][j
];
415 items
[n
].code
[2] = cs
[2][k
];
422 /* multi char folded code is not head of another folded multi char */
427 int index
= onigenc_unicode_fold1_key(&code
);
429 int m
= FOLDS1_UNFOLDS_NUM(index
);
430 for (i
= 0; i
< m
; i
++) {
431 items
[n
].byte_len
= len
;
432 items
[n
].code_len
= 1;
433 items
[n
].code
[0] = FOLDS1_UNFOLDS(index
)[i
];
439 if ((flag
& INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR
) == 0)
448 code
= ONIGENC_MBC_TO_CODE(enc
, p
, end
);
450 buk
= onigenc_unicode_unfold_key(code
);
451 if (buk
!= 0 && buk
->fold_len
== 1) {
452 codes
[1] = *FOLDS1_FOLD(buk
->index
);
457 clen
= enclen(enc
, p
);
460 index
= onigenc_unicode_fold2_key(codes
);
462 m
= FOLDS2_UNFOLDS_NUM(index
);
463 for (i
= 0; i
< m
; i
++) {
464 items
[n
].byte_len
= len
;
465 items
[n
].code_len
= 1;
466 items
[n
].code
[0] = FOLDS2_UNFOLDS(index
)[i
];
473 code
= ONIGENC_MBC_TO_CODE(enc
, p
, end
);
474 buk
= onigenc_unicode_unfold_key(code
);
475 if (buk
!= 0 && buk
->fold_len
== 1) {
476 codes
[2] = *FOLDS1_FOLD(buk
->index
);
481 clen
= enclen(enc
, p
);
484 index
= onigenc_unicode_fold3_key(codes
);
486 m
= FOLDS3_UNFOLDS_NUM(index
);
487 for (i
= 0; i
< m
; i
++) {
488 items
[n
].byte_len
= len
;
489 items
[n
].code_len
= 1;
490 items
[n
].code
[0] = FOLDS3_UNFOLDS(index
)[i
];
500 #ifdef USE_UNICODE_PROPERTIES
501 #include "unicode_property_data.c"
503 #include "unicode_property_data_posix.c"
507 #ifdef USE_UNICODE_WORD_BREAK
525 WB_Regional_Indicator
,
537 #include "unicode_wb_data.c"
540 wb_get_type(OnigCodePoint code
)
542 OnigCodePoint low
, high
, x
;
545 for (low
= 0, high
= (OnigCodePoint
)WB_RANGE_NUM
; low
< high
; ) {
546 x
= (low
+ high
) >> 1;
547 if (code
> WB_RANGES
[x
].end
)
553 type
= (low
< (OnigCodePoint
)WB_RANGE_NUM
&&
554 code
>= WB_RANGES
[low
].start
) ?
555 WB_RANGES
[low
].type
: WB_Any
;
560 #define IS_WB_IGNORE_TAIL(t) ((t) == WB_Extend || (t) == WB_Format || (t) == WB_ZWJ)
561 #define IS_WB_AHLetter(t) ((t) == WB_ALetter || (t) == WB_Hebrew_Letter)
562 #define IS_WB_MidNumLetQ(t) ((t) == WB_MidNumLet || (t) == WB_Single_Quote)
565 wb_get_next_main_code(OnigEncoding enc
, UChar
* p
, const UChar
* end
,
566 OnigCodePoint
* rcode
, enum WB_TYPE
* rtype
)
575 code
= ONIGENC_MBC_TO_CODE(enc
, p
, end
);
576 type
= wb_get_type(code
);
577 if (! IS_WB_IGNORE_TAIL(type
)) {
588 onigenc_wb_is_break_position(OnigEncoding enc
, UChar
* p
, UChar
* prev
,
589 const UChar
* start
, const UChar
* end
)
594 OnigCodePoint cfrom2
;
603 if (p
== start
) return TRUE
;
605 if (p
== end
) return TRUE
;
608 prev
= onigenc_get_prev_char_head(enc
, start
, p
);
609 if (IS_NULL(prev
)) return TRUE
;
612 cfrom
= ONIGENC_MBC_TO_CODE(enc
, prev
, end
);
613 cto
= ONIGENC_MBC_TO_CODE(enc
, p
, end
);
615 from
= wb_get_type(cfrom
);
616 to
= wb_get_type(cto
);
619 if (from
== 0 && to
== 0) goto WB999
;
622 if (from
== WB_CR
&& to
== WB_LF
) return FALSE
;
624 /* WB3a: (Newline|CR|LF) / */
625 if (from
== WB_Newline
|| from
== WB_CR
|| from
== WB_LF
) return TRUE
;
626 /* WB3b: / (Newline|CR|LF) */
627 if (to
== WB_Newline
|| to
== WB_CR
|| to
== WB_LF
) return TRUE
;
629 /* WB3c: ZWJ + {Extended_Pictographic} */
630 if (from
== WB_ZWJ
) {
631 if (onigenc_unicode_is_code_ctype(cto
, PROP_INDEX_EXTENDEDPICTOGRAPHIC
))
635 /* WB3d: WSegSpace + WSegSpace */
636 if (from
== WB_WSegSpace
&& to
== WB_WSegSpace
) return FALSE
;
638 /* WB4: X (Extend|Format|ZWJ)* -> X */
639 if (IS_WB_IGNORE_TAIL(to
)) return FALSE
;
640 if (IS_WB_IGNORE_TAIL(from
)) {
641 while ((pp
= onigenc_get_prev_char_head(enc
, start
, prev
)) != NULL
) {
643 cfrom
= ONIGENC_MBC_TO_CODE(enc
, prev
, end
);
644 from
= wb_get_type(cfrom
);
645 if (! IS_WB_IGNORE_TAIL(from
))
650 if (IS_WB_AHLetter(from
)) {
651 /* WB5: AHLetter + AHLetter */
652 if (IS_WB_AHLetter(to
)) return FALSE
;
654 /* WB6: AHLetter + (MidLetter | MidNumLetQ) AHLetter */
655 if (to
== WB_MidLetter
|| IS_WB_MidNumLetQ(to
)) {
656 r
= wb_get_next_main_code(enc
, p
, end
, &cto2
, &to2
);
658 if (IS_WB_AHLetter(to2
)) return FALSE
;
663 /* WB7: AHLetter (MidLetter | MidNumLetQ) + AHLetter */
664 if (from
== WB_MidLetter
|| IS_WB_MidNumLetQ(from
)) {
665 if (IS_WB_AHLetter(to
)) {
667 while ((pp
= onigenc_get_prev_char_head(enc
, start
, prev
)) != NULL
) {
669 cfrom2
= ONIGENC_MBC_TO_CODE(enc
, prev
, end
);
670 from2
= wb_get_type(cfrom2
);
671 if (! IS_WB_IGNORE_TAIL(from2
))
675 if (IS_WB_AHLetter(from2
)) return FALSE
;
679 if (from
== WB_Hebrew_Letter
) {
680 /* WB7a: Hebrew_Letter + Single_Quote */
681 if (to
== WB_Single_Quote
) return FALSE
;
683 /* WB7b: Hebrew_Letter + Double_Quote Hebrew_Letter */
684 if (to
== WB_Double_Quote
) {
685 r
= wb_get_next_main_code(enc
, p
, end
, &cto2
, &to2
);
687 if (to2
== WB_Hebrew_Letter
) return FALSE
;
692 /* WB7c: Hebrew_Letter Double_Quote + Hebrew_Letter */
693 if (from
== WB_Double_Quote
) {
694 if (to
== WB_Hebrew_Letter
) {
696 while ((pp
= onigenc_get_prev_char_head(enc
, start
, prev
)) != NULL
) {
698 cfrom2
= ONIGENC_MBC_TO_CODE(enc
, prev
, end
);
699 from2
= wb_get_type(cfrom2
);
700 if (! IS_WB_IGNORE_TAIL(from2
))
704 if (from2
== WB_Hebrew_Letter
) return FALSE
;
708 if (to
== WB_Numeric
) {
709 /* WB8: Numeric + Numeric */
710 if (from
== WB_Numeric
) return FALSE
;
712 /* WB9: AHLetter + Numeric */
713 if (IS_WB_AHLetter(from
)) return FALSE
;
715 /* WB11: Numeric (MidNum | MidNumLetQ) + Numeric */
716 if (from
== WB_MidNum
|| IS_WB_MidNumLetQ(from
)) {
718 while ((pp
= onigenc_get_prev_char_head(enc
, start
, prev
)) != NULL
) {
720 cfrom2
= ONIGENC_MBC_TO_CODE(enc
, prev
, end
);
721 from2
= wb_get_type(cfrom2
);
722 if (! IS_WB_IGNORE_TAIL(from2
))
726 if (from2
== WB_Numeric
) return FALSE
;
730 if (from
== WB_Numeric
) {
731 /* WB10: Numeric + AHLetter */
732 if (IS_WB_AHLetter(to
)) return FALSE
;
734 /* WB12: Numeric + (MidNum | MidNumLetQ) Numeric */
735 if (to
== WB_MidNum
|| IS_WB_MidNumLetQ(to
)) {
736 r
= wb_get_next_main_code(enc
, p
, end
, &cto2
, &to2
);
738 if (to2
== WB_Numeric
) return FALSE
;
743 /* WB13: Katakana + Katakana */
744 if (from
== WB_Katakana
&& to
== WB_Katakana
) return FALSE
;
746 /* WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) + ExtendNumLet */
747 if (IS_WB_AHLetter(from
) || from
== WB_Numeric
|| from
== WB_Katakana
748 || from
== WB_ExtendNumLet
) {
749 if (to
== WB_ExtendNumLet
) return FALSE
;
752 /* WB13b: ExtendNumLet + (AHLetter | Numeric | Katakana) */
753 if (from
== WB_ExtendNumLet
) {
754 if (IS_WB_AHLetter(to
) || to
== WB_Numeric
|| to
== WB_Katakana
)
759 /* WB15: sot (RI RI)* RI + RI */
760 /* WB16: [^RI] (RI RI)* RI + RI */
761 if (from
== WB_Regional_Indicator
&& to
== WB_Regional_Indicator
) {
763 while ((prev
= onigenc_get_prev_char_head(enc
, start
, prev
)) != NULL
) {
764 cfrom2
= ONIGENC_MBC_TO_CODE(enc
, prev
, end
);
765 from2
= wb_get_type(cfrom2
);
766 if (from2
!= WB_Regional_Indicator
)
771 if ((n
% 2) == 0) return FALSE
;
775 /* WB999: Any / Any */
779 #endif /* USE_UNICODE_WORD_BREAK */
782 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
784 enum EGCB_BREAK_TYPE
{
787 EGCB_BREAK_UNDEF_GB11
= 2,
788 EGCB_BREAK_UNDEF_RI_RI
= 3
798 EGCB_Regional_Indicator
= 6,
799 EGCB_SpacingMark
= 7,
804 EGCB_E_Base_GAZ
= 10,
805 EGCB_E_Modifier
= 11,
806 EGCB_Glue_After_Zwj
= 12,
821 #include "unicode_egcb_data.c"
823 static enum EGCB_TYPE
824 egcb_get_type(OnigCodePoint code
)
826 OnigCodePoint low
, high
, x
;
829 for (low
= 0, high
= (OnigCodePoint
)EGCB_RANGE_NUM
; low
< high
; ) {
830 x
= (low
+ high
) >> 1;
831 if (code
> EGCB_RANGES
[x
].end
)
837 type
= (low
< (OnigCodePoint
)EGCB_RANGE_NUM
&&
838 code
>= EGCB_RANGES
[low
].start
) ?
839 EGCB_RANGES
[low
].type
: EGCB_Other
;
844 #define IS_CONTROL_CR_LF(code) ((code) <= EGCB_Control && (code) >= EGCB_CR)
845 #define IS_HANGUL(code) ((code) >= EGCB_L)
847 /* GB1 and GB2 are outside of this function. */
848 static enum EGCB_BREAK_TYPE
849 unicode_egcb_is_break_2code(OnigCodePoint from_code
, OnigCodePoint to_code
)
854 from
= egcb_get_type(from_code
);
855 to
= egcb_get_type(to_code
);
858 if (from
== 0 && to
== 0) goto GB999
;
861 if (from
== EGCB_CR
&& to
== EGCB_LF
) return EGCB_NOT_BREAK
;
863 if (IS_CONTROL_CR_LF(from
)) return EGCB_BREAK
;
865 if (IS_CONTROL_CR_LF(to
)) return EGCB_BREAK
;
867 if (IS_HANGUL(from
) && IS_HANGUL(to
)) {
869 if (from
== EGCB_L
&& to
!= EGCB_T
) return EGCB_NOT_BREAK
;
871 if ((from
== EGCB_LV
|| from
== EGCB_V
)
872 && (to
== EGCB_V
|| to
== EGCB_T
)) return EGCB_NOT_BREAK
;
875 if ((to
== EGCB_T
) && (from
== EGCB_LVT
|| from
== EGCB_T
))
876 return EGCB_NOT_BREAK
;
882 if (to
== EGCB_Extend
|| to
== EGCB_ZWJ
) return EGCB_NOT_BREAK
;
885 if (to
== EGCB_SpacingMark
) return EGCB_NOT_BREAK
;
887 if (from
== EGCB_Prepend
) return EGCB_NOT_BREAK
;
892 if (from
== EGCB_ZWJ
) {
893 if (onigenc_unicode_is_code_ctype(to_code
, PROP_INDEX_EXTENDEDPICTOGRAPHIC
))
894 return EGCB_BREAK_UNDEF_GB11
;
900 if (from
== EGCB_Regional_Indicator
&& to
== EGCB_Regional_Indicator
) {
901 return EGCB_BREAK_UNDEF_RI_RI
;
908 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
911 onigenc_egcb_is_break_position(OnigEncoding enc
, UChar
* p
, UChar
* prev
,
912 const UChar
* start
, const UChar
* end
)
916 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
917 enum EGCB_BREAK_TYPE btype
;
922 if (p
== start
) return 1;
923 if (p
== end
) return 1;
926 prev
= onigenc_get_prev_char_head(enc
, start
, p
);
927 if (IS_NULL(prev
)) return 1;
930 from
= ONIGENC_MBC_TO_CODE(enc
, prev
, end
);
931 to
= ONIGENC_MBC_TO_CODE(enc
, p
, end
);
933 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
934 if (! ONIGENC_IS_UNICODE_ENCODING(enc
)) {
935 return from
!= 0x000d || to
!= 0x000a;
938 btype
= unicode_egcb_is_break_2code(from
, to
);
947 case EGCB_BREAK_UNDEF_GB11
:
948 while ((prev
= onigenc_get_prev_char_head(enc
, start
, prev
)) != NULL
) {
949 from
= ONIGENC_MBC_TO_CODE(enc
, prev
, end
);
950 if (onigenc_unicode_is_code_ctype(from
, PROP_INDEX_EXTENDEDPICTOGRAPHIC
))
953 type
= egcb_get_type(from
);
954 if (type
!= EGCB_Extend
)
959 case EGCB_BREAK_UNDEF_RI_RI
:
962 while ((prev
= onigenc_get_prev_char_head(enc
, start
, prev
)) != NULL
) {
963 from
= ONIGENC_MBC_TO_CODE(enc
, prev
, end
);
964 type
= egcb_get_type(from
);
965 if (type
!= EGCB_Regional_Indicator
)
970 if ((n
% 2) == 0) return 0;
978 return from
!= 0x000d || to
!= 0x000a;
979 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
983 #define USER_DEFINED_PROPERTY_MAX_NUM 20
987 OnigCodePoint
* ranges
;
988 } UserDefinedPropertyValue
;
990 static int UserDefinedPropertyNum
;
991 static UserDefinedPropertyValue
992 UserDefinedPropertyRanges
[USER_DEFINED_PROPERTY_MAX_NUM
];
993 static st_table
* UserDefinedPropertyTable
;
996 onig_unicode_define_user_property(const char* name
, OnigCodePoint
* ranges
)
998 UserDefinedPropertyValue
* e
;
1007 if (UserDefinedPropertyNum
>= USER_DEFINED_PROPERTY_MAX_NUM
)
1008 return ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS
;
1010 len
= (int )strlen_s(name
,MAX_STRING_SIZE
);
1011 if (len
>= PROPERTY_NAME_MAX_SIZE
)
1012 return ONIGERR_TOO_LONG_PROPERTY_NAME
;
1014 s
= (char* )xmalloc(len
+ 1);
1016 return ONIGERR_MEMORY
;
1018 uname
= (UChar
* )name
;
1020 for (i
= 0; i
< len
; i
++) {
1022 if (c
< 0x20 || c
>= 0x80) {
1024 return ONIGERR_INVALID_CHAR_PROPERTY_NAME
;
1027 if (c
!= ' ' && c
!= '-' && c
!= '_') {
1034 if (UserDefinedPropertyTable
== 0) {
1035 UserDefinedPropertyTable
= onig_st_init_strend_table_with_size(10);
1036 if (IS_NULL(UserDefinedPropertyTable
)) {
1038 return ONIGERR_MEMORY
;
1042 e
= UserDefinedPropertyRanges
+ UserDefinedPropertyNum
;
1043 e
->ctype
= CODE_RANGES_NUM
+ UserDefinedPropertyNum
;
1045 r
= onig_st_insert_strend(UserDefinedPropertyTable
,
1046 (const UChar
* )s
, (const UChar
* )s
+ n
,
1047 (hash_data_type
)((void* )e
));
1048 if (r
< 0) return r
;
1050 UserDefinedPropertyNum
++;
1055 onigenc_unicode_is_code_ctype(OnigCodePoint code
, unsigned int ctype
)
1058 #ifdef USE_UNICODE_PROPERTIES
1059 ctype
<= ONIGENC_MAX_STD_CTYPE
&&
1062 return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code
, ctype
);
1065 if (ctype
>= CODE_RANGES_NUM
) {
1066 int index
= ctype
- CODE_RANGES_NUM
;
1067 if (index
< UserDefinedPropertyNum
)
1068 return onig_is_in_code_range((UChar
* )UserDefinedPropertyRanges
[index
].ranges
, code
);
1070 return ONIGERR_TYPE_BUG
;
1073 return onig_is_in_code_range((UChar
* )CodeRanges
[ctype
], code
);
1078 onigenc_unicode_ctype_code_range(OnigCtype ctype
, const OnigCodePoint
* ranges
[])
1080 if (ctype
>= CODE_RANGES_NUM
) {
1081 int index
= ctype
- CODE_RANGES_NUM
;
1082 if (index
< UserDefinedPropertyNum
) {
1083 *ranges
= UserDefinedPropertyRanges
[index
].ranges
;
1087 return ONIGERR_TYPE_BUG
;
1090 *ranges
= CodeRanges
[ctype
];
1095 onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype
, OnigCodePoint
* sb_out
,
1096 const OnigCodePoint
* ranges
[])
1099 return onigenc_unicode_ctype_code_range(ctype
, ranges
);
1103 onigenc_unicode_property_name_to_ctype(OnigEncoding enc
, UChar
* name
, UChar
* end
)
1108 const struct PoolPropertyNameCtype
* pc
;
1109 char buf
[PROPERTY_NAME_MAX_SIZE
];
1114 code
= ONIGENC_MBC_TO_CODE(enc
, p
, end
);
1116 return ONIGERR_INVALID_CHAR_PROPERTY_NAME
;
1118 if (code
!= ' ' && code
!= '-' && code
!= '_') {
1119 buf
[len
++] = (char )code
;
1120 if (len
>= PROPERTY_NAME_MAX_SIZE
)
1121 return ONIGERR_INVALID_CHAR_PROPERTY_NAME
;
1124 p
+= enclen(enc
, p
);
1129 if (UserDefinedPropertyTable
!= 0) {
1130 UserDefinedPropertyValue
* e
;
1131 e
= (UserDefinedPropertyValue
* )NULL
;
1132 onig_st_lookup_strend(UserDefinedPropertyTable
,
1133 (const UChar
* )buf
, (const UChar
* )buf
+ len
,
1134 (hash_data_type
* )((void* )(&e
)));
1140 pc
= unicode_lookup_property_name(buf
, len
);
1142 /* fprintf(stderr, "LOOKUP: %s: %d\n", buf, pc->ctype); */
1143 #ifndef USE_UNICODE_PROPERTIES
1144 if (pc
->ctype
> ONIGENC_MAX_STD_CTYPE
)
1145 return ONIGERR_INVALID_CHAR_PROPERTY_NAME
;
1148 return (int )pc
->ctype
;
1151 return ONIGERR_INVALID_CHAR_PROPERTY_NAME
;