unicode.c - Oniguruma (regular expression library)\r
**********************************************************************/\r
/*-\r
- * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>\r
+ * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>\r
* All rights reserved.\r
*\r
* Redistribution and use in source and binary forms, with or without\r
#endif\r
\r
\r
+#ifdef USE_UNICODE_WORD_BREAK\r
+\r
+enum WB_TYPE {\r
+ WB_Any = 0,\r
+ WB_ALetter,\r
+ WB_CR,\r
+ WB_Double_Quote,\r
+ WB_Extend,\r
+ WB_ExtendNumLet,\r
+ WB_Format,\r
+ WB_Hebrew_Letter,\r
+ WB_Katakana,\r
+ WB_LF,\r
+ WB_MidLetter,\r
+ WB_MidNum,\r
+ WB_MidNumLet,\r
+ WB_Newline,\r
+ WB_Numeric,\r
+ WB_Regional_Indicator,\r
+ WB_Single_Quote,\r
+ WB_WSegSpace,\r
+ WB_ZWJ,\r
+};\r
+\r
+typedef struct {\r
+ OnigCodePoint start;\r
+ OnigCodePoint end;\r
+ enum WB_TYPE type;\r
+} WB_RANGE_TYPE;\r
+\r
+#include "unicode_wb_data.c"\r
+\r
+static enum WB_TYPE\r
+wb_get_type(OnigCodePoint code)\r
+{\r
+ OnigCodePoint low, high, x;\r
+ enum WB_TYPE type;\r
+\r
+ for (low = 0, high = (OnigCodePoint )WB_RANGE_NUM; low < high; ) {\r
+ x = (low + high) >> 1;\r
+ if (code > WB_RANGES[x].end)\r
+ low = x + 1;\r
+ else\r
+ high = x;\r
+ }\r
+\r
+ type = (low < (OnigCodePoint )WB_RANGE_NUM &&\r
+ code >= WB_RANGES[low].start) ?\r
+ WB_RANGES[low].type : WB_Any;\r
+\r
+ return type;\r
+}\r
+\r
+#define IS_WB_IGNORE_TAIL(t) ((t) == WB_Extend || (t) == WB_Format || (t) == WB_ZWJ)\r
+#define IS_WB_AHLetter(t) ((t) == WB_ALetter || (t) == WB_Hebrew_Letter)\r
+#define IS_WB_MidNumLetQ(t) ((t) == WB_MidNumLet || (t) == WB_Single_Quote)\r
+\r
+static int\r
+wb_get_next_main_code(OnigEncoding enc, UChar* p, const UChar* end,\r
+ OnigCodePoint* rcode, enum WB_TYPE* rtype)\r
+{\r
+ OnigCodePoint code;\r
+ enum WB_TYPE type;\r
+\r
+ while (TRUE) {\r
+ p += enclen(enc, p);\r
+ if (p >= end) break;\r
+\r
+ code = ONIGENC_MBC_TO_CODE(enc, p, end);\r
+ type = wb_get_type(code);\r
+ if (! IS_WB_IGNORE_TAIL(type)) {\r
+ *rcode = code;\r
+ *rtype = type;\r
+ return 1;\r
+ }\r
+ }\r
+\r
+ return 0;\r
+}\r
+\r
+extern int\r
+onigenc_wb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,\r
+ const UChar* start, const UChar* end)\r
+{\r
+ int r;\r
+ UChar* pp;\r
+ OnigCodePoint cfrom;\r
+ OnigCodePoint cfrom2;\r
+ OnigCodePoint cto;\r
+ OnigCodePoint cto2;\r
+ enum WB_TYPE from;\r
+ enum WB_TYPE from2;\r
+ enum WB_TYPE to;\r
+ enum WB_TYPE to2;\r
+\r
+ /* WB1: sot / Any */\r
+ if (p == start) return TRUE;\r
+ /* WB2: Any / eot */\r
+ if (p == end) return TRUE;\r
+\r
+ if (IS_NULL(prev)) {\r
+ prev = onigenc_get_prev_char_head(enc, start, p);\r
+ if (IS_NULL(prev)) return TRUE;\r
+ }\r
+\r
+ cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);\r
+ cto = ONIGENC_MBC_TO_CODE(enc, p, end);\r
+\r
+ from = wb_get_type(cfrom);\r
+ to = wb_get_type(cto);\r
+\r
+ /* short cut */\r
+ if (from == 0 && to == 0) goto WB999;\r
+\r
+ /* WB3: CR + LF */\r
+ if (from == WB_CR && to == WB_LF) return FALSE;\r
+\r
+ /* WB3a: (Newline|CR|LF) / */\r
+ if (from == WB_Newline || from == WB_CR || from == WB_LF) return TRUE;\r
+ /* WB3b: / (Newline|CR|LF) */\r
+ if (to == WB_Newline || to == WB_CR || to == WB_LF) return TRUE;\r
+\r
+ /* WB3c: ZWJ + {Extended_Pictographic} */\r
+ if (from == WB_ZWJ) {\r
+ if (onigenc_unicode_is_code_ctype(cto, PROP_INDEX_EXTENDEDPICTOGRAPHIC))\r
+ return FALSE;\r
+ }\r
+\r
+ /* WB3d: WSegSpace + WSegSpace */\r
+ if (from == WB_WSegSpace && to == WB_WSegSpace) return FALSE;\r
+\r
+ /* WB4: X (Extend|Format|ZWJ)* -> X */\r
+ if (IS_WB_IGNORE_TAIL(to)) return FALSE;\r
+ if (IS_WB_IGNORE_TAIL(from)) {\r
+ while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r
+ prev = pp;\r
+ cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);\r
+ from = wb_get_type(cfrom);\r
+ if (! IS_WB_IGNORE_TAIL(from))\r
+ break;\r
+ }\r
+ }\r
+\r
+ if (IS_WB_AHLetter(from)) {\r
+ /* WB5: AHLetter + AHLetter */\r
+ if (IS_WB_AHLetter(to)) return FALSE;\r
+\r
+ /* WB6: AHLetter + (MidLetter | MidNumLetQ) AHLetter */\r
+ if (to == WB_MidLetter || IS_WB_MidNumLetQ(to)) {\r
+ r = wb_get_next_main_code(enc, p, end, &cto2, &to2);\r
+ if (r == 1) {\r
+ if (IS_WB_AHLetter(to2)) return FALSE;\r
+ }\r
+ }\r
+ }\r
+\r
+ /* WB7: AHLetter (MidLetter | MidNumLetQ) + AHLetter */\r
+ if (from == WB_MidLetter || IS_WB_MidNumLetQ(from)) {\r
+ if (IS_WB_AHLetter(to)) {\r
+ from2 = WB_Any;\r
+ while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r
+ prev = pp;\r
+ cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);\r
+ from2 = wb_get_type(cfrom2);\r
+ if (! IS_WB_IGNORE_TAIL(from2))\r
+ break;\r
+ }\r
+\r
+ if (IS_WB_AHLetter(from2)) return FALSE;\r
+ }\r
+ }\r
+\r
+ if (from == WB_Hebrew_Letter) {\r
+ /* WB7a: Hebrew_Letter + Single_Quote */\r
+ if (to == WB_Single_Quote) return FALSE;\r
+\r
+ /* WB7b: Hebrew_Letter + Double_Quote Hebrew_Letter */\r
+ if (to == WB_Double_Quote) {\r
+ r = wb_get_next_main_code(enc, p, end, &cto2, &to2);\r
+ if (r == 1) {\r
+ if (to2 == WB_Hebrew_Letter) return FALSE;\r
+ }\r
+ }\r
+ }\r
+\r
+ /* WB7c: Hebrew_Letter Double_Quote + Hebrew_Letter */\r
+ if (from == WB_Double_Quote) {\r
+ if (to == WB_Hebrew_Letter) {\r
+ from2 = WB_Any;\r
+ while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r
+ prev = pp;\r
+ cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);\r
+ from2 = wb_get_type(cfrom2);\r
+ if (! IS_WB_IGNORE_TAIL(from2))\r
+ break;\r
+ }\r
+\r
+ if (from2 == WB_Hebrew_Letter) return FALSE;\r
+ }\r
+ }\r
+\r
+ if (to == WB_Numeric) {\r
+ /* WB8: Numeric + Numeric */\r
+ if (from == WB_Numeric) return FALSE;\r
+\r
+ /* WB9: AHLetter + Numeric */\r
+ if (IS_WB_AHLetter(from)) return FALSE;\r
+\r
+ /* WB11: Numeric (MidNum | MidNumLetQ) + Numeric */\r
+ if (from == WB_MidNum || IS_WB_MidNumLetQ(from)) {\r
+ from2 = WB_Any;\r
+ while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r
+ prev = pp;\r
+ cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);\r
+ from2 = wb_get_type(cfrom2);\r
+ if (! IS_WB_IGNORE_TAIL(from2))\r
+ break;\r
+ }\r
+\r
+ if (from2 == WB_Numeric) return FALSE;\r
+ }\r
+ }\r
+\r
+ if (from == WB_Numeric) {\r
+ /* WB10: Numeric + AHLetter */\r
+ if (IS_WB_AHLetter(to)) return FALSE;\r
+\r
+ /* WB12: Numeric + (MidNum | MidNumLetQ) Numeric */\r
+ if (to == WB_MidNum || IS_WB_MidNumLetQ(to)) {\r
+ r = wb_get_next_main_code(enc, p, end, &cto2, &to2);\r
+ if (r == 1) {\r
+ if (to2 == WB_Numeric) return FALSE;\r
+ }\r
+ }\r
+ }\r
+\r
+ /* WB13: Katakana + Katakana */\r
+ if (from == WB_Katakana && to == WB_Katakana) return FALSE;\r
+\r
+ /* WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) + ExtendNumLet */\r
+ if (IS_WB_AHLetter(from) || from == WB_Numeric || from == WB_Katakana\r
+ || from == WB_ExtendNumLet) {\r
+ if (to == WB_ExtendNumLet) return FALSE;\r
+ }\r
+\r
+ /* WB13b: ExtendNumLet + (AHLetter | Numeric | Katakana) */\r
+ if (from == WB_ExtendNumLet) {\r
+ if (IS_WB_AHLetter(to) || to == WB_Numeric || to == WB_Katakana)\r
+ return FALSE;\r
+ }\r
+\r
+\r
+ /* WB15: sot (RI RI)* RI + RI */\r
+ /* WB16: [^RI] (RI RI)* RI + RI */\r
+ if (from == WB_Regional_Indicator && to == WB_Regional_Indicator) {\r
+ int n = 0;\r
+ while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r
+ cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);\r
+ from2 = wb_get_type(cfrom2);\r
+ if (from2 != WB_Regional_Indicator)\r
+ break;\r
+\r
+ n++;\r
+ }\r
+ if ((n % 2) == 0) return FALSE;\r
+ }\r
+\r
+ WB999:\r
+ /* WB999: Any / Any */\r
+ return TRUE;\r
+}\r
+\r
+#endif /* USE_UNICODE_WORD_BREAK */\r
+\r
+\r
#ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER\r
\r
enum EGCB_BREAK_TYPE {\r
\r
#ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER\r
if (! ONIGENC_IS_UNICODE_ENCODING(enc)) {\r
- if (from == 0x000d && to == 0x000a) return 0;\r
- else return 1;\r
+ return from != 0x000d || to != 0x000a;\r
}\r
\r
btype = unicode_egcb_is_break_2code(from, to);\r
return 1;\r
\r
#else\r
- if (from == 0x000d && to == 0x000a) return 0;\r
- else return 1;\r
+ return from != 0x000d || to != 0x000a;\r
#endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */\r
}\r
\r
int len;\r
int c;\r
char* s;\r
+ UChar* uname;\r
\r
if (UserDefinedPropertyNum >= USER_DEFINED_PROPERTY_MAX_NUM)\r
return ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS;\r
if (s == 0)\r
return ONIGERR_MEMORY;\r
\r
+ uname = (UChar* )name;\r
n = 0;\r
for (i = 0; i < len; i++) {\r
- c = name[i];\r
- if (c <= 0 || c >= 0x80) {\r
+ c = uname[i];\r
+ if (c < 0x20 || c >= 0x80) {\r
xfree(s);\r
return ONIGERR_INVALID_CHAR_PROPERTY_NAME;\r
}\r