+++ /dev/null
-/**********************************************************************\r
- unicode.c - Oniguruma (regular expression library)\r
-**********************************************************************/\r
-/*-\r
- * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>\r
- * All rights reserved.\r
- *\r
- * Redistribution and use in source and binary forms, with or without\r
- * modification, are permitted provided that the following conditions\r
- * are met:\r
- * 1. Redistributions of source code must retain the above copyright\r
- * notice, this list of conditions and the following disclaimer.\r
- * 2. Redistributions in binary form must reproduce the above copyright\r
- * notice, this list of conditions and the following disclaimer in the\r
- * documentation and/or other materials provided with the distribution.\r
- *\r
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND\r
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE\r
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS\r
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)\r
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT\r
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY\r
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF\r
- * SUCH DAMAGE.\r
- */\r
-\r
-#include "regint.h"\r
-\r
-struct PoolPropertyNameCtype {\r
- short int name;\r
- short int ctype;\r
-};\r
-\r
-#define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \\r
- ((EncUNICODE_ISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)\r
-\r
-static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {\r
- 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,\r
- 0x4008, 0x428c, 0x4289, 0x4288, 0x4288, 0x4288, 0x4008, 0x4008,\r
- 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,\r
- 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,\r
- 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,\r
- 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,\r
- 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,\r
- 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,\r
- 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,\r
- 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,\r
- 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,\r
- 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,\r
- 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,\r
- 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,\r
- 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,\r
- 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,\r
- 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,\r
- 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,\r
- 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,\r
- 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,\r
- 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,\r
- 0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0,\r
- 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,\r
- 0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,\r
- 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,\r
- 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,\r
- 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,\r
- 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,\r
- 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,\r
- 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,\r
- 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,\r
- 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2\r
-};\r
-\r
-#include "st.h"\r
-\r
-#include "unicode_fold_data.c"\r
-\r
-extern int\r
-onigenc_unicode_mbc_case_fold(OnigEncoding enc,\r
- OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end,\r
- UChar* fold)\r
-{\r
- const struct ByUnfoldKey* buk;\r
-\r
- OnigCodePoint code;\r
- int i, len, rlen;\r
- const UChar *p = *pp;\r
-\r
- code = ONIGENC_MBC_TO_CODE(enc, p, end);\r
- len = enclen(enc, p);\r
- *pp += len;\r
-\r
-#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI\r
- if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {\r
- if (code == 0x0130) {\r
- return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold);\r
- }\r
-#if 0\r
- if (code == 0x0049) {\r
- return ONIGENC_CODE_TO_MBC(enc, 0x0131, fold);\r
- }\r
-#endif\r
- }\r
-#endif\r
-\r
- buk = onigenc_unicode_unfold_key(code);\r
- if (buk != 0) {\r
- if (buk->fold_len == 1) {\r
- return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold);\r
- }\r
- else {\r
- OnigCodePoint* addr;\r
-\r
- FOLDS_FOLD_ADDR_BUK(buk, addr);\r
- rlen = 0;\r
- for (i = 0; i < buk->fold_len; i++) {\r
- OnigCodePoint c = addr[i];\r
- len = ONIGENC_CODE_TO_MBC(enc, c, fold);\r
- fold += len;\r
- rlen += len;\r
- }\r
- return rlen;\r
- }\r
- }\r
-\r
- for (i = 0; i < len; i++) {\r
- *fold++ = *p++;\r
- }\r
- return len;\r
-}\r
-\r
-static int\r
-apply_case_fold1(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)\r
-{\r
- int i, j, k, n, r;\r
-\r
- for (i = from; i < to; ) {\r
- OnigCodePoint fold = *FOLDS1_FOLD(i);\r
- n = FOLDS1_UNFOLDS_NUM(i);\r
- for (j = 0; j < n; j++) {\r
- OnigCodePoint unfold = FOLDS1_UNFOLDS(i)[j];\r
-\r
- r = (*f)(fold, &unfold, 1, arg);\r
- if (r != 0) return r;\r
- r = (*f)(unfold, &fold, 1, arg);\r
- if (r != 0) return r;\r
-\r
- for (k = 0; k < j; k++) {\r
- OnigCodePoint unfold2 = FOLDS1_UNFOLDS(i)[k];\r
- r = (*f)(unfold, &unfold2, 1, arg);\r
- if (r != 0) return r;\r
- r = (*f)(unfold2, &unfold, 1, arg);\r
- if (r != 0) return r;\r
- }\r
- }\r
-\r
- i = FOLDS1_NEXT_INDEX(i);\r
- }\r
-\r
- return 0;\r
-}\r
-\r
-static int\r
-apply_case_fold2(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)\r
-{\r
- int i, j, k, n, r;\r
-\r
- for (i = from; i < to; ) {\r
- OnigCodePoint* fold = FOLDS2_FOLD(i);\r
- n = FOLDS2_UNFOLDS_NUM(i);\r
- for (j = 0; j < n; j++) {\r
- OnigCodePoint unfold = FOLDS2_UNFOLDS(i)[j];\r
-\r
- r = (*f)(unfold, fold, 2, arg);\r
- if (r != 0) return r;\r
-\r
- for (k = 0; k < j; k++) {\r
- OnigCodePoint unfold2 = FOLDS2_UNFOLDS(i)[k];\r
- r = (*f)(unfold, &unfold2, 1, arg);\r
- if (r != 0) return r;\r
- r = (*f)(unfold2, &unfold, 1, arg);\r
- if (r != 0) return r;\r
- }\r
- }\r
-\r
- i = FOLDS2_NEXT_INDEX(i);\r
- }\r
-\r
- return 0;\r
-}\r
-\r
-static int\r
-apply_case_fold3(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)\r
-{\r
- int i, j, k, n, r;\r
-\r
- for (i = from; i < to; ) {\r
- OnigCodePoint* fold = FOLDS3_FOLD(i);\r
- n = FOLDS3_UNFOLDS_NUM(i);\r
- for (j = 0; j < n; j++) {\r
- OnigCodePoint unfold = FOLDS3_UNFOLDS(i)[j];\r
-\r
- r = (*f)(unfold, fold, 3, arg);\r
- if (r != 0) return r;\r
-\r
- for (k = 0; k < j; k++) {\r
- OnigCodePoint unfold2 = FOLDS3_UNFOLDS(i)[k];\r
- r = (*f)(unfold, &unfold2, 1, arg);\r
- if (r != 0) return r;\r
- r = (*f)(unfold2, &unfold, 1, arg);\r
- if (r != 0) return r;\r
- }\r
- }\r
-\r
- i = FOLDS3_NEXT_INDEX(i);\r
- }\r
-\r
- return 0;\r
-}\r
-\r
-extern int\r
-onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,\r
- OnigApplyAllCaseFoldFunc f, void* arg)\r
-{\r
- int r;\r
-\r
- r = apply_case_fold1(0, FOLDS1_NORMAL_END_INDEX, f, arg);\r
- if (r != 0) return r;\r
-\r
-#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI\r
- if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {\r
- code = 0x0131;\r
- r = (*f)(0x0049, &code, 1, arg);\r
- if (r != 0) return r;\r
- code = 0x0049;\r
- r = (*f)(0x0131, &code, 1, arg);\r
- if (r != 0) return r;\r
-\r
- code = 0x0130;\r
- r = (*f)(0x0069, &code, 1, arg);\r
- if (r != 0) return r;\r
- code = 0x0069;\r
- r = (*f)(0x0130, &code, 1, arg);\r
- if (r != 0) return r;\r
- }\r
- else {\r
-#endif\r
- r = apply_case_fold1(FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg);\r
- if (r != 0) return r;\r
-#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI\r
- }\r
-#endif\r
-\r
- if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)\r
- return 0;\r
-\r
- r = apply_case_fold2(0, FOLDS2_NORMAL_END_INDEX, f, arg);\r
- if (r != 0) return r;\r
-\r
-#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI\r
- if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) {\r
-#endif\r
- r = apply_case_fold2(FOLDS2_NORMAL_END_INDEX, FOLDS2_END_INDEX, f, arg);\r
- if (r != 0) return r;\r
-#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI\r
- }\r
-#endif\r
-\r
- r = apply_case_fold3(0, FOLDS3_NORMAL_END_INDEX, f, arg);\r
- if (r != 0) return r;\r
-\r
- return 0;\r
-}\r
-\r
-extern int\r
-onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,\r
- OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end,\r
- OnigCaseFoldCodeItem items[])\r
-{\r
- int n, m, i, j, k, len;\r
- OnigCodePoint code, codes[3];\r
- const struct ByUnfoldKey* buk;\r
-\r
- n = 0;\r
-\r
- code = ONIGENC_MBC_TO_CODE(enc, p, end);\r
- len = enclen(enc, p);\r
-\r
-#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI\r
- if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {\r
- if (code == 0x0049) {\r
- items[0].byte_len = len;\r
- items[0].code_len = 1;\r
- items[0].code[0] = 0x0131;\r
- return 1;\r
- }\r
- else if (code == 0x0130) {\r
- items[0].byte_len = len;\r
- items[0].code_len = 1;\r
- items[0].code[0] = 0x0069;\r
- return 1;\r
- }\r
- else if (code == 0x0131) {\r
- items[0].byte_len = len;\r
- items[0].code_len = 1;\r
- items[0].code[0] = 0x0049;\r
- return 1;\r
- }\r
- else if (code == 0x0069) {\r
- items[0].byte_len = len;\r
- items[0].code_len = 1;\r
- items[0].code[0] = 0x0130;\r
- return 1;\r
- }\r
- }\r
-#endif\r
-\r
- buk = onigenc_unicode_unfold_key(code);\r
- if (buk != 0) {\r
- if (buk->fold_len == 1) {\r
- int un;\r
- items[0].byte_len = len;\r
- items[0].code_len = 1;\r
- items[0].code[0] = *FOLDS1_FOLD(buk->index);\r
- n++;\r
-\r
- un = FOLDS1_UNFOLDS_NUM(buk->index);\r
- for (i = 0; i < un; i++) {\r
- OnigCodePoint unfold = FOLDS1_UNFOLDS(buk->index)[i];\r
- if (unfold != code) {\r
- items[n].byte_len = len;\r
- items[n].code_len = 1;\r
- items[n].code[0] = unfold;\r
- n++;\r
- }\r
- }\r
- code = items[0].code[0]; /* for multi-code to unfold search. */\r
- }\r
- else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {\r
- OnigCodePoint cs[3][4];\r
- int fn, ncs[3];\r
-\r
- if (buk->fold_len == 2) {\r
- m = FOLDS2_UNFOLDS_NUM(buk->index);\r
- for (i = 0; i < m; i++) {\r
- OnigCodePoint unfold = FOLDS2_UNFOLDS(buk->index)[i];\r
- if (unfold == code) continue;\r
-\r
- items[n].byte_len = len;\r
- items[n].code_len = 1;\r
- items[n].code[0] = unfold;\r
- n++;\r
- }\r
-\r
- for (fn = 0; fn < 2; fn++) {\r
- int index;\r
- cs[fn][0] = FOLDS2_FOLD(buk->index)[fn];\r
- index = onigenc_unicode_fold1_key(&cs[fn][0]);\r
- if (index >= 0) {\r
- int m = FOLDS1_UNFOLDS_NUM(index);\r
- for (i = 0; i < m; i++) {\r
- cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];\r
- }\r
- ncs[fn] = m + 1;\r
- }\r
- else\r
- ncs[fn] = 1;\r
- }\r
-\r
- for (i = 0; i < ncs[0]; i++) {\r
- for (j = 0; j < ncs[1]; j++) {\r
- items[n].byte_len = len;\r
- items[n].code_len = 2;\r
- items[n].code[0] = cs[0][i];\r
- items[n].code[1] = cs[1][j];\r
- n++;\r
- }\r
- }\r
- }\r
- else { /* fold_len == 3 */\r
- m = FOLDS3_UNFOLDS_NUM(buk->index);\r
- for (i = 0; i < m; i++) {\r
- OnigCodePoint unfold = FOLDS3_UNFOLDS(buk->index)[i];\r
- if (unfold == code) continue;\r
-\r
- items[n].byte_len = len;\r
- items[n].code_len = 1;\r
- items[n].code[0] = unfold;\r
- n++;\r
- }\r
-\r
- for (fn = 0; fn < 3; fn++) {\r
- int index;\r
- cs[fn][0] = FOLDS3_FOLD(buk->index)[fn];\r
- index = onigenc_unicode_fold1_key(&cs[fn][0]);\r
- if (index >= 0) {\r
- int m = FOLDS1_UNFOLDS_NUM(index);\r
- for (i = 0; i < m; i++) {\r
- cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];\r
- }\r
- ncs[fn] = m + 1;\r
- }\r
- else\r
- ncs[fn] = 1;\r
- }\r
-\r
- for (i = 0; i < ncs[0]; i++) {\r
- for (j = 0; j < ncs[1]; j++) {\r
- for (k = 0; k < ncs[2]; k++) {\r
- items[n].byte_len = len;\r
- items[n].code_len = 3;\r
- items[n].code[0] = cs[0][i];\r
- items[n].code[1] = cs[1][j];\r
- items[n].code[2] = cs[2][k];\r
- n++;\r
- }\r
- }\r
- }\r
- }\r
-\r
- /* multi char folded code is not head of another folded multi char */\r
- return n;\r
- }\r
- }\r
- else {\r
- int index = onigenc_unicode_fold1_key(&code);\r
- if (index >= 0) {\r
- int m = FOLDS1_UNFOLDS_NUM(index);\r
- for (i = 0; i < m; i++) {\r
- items[n].byte_len = len;\r
- items[n].code_len = 1;\r
- items[n].code[0] = FOLDS1_UNFOLDS(index)[i];\r
- n++;\r
- }\r
- }\r
- }\r
-\r
- if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)\r
- return n;\r
-\r
- p += len;\r
- if (p < end) {\r
- int clen;\r
- int index;\r
-\r
- codes[0] = code;\r
- code = ONIGENC_MBC_TO_CODE(enc, p, end);\r
-\r
- buk = onigenc_unicode_unfold_key(code);\r
- if (buk != 0 && buk->fold_len == 1) {\r
- codes[1] = *FOLDS1_FOLD(buk->index);\r
- }\r
- else\r
- codes[1] = code;\r
-\r
- clen = enclen(enc, p);\r
- len += clen;\r
-\r
- index = onigenc_unicode_fold2_key(codes);\r
- if (index >= 0) {\r
- m = FOLDS2_UNFOLDS_NUM(index);\r
- for (i = 0; i < m; i++) {\r
- items[n].byte_len = len;\r
- items[n].code_len = 1;\r
- items[n].code[0] = FOLDS2_UNFOLDS(index)[i];\r
- n++;\r
- }\r
- }\r
-\r
- p += clen;\r
- if (p < end) {\r
- code = ONIGENC_MBC_TO_CODE(enc, p, end);\r
- buk = onigenc_unicode_unfold_key(code);\r
- if (buk != 0 && buk->fold_len == 1) {\r
- codes[2] = *FOLDS1_FOLD(buk->index);\r
- }\r
- else\r
- codes[2] = code;\r
-\r
- clen = enclen(enc, p);\r
- len += clen;\r
-\r
- index = onigenc_unicode_fold3_key(codes);\r
- if (index >= 0) {\r
- m = FOLDS3_UNFOLDS_NUM(index);\r
- for (i = 0; i < m; i++) {\r
- items[n].byte_len = len;\r
- items[n].code_len = 1;\r
- items[n].code[0] = FOLDS3_UNFOLDS(index)[i];\r
- n++;\r
- }\r
- }\r
- }\r
- }\r
-\r
- return n;\r
-}\r
-\r
-#ifdef USE_UNICODE_PROPERTIES\r
-#include "unicode_property_data.c"\r
-#else\r
-#include "unicode_property_data_posix.c"\r
-#endif\r
-\r
-\r
-#ifdef USE_UNICODE_WORD_BREAK\r
-\r
-enum WB_TYPE {\r
- WB_Any = 0,\r
- WB_ALetter,\r
- WB_CR,\r
- WB_Double_Quote,\r
- WB_Extend,\r
- WB_ExtendNumLet,\r
- WB_Format,\r
- WB_Hebrew_Letter,\r
- WB_Katakana,\r
- WB_LF,\r
- WB_MidLetter,\r
- WB_MidNum,\r
- WB_MidNumLet,\r
- WB_Newline,\r
- WB_Numeric,\r
- WB_Regional_Indicator,\r
- WB_Single_Quote,\r
- WB_WSegSpace,\r
- WB_ZWJ,\r
-};\r
-\r
-typedef struct {\r
- OnigCodePoint start;\r
- OnigCodePoint end;\r
- enum WB_TYPE type;\r
-} WB_RANGE_TYPE;\r
-\r
-#include "unicode_wb_data.c"\r
-\r
-static enum WB_TYPE\r
-wb_get_type(OnigCodePoint code)\r
-{\r
- OnigCodePoint low, high, x;\r
- enum WB_TYPE type;\r
-\r
- for (low = 0, high = (OnigCodePoint )WB_RANGE_NUM; low < high; ) {\r
- x = (low + high) >> 1;\r
- if (code > WB_RANGES[x].end)\r
- low = x + 1;\r
- else\r
- high = x;\r
- }\r
-\r
- type = (low < (OnigCodePoint )WB_RANGE_NUM &&\r
- code >= WB_RANGES[low].start) ?\r
- WB_RANGES[low].type : WB_Any;\r
-\r
- return type;\r
-}\r
-\r
-#define IS_WB_IGNORE_TAIL(t) ((t) == WB_Extend || (t) == WB_Format || (t) == WB_ZWJ)\r
-#define IS_WB_AHLetter(t) ((t) == WB_ALetter || (t) == WB_Hebrew_Letter)\r
-#define IS_WB_MidNumLetQ(t) ((t) == WB_MidNumLet || (t) == WB_Single_Quote)\r
-\r
-static int\r
-wb_get_next_main_code(OnigEncoding enc, UChar* p, const UChar* end,\r
- OnigCodePoint* rcode, enum WB_TYPE* rtype)\r
-{\r
- OnigCodePoint code;\r
- enum WB_TYPE type;\r
-\r
- while (TRUE) {\r
- p += enclen(enc, p);\r
- if (p >= end) break;\r
-\r
- code = ONIGENC_MBC_TO_CODE(enc, p, end);\r
- type = wb_get_type(code);\r
- if (! IS_WB_IGNORE_TAIL(type)) {\r
- *rcode = code;\r
- *rtype = type;\r
- return 1;\r
- }\r
- }\r
-\r
- return 0;\r
-}\r
-\r
-extern int\r
-onigenc_wb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,\r
- const UChar* start, const UChar* end)\r
-{\r
- int r;\r
- UChar* pp;\r
- OnigCodePoint cfrom;\r
- OnigCodePoint cfrom2;\r
- OnigCodePoint cto;\r
- OnigCodePoint cto2;\r
- enum WB_TYPE from;\r
- enum WB_TYPE from2;\r
- enum WB_TYPE to;\r
- enum WB_TYPE to2;\r
-\r
- /* WB1: sot / Any */\r
- if (p == start) return TRUE;\r
- /* WB2: Any / eot */\r
- if (p == end) return TRUE;\r
-\r
- if (IS_NULL(prev)) {\r
- prev = onigenc_get_prev_char_head(enc, start, p);\r
- if (IS_NULL(prev)) return TRUE;\r
- }\r
-\r
- cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);\r
- cto = ONIGENC_MBC_TO_CODE(enc, p, end);\r
-\r
- from = wb_get_type(cfrom);\r
- to = wb_get_type(cto);\r
-\r
- /* short cut */\r
- if (from == 0 && to == 0) goto WB999;\r
-\r
- /* WB3: CR + LF */\r
- if (from == WB_CR && to == WB_LF) return FALSE;\r
-\r
- /* WB3a: (Newline|CR|LF) / */\r
- if (from == WB_Newline || from == WB_CR || from == WB_LF) return TRUE;\r
- /* WB3b: / (Newline|CR|LF) */\r
- if (to == WB_Newline || to == WB_CR || to == WB_LF) return TRUE;\r
-\r
- /* WB3c: ZWJ + {Extended_Pictographic} */\r
- if (from == WB_ZWJ) {\r
- if (onigenc_unicode_is_code_ctype(cto, PROP_INDEX_EXTENDEDPICTOGRAPHIC))\r
- return FALSE;\r
- }\r
-\r
- /* WB3d: WSegSpace + WSegSpace */\r
- if (from == WB_WSegSpace && to == WB_WSegSpace) return FALSE;\r
-\r
- /* WB4: X (Extend|Format|ZWJ)* -> X */\r
- if (IS_WB_IGNORE_TAIL(to)) return FALSE;\r
- if (IS_WB_IGNORE_TAIL(from)) {\r
- while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r
- prev = pp;\r
- cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);\r
- from = wb_get_type(cfrom);\r
- if (! IS_WB_IGNORE_TAIL(from))\r
- break;\r
- }\r
- }\r
-\r
- if (IS_WB_AHLetter(from)) {\r
- /* WB5: AHLetter + AHLetter */\r
- if (IS_WB_AHLetter(to)) return FALSE;\r
-\r
- /* WB6: AHLetter + (MidLetter | MidNumLetQ) AHLetter */\r
- if (to == WB_MidLetter || IS_WB_MidNumLetQ(to)) {\r
- r = wb_get_next_main_code(enc, p, end, &cto2, &to2);\r
- if (r == 1) {\r
- if (IS_WB_AHLetter(to2)) return FALSE;\r
- }\r
- }\r
- }\r
-\r
- /* WB7: AHLetter (MidLetter | MidNumLetQ) + AHLetter */\r
- if (from == WB_MidLetter || IS_WB_MidNumLetQ(from)) {\r
- if (IS_WB_AHLetter(to)) {\r
- from2 = WB_Any;\r
- while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r
- prev = pp;\r
- cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);\r
- from2 = wb_get_type(cfrom2);\r
- if (! IS_WB_IGNORE_TAIL(from2))\r
- break;\r
- }\r
-\r
- if (IS_WB_AHLetter(from2)) return FALSE;\r
- }\r
- }\r
-\r
- if (from == WB_Hebrew_Letter) {\r
- /* WB7a: Hebrew_Letter + Single_Quote */\r
- if (to == WB_Single_Quote) return FALSE;\r
-\r
- /* WB7b: Hebrew_Letter + Double_Quote Hebrew_Letter */\r
- if (to == WB_Double_Quote) {\r
- r = wb_get_next_main_code(enc, p, end, &cto2, &to2);\r
- if (r == 1) {\r
- if (to2 == WB_Hebrew_Letter) return FALSE;\r
- }\r
- }\r
- }\r
-\r
- /* WB7c: Hebrew_Letter Double_Quote + Hebrew_Letter */\r
- if (from == WB_Double_Quote) {\r
- if (to == WB_Hebrew_Letter) {\r
- from2 = WB_Any;\r
- while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r
- prev = pp;\r
- cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);\r
- from2 = wb_get_type(cfrom2);\r
- if (! IS_WB_IGNORE_TAIL(from2))\r
- break;\r
- }\r
-\r
- if (from2 == WB_Hebrew_Letter) return FALSE;\r
- }\r
- }\r
-\r
- if (to == WB_Numeric) {\r
- /* WB8: Numeric + Numeric */\r
- if (from == WB_Numeric) return FALSE;\r
-\r
- /* WB9: AHLetter + Numeric */\r
- if (IS_WB_AHLetter(from)) return FALSE;\r
-\r
- /* WB11: Numeric (MidNum | MidNumLetQ) + Numeric */\r
- if (from == WB_MidNum || IS_WB_MidNumLetQ(from)) {\r
- from2 = WB_Any;\r
- while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r
- prev = pp;\r
- cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);\r
- from2 = wb_get_type(cfrom2);\r
- if (! IS_WB_IGNORE_TAIL(from2))\r
- break;\r
- }\r
-\r
- if (from2 == WB_Numeric) return FALSE;\r
- }\r
- }\r
-\r
- if (from == WB_Numeric) {\r
- /* WB10: Numeric + AHLetter */\r
- if (IS_WB_AHLetter(to)) return FALSE;\r
-\r
- /* WB12: Numeric + (MidNum | MidNumLetQ) Numeric */\r
- if (to == WB_MidNum || IS_WB_MidNumLetQ(to)) {\r
- r = wb_get_next_main_code(enc, p, end, &cto2, &to2);\r
- if (r == 1) {\r
- if (to2 == WB_Numeric) return FALSE;\r
- }\r
- }\r
- }\r
-\r
- /* WB13: Katakana + Katakana */\r
- if (from == WB_Katakana && to == WB_Katakana) return FALSE;\r
-\r
- /* WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) + ExtendNumLet */\r
- if (IS_WB_AHLetter(from) || from == WB_Numeric || from == WB_Katakana\r
- || from == WB_ExtendNumLet) {\r
- if (to == WB_ExtendNumLet) return FALSE;\r
- }\r
-\r
- /* WB13b: ExtendNumLet + (AHLetter | Numeric | Katakana) */\r
- if (from == WB_ExtendNumLet) {\r
- if (IS_WB_AHLetter(to) || to == WB_Numeric || to == WB_Katakana)\r
- return FALSE;\r
- }\r
-\r
-\r
- /* WB15: sot (RI RI)* RI + RI */\r
- /* WB16: [^RI] (RI RI)* RI + RI */\r
- if (from == WB_Regional_Indicator && to == WB_Regional_Indicator) {\r
- int n = 0;\r
- while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r
- cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);\r
- from2 = wb_get_type(cfrom2);\r
- if (from2 != WB_Regional_Indicator)\r
- break;\r
-\r
- n++;\r
- }\r
- if ((n % 2) == 0) return FALSE;\r
- }\r
-\r
- WB999:\r
- /* WB999: Any / Any */\r
- return TRUE;\r
-}\r
-\r
-#endif /* USE_UNICODE_WORD_BREAK */\r
-\r
-\r
-#ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER\r
-\r
-enum EGCB_BREAK_TYPE {\r
- EGCB_NOT_BREAK = 0,\r
- EGCB_BREAK = 1,\r
- EGCB_BREAK_UNDEF_GB11 = 2,\r
- EGCB_BREAK_UNDEF_RI_RI = 3\r
-};\r
-\r
-enum EGCB_TYPE {\r
- EGCB_Other = 0,\r
- EGCB_CR = 1,\r
- EGCB_LF = 2,\r
- EGCB_Control = 3,\r
- EGCB_Extend = 4,\r
- EGCB_Prepend = 5,\r
- EGCB_Regional_Indicator = 6,\r
- EGCB_SpacingMark = 7,\r
- EGCB_ZWJ = 8,\r
-#if 0\r
- /* obsoleted */\r
- EGCB_E_Base = 9,\r
- EGCB_E_Base_GAZ = 10,\r
- EGCB_E_Modifier = 11,\r
- EGCB_Glue_After_Zwj = 12,\r
-#endif\r
- EGCB_L = 13,\r
- EGCB_LV = 14,\r
- EGCB_LVT = 15,\r
- EGCB_T = 16,\r
- EGCB_V = 17\r
-};\r
-\r
-typedef struct {\r
- OnigCodePoint start;\r
- OnigCodePoint end;\r
- enum EGCB_TYPE type;\r
-} EGCB_RANGE_TYPE;\r
-\r
-#include "unicode_egcb_data.c"\r
-\r
-static enum EGCB_TYPE\r
-egcb_get_type(OnigCodePoint code)\r
-{\r
- OnigCodePoint low, high, x;\r
- enum EGCB_TYPE type;\r
-\r
- for (low = 0, high = (OnigCodePoint )EGCB_RANGE_NUM; low < high; ) {\r
- x = (low + high) >> 1;\r
- if (code > EGCB_RANGES[x].end)\r
- low = x + 1;\r
- else\r
- high = x;\r
- }\r
-\r
- type = (low < (OnigCodePoint )EGCB_RANGE_NUM &&\r
- code >= EGCB_RANGES[low].start) ?\r
- EGCB_RANGES[low].type : EGCB_Other;\r
-\r
- return type;\r
-}\r
-\r
-#define IS_CONTROL_CR_LF(code) ((code) <= EGCB_Control && (code) >= EGCB_CR)\r
-#define IS_HANGUL(code) ((code) >= EGCB_L)\r
-\r
-/* GB1 and GB2 are outside of this function. */\r
-static enum EGCB_BREAK_TYPE\r
-unicode_egcb_is_break_2code(OnigCodePoint from_code, OnigCodePoint to_code)\r
-{\r
- enum EGCB_TYPE from;\r
- enum EGCB_TYPE to;\r
-\r
- from = egcb_get_type(from_code);\r
- to = egcb_get_type(to_code);\r
-\r
- /* short cut */\r
- if (from == 0 && to == 0) goto GB999;\r
-\r
- /* GB3 */\r
- if (from == EGCB_CR && to == EGCB_LF) return EGCB_NOT_BREAK;\r
- /* GB4 */\r
- if (IS_CONTROL_CR_LF(from)) return EGCB_BREAK;\r
- /* GB5 */\r
- if (IS_CONTROL_CR_LF(to)) return EGCB_BREAK;\r
-\r
- if (IS_HANGUL(from) && IS_HANGUL(to)) {\r
- /* GB6 */\r
- if (from == EGCB_L && to != EGCB_T) return EGCB_NOT_BREAK;\r
- /* GB7 */\r
- if ((from == EGCB_LV || from == EGCB_V)\r
- && (to == EGCB_V || to == EGCB_T)) return EGCB_NOT_BREAK;\r
-\r
- /* GB8 */\r
- if ((to == EGCB_T) && (from == EGCB_LVT || from == EGCB_T))\r
- return EGCB_NOT_BREAK;\r
-\r
- goto GB999;\r
- }\r
-\r
- /* GB9 */\r
- if (to == EGCB_Extend || to == EGCB_ZWJ) return EGCB_NOT_BREAK;\r
-\r
- /* GB9a */\r
- if (to == EGCB_SpacingMark) return EGCB_NOT_BREAK;\r
- /* GB9b */\r
- if (from == EGCB_Prepend) return EGCB_NOT_BREAK;\r
-\r
- /* GB10 removed */\r
-\r
- /* GB11 */\r
- if (from == EGCB_ZWJ) {\r
- if (onigenc_unicode_is_code_ctype(to_code, PROP_INDEX_EXTENDEDPICTOGRAPHIC))\r
- return EGCB_BREAK_UNDEF_GB11;\r
-\r
- goto GB999;\r
- }\r
-\r
- /* GB12, GB13 */\r
- if (from == EGCB_Regional_Indicator && to == EGCB_Regional_Indicator) {\r
- return EGCB_BREAK_UNDEF_RI_RI;\r
- }\r
-\r
- GB999:\r
- return EGCB_BREAK;\r
-}\r
-\r
-#endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */\r
-\r
-extern int\r
-onigenc_egcb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,\r
- const UChar* start, const UChar* end)\r
-{\r
- OnigCodePoint from;\r
- OnigCodePoint to;\r
-#ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER\r
- enum EGCB_BREAK_TYPE btype;\r
- enum EGCB_TYPE type;\r
-#endif\r
-\r
- /* GB1 and GB2 */\r
- if (p == start) return 1;\r
- if (p == end) return 1;\r
-\r
- if (IS_NULL(prev)) {\r
- prev = onigenc_get_prev_char_head(enc, start, p);\r
- if (IS_NULL(prev)) return 1;\r
- }\r
-\r
- from = ONIGENC_MBC_TO_CODE(enc, prev, end);\r
- to = ONIGENC_MBC_TO_CODE(enc, p, end);\r
-\r
-#ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER\r
- if (! ONIGENC_IS_UNICODE_ENCODING(enc)) {\r
- return from != 0x000d || to != 0x000a;\r
- }\r
-\r
- btype = unicode_egcb_is_break_2code(from, to);\r
- switch (btype) {\r
- case EGCB_NOT_BREAK:\r
- return 0;\r
- break;\r
- case EGCB_BREAK:\r
- return 1;\r
- break;\r
-\r
- case EGCB_BREAK_UNDEF_GB11:\r
- while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r
- from = ONIGENC_MBC_TO_CODE(enc, prev, end);\r
- if (onigenc_unicode_is_code_ctype(from, PROP_INDEX_EXTENDEDPICTOGRAPHIC))\r
- return 0;\r
-\r
- type = egcb_get_type(from);\r
- if (type != EGCB_Extend)\r
- break;\r
- }\r
- break;\r
-\r
- case EGCB_BREAK_UNDEF_RI_RI:\r
- {\r
- int n = 0;\r
- while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {\r
- from = ONIGENC_MBC_TO_CODE(enc, prev, end);\r
- type = egcb_get_type(from);\r
- if (type != EGCB_Regional_Indicator)\r
- break;\r
-\r
- n++;\r
- }\r
- if ((n % 2) == 0) return 0;\r
- }\r
- break;\r
- }\r
-\r
- return 1;\r
-\r
-#else\r
- return from != 0x000d || to != 0x000a;\r
-#endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */\r
-}\r
-\r
-\r
-#define USER_DEFINED_PROPERTY_MAX_NUM 20\r
-\r
-typedef struct {\r
- int ctype;\r
- OnigCodePoint* ranges;\r
-} UserDefinedPropertyValue;\r
-\r
-static int UserDefinedPropertyNum;\r
-static UserDefinedPropertyValue\r
-UserDefinedPropertyRanges[USER_DEFINED_PROPERTY_MAX_NUM];\r
-static st_table* UserDefinedPropertyTable;\r
-\r
-extern int\r
-onig_unicode_define_user_property(const char* name, OnigCodePoint* ranges)\r
-{\r
- UserDefinedPropertyValue* e;\r
- int r;\r
- int i;\r
- int n;\r
- int len;\r
- int c;\r
- char* s;\r
- UChar* uname;\r
-\r
- if (UserDefinedPropertyNum >= USER_DEFINED_PROPERTY_MAX_NUM)\r
- return ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS;\r
-\r
- len = (int )strlen_s(name,MAX_STRING_SIZE);\r
- if (len >= PROPERTY_NAME_MAX_SIZE)\r
- return ONIGERR_TOO_LONG_PROPERTY_NAME;\r
-\r
- s = (char* )xmalloc(len + 1);\r
- if (s == 0)\r
- return ONIGERR_MEMORY;\r
-\r
- uname = (UChar* )name;\r
- n = 0;\r
- for (i = 0; i < len; i++) {\r
- c = uname[i];\r
- if (c < 0x20 || c >= 0x80) {\r
- xfree(s);\r
- return ONIGERR_INVALID_CHAR_PROPERTY_NAME;\r
- }\r
-\r
- if (c != ' ' && c != '-' && c != '_') {\r
- s[n] = c;\r
- n++;\r
- }\r
- }\r
- s[n] = '\0';\r
-\r
- if (UserDefinedPropertyTable == 0) {\r
- UserDefinedPropertyTable = onig_st_init_strend_table_with_size(10);\r
- if (IS_NULL(UserDefinedPropertyTable)) {\r
- xfree(s);\r
- return ONIGERR_MEMORY;\r
- }\r
- }\r
-\r
- e = UserDefinedPropertyRanges + UserDefinedPropertyNum;\r
- e->ctype = CODE_RANGES_NUM + UserDefinedPropertyNum;\r
- e->ranges = ranges;\r
- r = onig_st_insert_strend(UserDefinedPropertyTable,\r
- (const UChar* )s, (const UChar* )s + n,\r
- (hash_data_type )((void* )e));\r
- if (r < 0) return r;\r
-\r
- UserDefinedPropertyNum++;\r
- return 0;\r
-}\r
-\r
-extern int\r
-onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype)\r
-{\r
- if (\r
-#ifdef USE_UNICODE_PROPERTIES\r
- ctype <= ONIGENC_MAX_STD_CTYPE &&\r
-#endif\r
- code < 256) {\r
- return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype);\r
- }\r
-\r
- if (ctype >= CODE_RANGES_NUM) {\r
- int index = ctype - CODE_RANGES_NUM;\r
- if (index < UserDefinedPropertyNum)\r
- return onig_is_in_code_range((UChar* )UserDefinedPropertyRanges[index].ranges, code);\r
- else\r
- return ONIGERR_TYPE_BUG;\r
- }\r
-\r
- return onig_is_in_code_range((UChar* )CodeRanges[ctype], code);\r
-}\r
-\r
-\r
-extern int\r
-onigenc_unicode_ctype_code_range(OnigCtype ctype, const OnigCodePoint* ranges[])\r
-{\r
- if (ctype >= CODE_RANGES_NUM) {\r
- int index = ctype - CODE_RANGES_NUM;\r
- if (index < UserDefinedPropertyNum) {\r
- *ranges = UserDefinedPropertyRanges[index].ranges;\r
- return 0;\r
- }\r
- else\r
- return ONIGERR_TYPE_BUG;\r
- }\r
-\r
- *ranges = CodeRanges[ctype];\r
- return 0;\r
-}\r
-\r
-extern int\r
-onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,\r
- const OnigCodePoint* ranges[])\r
-{\r
- *sb_out = 0x00;\r
- return onigenc_unicode_ctype_code_range(ctype, ranges);\r
-}\r
-\r
-extern int\r
-onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name, UChar* end)\r
-{\r
- int len;\r
- UChar *p;\r
- OnigCodePoint code;\r
- const struct PoolPropertyNameCtype* pc;\r
- char buf[PROPERTY_NAME_MAX_SIZE];\r
-\r
- p = name;\r
- len = 0;\r
- while (p < end) {\r
- code = ONIGENC_MBC_TO_CODE(enc, p, end);\r
- if (code >= 0x80)\r
- return ONIGERR_INVALID_CHAR_PROPERTY_NAME;\r
-\r
- if (code != ' ' && code != '-' && code != '_') {\r
- buf[len++] = (char )code;\r
- if (len >= PROPERTY_NAME_MAX_SIZE)\r
- return ONIGERR_INVALID_CHAR_PROPERTY_NAME;\r
- }\r
-\r
- p += enclen(enc, p);\r
- }\r
-\r
- buf[len] = 0;\r
-\r
- if (UserDefinedPropertyTable != 0) {\r
- UserDefinedPropertyValue* e;\r
- e = (UserDefinedPropertyValue* )NULL;\r
- onig_st_lookup_strend(UserDefinedPropertyTable,\r
- (const UChar* )buf, (const UChar* )buf + len,\r
- (hash_data_type* )((void* )(&e)));\r
- if (e != 0) {\r
- return e->ctype;\r
- }\r
- }\r
-\r
- pc = unicode_lookup_property_name(buf, len);\r
- if (pc != 0) {\r
- /* fprintf(stderr, "LOOKUP: %s: %d\n", buf, pc->ctype); */\r
-#ifndef USE_UNICODE_PROPERTIES\r
- if (pc->ctype > ONIGENC_MAX_STD_CTYPE)\r
- return ONIGERR_INVALID_CHAR_PROPERTY_NAME;\r
-#endif\r
-\r
- return (int )pc->ctype;\r
- }\r
-\r
- return ONIGERR_INVALID_CHAR_PROPERTY_NAME;\r
-}\r