MdeModulePkg/Universal/RegularExpressionDxe/Oniguruma/unicode.c

   1 /**********************************************************************
   2   unicode.c -  Oniguruma (regular expression library)
   3 **********************************************************************/
   4 /*-
   5  * Copyright (c) 2002-2019  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  */
  29
  30 #include "regint.h"
  31
  32 struct PoolPropertyNameCtype {
  33   short int name;
  34   short int ctype;
  35 };
  36
  37 #define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \
  38   ((EncUNICODE_ISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
  39
  40 static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {
  41   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
  42   0x4008, 0x428c, 0x4289, 0x4288, 0x4288, 0x4288, 0x4008, 0x4008,
  43   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
  44   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
  45   0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
  46   0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
  47   0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
  48   0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
  49   0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
  50   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
  51   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
  52   0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
  53   0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
  54   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
  55   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
  56   0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
  57   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,
  58   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
  59   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
  60   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
  61   0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
  62   0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0,
  63   0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
  64   0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
  65   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
  66   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
  67   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
  68   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
  69   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
  70   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
  71   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
  72   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
  73 };
  74
  75 #include "st.h"
  76
  77 #include "unicode_fold_data.c"
  78
  79 extern int
  80 onigenc_unicode_mbc_case_fold(OnigEncoding enc,
  81     OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end,
  82     UChar* fold)
  83 {
  84   const struct ByUnfoldKey* buk;
  85
  86   OnigCodePoint code;
  87   int i, len, rlen;
  88   const UChar *p = *pp;
  89
  90   code = ONIGENC_MBC_TO_CODE(enc, p, end);
  91   len = enclen(enc, p);
  92   *pp += len;
  93
  94 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
  95   if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
  96     if (code == 0x0130) {
  97       return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold);
  98     }
  99 #if 0
 100     if (code == 0x0049) {
 101       return ONIGENC_CODE_TO_MBC(enc, 0x0131, fold);
 102     }
 103 #endif
 104   }
 105 #endif
 106
 107   buk = onigenc_unicode_unfold_key(code);
 108   if (buk != 0) {
 109     if (buk->fold_len == 1) {
 110       return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold);
 111     }
 112     else {
 113       OnigCodePoint* addr;
 114
 115       FOLDS_FOLD_ADDR_BUK(buk, addr);
 116       rlen = 0;
 117       for (i = 0; i < buk->fold_len; i++) {
 118         OnigCodePoint c = addr[i];
 119         len = ONIGENC_CODE_TO_MBC(enc, c, fold);
 120         fold += len;
 121         rlen += len;
 122       }
 123       return rlen;
 124     }
 125   }
 126
 127   for (i = 0; i < len; i++) {
 128     *fold++ = *p++;
 129   }
 130   return len;
 131 }
 132
 133 static int
 134 apply_case_fold1(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
 135 {
 136   int i, j, k, n, r;
 137
 138   for (i = from; i < to; ) {
 139     OnigCodePoint fold = *FOLDS1_FOLD(i);
 140     n = FOLDS1_UNFOLDS_NUM(i);
 141     for (j = 0; j < n; j++) {
 142       OnigCodePoint unfold = FOLDS1_UNFOLDS(i)[j];
 143
 144       r = (*f)(fold, &unfold, 1, arg);
 145       if (r != 0) return r;
 146       r = (*f)(unfold, &fold, 1, arg);
 147       if (r != 0) return r;
 148
 149       for (k = 0; k < j; k++) {
 150         OnigCodePoint unfold2 = FOLDS1_UNFOLDS(i)[k];
 151         r = (*f)(unfold, &unfold2, 1, arg);
 152         if (r != 0) return r;
 153         r = (*f)(unfold2, &unfold, 1, arg);
 154         if (r != 0) return r;
 155       }
 156     }
 157
 158     i = FOLDS1_NEXT_INDEX(i);
 159   }
 160
 161   return 0;
 162 }
 163
 164 static int
 165 apply_case_fold2(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
 166 {
 167   int i, j, k, n, r;
 168
 169   for (i = from; i < to; ) {
 170     OnigCodePoint* fold = FOLDS2_FOLD(i);
 171     n = FOLDS2_UNFOLDS_NUM(i);
 172     for (j = 0; j < n; j++) {
 173       OnigCodePoint unfold = FOLDS2_UNFOLDS(i)[j];
 174
 175       r = (*f)(unfold, fold, 2, arg);
 176       if (r != 0) return r;
 177
 178       for (k = 0; k < j; k++) {
 179         OnigCodePoint unfold2 = FOLDS2_UNFOLDS(i)[k];
 180         r = (*f)(unfold, &unfold2, 1, arg);
 181         if (r != 0) return r;
 182         r = (*f)(unfold2, &unfold, 1, arg);
 183         if (r != 0) return r;
 184       }
 185     }
 186
 187     i = FOLDS2_NEXT_INDEX(i);
 188   }
 189
 190   return 0;
 191 }
 192
 193 static int
 194 apply_case_fold3(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
 195 {
 196   int i, j, k, n, r;
 197
 198   for (i = from; i < to; ) {
 199     OnigCodePoint* fold = FOLDS3_FOLD(i);
 200     n = FOLDS3_UNFOLDS_NUM(i);
 201     for (j = 0; j < n; j++) {
 202       OnigCodePoint unfold = FOLDS3_UNFOLDS(i)[j];
 203
 204       r = (*f)(unfold, fold, 3, arg);
 205       if (r != 0) return r;
 206
 207       for (k = 0; k < j; k++) {
 208         OnigCodePoint unfold2 = FOLDS3_UNFOLDS(i)[k];
 209         r = (*f)(unfold, &unfold2, 1, arg);
 210         if (r != 0) return r;
 211         r = (*f)(unfold2, &unfold, 1, arg);
 212         if (r != 0) return r;
 213       }
 214     }
 215
 216     i = FOLDS3_NEXT_INDEX(i);
 217   }
 218
 219   return 0;
 220 }
 221
 222 extern int
 223 onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
 224                                     OnigApplyAllCaseFoldFunc f, void* arg)
 225 {
 226   int r;
 227
 228   r = apply_case_fold1(0, FOLDS1_NORMAL_END_INDEX, f, arg);
 229   if (r != 0) return r;
 230
 231 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
 232   if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
 233     code = 0x0131;
 234     r = (*f)(0x0049, &code, 1, arg);
 235     if (r != 0) return r;
 236     code = 0x0049;
 237     r = (*f)(0x0131, &code, 1, arg);
 238     if (r != 0) return r;
 239
 240     code = 0x0130;
 241     r = (*f)(0x0069, &code, 1, arg);
 242     if (r != 0) return r;
 243     code = 0x0069;
 244     r = (*f)(0x0130, &code, 1, arg);
 245     if (r != 0) return r;
 246   }
 247   else {
 248 #endif
 249     r = apply_case_fold1(FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg);
 250     if (r != 0) return r;
 251 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
 252   }
 253 #endif
 254
 255   if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
 256     return 0;
 257
 258   r = apply_case_fold2(0, FOLDS2_NORMAL_END_INDEX, f, arg);
 259   if (r != 0) return r;
 260
 261 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
 262   if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) {
 263 #endif
 264     r = apply_case_fold2(FOLDS2_NORMAL_END_INDEX, FOLDS2_END_INDEX, f, arg);
 265     if (r != 0) return r;
 266 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
 267   }
 268 #endif
 269
 270   r = apply_case_fold3(0, FOLDS3_NORMAL_END_INDEX, f, arg);
 271   if (r != 0) return r;
 272
 273   return 0;
 274 }
 275
 276 extern int
 277 onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
 278     OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end,
 279     OnigCaseFoldCodeItem items[])
 280 {
 281   int n, m, i, j, k, len;
 282   OnigCodePoint code, codes[3];
 283   const struct ByUnfoldKey* buk;
 284
 285   n = 0;
 286
 287   code = ONIGENC_MBC_TO_CODE(enc, p, end);
 288   len = enclen(enc, p);
 289
 290 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
 291   if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
 292     if (code == 0x0049) {
 293       items[0].byte_len = len;
 294       items[0].code_len = 1;
 295       items[0].code[0]  = 0x0131;
 296       return 1;
 297     }
 298     else if (code == 0x0130) {
 299       items[0].byte_len = len;
 300       items[0].code_len = 1;
 301       items[0].code[0]  = 0x0069;
 302       return 1;
 303     }
 304     else if (code == 0x0131) {
 305       items[0].byte_len = len;
 306       items[0].code_len = 1;
 307       items[0].code[0]  = 0x0049;
 308       return 1;
 309     }
 310     else if (code == 0x0069) {
 311       items[0].byte_len = len;
 312       items[0].code_len = 1;
 313       items[0].code[0]  = 0x0130;
 314       return 1;
 315     }
 316   }
 317 #endif
 318
 319   buk = onigenc_unicode_unfold_key(code);
 320   if (buk != 0) {
 321     if (buk->fold_len == 1) {
 322       int un;
 323       items[0].byte_len = len;
 324       items[0].code_len = 1;
 325       items[0].code[0]  = *FOLDS1_FOLD(buk->index);
 326       n++;
 327
 328       un = FOLDS1_UNFOLDS_NUM(buk->index);
 329       for (i = 0; i < un; i++) {
 330         OnigCodePoint unfold = FOLDS1_UNFOLDS(buk->index)[i];
 331         if (unfold != code) {
 332           items[n].byte_len = len;
 333           items[n].code_len = 1;
 334           items[n].code[0]  = unfold;
 335           n++;
 336         }
 337       }
 338       code = items[0].code[0]; /* for multi-code to unfold search. */
 339     }
 340     else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
 341       OnigCodePoint cs[3][4];
 342       int fn, ncs[3];
 343
 344       if (buk->fold_len == 2) {
 345         m = FOLDS2_UNFOLDS_NUM(buk->index);
 346         for (i = 0; i < m; i++) {
 347           OnigCodePoint unfold = FOLDS2_UNFOLDS(buk->index)[i];
 348           if (unfold == code) continue;
 349
 350           items[n].byte_len = len;
 351           items[n].code_len = 1;
 352           items[n].code[0]  = unfold;
 353           n++;
 354         }
 355
 356         for (fn = 0; fn < 2; fn++) {
 357           int index;
 358           cs[fn][0] = FOLDS2_FOLD(buk->index)[fn];
 359           index = onigenc_unicode_fold1_key(&cs[fn][0]);
 360           if (index >= 0) {
 361             int m = FOLDS1_UNFOLDS_NUM(index);
 362             for (i = 0; i < m; i++) {
 363               cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];
 364             }
 365             ncs[fn] = m + 1;
 366           }
 367           else
 368             ncs[fn] = 1;
 369         }
 370
 371         for (i = 0; i < ncs[0]; i++) {
 372           for (j = 0; j < ncs[1]; j++) {
 373             items[n].byte_len = len;
 374             items[n].code_len = 2;
 375             items[n].code[0]  = cs[0][i];
 376             items[n].code[1]  = cs[1][j];
 377             n++;
 378           }
 379         }
 380       }
 381       else { /* fold_len == 3 */
 382         m = FOLDS3_UNFOLDS_NUM(buk->index);
 383         for (i = 0; i < m; i++) {
 384           OnigCodePoint unfold = FOLDS3_UNFOLDS(buk->index)[i];
 385           if (unfold == code) continue;
 386
 387           items[n].byte_len = len;
 388           items[n].code_len = 1;
 389           items[n].code[0]  = unfold;
 390           n++;
 391         }
 392
 393         for (fn = 0; fn < 3; fn++) {
 394           int index;
 395           cs[fn][0] = FOLDS3_FOLD(buk->index)[fn];
 396           index = onigenc_unicode_fold1_key(&cs[fn][0]);
 397           if (index >= 0) {
 398             int m = FOLDS1_UNFOLDS_NUM(index);
 399             for (i = 0; i < m; i++) {
 400               cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];
 401             }
 402             ncs[fn] = m + 1;
 403           }
 404           else
 405             ncs[fn] = 1;
 406         }
 407
 408         for (i = 0; i < ncs[0]; i++) {
 409           for (j = 0; j < ncs[1]; j++) {
 410             for (k = 0; k < ncs[2]; k++) {
 411               items[n].byte_len = len;
 412               items[n].code_len = 3;
 413               items[n].code[0]  = cs[0][i];
 414               items[n].code[1]  = cs[1][j];
 415               items[n].code[2]  = cs[2][k];
 416               n++;
 417             }
 418           }
 419         }
 420       }
 421
 422       /* multi char folded code is not head of another folded multi char */
 423       return n;
 424     }
 425   }
 426   else {
 427     int index = onigenc_unicode_fold1_key(&code);
 428     if (index >= 0) {
 429       int m = FOLDS1_UNFOLDS_NUM(index);
 430       for (i = 0; i < m; i++) {
 431         items[n].byte_len = len;
 432         items[n].code_len = 1;
 433         items[n].code[0]  = FOLDS1_UNFOLDS(index)[i];
 434         n++;
 435       }
 436     }
 437   }
 438
 439   if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
 440     return n;
 441
 442   p += len;
 443   if (p < end) {
 444     int clen;
 445     int index;
 446
 447     codes[0] = code;
 448     code = ONIGENC_MBC_TO_CODE(enc, p, end);
 449
 450     buk = onigenc_unicode_unfold_key(code);
 451     if (buk != 0 && buk->fold_len == 1) {
 452       codes[1] = *FOLDS1_FOLD(buk->index);
 453     }
 454     else
 455       codes[1] = code;
 456
 457     clen = enclen(enc, p);
 458     len += clen;
 459
 460     index = onigenc_unicode_fold2_key(codes);
 461     if (index >= 0) {
 462       m = FOLDS2_UNFOLDS_NUM(index);
 463       for (i = 0; i < m; i++) {
 464         items[n].byte_len = len;
 465         items[n].code_len = 1;
 466         items[n].code[0]  = FOLDS2_UNFOLDS(index)[i];
 467         n++;
 468       }
 469     }
 470
 471     p += clen;
 472     if (p < end) {
 473       code = ONIGENC_MBC_TO_CODE(enc, p, end);
 474       buk = onigenc_unicode_unfold_key(code);
 475       if (buk != 0 && buk->fold_len == 1) {
 476         codes[2] = *FOLDS1_FOLD(buk->index);
 477       }
 478       else
 479         codes[2] = code;
 480
 481       clen = enclen(enc, p);
 482       len += clen;
 483
 484       index = onigenc_unicode_fold3_key(codes);
 485       if (index >= 0) {
 486         m = FOLDS3_UNFOLDS_NUM(index);
 487         for (i = 0; i < m; i++) {
 488           items[n].byte_len = len;
 489           items[n].code_len = 1;
 490           items[n].code[0]  = FOLDS3_UNFOLDS(index)[i];
 491           n++;
 492         }
 493       }
 494     }
 495   }
 496
 497   return n;
 498 }
 499
 500 #ifdef USE_UNICODE_PROPERTIES
 501 #include "unicode_property_data.c"
 502 #else
 503 #include "unicode_property_data_posix.c"
 504 #endif
 505
 506
 507 #ifdef USE_UNICODE_WORD_BREAK
 508
 509 enum WB_TYPE {
 510   WB_Any = 0,
 511   WB_ALetter,
 512   WB_CR,
 513   WB_Double_Quote,
 514   WB_Extend,
 515   WB_ExtendNumLet,
 516   WB_Format,
 517   WB_Hebrew_Letter,
 518   WB_Katakana,
 519   WB_LF,
 520   WB_MidLetter,
 521   WB_MidNum,
 522   WB_MidNumLet,
 523   WB_Newline,
 524   WB_Numeric,
 525   WB_Regional_Indicator,
 526   WB_Single_Quote,
 527   WB_WSegSpace,
 528   WB_ZWJ,
 529 };
 530
 531 typedef struct {
 532   OnigCodePoint start;
 533   OnigCodePoint end;
 534   enum WB_TYPE  type;
 535 } WB_RANGE_TYPE;
 536
 537 #include "unicode_wb_data.c"
 538
 539 static enum WB_TYPE
 540 wb_get_type(OnigCodePoint code)
 541 {
 542   OnigCodePoint low, high, x;
 543   enum WB_TYPE type;
 544
 545   for (low = 0, high = (OnigCodePoint )WB_RANGE_NUM; low < high; ) {
 546     x = (low + high) >> 1;
 547     if (code > WB_RANGES[x].end)
 548       low = x + 1;
 549     else
 550       high = x;
 551   }
 552
 553   type = (low < (OnigCodePoint )WB_RANGE_NUM &&
 554           code >= WB_RANGES[low].start) ?
 555     WB_RANGES[low].type : WB_Any;
 556
 557   return type;
 558 }
 559
 560 #define IS_WB_IGNORE_TAIL(t)  ((t) == WB_Extend || (t) == WB_Format || (t) == WB_ZWJ)
 561 #define IS_WB_AHLetter(t)     ((t) == WB_ALetter || (t) == WB_Hebrew_Letter)
 562 #define IS_WB_MidNumLetQ(t)   ((t) == WB_MidNumLet || (t) == WB_Single_Quote)
 563
 564 static int
 565 wb_get_next_main_code(OnigEncoding enc, UChar* p, const UChar* end,
 566                       OnigCodePoint* rcode, enum WB_TYPE* rtype)
 567 {
 568   OnigCodePoint code;
 569   enum WB_TYPE type;
 570
 571   while (TRUE) {
 572     p += enclen(enc, p);
 573     if (p >= end) break;
 574
 575     code = ONIGENC_MBC_TO_CODE(enc, p, end);
 576     type = wb_get_type(code);
 577     if (! IS_WB_IGNORE_TAIL(type)) {
 578       *rcode = code;
 579       *rtype = type;
 580       return 1;
 581     }
 582   }
 583
 584   return 0;
 585 }
 586
 587 extern int
 588 onigenc_wb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
 589                              const UChar* start, const UChar* end)
 590 {
 591   int r;
 592   UChar* pp;
 593   OnigCodePoint cfrom;
 594   OnigCodePoint cfrom2;
 595   OnigCodePoint cto;
 596   OnigCodePoint cto2;
 597   enum WB_TYPE from;
 598   enum WB_TYPE from2;
 599   enum WB_TYPE to;
 600   enum WB_TYPE to2;
 601
 602   /* WB1: sot / Any */
 603   if (p == start) return TRUE;
 604   /* WB2: Any / eot */
 605   if (p == end)   return TRUE;
 606
 607   if (IS_NULL(prev)) {
 608     prev = onigenc_get_prev_char_head(enc, start, p);
 609     if (IS_NULL(prev)) return TRUE;
 610   }
 611
 612   cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);
 613   cto   = ONIGENC_MBC_TO_CODE(enc, p, end);
 614
 615   from = wb_get_type(cfrom);
 616   to   = wb_get_type(cto);
 617
 618   /* short cut */
 619   if (from == 0 && to == 0) goto WB999;
 620
 621   /* WB3: CR + LF */
 622   if (from == WB_CR && to == WB_LF) return FALSE;
 623
 624   /* WB3a: (Newline|CR|LF) /  */
 625   if (from == WB_Newline || from == WB_CR || from == WB_LF) return TRUE;
 626   /* WB3b: / (Newline|CR|LF) */
 627   if (to == WB_Newline || to == WB_CR || to == WB_LF) return TRUE;
 628
 629   /* WB3c: ZWJ + {Extended_Pictographic} */
 630   if (from == WB_ZWJ) {
 631     if (onigenc_unicode_is_code_ctype(cto, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
 632       return FALSE;
 633   }
 634
 635   /* WB3d: WSegSpace + WSegSpace */
 636   if (from == WB_WSegSpace && to == WB_WSegSpace) return FALSE;
 637
 638   /* WB4:  X (Extend|Format|ZWJ)* -> X */
 639   if (IS_WB_IGNORE_TAIL(to)) return FALSE;
 640   if (IS_WB_IGNORE_TAIL(from)) {
 641     while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
 642       prev = pp;
 643       cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);
 644       from = wb_get_type(cfrom);
 645       if (! IS_WB_IGNORE_TAIL(from))
 646         break;
 647     }
 648   }
 649
 650   if (IS_WB_AHLetter(from)) {
 651     /* WB5: AHLetter + AHLetter */
 652     if (IS_WB_AHLetter(to)) return FALSE;
 653
 654     /* WB6: AHLetter + (MidLetter | MidNumLetQ) AHLetter */
 655     if (to == WB_MidLetter || IS_WB_MidNumLetQ(to)) {
 656       r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
 657       if (r == 1) {
 658         if (IS_WB_AHLetter(to2)) return FALSE;
 659       }
 660     }
 661   }
 662
 663   /* WB7: AHLetter (MidLetter | MidNumLetQ) + AHLetter */
 664   if (from == WB_MidLetter || IS_WB_MidNumLetQ(from)) {
 665     if (IS_WB_AHLetter(to)) {
 666       from2 = WB_Any;
 667       while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
 668         prev = pp;
 669         cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
 670         from2 = wb_get_type(cfrom2);
 671         if (! IS_WB_IGNORE_TAIL(from2))
 672           break;
 673       }
 674
 675       if (IS_WB_AHLetter(from2)) return FALSE;
 676     }
 677   }
 678
 679   if (from == WB_Hebrew_Letter) {
 680     /* WB7a: Hebrew_Letter + Single_Quote */
 681     if (to == WB_Single_Quote) return FALSE;
 682
 683     /* WB7b: Hebrew_Letter + Double_Quote Hebrew_Letter */
 684     if (to == WB_Double_Quote) {
 685       r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
 686       if (r == 1) {
 687         if (to2 == WB_Hebrew_Letter) return FALSE;
 688       }
 689     }
 690   }
 691
 692   /* WB7c: Hebrew_Letter Double_Quote + Hebrew_Letter */
 693   if (from == WB_Double_Quote) {
 694     if (to == WB_Hebrew_Letter) {
 695       from2 = WB_Any;
 696       while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
 697         prev = pp;
 698         cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
 699         from2 = wb_get_type(cfrom2);
 700         if (! IS_WB_IGNORE_TAIL(from2))
 701           break;
 702       }
 703
 704       if (from2 == WB_Hebrew_Letter) return FALSE;
 705     }
 706   }
 707
 708   if (to == WB_Numeric) {
 709     /* WB8: Numeric + Numeric */
 710     if (from == WB_Numeric) return FALSE;
 711
 712     /* WB9: AHLetter + Numeric */
 713     if (IS_WB_AHLetter(from)) return FALSE;
 714
 715     /* WB11: Numeric (MidNum | MidNumLetQ) + Numeric */
 716     if (from == WB_MidNum || IS_WB_MidNumLetQ(from)) {
 717       from2 = WB_Any;
 718       while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
 719         prev = pp;
 720         cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
 721         from2 = wb_get_type(cfrom2);
 722         if (! IS_WB_IGNORE_TAIL(from2))
 723           break;
 724       }
 725
 726       if (from2 == WB_Numeric) return FALSE;
 727     }
 728   }
 729
 730   if (from == WB_Numeric) {
 731     /* WB10: Numeric + AHLetter */
 732     if (IS_WB_AHLetter(to)) return FALSE;
 733
 734     /* WB12: Numeric + (MidNum | MidNumLetQ) Numeric */
 735     if (to == WB_MidNum || IS_WB_MidNumLetQ(to)) {
 736       r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
 737       if (r == 1) {
 738         if (to2 == WB_Numeric) return FALSE;
 739       }
 740     }
 741   }
 742
 743   /* WB13: Katakana + Katakana */
 744   if (from == WB_Katakana && to == WB_Katakana) return FALSE;
 745
 746   /* WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) + ExtendNumLet */
 747   if (IS_WB_AHLetter(from) || from == WB_Numeric || from == WB_Katakana
 748       || from == WB_ExtendNumLet) {
 749     if (to == WB_ExtendNumLet) return FALSE;
 750   }
 751
 752   /* WB13b: ExtendNumLet + (AHLetter | Numeric | Katakana) */
 753   if (from == WB_ExtendNumLet) {
 754     if (IS_WB_AHLetter(to) || to == WB_Numeric || to == WB_Katakana)
 755       return FALSE;
 756   }
 757
 758
 759   /* WB15:   sot (RI RI)* RI + RI */
 760   /* WB16: [^RI] (RI RI)* RI + RI */
 761   if (from == WB_Regional_Indicator && to == WB_Regional_Indicator) {
 762     int n = 0;
 763     while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
 764       cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
 765       from2  = wb_get_type(cfrom2);
 766       if (from2 != WB_Regional_Indicator)
 767         break;
 768
 769       n++;
 770     }
 771     if ((n % 2) == 0) return FALSE;
 772   }
 773
 774  WB999:
 775   /* WB999: Any / Any */
 776   return TRUE;
 777 }
 778
 779 #endif /* USE_UNICODE_WORD_BREAK */
 780
 781
 782 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
 783
 784 enum EGCB_BREAK_TYPE {
 785   EGCB_NOT_BREAK = 0,
 786   EGCB_BREAK     = 1,
 787   EGCB_BREAK_UNDEF_GB11  = 2,
 788   EGCB_BREAK_UNDEF_RI_RI = 3
 789 };
 790
 791 enum EGCB_TYPE {
 792   EGCB_Other   = 0,
 793   EGCB_CR      = 1,
 794   EGCB_LF      = 2,
 795   EGCB_Control = 3,
 796   EGCB_Extend  = 4,
 797   EGCB_Prepend = 5,
 798   EGCB_Regional_Indicator = 6,
 799   EGCB_SpacingMark = 7,
 800   EGCB_ZWJ         = 8,
 801 #if 0
 802   /* obsoleted */
 803   EGCB_E_Base         = 9,
 804   EGCB_E_Base_GAZ     = 10,
 805   EGCB_E_Modifier     = 11,
 806   EGCB_Glue_After_Zwj = 12,
 807 #endif
 808   EGCB_L   = 13,
 809   EGCB_LV  = 14,
 810   EGCB_LVT = 15,
 811   EGCB_T   = 16,
 812   EGCB_V   = 17
 813 };
 814
 815 typedef struct {
 816   OnigCodePoint  start;
 817   OnigCodePoint  end;
 818   enum EGCB_TYPE type;
 819 } EGCB_RANGE_TYPE;
 820
 821 #include "unicode_egcb_data.c"
 822
 823 static enum EGCB_TYPE
 824 egcb_get_type(OnigCodePoint code)
 825 {
 826   OnigCodePoint low, high, x;
 827   enum EGCB_TYPE type;
 828
 829   for (low = 0, high = (OnigCodePoint )EGCB_RANGE_NUM; low < high; ) {
 830     x = (low + high) >> 1;
 831     if (code > EGCB_RANGES[x].end)
 832       low = x + 1;
 833     else
 834       high = x;
 835   }
 836
 837   type = (low < (OnigCodePoint )EGCB_RANGE_NUM &&
 838           code >= EGCB_RANGES[low].start) ?
 839     EGCB_RANGES[low].type : EGCB_Other;
 840
 841   return type;
 842 }
 843
 844 #define IS_CONTROL_CR_LF(code)   ((code) <= EGCB_Control && (code) >= EGCB_CR)
 845 #define IS_HANGUL(code)          ((code) >= EGCB_L)
 846
 847 /* GB1 and GB2 are outside of this function. */
 848 static enum EGCB_BREAK_TYPE
 849 unicode_egcb_is_break_2code(OnigCodePoint from_code, OnigCodePoint to_code)
 850 {
 851   enum EGCB_TYPE from;
 852   enum EGCB_TYPE to;
 853
 854   from = egcb_get_type(from_code);
 855   to   = egcb_get_type(to_code);
 856
 857   /* short cut */
 858   if (from == 0 && to == 0) goto GB999;
 859
 860   /* GB3 */
 861   if (from == EGCB_CR && to == EGCB_LF) return EGCB_NOT_BREAK;
 862   /* GB4 */
 863   if (IS_CONTROL_CR_LF(from)) return EGCB_BREAK;
 864   /* GB5 */
 865   if (IS_CONTROL_CR_LF(to)) return EGCB_BREAK;
 866
 867   if (IS_HANGUL(from) && IS_HANGUL(to)) {
 868     /* GB6 */
 869     if (from == EGCB_L && to != EGCB_T) return EGCB_NOT_BREAK;
 870     /* GB7 */
 871     if ((from == EGCB_LV || from == EGCB_V)
 872         && (to == EGCB_V || to == EGCB_T)) return EGCB_NOT_BREAK;
 873
 874     /* GB8 */
 875     if ((to == EGCB_T) && (from == EGCB_LVT || from == EGCB_T))
 876       return EGCB_NOT_BREAK;
 877
 878     goto GB999;
 879   }
 880
 881   /* GB9 */
 882   if (to == EGCB_Extend || to == EGCB_ZWJ) return EGCB_NOT_BREAK;
 883
 884   /* GB9a */
 885   if (to == EGCB_SpacingMark) return EGCB_NOT_BREAK;
 886   /* GB9b */
 887   if (from == EGCB_Prepend) return EGCB_NOT_BREAK;
 888
 889   /* GB10 removed */
 890
 891   /* GB11 */
 892   if (from == EGCB_ZWJ) {
 893     if (onigenc_unicode_is_code_ctype(to_code, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
 894       return EGCB_BREAK_UNDEF_GB11;
 895
 896     goto GB999;
 897   }
 898
 899   /* GB12, GB13 */
 900   if (from == EGCB_Regional_Indicator && to == EGCB_Regional_Indicator) {
 901     return EGCB_BREAK_UNDEF_RI_RI;
 902   }
 903
 904  GB999:
 905   return EGCB_BREAK;
 906 }
 907
 908 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
 909
 910 extern int
 911 onigenc_egcb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
 912                                const UChar* start, const UChar* end)
 913 {
 914   OnigCodePoint from;
 915   OnigCodePoint to;
 916 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
 917   enum EGCB_BREAK_TYPE btype;
 918   enum EGCB_TYPE type;
 919 #endif
 920
 921   /* GB1 and GB2 */
 922   if (p == start) return 1;
 923   if (p == end)   return 1;
 924
 925   if (IS_NULL(prev)) {
 926     prev = onigenc_get_prev_char_head(enc, start, p);
 927     if (IS_NULL(prev)) return 1;
 928   }
 929
 930   from = ONIGENC_MBC_TO_CODE(enc, prev, end);
 931   to   = ONIGENC_MBC_TO_CODE(enc, p, end);
 932
 933 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
 934   if (! ONIGENC_IS_UNICODE_ENCODING(enc)) {
 935     return from != 0x000d || to != 0x000a;
 936   }
 937
 938   btype = unicode_egcb_is_break_2code(from, to);
 939   switch (btype) {
 940   case EGCB_NOT_BREAK:
 941     return 0;
 942     break;
 943   case EGCB_BREAK:
 944     return 1;
 945     break;
 946
 947   case EGCB_BREAK_UNDEF_GB11:
 948     while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
 949       from = ONIGENC_MBC_TO_CODE(enc, prev, end);
 950       if (onigenc_unicode_is_code_ctype(from, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
 951         return 0;
 952
 953       type = egcb_get_type(from);
 954       if (type != EGCB_Extend)
 955         break;
 956     }
 957     break;
 958
 959   case EGCB_BREAK_UNDEF_RI_RI:
 960     {
 961       int n = 0;
 962       while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
 963         from = ONIGENC_MBC_TO_CODE(enc, prev, end);
 964         type = egcb_get_type(from);
 965         if (type != EGCB_Regional_Indicator)
 966           break;
 967
 968         n++;
 969       }
 970       if ((n % 2) == 0) return 0;
 971     }
 972     break;
 973   }
 974
 975   return 1;
 976
 977 #else
 978   return from != 0x000d || to != 0x000a;
 979 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
 980 }
 981
 982
 983 #define USER_DEFINED_PROPERTY_MAX_NUM  20
 984
 985 typedef struct {
 986   int ctype;
 987   OnigCodePoint* ranges;
 988 } UserDefinedPropertyValue;
 989
 990 static int UserDefinedPropertyNum;
 991 static UserDefinedPropertyValue
 992 UserDefinedPropertyRanges[USER_DEFINED_PROPERTY_MAX_NUM];
 993 static st_table* UserDefinedPropertyTable;
 994
 995 extern int
 996 onig_unicode_define_user_property(const char* name, OnigCodePoint* ranges)
 997 {
 998   UserDefinedPropertyValue* e;
 999   int r;
1000   int i;
1001   int n;
1002   int len;
1003   int c;
1004   char* s;
1005   UChar* uname;
1006
1007   if (UserDefinedPropertyNum >= USER_DEFINED_PROPERTY_MAX_NUM)
1008     return ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS;
1009
1010   len = (int )strlen_s(name,MAX_STRING_SIZE);
1011   if (len >= PROPERTY_NAME_MAX_SIZE)
1012     return ONIGERR_TOO_LONG_PROPERTY_NAME;
1013
1014   s = (char* )xmalloc(len + 1);
1015   if (s == 0)
1016     return ONIGERR_MEMORY;
1017
1018   uname = (UChar* )name;
1019   n = 0;
1020   for (i = 0; i < len; i++) {
1021     c = uname[i];
1022     if (c < 0x20 || c >= 0x80) {
1023       xfree(s);
1024       return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1025     }
1026
1027     if (c != ' ' && c != '-' && c != '_') {
1028       s[n] = c;
1029       n++;
1030     }
1031   }
1032   s[n] = '\0';
1033
1034   if (UserDefinedPropertyTable == 0) {
1035     UserDefinedPropertyTable = onig_st_init_strend_table_with_size(10);
1036     if (IS_NULL(UserDefinedPropertyTable)) {
1037       xfree(s);
1038       return ONIGERR_MEMORY;
1039     }
1040   }
1041
1042   e = UserDefinedPropertyRanges + UserDefinedPropertyNum;
1043   e->ctype = CODE_RANGES_NUM + UserDefinedPropertyNum;
1044   e->ranges = ranges;
1045   r = onig_st_insert_strend(UserDefinedPropertyTable,
1046                             (const UChar* )s, (const UChar* )s + n,
1047                             (hash_data_type )((void* )e));
1048   if (r < 0) return r;
1049
1050   UserDefinedPropertyNum++;
1051   return 0;
1052 }
1053
1054 extern int
1055 onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype)
1056 {
1057   if (
1058 #ifdef USE_UNICODE_PROPERTIES
1059       ctype <= ONIGENC_MAX_STD_CTYPE &&
1060 #endif
1061       code < 256) {
1062     return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype);
1063   }
1064
1065   if (ctype >= CODE_RANGES_NUM) {
1066     int index = ctype - CODE_RANGES_NUM;
1067     if (index < UserDefinedPropertyNum)
1068       return onig_is_in_code_range((UChar* )UserDefinedPropertyRanges[index].ranges, code);
1069     else
1070       return ONIGERR_TYPE_BUG;
1071   }
1072
1073   return onig_is_in_code_range((UChar* )CodeRanges[ctype], code);
1074 }
1075
1076
1077 extern int
1078 onigenc_unicode_ctype_code_range(OnigCtype ctype, const OnigCodePoint* ranges[])
1079 {
1080   if (ctype >= CODE_RANGES_NUM) {
1081     int index = ctype - CODE_RANGES_NUM;
1082     if (index < UserDefinedPropertyNum) {
1083       *ranges = UserDefinedPropertyRanges[index].ranges;
1084       return 0;
1085     }
1086     else
1087       return ONIGERR_TYPE_BUG;
1088   }
1089
1090   *ranges = CodeRanges[ctype];
1091   return 0;
1092 }
1093
1094 extern int
1095 onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
1096                                       const OnigCodePoint* ranges[])
1097 {
1098   *sb_out = 0x00;
1099   return onigenc_unicode_ctype_code_range(ctype, ranges);
1100 }
1101
1102 extern int
1103 onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name, UChar* end)
1104 {
1105   int len;
1106   UChar *p;
1107   OnigCodePoint code;
1108   const struct PoolPropertyNameCtype* pc;
1109   char buf[PROPERTY_NAME_MAX_SIZE];
1110
1111   p = name;
1112   len = 0;
1113   while (p < end) {
1114     code = ONIGENC_MBC_TO_CODE(enc, p, end);
1115     if (code >= 0x80)
1116       return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1117
1118     if (code != ' ' && code != '-' && code != '_') {
1119       buf[len++] = (char )code;
1120       if (len >= PROPERTY_NAME_MAX_SIZE)
1121         return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1122     }
1123
1124     p += enclen(enc, p);
1125   }
1126
1127   buf[len] = 0;
1128
1129   if (UserDefinedPropertyTable != 0) {
1130     UserDefinedPropertyValue* e;
1131     e = (UserDefinedPropertyValue* )NULL;
1132     onig_st_lookup_strend(UserDefinedPropertyTable,
1133                           (const UChar* )buf, (const UChar* )buf + len,
1134                           (hash_data_type* )((void* )(&e)));
1135     if (e != 0) {
1136       return e->ctype;
1137     }
1138   }
1139
1140   pc = unicode_lookup_property_name(buf, len);
1141   if (pc != 0) {
1142     /* fprintf(stderr, "LOOKUP: %s: %d\n", buf, pc->ctype); */
1143 #ifndef USE_UNICODE_PROPERTIES
1144     if (pc->ctype > ONIGENC_MAX_STD_CTYPE)
1145       return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1146 #endif
1147
1148     return (int )pc->ctype;
1149   }
1150
1151   return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1152 }