AppPkg/Applications/Python/Python-2.7.10/Modules/unicodedata.c

   1 /* ------------------------------------------------------------------------
   2
   3    unicodedata -- Provides access to the Unicode 5.2 data base.
   4
   5    Data was extracted from the Unicode 5.2 UnicodeData.txt file.
   6
   7    Written by Marc-Andre Lemburg (mal@lemburg.com).
   8    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
   9    Modified by Martin v. Löwis (martin@v.loewis.de)
  10
  11    Copyright (c) Corporation for National Research Initiatives.
  12
  13    ------------------------------------------------------------------------ */
  14
  15 #include "Python.h"
  16 #include "ucnhash.h"
  17 #include "structmember.h"
  18
  19 /* character properties */
  20
  21 typedef struct {
  22     const unsigned char category;       /* index into
  23                                            _PyUnicode_CategoryNames */
  24     const unsigned char combining;      /* combining class value 0 - 255 */
  25     const unsigned char bidirectional;  /* index into
  26                                            _PyUnicode_BidirectionalNames */
  27     const unsigned char mirrored;       /* true if mirrored in bidir mode */
  28     const unsigned char east_asian_width;       /* index into
  29                                                    _PyUnicode_EastAsianWidth */
  30     const unsigned char normalization_quick_check; /* see is_normalized() */
  31 } _PyUnicode_DatabaseRecord;
  32
  33 typedef struct change_record {
  34     /* sequence of fields should be the same as in merge_old_version */
  35     const unsigned char bidir_changed;
  36     const unsigned char category_changed;
  37     const unsigned char decimal_changed;
  38     const unsigned char mirrored_changed;
  39     const double numeric_changed;
  40 } change_record;
  41
  42 /* data file generated by Tools/unicode/makeunicodedata.py */
  43 #include "unicodedata_db.h"
  44
  45 static const _PyUnicode_DatabaseRecord*
  46 _getrecord_ex(Py_UCS4 code)
  47 {
  48     int index;
  49     if (code >= 0x110000)
  50         index = 0;
  51     else {
  52         index = index1[(code>>SHIFT)];
  53         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
  54     }
  55
  56     return &_PyUnicode_Database_Records[index];
  57 }
  58
  59 /* ------------- Previous-version API ------------------------------------- */
  60 typedef struct previous_version {
  61     PyObject_HEAD
  62     const char *name;
  63     const change_record* (*getrecord)(Py_UCS4);
  64     Py_UCS4 (*normalization)(Py_UCS4);
  65 } PreviousDBVersion;
  66
  67 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
  68
  69 static PyMemberDef DB_members[] = {
  70         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
  71         {NULL}
  72 };
  73
  74 /* forward declaration */
  75 static PyTypeObject UCD_Type;
  76
  77 static PyObject*
  78 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
  79                      Py_UCS4 (*normalization)(Py_UCS4))
  80 {
  81         PreviousDBVersion *self;
  82         self = PyObject_New(PreviousDBVersion, &UCD_Type);
  83         if (self == NULL)
  84                 return NULL;
  85         self->name = name;
  86         self->getrecord = getrecord;
  87         self->normalization = normalization;
  88         return (PyObject*)self;
  89 }
  90
  91
  92 static Py_UCS4 getuchar(PyUnicodeObject *obj)
  93 {
  94     Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
  95
  96     if (PyUnicode_GET_SIZE(obj) == 1)
  97         return *v;
  98 #ifndef Py_UNICODE_WIDE
  99     else if ((PyUnicode_GET_SIZE(obj) == 2) &&
 100              (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
 101              (0xDC00 <= v[1] && v[1] <= 0xDFFF))
 102         return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
 103 #endif
 104     PyErr_SetString(PyExc_TypeError,
 105                     "need a single Unicode character as parameter");
 106     return (Py_UCS4)-1;
 107 }
 108
 109 /* --- Module API --------------------------------------------------------- */
 110
 111 PyDoc_STRVAR(unicodedata_decimal__doc__,
 112 "decimal(unichr[, default])\n\
 113 \n\
 114 Returns the decimal value assigned to the Unicode character unichr\n\
 115 as integer. If no such value is defined, default is returned, or, if\n\
 116 not given, ValueError is raised.");
 117
 118 static PyObject *
 119 unicodedata_decimal(PyObject *self, PyObject *args)
 120 {
 121     PyUnicodeObject *v;
 122     PyObject *defobj = NULL;
 123     int have_old = 0;
 124     long rc;
 125     Py_UCS4 c;
 126
 127     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
 128         return NULL;
 129     c = getuchar(v);
 130     if (c == (Py_UCS4)-1)
 131         return NULL;
 132
 133     if (self) {
 134         const change_record *old = get_old_record(self, c);
 135         if (old->category_changed == 0) {
 136             /* unassigned */
 137             have_old = 1;
 138             rc = -1;
 139         }
 140         else if (old->decimal_changed != 0xFF) {
 141             have_old = 1;
 142             rc = old->decimal_changed;
 143         }
 144     }
 145
 146     if (!have_old)
 147         rc = Py_UNICODE_TODECIMAL(c);
 148     if (rc < 0) {
 149         if (defobj == NULL) {
 150             PyErr_SetString(PyExc_ValueError,
 151                             "not a decimal");
 152             return NULL;
 153         }
 154         else {
 155             Py_INCREF(defobj);
 156             return defobj;
 157         }
 158     }
 159     return PyInt_FromLong(rc);
 160 }
 161
 162 PyDoc_STRVAR(unicodedata_digit__doc__,
 163 "digit(unichr[, default])\n\
 164 \n\
 165 Returns the digit value assigned to the Unicode character unichr as\n\
 166 integer. If no such value is defined, default is returned, or, if\n\
 167 not given, ValueError is raised.");
 168
 169 static PyObject *
 170 unicodedata_digit(PyObject *self, PyObject *args)
 171 {
 172     PyUnicodeObject *v;
 173     PyObject *defobj = NULL;
 174     long rc;
 175     Py_UCS4 c;
 176
 177     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
 178         return NULL;
 179     c = getuchar(v);
 180     if (c == (Py_UCS4)-1)
 181         return NULL;
 182     rc = Py_UNICODE_TODIGIT(c);
 183     if (rc < 0) {
 184         if (defobj == NULL) {
 185             PyErr_SetString(PyExc_ValueError, "not a digit");
 186             return NULL;
 187         }
 188         else {
 189             Py_INCREF(defobj);
 190             return defobj;
 191         }
 192     }
 193     return PyInt_FromLong(rc);
 194 }
 195
 196 PyDoc_STRVAR(unicodedata_numeric__doc__,
 197 "numeric(unichr[, default])\n\
 198 \n\
 199 Returns the numeric value assigned to the Unicode character unichr\n\
 200 as float. If no such value is defined, default is returned, or, if\n\
 201 not given, ValueError is raised.");
 202
 203 static PyObject *
 204 unicodedata_numeric(PyObject *self, PyObject *args)
 205 {
 206     PyUnicodeObject *v;
 207     PyObject *defobj = NULL;
 208     int have_old = 0;
 209     double rc;
 210     Py_UCS4 c;
 211
 212     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
 213         return NULL;
 214     c = getuchar(v);
 215     if (c == (Py_UCS4)-1)
 216         return NULL;
 217
 218     if (self) {
 219         const change_record *old = get_old_record(self, c);
 220         if (old->category_changed == 0) {
 221             /* unassigned */
 222             have_old = 1;
 223             rc = -1.0;
 224         }
 225         else if (old->decimal_changed != 0xFF) {
 226             have_old = 1;
 227             rc = old->decimal_changed;
 228         }
 229     }
 230
 231     if (!have_old)
 232         rc = Py_UNICODE_TONUMERIC(c);
 233     if (rc == -1.0) {
 234         if (defobj == NULL) {
 235             PyErr_SetString(PyExc_ValueError, "not a numeric character");
 236             return NULL;
 237         }
 238         else {
 239             Py_INCREF(defobj);
 240             return defobj;
 241         }
 242     }
 243     return PyFloat_FromDouble(rc);
 244 }
 245
 246 PyDoc_STRVAR(unicodedata_category__doc__,
 247 "category(unichr)\n\
 248 \n\
 249 Returns the general category assigned to the Unicode character\n\
 250 unichr as string.");
 251
 252 static PyObject *
 253 unicodedata_category(PyObject *self, PyObject *args)
 254 {
 255     PyUnicodeObject *v;
 256     int index;
 257     Py_UCS4 c;
 258
 259     if (!PyArg_ParseTuple(args, "O!:category",
 260                           &PyUnicode_Type, &v))
 261         return NULL;
 262     c = getuchar(v);
 263     if (c == (Py_UCS4)-1)
 264         return NULL;
 265     index = (int) _getrecord_ex(c)->category;
 266     if (self) {
 267         const change_record *old = get_old_record(self, c);
 268         if (old->category_changed != 0xFF)
 269             index = old->category_changed;
 270     }
 271     return PyString_FromString(_PyUnicode_CategoryNames[index]);
 272 }
 273
 274 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
 275 "bidirectional(unichr)\n\
 276 \n\
 277 Returns the bidirectional class assigned to the Unicode character\n\
 278 unichr as string. If no such value is defined, an empty string is\n\
 279 returned.");
 280
 281 static PyObject *
 282 unicodedata_bidirectional(PyObject *self, PyObject *args)
 283 {
 284     PyUnicodeObject *v;
 285     int index;
 286     Py_UCS4 c;
 287
 288     if (!PyArg_ParseTuple(args, "O!:bidirectional",
 289                           &PyUnicode_Type, &v))
 290         return NULL;
 291     c = getuchar(v);
 292     if (c == (Py_UCS4)-1)
 293         return NULL;
 294     index = (int) _getrecord_ex(c)->bidirectional;
 295     if (self) {
 296         const change_record *old = get_old_record(self, c);
 297         if (old->category_changed == 0)
 298             index = 0; /* unassigned */
 299         else if (old->bidir_changed != 0xFF)
 300             index = old->bidir_changed;
 301     }
 302     return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
 303 }
 304
 305 PyDoc_STRVAR(unicodedata_combining__doc__,
 306 "combining(unichr)\n\
 307 \n\
 308 Returns the canonical combining class assigned to the Unicode\n\
 309 character unichr as integer. Returns 0 if no combining class is\n\
 310 defined.");
 311
 312 static PyObject *
 313 unicodedata_combining(PyObject *self, PyObject *args)
 314 {
 315     PyUnicodeObject *v;
 316     int index;
 317     Py_UCS4 c;
 318
 319     if (!PyArg_ParseTuple(args, "O!:combining",
 320                           &PyUnicode_Type, &v))
 321         return NULL;
 322     c = getuchar(v);
 323     if (c == (Py_UCS4)-1)
 324         return NULL;
 325     index = (int) _getrecord_ex(c)->combining;
 326     if (self) {
 327         const change_record *old = get_old_record(self, c);
 328         if (old->category_changed == 0)
 329             index = 0; /* unassigned */
 330     }
 331     return PyInt_FromLong(index);
 332 }
 333
 334 PyDoc_STRVAR(unicodedata_mirrored__doc__,
 335 "mirrored(unichr)\n\
 336 \n\
 337 Returns the mirrored property assigned to the Unicode character\n\
 338 unichr as integer. Returns 1 if the character has been identified as\n\
 339 a \"mirrored\" character in bidirectional text, 0 otherwise.");
 340
 341 static PyObject *
 342 unicodedata_mirrored(PyObject *self, PyObject *args)
 343 {
 344     PyUnicodeObject *v;
 345     int index;
 346     Py_UCS4 c;
 347
 348     if (!PyArg_ParseTuple(args, "O!:mirrored",
 349                           &PyUnicode_Type, &v))
 350         return NULL;
 351     c = getuchar(v);
 352     if (c == (Py_UCS4)-1)
 353         return NULL;
 354     index = (int) _getrecord_ex(c)->mirrored;
 355     if (self) {
 356         const change_record *old = get_old_record(self, c);
 357         if (old->category_changed == 0)
 358             index = 0; /* unassigned */
 359         else if (old->mirrored_changed != 0xFF)
 360             index = old->mirrored_changed;
 361     }
 362     return PyInt_FromLong(index);
 363 }
 364
 365 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
 366 "east_asian_width(unichr)\n\
 367 \n\
 368 Returns the east asian width assigned to the Unicode character\n\
 369 unichr as string.");
 370
 371 static PyObject *
 372 unicodedata_east_asian_width(PyObject *self, PyObject *args)
 373 {
 374     PyUnicodeObject *v;
 375     int index;
 376     Py_UCS4 c;
 377
 378     if (!PyArg_ParseTuple(args, "O!:east_asian_width",
 379                           &PyUnicode_Type, &v))
 380         return NULL;
 381     c = getuchar(v);
 382     if (c == (Py_UCS4)-1)
 383         return NULL;
 384     index = (int) _getrecord_ex(c)->east_asian_width;
 385     if (self) {
 386         const change_record *old = get_old_record(self, c);
 387         if (old->category_changed == 0)
 388             index = 0; /* unassigned */
 389     }
 390     return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
 391 }
 392
 393 PyDoc_STRVAR(unicodedata_decomposition__doc__,
 394 "decomposition(unichr)\n\
 395 \n\
 396 Returns the character decomposition mapping assigned to the Unicode\n\
 397 character unichr as string. An empty string is returned in case no\n\
 398 such mapping is defined.");
 399
 400 static PyObject *
 401 unicodedata_decomposition(PyObject *self, PyObject *args)
 402 {
 403     PyUnicodeObject *v;
 404     char decomp[256];
 405     int code, index, count, i;
 406     unsigned int prefix_index;
 407     Py_UCS4 c;
 408
 409     if (!PyArg_ParseTuple(args, "O!:decomposition",
 410                           &PyUnicode_Type, &v))
 411         return NULL;
 412     c = getuchar(v);
 413     if (c == (Py_UCS4)-1)
 414         return NULL;
 415
 416     code = (int)c;
 417
 418     if (self) {
 419         const change_record *old = get_old_record(self, c);
 420         if (old->category_changed == 0)
 421             return PyString_FromString(""); /* unassigned */
 422     }
 423
 424     if (code < 0 || code >= 0x110000)
 425         index = 0;
 426     else {
 427         index = decomp_index1[(code>>DECOMP_SHIFT)];
 428         index = decomp_index2[(index<<DECOMP_SHIFT)+
 429                              (code&((1<<DECOMP_SHIFT)-1))];
 430     }
 431
 432     /* high byte is number of hex bytes (usually one or two), low byte
 433        is prefix code (from*/
 434     count = decomp_data[index] >> 8;
 435
 436     /* XXX: could allocate the PyString up front instead
 437        (strlen(prefix) + 5 * count + 1 bytes) */
 438
 439     /* Based on how index is calculated above and decomp_data is generated
 440        from Tools/unicode/makeunicodedata.py, it should not be possible
 441        to overflow decomp_prefix. */
 442     prefix_index = decomp_data[index] & 255;
 443     assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
 444
 445     /* copy prefix */
 446     i = strlen(decomp_prefix[prefix_index]);
 447     memcpy(decomp, decomp_prefix[prefix_index], i);
 448
 449     while (count-- > 0) {
 450         if (i)
 451             decomp[i++] = ' ';
 452         assert((size_t)i < sizeof(decomp));
 453         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
 454                       decomp_data[++index]);
 455         i += strlen(decomp + i);
 456     }
 457
 458     decomp[i] = '\0';
 459
 460     return PyString_FromString(decomp);
 461 }
 462
 463 static void
 464 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
 465 {
 466     if (code >= 0x110000) {
 467         *index = 0;
 468     } else if (self && get_old_record(self, code)->category_changed==0) {
 469         /* unassigned in old version */
 470         *index = 0;
 471     }
 472     else {
 473         *index = decomp_index1[(code>>DECOMP_SHIFT)];
 474         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
 475                                (code&((1<<DECOMP_SHIFT)-1))];
 476     }
 477
 478     /* high byte is number of hex bytes (usually one or two), low byte
 479        is prefix code (from*/
 480     *count = decomp_data[*index] >> 8;
 481     *prefix = decomp_data[*index] & 255;
 482
 483     (*index)++;
 484 }
 485
 486 #define SBase   0xAC00
 487 #define LBase   0x1100
 488 #define VBase   0x1161
 489 #define TBase   0x11A7
 490 #define LCount  19
 491 #define VCount  21
 492 #define TCount  28
 493 #define NCount  (VCount*TCount)
 494 #define SCount  (LCount*NCount)
 495
 496 static PyObject*
 497 nfd_nfkd(PyObject *self, PyObject *input, int k)
 498 {
 499     PyObject *result;
 500     Py_UNICODE *i, *end, *o;
 501     /* Longest decomposition in Unicode 3.2: U+FDFA */
 502     Py_UNICODE stack[20];
 503     Py_ssize_t space, isize;
 504     int index, prefix, count, stackptr;
 505     unsigned char prev, cur;
 506
 507     stackptr = 0;
 508     isize = PyUnicode_GET_SIZE(input);
 509     space = isize;
 510     /* Overallocate at most 10 characters. */
 511     if (space > 10) {
 512         if (space <= PY_SSIZE_T_MAX - 10)
 513             space += 10;
 514     }
 515     else {
 516         space *= 2;
 517     }
 518     result = PyUnicode_FromUnicode(NULL, space);
 519     if (!result)
 520         return NULL;
 521     i = PyUnicode_AS_UNICODE(input);
 522     end = i + isize;
 523     o = PyUnicode_AS_UNICODE(result);
 524
 525     while (i < end) {
 526         stack[stackptr++] = *i++;
 527         while(stackptr) {
 528             Py_UNICODE code = stack[--stackptr];
 529             /* Hangul Decomposition adds three characters in
 530                a single step, so we need at least that much room. */
 531             if (space < 3) {
 532                 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
 533                 space += 10;
 534                 if (PyUnicode_Resize(&result, newsize) == -1)
 535                     return NULL;
 536                 o = PyUnicode_AS_UNICODE(result) + newsize - space;
 537             }
 538             /* Hangul Decomposition. */
 539             if (SBase <= code && code < (SBase+SCount)) {
 540                 int SIndex = code - SBase;
 541                 int L = LBase + SIndex / NCount;
 542                 int V = VBase + (SIndex % NCount) / TCount;
 543                 int T = TBase + SIndex % TCount;
 544                 *o++ = L;
 545                 *o++ = V;
 546                 space -= 2;
 547                 if (T != TBase) {
 548                     *o++ = T;
 549                     space --;
 550                 }
 551                 continue;
 552             }
 553             /* normalization changes */
 554             if (self) {
 555                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
 556                 if (value != 0) {
 557                     stack[stackptr++] = value;
 558                     continue;
 559                 }
 560             }
 561
 562             /* Other decompositions. */
 563             get_decomp_record(self, code, &index, &prefix, &count);
 564
 565             /* Copy character if it is not decomposable, or has a
 566                compatibility decomposition, but we do NFD. */
 567             if (!count || (prefix && !k)) {
 568                 *o++ = code;
 569                 space--;
 570                 continue;
 571             }
 572             /* Copy decomposition onto the stack, in reverse
 573                order.  */
 574             while(count) {
 575                 code = decomp_data[index + (--count)];
 576                 stack[stackptr++] = code;
 577             }
 578         }
 579     }
 580
 581     /* Drop overallocation. Cannot fail. */
 582     PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
 583
 584     /* Sort canonically. */
 585     i = PyUnicode_AS_UNICODE(result);
 586     prev = _getrecord_ex(*i)->combining;
 587     end = i + PyUnicode_GET_SIZE(result);
 588     for (i++; i < end; i++) {
 589         cur = _getrecord_ex(*i)->combining;
 590         if (prev == 0 || cur == 0 || prev <= cur) {
 591             prev = cur;
 592             continue;
 593         }
 594         /* Non-canonical order. Need to switch *i with previous. */
 595         o = i - 1;
 596         while (1) {
 597             Py_UNICODE tmp = o[1];
 598             o[1] = o[0];
 599             o[0] = tmp;
 600             o--;
 601             if (o < PyUnicode_AS_UNICODE(result))
 602                 break;
 603             prev = _getrecord_ex(*o)->combining;
 604             if (prev == 0 || prev <= cur)
 605                 break;
 606         }
 607         prev = _getrecord_ex(*i)->combining;
 608     }
 609     return result;
 610 }
 611
 612 static int
 613 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
 614 {
 615     int index;
 616     for (index = 0; nfc[index].start; index++) {
 617         int start = nfc[index].start;
 618         if (code < start)
 619             return -1;
 620         if (code <= start + nfc[index].count) {
 621             int delta = code - start;
 622             return nfc[index].index + delta;
 623         }
 624     }
 625     return -1;
 626 }
 627
 628 static PyObject*
 629 nfc_nfkc(PyObject *self, PyObject *input, int k)
 630 {
 631     PyObject *result;
 632     Py_UNICODE *i, *i1, *o, *end;
 633     int f,l,index,index1,comb;
 634     Py_UNICODE code;
 635     Py_UNICODE *skipped[20];
 636     int cskipped = 0;
 637
 638     result = nfd_nfkd(self, input, k);
 639     if (!result)
 640         return NULL;
 641
 642     /* We are going to modify result in-place.
 643        If nfd_nfkd is changed to sometimes return the input,
 644        this code needs to be reviewed. */
 645     assert(result != input);
 646
 647     i = PyUnicode_AS_UNICODE(result);
 648     end = i + PyUnicode_GET_SIZE(result);
 649     o = PyUnicode_AS_UNICODE(result);
 650
 651   again:
 652     while (i < end) {
 653       for (index = 0; index < cskipped; index++) {
 654           if (skipped[index] == i) {
 655               /* *i character is skipped.
 656                  Remove from list. */
 657               skipped[index] = skipped[cskipped-1];
 658               cskipped--;
 659               i++;
 660               goto again; /* continue while */
 661           }
 662       }
 663       /* Hangul Composition. We don't need to check for <LV,T>
 664          pairs, since we always have decomposed data. */
 665       if (LBase <= *i && *i < (LBase+LCount) &&
 666           i + 1 < end &&
 667           VBase <= i[1] && i[1] <= (VBase+VCount)) {
 668           int LIndex, VIndex;
 669           LIndex = i[0] - LBase;
 670           VIndex = i[1] - VBase;
 671           code = SBase + (LIndex*VCount+VIndex)*TCount;
 672           i+=2;
 673           if (i < end &&
 674               TBase <= *i && *i <= (TBase+TCount)) {
 675               code += *i-TBase;
 676               i++;
 677           }
 678           *o++ = code;
 679           continue;
 680       }
 681
 682       f = find_nfc_index(self, nfc_first, *i);
 683       if (f == -1) {
 684           *o++ = *i++;
 685           continue;
 686       }
 687       /* Find next unblocked character. */
 688       i1 = i+1;
 689       comb = 0;
 690       while (i1 < end) {
 691           int comb1 = _getrecord_ex(*i1)->combining;
 692           if (comb) {
 693               if (comb1 == 0)
 694                   break;
 695               if (comb >= comb1) {
 696                   /* Character is blocked. */
 697                   i1++;
 698                   continue;
 699               }
 700           }
 701           l = find_nfc_index(self, nfc_last, *i1);
 702           /* *i1 cannot be combined with *i. If *i1
 703              is a starter, we don't need to look further.
 704              Otherwise, record the combining class. */
 705           if (l == -1) {
 706             not_combinable:
 707               if (comb1 == 0)
 708                   break;
 709               comb = comb1;
 710               i1++;
 711               continue;
 712           }
 713           index = f*TOTAL_LAST + l;
 714           index1 = comp_index[index >> COMP_SHIFT];
 715           code = comp_data[(index1<<COMP_SHIFT)+
 716                            (index&((1<<COMP_SHIFT)-1))];
 717           if (code == 0)
 718               goto not_combinable;
 719
 720           /* Replace the original character. */
 721           *i = code;
 722           /* Mark the second character unused. */
 723           assert(cskipped < 20);
 724           skipped[cskipped++] = i1;
 725           i1++;
 726           f = find_nfc_index(self, nfc_first, *i);
 727           if (f == -1)
 728               break;
 729       }
 730       *o++ = *i++;
 731     }
 732     if (o != end)
 733         PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
 734     return result;
 735 }
 736
 737 /* Return 1 if the input is certainly normalized, 0 if it might not be. */
 738 static int
 739 is_normalized(PyObject *self, PyObject *input, int nfc, int k)
 740 {
 741     Py_UNICODE *i, *end;
 742     unsigned char prev_combining = 0, quickcheck_mask;
 743
 744     /* An older version of the database is requested, quickchecks must be
 745        disabled. */
 746     if (self != NULL)
 747         return 0;
 748
 749     /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
 750        as described in http://unicode.org/reports/tr15/#Annex8. */
 751     quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
 752
 753     i = PyUnicode_AS_UNICODE(input);
 754     end = i + PyUnicode_GET_SIZE(input);
 755     while (i < end) {
 756         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
 757         unsigned char combining = record->combining;
 758         unsigned char quickcheck = record->normalization_quick_check;
 759
 760         if (quickcheck & quickcheck_mask)
 761             return 0; /* this string might need normalization */
 762         if (combining && prev_combining > combining)
 763             return 0; /* non-canonical sort order, not normalized */
 764         prev_combining = combining;
 765     }
 766     return 1; /* certainly normalized */
 767 }
 768
 769 PyDoc_STRVAR(unicodedata_normalize__doc__,
 770 "normalize(form, unistr)\n\
 771 \n\
 772 Return the normal form 'form' for the Unicode string unistr.  Valid\n\
 773 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
 774
 775 static PyObject*
 776 unicodedata_normalize(PyObject *self, PyObject *args)
 777 {
 778     char *form;
 779     PyObject *input;
 780
 781     if(!PyArg_ParseTuple(args, "sO!:normalize",
 782                          &form, &PyUnicode_Type, &input))
 783         return NULL;
 784
 785     if (PyUnicode_GetSize(input) == 0) {
 786         /* Special case empty input strings, since resizing
 787            them  later would cause internal errors. */
 788         Py_INCREF(input);
 789         return input;
 790     }
 791
 792     if (strcmp(form, "NFC") == 0) {
 793         if (is_normalized(self, input, 1, 0)) {
 794             Py_INCREF(input);
 795             return input;
 796         }
 797         return nfc_nfkc(self, input, 0);
 798     }
 799     if (strcmp(form, "NFKC") == 0) {
 800         if (is_normalized(self, input, 1, 1)) {
 801             Py_INCREF(input);
 802             return input;
 803         }
 804         return nfc_nfkc(self, input, 1);
 805     }
 806     if (strcmp(form, "NFD") == 0) {
 807         if (is_normalized(self, input, 0, 0)) {
 808             Py_INCREF(input);
 809             return input;
 810         }
 811         return nfd_nfkd(self, input, 0);
 812     }
 813     if (strcmp(form, "NFKD") == 0) {
 814         if (is_normalized(self, input, 0, 1)) {
 815             Py_INCREF(input);
 816             return input;
 817         }
 818         return nfd_nfkd(self, input, 1);
 819     }
 820     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
 821     return NULL;
 822 }
 823
 824 /* -------------------------------------------------------------------- */
 825 /* unicode character name tables */
 826
 827 /* data file generated by Tools/unicode/makeunicodedata.py */
 828 #include "unicodename_db.h"
 829
 830 /* -------------------------------------------------------------------- */
 831 /* database code (cut and pasted from the unidb package) */
 832
 833 static unsigned long
 834 _gethash(const char *s, int len, int scale)
 835 {
 836     int i;
 837     unsigned long h = 0;
 838     unsigned long ix;
 839     for (i = 0; i < len; i++) {
 840         h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
 841         ix = h & 0xff000000;
 842         if (ix)
 843             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
 844     }
 845     return h;
 846 }
 847
 848 static char *hangul_syllables[][3] = {
 849     { "G",  "A",   ""   },
 850     { "GG", "AE",  "G"  },
 851     { "N",  "YA",  "GG" },
 852     { "D",  "YAE", "GS" },
 853     { "DD", "EO",  "N", },
 854     { "R",  "E",   "NJ" },
 855     { "M",  "YEO", "NH" },
 856     { "B",  "YE",  "D"  },
 857     { "BB", "O",   "L"  },
 858     { "S",  "WA",  "LG" },
 859     { "SS", "WAE", "LM" },
 860     { "",   "OE",  "LB" },
 861     { "J",  "YO",  "LS" },
 862     { "JJ", "U",   "LT" },
 863     { "C",  "WEO", "LP" },
 864     { "K",  "WE",  "LH" },
 865     { "T",  "WI",  "M"  },
 866     { "P",  "YU",  "B"  },
 867     { "H",  "EU",  "BS" },
 868     { 0,    "YI",  "S"  },
 869     { 0,    "I",   "SS" },
 870     { 0,    0,     "NG" },
 871     { 0,    0,     "J"  },
 872     { 0,    0,     "C"  },
 873     { 0,    0,     "K"  },
 874     { 0,    0,     "T"  },
 875     { 0,    0,     "P"  },
 876     { 0,    0,     "H"  }
 877 };
 878
 879 static int
 880 is_unified_ideograph(Py_UCS4 code)
 881 {
 882     return (
 883         (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
 884         (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph, Unicode 5.2 */
 885         (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
 886         (0x2A700 <= code && code <= 0x2B734));  /* CJK Ideograph Extension C */
 887 }
 888
 889 static int
 890 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
 891 {
 892     int offset;
 893     int i;
 894     int word;
 895     unsigned char* w;
 896
 897     if (code >= 0x110000)
 898         return 0;
 899
 900     if (self) {
 901         const change_record *old = get_old_record(self, code);
 902         if (old->category_changed == 0) {
 903             /* unassigned */
 904             return 0;
 905         }
 906     }
 907
 908     if (SBase <= code && code < SBase+SCount) {
 909         /* Hangul syllable. */
 910         int SIndex = code - SBase;
 911         int L = SIndex / NCount;
 912         int V = (SIndex % NCount) / TCount;
 913         int T = SIndex % TCount;
 914
 915         if (buflen < 27)
 916             /* Worst case: HANGUL SYLLABLE <10chars>. */
 917             return 0;
 918         strcpy(buffer, "HANGUL SYLLABLE ");
 919         buffer += 16;
 920         strcpy(buffer, hangul_syllables[L][0]);
 921         buffer += strlen(hangul_syllables[L][0]);
 922         strcpy(buffer, hangul_syllables[V][1]);
 923         buffer += strlen(hangul_syllables[V][1]);
 924         strcpy(buffer, hangul_syllables[T][2]);
 925         buffer += strlen(hangul_syllables[T][2]);
 926         *buffer = '\0';
 927         return 1;
 928     }
 929
 930     if (is_unified_ideograph(code)) {
 931         if (buflen < 28)
 932             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
 933             return 0;
 934         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
 935         return 1;
 936     }
 937
 938     /* get offset into phrasebook */
 939     offset = phrasebook_offset1[(code>>phrasebook_shift)];
 940     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
 941                                (code&((1<<phrasebook_shift)-1))];
 942     if (!offset)
 943         return 0;
 944
 945     i = 0;
 946
 947     for (;;) {
 948         /* get word index */
 949         word = phrasebook[offset] - phrasebook_short;
 950         if (word >= 0) {
 951             word = (word << 8) + phrasebook[offset+1];
 952             offset += 2;
 953         } else
 954             word = phrasebook[offset++];
 955         if (i) {
 956             if (i > buflen)
 957                 return 0; /* buffer overflow */
 958             buffer[i++] = ' ';
 959         }
 960         /* copy word string from lexicon.  the last character in the
 961            word has bit 7 set.  the last word in a string ends with
 962            0x80 */
 963         w = lexicon + lexicon_offset[word];
 964         while (*w < 128) {
 965             if (i >= buflen)
 966                 return 0; /* buffer overflow */
 967             buffer[i++] = *w++;
 968         }
 969         if (i >= buflen)
 970             return 0; /* buffer overflow */
 971         buffer[i++] = *w & 127;
 972         if (*w == 128)
 973             break; /* end of word */
 974     }
 975
 976     return 1;
 977 }
 978
 979 static int
 980 _cmpname(PyObject *self, int code, const char* name, int namelen)
 981 {
 982     /* check if code corresponds to the given name */
 983     int i;
 984     char buffer[NAME_MAXLEN];
 985     if (!_getucname(self, code, buffer, sizeof(buffer)))
 986         return 0;
 987     for (i = 0; i < namelen; i++) {
 988         if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
 989             return 0;
 990     }
 991     return buffer[namelen] == '\0';
 992 }
 993
 994 static void
 995 find_syllable(const char *str, int *len, int *pos, int count, int column)
 996 {
 997     int i, len1;
 998     *len = -1;
 999     for (i = 0; i < count; i++) {
1000         char *s = hangul_syllables[i][column];
1001         len1 = strlen(s);
1002         if (len1 <= *len)
1003             continue;
1004         if (strncmp(str, s, len1) == 0) {
1005             *len = len1;
1006             *pos = i;
1007         }
1008     }
1009     if (*len == -1) {
1010         *len = 0;
1011     }
1012 }
1013
1014 static int
1015 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
1016 {
1017     unsigned int h, v;
1018     unsigned int mask = code_size-1;
1019     unsigned int i, incr;
1020
1021     /* Check for hangul syllables. */
1022     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1023         int len, L = -1, V = -1, T = -1;
1024         const char *pos = name + 16;
1025         find_syllable(pos, &len, &L, LCount, 0);
1026         pos += len;
1027         find_syllable(pos, &len, &V, VCount, 1);
1028         pos += len;
1029         find_syllable(pos, &len, &T, TCount, 2);
1030         pos += len;
1031         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1032             *code = SBase + (L*VCount+V)*TCount + T;
1033             return 1;
1034         }
1035         /* Otherwise, it's an illegal syllable name. */
1036         return 0;
1037     }
1038
1039     /* Check for unified ideographs. */
1040     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1041         /* Four or five hexdigits must follow. */
1042         v = 0;
1043         name += 22;
1044         namelen -= 22;
1045         if (namelen != 4 && namelen != 5)
1046             return 0;
1047         while (namelen--) {
1048             v *= 16;
1049             if (*name >= '0' && *name <= '9')
1050                 v += *name - '0';
1051             else if (*name >= 'A' && *name <= 'F')
1052                 v += *name - 'A' + 10;
1053             else
1054                 return 0;
1055             name++;
1056         }
1057         if (!is_unified_ideograph(v))
1058             return 0;
1059         *code = v;
1060         return 1;
1061     }
1062
1063     /* the following is the same as python's dictionary lookup, with
1064        only minor changes.  see the makeunicodedata script for more
1065        details */
1066
1067     h = (unsigned int) _gethash(name, namelen, code_magic);
1068     i = (~h) & mask;
1069     v = code_hash[i];
1070     if (!v)
1071         return 0;
1072     if (_cmpname(self, v, name, namelen)) {
1073         *code = v;
1074         return 1;
1075     }
1076     incr = (h ^ (h >> 3)) & mask;
1077     if (!incr)
1078         incr = mask;
1079     for (;;) {
1080         i = (i + incr) & mask;
1081         v = code_hash[i];
1082         if (!v)
1083             return 0;
1084         if (_cmpname(self, v, name, namelen)) {
1085             *code = v;
1086             return 1;
1087         }
1088         incr = incr << 1;
1089         if (incr > mask)
1090             incr = incr ^ code_poly;
1091     }
1092 }
1093
1094 static const _PyUnicode_Name_CAPI hashAPI =
1095 {
1096     sizeof(_PyUnicode_Name_CAPI),
1097     _getucname,
1098     _getcode
1099 };
1100
1101 /* -------------------------------------------------------------------- */
1102 /* Python bindings */
1103
1104 PyDoc_STRVAR(unicodedata_name__doc__,
1105 "name(unichr[, default])\n\
1106 Returns the name assigned to the Unicode character unichr as a\n\
1107 string. If no name is defined, default is returned, or, if not\n\
1108 given, ValueError is raised.");
1109
1110 static PyObject *
1111 unicodedata_name(PyObject* self, PyObject* args)
1112 {
1113     char name[NAME_MAXLEN];
1114     Py_UCS4 c;
1115
1116     PyUnicodeObject* v;
1117     PyObject* defobj = NULL;
1118     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1119         return NULL;
1120
1121     c = getuchar(v);
1122     if (c == (Py_UCS4)-1)
1123         return NULL;
1124
1125     if (!_getucname(self, c, name, sizeof(name))) {
1126         if (defobj == NULL) {
1127             PyErr_SetString(PyExc_ValueError, "no such name");
1128             return NULL;
1129         }
1130         else {
1131             Py_INCREF(defobj);
1132             return defobj;
1133         }
1134     }
1135
1136     return Py_BuildValue("s", name);
1137 }
1138
1139 PyDoc_STRVAR(unicodedata_lookup__doc__,
1140 "lookup(name)\n\
1141 \n\
1142 Look up character by name.  If a character with the\n\
1143 given name is found, return the corresponding Unicode\n\
1144 character.  If not found, KeyError is raised.");
1145
1146 static PyObject *
1147 unicodedata_lookup(PyObject* self, PyObject* args)
1148 {
1149     Py_UCS4 code;
1150     Py_UNICODE str[2];
1151
1152     char* name;
1153     int namelen;
1154     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1155         return NULL;
1156
1157     if (!_getcode(self, name, namelen, &code)) {
1158         PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1159                      name);
1160         return NULL;
1161     }
1162
1163 #ifndef Py_UNICODE_WIDE
1164     if (code >= 0x10000) {
1165         str[0] = 0xd800 + ((code - 0x10000) >> 10);
1166         str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1167         return PyUnicode_FromUnicode(str, 2);
1168     }
1169 #endif
1170     str[0] = (Py_UNICODE) code;
1171     return PyUnicode_FromUnicode(str, 1);
1172 }
1173
1174 /* XXX Add doc strings. */
1175
1176 static PyMethodDef unicodedata_functions[] = {
1177     {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1178     {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1179     {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1180     {"category", unicodedata_category, METH_VARARGS,
1181                  unicodedata_category__doc__},
1182     {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1183                       unicodedata_bidirectional__doc__},
1184     {"combining", unicodedata_combining, METH_VARARGS,
1185                   unicodedata_combining__doc__},
1186     {"mirrored", unicodedata_mirrored, METH_VARARGS,
1187                  unicodedata_mirrored__doc__},
1188     {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1189                          unicodedata_east_asian_width__doc__},
1190     {"decomposition", unicodedata_decomposition, METH_VARARGS,
1191                       unicodedata_decomposition__doc__},
1192     {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1193     {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1194     {"normalize", unicodedata_normalize, METH_VARARGS,
1195                   unicodedata_normalize__doc__},
1196     {NULL, NULL}                /* sentinel */
1197 };
1198
1199 static PyTypeObject UCD_Type = {
1200         /* The ob_type field must be initialized in the module init function
1201          * to be portable to Windows without using C++. */
1202         PyVarObject_HEAD_INIT(NULL, 0)
1203         "unicodedata.UCD",              /*tp_name*/
1204         sizeof(PreviousDBVersion),      /*tp_basicsize*/
1205         0,                      /*tp_itemsize*/
1206         /* methods */
1207         (destructor)PyObject_Del, /*tp_dealloc*/
1208         0,                      /*tp_print*/
1209         0,                      /*tp_getattr*/
1210         0,                      /*tp_setattr*/
1211         0,                      /*tp_compare*/
1212         0,                      /*tp_repr*/
1213         0,                      /*tp_as_number*/
1214         0,                      /*tp_as_sequence*/
1215         0,                      /*tp_as_mapping*/
1216         0,                      /*tp_hash*/
1217         0,                      /*tp_call*/
1218         0,                      /*tp_str*/
1219         PyObject_GenericGetAttr,/*tp_getattro*/
1220         0,                      /*tp_setattro*/
1221         0,                      /*tp_as_buffer*/
1222         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
1223         0,                      /*tp_doc*/
1224         0,                      /*tp_traverse*/
1225         0,                      /*tp_clear*/
1226         0,                      /*tp_richcompare*/
1227         0,                      /*tp_weaklistoffset*/
1228         0,                      /*tp_iter*/
1229         0,                      /*tp_iternext*/
1230         unicodedata_functions,  /*tp_methods*/
1231         DB_members,             /*tp_members*/
1232         0,                      /*tp_getset*/
1233         0,                      /*tp_base*/
1234         0,                      /*tp_dict*/
1235         0,                      /*tp_descr_get*/
1236         0,                      /*tp_descr_set*/
1237         0,                      /*tp_dictoffset*/
1238         0,                      /*tp_init*/
1239         0,                      /*tp_alloc*/
1240         0,                      /*tp_new*/
1241         0,                      /*tp_free*/
1242         0,                      /*tp_is_gc*/
1243 };
1244
1245 PyDoc_STRVAR(unicodedata_docstring,
1246 "This module provides access to the Unicode Character Database which\n\
1247 defines character properties for all Unicode characters. The data in\n\
1248 this database is based on the UnicodeData.txt file version\n\
1249 5.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
1250 \n\
1251 The module uses the same names and symbols as defined by the\n\
1252 UnicodeData File Format 5.2.0 (see\n\
1253 http://www.unicode.org/reports/tr44/tr44-4.html).");
1254
1255 PyMODINIT_FUNC
1256 initunicodedata(void)
1257 {
1258     PyObject *m, *v;
1259
1260     Py_TYPE(&UCD_Type) = &PyType_Type;
1261
1262     m = Py_InitModule3(
1263         "unicodedata", unicodedata_functions, unicodedata_docstring);
1264     if (!m)
1265         return;
1266
1267     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1268     Py_INCREF(&UCD_Type);
1269     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1270
1271     /* Previous versions */
1272     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1273     if (v != NULL)
1274         PyModule_AddObject(m, "ucd_3_2_0", v);
1275
1276     /* Export C API */
1277     v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
1278     if (v != NULL)
1279         PyModule_AddObject(m, "ucnhash_CAPI", v);
1280 }
1281
1282 /*
1283 Local variables:
1284 c-basic-offset: 4
1285 indent-tabs-mode: nil
1286 End:
1287 */