+++ /dev/null
-/* ------------------------------------------------------------------------\r
-\r
- unicodedata -- Provides access to the Unicode 5.2 data base.\r
-\r
- Data was extracted from the Unicode 5.2 UnicodeData.txt file.\r
-\r
- Written by Marc-Andre Lemburg (mal@lemburg.com).\r
- Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)\r
- Modified by Martin v. Löwis (martin@v.loewis.de)\r
-\r
- Copyright (c) Corporation for National Research Initiatives.\r
-\r
- ------------------------------------------------------------------------ */\r
-\r
-#include "Python.h"\r
-#include "ucnhash.h"\r
-#include "structmember.h"\r
-\r
-/* character properties */\r
-\r
-typedef struct {\r
- const unsigned char category; /* index into\r
- _PyUnicode_CategoryNames */\r
- const unsigned char combining; /* combining class value 0 - 255 */\r
- const unsigned char bidirectional; /* index into\r
- _PyUnicode_BidirectionalNames */\r
- const unsigned char mirrored; /* true if mirrored in bidir mode */\r
- const unsigned char east_asian_width; /* index into\r
- _PyUnicode_EastAsianWidth */\r
- const unsigned char normalization_quick_check; /* see is_normalized() */\r
-} _PyUnicode_DatabaseRecord;\r
-\r
-typedef struct change_record {\r
- /* sequence of fields should be the same as in merge_old_version */\r
- const unsigned char bidir_changed;\r
- const unsigned char category_changed;\r
- const unsigned char decimal_changed;\r
- const unsigned char mirrored_changed;\r
- const double numeric_changed;\r
-} change_record;\r
-\r
-/* data file generated by Tools/unicode/makeunicodedata.py */\r
-#include "unicodedata_db.h"\r
-\r
-static const _PyUnicode_DatabaseRecord*\r
-_getrecord_ex(Py_UCS4 code)\r
-{\r
- int index;\r
- if (code >= 0x110000)\r
- index = 0;\r
- else {\r
- index = index1[(code>>SHIFT)];\r
- index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];\r
- }\r
-\r
- return &_PyUnicode_Database_Records[index];\r
-}\r
-\r
-/* ------------- Previous-version API ------------------------------------- */\r
-typedef struct previous_version {\r
- PyObject_HEAD\r
- const char *name;\r
- const change_record* (*getrecord)(Py_UCS4);\r
- Py_UCS4 (*normalization)(Py_UCS4);\r
-} PreviousDBVersion;\r
-\r
-#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))\r
-\r
-static PyMemberDef DB_members[] = {\r
- {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},\r
- {NULL}\r
-};\r
-\r
-/* forward declaration */\r
-static PyTypeObject UCD_Type;\r
-\r
-static PyObject*\r
-new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),\r
- Py_UCS4 (*normalization)(Py_UCS4))\r
-{\r
- PreviousDBVersion *self;\r
- self = PyObject_New(PreviousDBVersion, &UCD_Type);\r
- if (self == NULL)\r
- return NULL;\r
- self->name = name;\r
- self->getrecord = getrecord;\r
- self->normalization = normalization;\r
- return (PyObject*)self;\r
-}\r
-\r
-\r
-static Py_UCS4 getuchar(PyUnicodeObject *obj)\r
-{\r
- Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);\r
-\r
- if (PyUnicode_GET_SIZE(obj) == 1)\r
- return *v;\r
-#ifndef Py_UNICODE_WIDE\r
- else if ((PyUnicode_GET_SIZE(obj) == 2) &&\r
- (0xD800 <= v[0] && v[0] <= 0xDBFF) &&\r
- (0xDC00 <= v[1] && v[1] <= 0xDFFF))\r
- return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;\r
-#endif\r
- PyErr_SetString(PyExc_TypeError,\r
- "need a single Unicode character as parameter");\r
- return (Py_UCS4)-1;\r
-}\r
-\r
-/* --- Module API --------------------------------------------------------- */\r
-\r
-PyDoc_STRVAR(unicodedata_decimal__doc__,\r
-"decimal(unichr[, default])\n\\r
-\n\\r
-Returns the decimal value assigned to the Unicode character unichr\n\\r
-as integer. If no such value is defined, default is returned, or, if\n\\r
-not given, ValueError is raised.");\r
-\r
-static PyObject *\r
-unicodedata_decimal(PyObject *self, PyObject *args)\r
-{\r
- PyUnicodeObject *v;\r
- PyObject *defobj = NULL;\r
- int have_old = 0;\r
- long rc;\r
- Py_UCS4 c;\r
-\r
- if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))\r
- return NULL;\r
- c = getuchar(v);\r
- if (c == (Py_UCS4)-1)\r
- return NULL;\r
-\r
- if (self) {\r
- const change_record *old = get_old_record(self, c);\r
- if (old->category_changed == 0) {\r
- /* unassigned */\r
- have_old = 1;\r
- rc = -1;\r
- }\r
- else if (old->decimal_changed != 0xFF) {\r
- have_old = 1;\r
- rc = old->decimal_changed;\r
- }\r
- }\r
-\r
- if (!have_old)\r
- rc = Py_UNICODE_TODECIMAL(c);\r
- if (rc < 0) {\r
- if (defobj == NULL) {\r
- PyErr_SetString(PyExc_ValueError,\r
- "not a decimal");\r
- return NULL;\r
- }\r
- else {\r
- Py_INCREF(defobj);\r
- return defobj;\r
- }\r
- }\r
- return PyInt_FromLong(rc);\r
-}\r
-\r
-PyDoc_STRVAR(unicodedata_digit__doc__,\r
-"digit(unichr[, default])\n\\r
-\n\\r
-Returns the digit value assigned to the Unicode character unichr as\n\\r
-integer. If no such value is defined, default is returned, or, if\n\\r
-not given, ValueError is raised.");\r
-\r
-static PyObject *\r
-unicodedata_digit(PyObject *self, PyObject *args)\r
-{\r
- PyUnicodeObject *v;\r
- PyObject *defobj = NULL;\r
- long rc;\r
- Py_UCS4 c;\r
-\r
- if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))\r
- return NULL;\r
- c = getuchar(v);\r
- if (c == (Py_UCS4)-1)\r
- return NULL;\r
- rc = Py_UNICODE_TODIGIT(c);\r
- if (rc < 0) {\r
- if (defobj == NULL) {\r
- PyErr_SetString(PyExc_ValueError, "not a digit");\r
- return NULL;\r
- }\r
- else {\r
- Py_INCREF(defobj);\r
- return defobj;\r
- }\r
- }\r
- return PyInt_FromLong(rc);\r
-}\r
-\r
-PyDoc_STRVAR(unicodedata_numeric__doc__,\r
-"numeric(unichr[, default])\n\\r
-\n\\r
-Returns the numeric value assigned to the Unicode character unichr\n\\r
-as float. If no such value is defined, default is returned, or, if\n\\r
-not given, ValueError is raised.");\r
-\r
-static PyObject *\r
-unicodedata_numeric(PyObject *self, PyObject *args)\r
-{\r
- PyUnicodeObject *v;\r
- PyObject *defobj = NULL;\r
- int have_old = 0;\r
- double rc;\r
- Py_UCS4 c;\r
-\r
- if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))\r
- return NULL;\r
- c = getuchar(v);\r
- if (c == (Py_UCS4)-1)\r
- return NULL;\r
-\r
- if (self) {\r
- const change_record *old = get_old_record(self, c);\r
- if (old->category_changed == 0) {\r
- /* unassigned */\r
- have_old = 1;\r
- rc = -1.0;\r
- }\r
- else if (old->decimal_changed != 0xFF) {\r
- have_old = 1;\r
- rc = old->decimal_changed;\r
- }\r
- }\r
-\r
- if (!have_old)\r
- rc = Py_UNICODE_TONUMERIC(c);\r
- if (rc == -1.0) {\r
- if (defobj == NULL) {\r
- PyErr_SetString(PyExc_ValueError, "not a numeric character");\r
- return NULL;\r
- }\r
- else {\r
- Py_INCREF(defobj);\r
- return defobj;\r
- }\r
- }\r
- return PyFloat_FromDouble(rc);\r
-}\r
-\r
-PyDoc_STRVAR(unicodedata_category__doc__,\r
-"category(unichr)\n\\r
-\n\\r
-Returns the general category assigned to the Unicode character\n\\r
-unichr as string.");\r
-\r
-static PyObject *\r
-unicodedata_category(PyObject *self, PyObject *args)\r
-{\r
- PyUnicodeObject *v;\r
- int index;\r
- Py_UCS4 c;\r
-\r
- if (!PyArg_ParseTuple(args, "O!:category",\r
- &PyUnicode_Type, &v))\r
- return NULL;\r
- c = getuchar(v);\r
- if (c == (Py_UCS4)-1)\r
- return NULL;\r
- index = (int) _getrecord_ex(c)->category;\r
- if (self) {\r
- const change_record *old = get_old_record(self, c);\r
- if (old->category_changed != 0xFF)\r
- index = old->category_changed;\r
- }\r
- return PyString_FromString(_PyUnicode_CategoryNames[index]);\r
-}\r
-\r
-PyDoc_STRVAR(unicodedata_bidirectional__doc__,\r
-"bidirectional(unichr)\n\\r
-\n\\r
-Returns the bidirectional class assigned to the Unicode character\n\\r
-unichr as string. If no such value is defined, an empty string is\n\\r
-returned.");\r
-\r
-static PyObject *\r
-unicodedata_bidirectional(PyObject *self, PyObject *args)\r
-{\r
- PyUnicodeObject *v;\r
- int index;\r
- Py_UCS4 c;\r
-\r
- if (!PyArg_ParseTuple(args, "O!:bidirectional",\r
- &PyUnicode_Type, &v))\r
- return NULL;\r
- c = getuchar(v);\r
- if (c == (Py_UCS4)-1)\r
- return NULL;\r
- index = (int) _getrecord_ex(c)->bidirectional;\r
- if (self) {\r
- const change_record *old = get_old_record(self, c);\r
- if (old->category_changed == 0)\r
- index = 0; /* unassigned */\r
- else if (old->bidir_changed != 0xFF)\r
- index = old->bidir_changed;\r
- }\r
- return PyString_FromString(_PyUnicode_BidirectionalNames[index]);\r
-}\r
-\r
-PyDoc_STRVAR(unicodedata_combining__doc__,\r
-"combining(unichr)\n\\r
-\n\\r
-Returns the canonical combining class assigned to the Unicode\n\\r
-character unichr as integer. Returns 0 if no combining class is\n\\r
-defined.");\r
-\r
-static PyObject *\r
-unicodedata_combining(PyObject *self, PyObject *args)\r
-{\r
- PyUnicodeObject *v;\r
- int index;\r
- Py_UCS4 c;\r
-\r
- if (!PyArg_ParseTuple(args, "O!:combining",\r
- &PyUnicode_Type, &v))\r
- return NULL;\r
- c = getuchar(v);\r
- if (c == (Py_UCS4)-1)\r
- return NULL;\r
- index = (int) _getrecord_ex(c)->combining;\r
- if (self) {\r
- const change_record *old = get_old_record(self, c);\r
- if (old->category_changed == 0)\r
- index = 0; /* unassigned */\r
- }\r
- return PyInt_FromLong(index);\r
-}\r
-\r
-PyDoc_STRVAR(unicodedata_mirrored__doc__,\r
-"mirrored(unichr)\n\\r
-\n\\r
-Returns the mirrored property assigned to the Unicode character\n\\r
-unichr as integer. Returns 1 if the character has been identified as\n\\r
-a \"mirrored\" character in bidirectional text, 0 otherwise.");\r
-\r
-static PyObject *\r
-unicodedata_mirrored(PyObject *self, PyObject *args)\r
-{\r
- PyUnicodeObject *v;\r
- int index;\r
- Py_UCS4 c;\r
-\r
- if (!PyArg_ParseTuple(args, "O!:mirrored",\r
- &PyUnicode_Type, &v))\r
- return NULL;\r
- c = getuchar(v);\r
- if (c == (Py_UCS4)-1)\r
- return NULL;\r
- index = (int) _getrecord_ex(c)->mirrored;\r
- if (self) {\r
- const change_record *old = get_old_record(self, c);\r
- if (old->category_changed == 0)\r
- index = 0; /* unassigned */\r
- else if (old->mirrored_changed != 0xFF)\r
- index = old->mirrored_changed;\r
- }\r
- return PyInt_FromLong(index);\r
-}\r
-\r
-PyDoc_STRVAR(unicodedata_east_asian_width__doc__,\r
-"east_asian_width(unichr)\n\\r
-\n\\r
-Returns the east asian width assigned to the Unicode character\n\\r
-unichr as string.");\r
-\r
-static PyObject *\r
-unicodedata_east_asian_width(PyObject *self, PyObject *args)\r
-{\r
- PyUnicodeObject *v;\r
- int index;\r
- Py_UCS4 c;\r
-\r
- if (!PyArg_ParseTuple(args, "O!:east_asian_width",\r
- &PyUnicode_Type, &v))\r
- return NULL;\r
- c = getuchar(v);\r
- if (c == (Py_UCS4)-1)\r
- return NULL;\r
- index = (int) _getrecord_ex(c)->east_asian_width;\r
- if (self) {\r
- const change_record *old = get_old_record(self, c);\r
- if (old->category_changed == 0)\r
- index = 0; /* unassigned */\r
- }\r
- return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);\r
-}\r
-\r
-PyDoc_STRVAR(unicodedata_decomposition__doc__,\r
-"decomposition(unichr)\n\\r
-\n\\r
-Returns the character decomposition mapping assigned to the Unicode\n\\r
-character unichr as string. An empty string is returned in case no\n\\r
-such mapping is defined.");\r
-\r
-static PyObject *\r
-unicodedata_decomposition(PyObject *self, PyObject *args)\r
-{\r
- PyUnicodeObject *v;\r
- char decomp[256];\r
- int code, index, count, i;\r
- unsigned int prefix_index;\r
- Py_UCS4 c;\r
-\r
- if (!PyArg_ParseTuple(args, "O!:decomposition",\r
- &PyUnicode_Type, &v))\r
- return NULL;\r
- c = getuchar(v);\r
- if (c == (Py_UCS4)-1)\r
- return NULL;\r
-\r
- code = (int)c;\r
-\r
- if (self) {\r
- const change_record *old = get_old_record(self, c);\r
- if (old->category_changed == 0)\r
- return PyString_FromString(""); /* unassigned */\r
- }\r
-\r
- if (code < 0 || code >= 0x110000)\r
- index = 0;\r
- else {\r
- index = decomp_index1[(code>>DECOMP_SHIFT)];\r
- index = decomp_index2[(index<<DECOMP_SHIFT)+\r
- (code&((1<<DECOMP_SHIFT)-1))];\r
- }\r
-\r
- /* high byte is number of hex bytes (usually one or two), low byte\r
- is prefix code (from*/\r
- count = decomp_data[index] >> 8;\r
-\r
- /* XXX: could allocate the PyString up front instead\r
- (strlen(prefix) + 5 * count + 1 bytes) */\r
-\r
- /* Based on how index is calculated above and decomp_data is generated\r
- from Tools/unicode/makeunicodedata.py, it should not be possible\r
- to overflow decomp_prefix. */\r
- prefix_index = decomp_data[index] & 255;\r
- assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));\r
-\r
- /* copy prefix */\r
- i = strlen(decomp_prefix[prefix_index]);\r
- memcpy(decomp, decomp_prefix[prefix_index], i);\r
-\r
- while (count-- > 0) {\r
- if (i)\r
- decomp[i++] = ' ';\r
- assert((size_t)i < sizeof(decomp));\r
- PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",\r
- decomp_data[++index]);\r
- i += strlen(decomp + i);\r
- }\r
-\r
- decomp[i] = '\0';\r
-\r
- return PyString_FromString(decomp);\r
-}\r
-\r
-static void\r
-get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)\r
-{\r
- if (code >= 0x110000) {\r
- *index = 0;\r
- } else if (self && get_old_record(self, code)->category_changed==0) {\r
- /* unassigned in old version */\r
- *index = 0;\r
- }\r
- else {\r
- *index = decomp_index1[(code>>DECOMP_SHIFT)];\r
- *index = decomp_index2[(*index<<DECOMP_SHIFT)+\r
- (code&((1<<DECOMP_SHIFT)-1))];\r
- }\r
-\r
- /* high byte is number of hex bytes (usually one or two), low byte\r
- is prefix code (from*/\r
- *count = decomp_data[*index] >> 8;\r
- *prefix = decomp_data[*index] & 255;\r
-\r
- (*index)++;\r
-}\r
-\r
-#define SBase 0xAC00\r
-#define LBase 0x1100\r
-#define VBase 0x1161\r
-#define TBase 0x11A7\r
-#define LCount 19\r
-#define VCount 21\r
-#define TCount 28\r
-#define NCount (VCount*TCount)\r
-#define SCount (LCount*NCount)\r
-\r
-static PyObject*\r
-nfd_nfkd(PyObject *self, PyObject *input, int k)\r
-{\r
- PyObject *result;\r
- Py_UNICODE *i, *end, *o;\r
- /* Longest decomposition in Unicode 3.2: U+FDFA */\r
- Py_UNICODE stack[20];\r
- Py_ssize_t space, isize;\r
- int index, prefix, count, stackptr;\r
- unsigned char prev, cur;\r
-\r
- stackptr = 0;\r
- isize = PyUnicode_GET_SIZE(input);\r
- space = isize;\r
- /* Overallocate at most 10 characters. */\r
- if (space > 10) {\r
- if (space <= PY_SSIZE_T_MAX - 10)\r
- space += 10;\r
- }\r
- else {\r
- space *= 2;\r
- }\r
- result = PyUnicode_FromUnicode(NULL, space);\r
- if (!result)\r
- return NULL;\r
- i = PyUnicode_AS_UNICODE(input);\r
- end = i + isize;\r
- o = PyUnicode_AS_UNICODE(result);\r
-\r
- while (i < end) {\r
- stack[stackptr++] = *i++;\r
- while(stackptr) {\r
- Py_UNICODE code = stack[--stackptr];\r
- /* Hangul Decomposition adds three characters in\r
- a single step, so we need at least that much room. */\r
- if (space < 3) {\r
- Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;\r
- space += 10;\r
- if (PyUnicode_Resize(&result, newsize) == -1)\r
- return NULL;\r
- o = PyUnicode_AS_UNICODE(result) + newsize - space;\r
- }\r
- /* Hangul Decomposition. */\r
- if (SBase <= code && code < (SBase+SCount)) {\r
- int SIndex = code - SBase;\r
- int L = LBase + SIndex / NCount;\r
- int V = VBase + (SIndex % NCount) / TCount;\r
- int T = TBase + SIndex % TCount;\r
- *o++ = L;\r
- *o++ = V;\r
- space -= 2;\r
- if (T != TBase) {\r
- *o++ = T;\r
- space --;\r
- }\r
- continue;\r
- }\r
- /* normalization changes */\r
- if (self) {\r
- Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);\r
- if (value != 0) {\r
- stack[stackptr++] = value;\r
- continue;\r
- }\r
- }\r
-\r
- /* Other decompositions. */\r
- get_decomp_record(self, code, &index, &prefix, &count);\r
-\r
- /* Copy character if it is not decomposable, or has a\r
- compatibility decomposition, but we do NFD. */\r
- if (!count || (prefix && !k)) {\r
- *o++ = code;\r
- space--;\r
- continue;\r
- }\r
- /* Copy decomposition onto the stack, in reverse\r
- order. */\r
- while(count) {\r
- code = decomp_data[index + (--count)];\r
- stack[stackptr++] = code;\r
- }\r
- }\r
- }\r
-\r
- /* Drop overallocation. Cannot fail. */\r
- PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);\r
-\r
- /* Sort canonically. */\r
- i = PyUnicode_AS_UNICODE(result);\r
- prev = _getrecord_ex(*i)->combining;\r
- end = i + PyUnicode_GET_SIZE(result);\r
- for (i++; i < end; i++) {\r
- cur = _getrecord_ex(*i)->combining;\r
- if (prev == 0 || cur == 0 || prev <= cur) {\r
- prev = cur;\r
- continue;\r
- }\r
- /* Non-canonical order. Need to switch *i with previous. */\r
- o = i - 1;\r
- while (1) {\r
- Py_UNICODE tmp = o[1];\r
- o[1] = o[0];\r
- o[0] = tmp;\r
- o--;\r
- if (o < PyUnicode_AS_UNICODE(result))\r
- break;\r
- prev = _getrecord_ex(*o)->combining;\r
- if (prev == 0 || prev <= cur)\r
- break;\r
- }\r
- prev = _getrecord_ex(*i)->combining;\r
- }\r
- return result;\r
-}\r
-\r
-static int\r
-find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)\r
-{\r
- int index;\r
- for (index = 0; nfc[index].start; index++) {\r
- int start = nfc[index].start;\r
- if (code < start)\r
- return -1;\r
- if (code <= start + nfc[index].count) {\r
- int delta = code - start;\r
- return nfc[index].index + delta;\r
- }\r
- }\r
- return -1;\r
-}\r
-\r
-static PyObject*\r
-nfc_nfkc(PyObject *self, PyObject *input, int k)\r
-{\r
- PyObject *result;\r
- Py_UNICODE *i, *i1, *o, *end;\r
- int f,l,index,index1,comb;\r
- Py_UNICODE code;\r
- Py_UNICODE *skipped[20];\r
- int cskipped = 0;\r
-\r
- result = nfd_nfkd(self, input, k);\r
- if (!result)\r
- return NULL;\r
-\r
- /* We are going to modify result in-place.\r
- If nfd_nfkd is changed to sometimes return the input,\r
- this code needs to be reviewed. */\r
- assert(result != input);\r
-\r
- i = PyUnicode_AS_UNICODE(result);\r
- end = i + PyUnicode_GET_SIZE(result);\r
- o = PyUnicode_AS_UNICODE(result);\r
-\r
- again:\r
- while (i < end) {\r
- for (index = 0; index < cskipped; index++) {\r
- if (skipped[index] == i) {\r
- /* *i character is skipped.\r
- Remove from list. */\r
- skipped[index] = skipped[cskipped-1];\r
- cskipped--;\r
- i++;\r
- goto again; /* continue while */\r
- }\r
- }\r
- /* Hangul Composition. We don't need to check for <LV,T>\r
- pairs, since we always have decomposed data. */\r
- if (LBase <= *i && *i < (LBase+LCount) &&\r
- i + 1 < end &&\r
- VBase <= i[1] && i[1] <= (VBase+VCount)) {\r
- int LIndex, VIndex;\r
- LIndex = i[0] - LBase;\r
- VIndex = i[1] - VBase;\r
- code = SBase + (LIndex*VCount+VIndex)*TCount;\r
- i+=2;\r
- if (i < end &&\r
- TBase <= *i && *i <= (TBase+TCount)) {\r
- code += *i-TBase;\r
- i++;\r
- }\r
- *o++ = code;\r
- continue;\r
- }\r
-\r
- f = find_nfc_index(self, nfc_first, *i);\r
- if (f == -1) {\r
- *o++ = *i++;\r
- continue;\r
- }\r
- /* Find next unblocked character. */\r
- i1 = i+1;\r
- comb = 0;\r
- while (i1 < end) {\r
- int comb1 = _getrecord_ex(*i1)->combining;\r
- if (comb) {\r
- if (comb1 == 0)\r
- break;\r
- if (comb >= comb1) {\r
- /* Character is blocked. */\r
- i1++;\r
- continue;\r
- }\r
- }\r
- l = find_nfc_index(self, nfc_last, *i1);\r
- /* *i1 cannot be combined with *i. If *i1\r
- is a starter, we don't need to look further.\r
- Otherwise, record the combining class. */\r
- if (l == -1) {\r
- not_combinable:\r
- if (comb1 == 0)\r
- break;\r
- comb = comb1;\r
- i1++;\r
- continue;\r
- }\r
- index = f*TOTAL_LAST + l;\r
- index1 = comp_index[index >> COMP_SHIFT];\r
- code = comp_data[(index1<<COMP_SHIFT)+\r
- (index&((1<<COMP_SHIFT)-1))];\r
- if (code == 0)\r
- goto not_combinable;\r
-\r
- /* Replace the original character. */\r
- *i = code;\r
- /* Mark the second character unused. */\r
- assert(cskipped < 20);\r
- skipped[cskipped++] = i1;\r
- i1++;\r
- f = find_nfc_index(self, nfc_first, *i);\r
- if (f == -1)\r
- break;\r
- }\r
- *o++ = *i++;\r
- }\r
- if (o != end)\r
- PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));\r
- return result;\r
-}\r
-\r
-/* Return 1 if the input is certainly normalized, 0 if it might not be. */\r
-static int\r
-is_normalized(PyObject *self, PyObject *input, int nfc, int k)\r
-{\r
- Py_UNICODE *i, *end;\r
- unsigned char prev_combining = 0, quickcheck_mask;\r
-\r
- /* An older version of the database is requested, quickchecks must be\r
- disabled. */\r
- if (self != NULL)\r
- return 0;\r
-\r
- /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,\r
- as described in http://unicode.org/reports/tr15/#Annex8. */\r
- quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));\r
-\r
- i = PyUnicode_AS_UNICODE(input);\r
- end = i + PyUnicode_GET_SIZE(input);\r
- while (i < end) {\r
- const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);\r
- unsigned char combining = record->combining;\r
- unsigned char quickcheck = record->normalization_quick_check;\r
-\r
- if (quickcheck & quickcheck_mask)\r
- return 0; /* this string might need normalization */\r
- if (combining && prev_combining > combining)\r
- return 0; /* non-canonical sort order, not normalized */\r
- prev_combining = combining;\r
- }\r
- return 1; /* certainly normalized */\r
-}\r
-\r
-PyDoc_STRVAR(unicodedata_normalize__doc__,\r
-"normalize(form, unistr)\n\\r
-\n\\r
-Return the normal form 'form' for the Unicode string unistr. Valid\n\\r
-values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");\r
-\r
-static PyObject*\r
-unicodedata_normalize(PyObject *self, PyObject *args)\r
-{\r
- char *form;\r
- PyObject *input;\r
-\r
- if(!PyArg_ParseTuple(args, "sO!:normalize",\r
- &form, &PyUnicode_Type, &input))\r
- return NULL;\r
-\r
- if (PyUnicode_GetSize(input) == 0) {\r
- /* Special case empty input strings, since resizing\r
- them later would cause internal errors. */\r
- Py_INCREF(input);\r
- return input;\r
- }\r
-\r
- if (strcmp(form, "NFC") == 0) {\r
- if (is_normalized(self, input, 1, 0)) {\r
- Py_INCREF(input);\r
- return input;\r
- }\r
- return nfc_nfkc(self, input, 0);\r
- }\r
- if (strcmp(form, "NFKC") == 0) {\r
- if (is_normalized(self, input, 1, 1)) {\r
- Py_INCREF(input);\r
- return input;\r
- }\r
- return nfc_nfkc(self, input, 1);\r
- }\r
- if (strcmp(form, "NFD") == 0) {\r
- if (is_normalized(self, input, 0, 0)) {\r
- Py_INCREF(input);\r
- return input;\r
- }\r
- return nfd_nfkd(self, input, 0);\r
- }\r
- if (strcmp(form, "NFKD") == 0) {\r
- if (is_normalized(self, input, 0, 1)) {\r
- Py_INCREF(input);\r
- return input;\r
- }\r
- return nfd_nfkd(self, input, 1);\r
- }\r
- PyErr_SetString(PyExc_ValueError, "invalid normalization form");\r
- return NULL;\r
-}\r
-\r
-/* -------------------------------------------------------------------- */\r
-/* unicode character name tables */\r
-\r
-/* data file generated by Tools/unicode/makeunicodedata.py */\r
-#include "unicodename_db.h"\r
-\r
-/* -------------------------------------------------------------------- */\r
-/* database code (cut and pasted from the unidb package) */\r
-\r
-static unsigned long\r
-_gethash(const char *s, int len, int scale)\r
-{\r
- int i;\r
- unsigned long h = 0;\r
- unsigned long ix;\r
- for (i = 0; i < len; i++) {\r
- h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));\r
- ix = h & 0xff000000;\r
- if (ix)\r
- h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;\r
- }\r
- return h;\r
-}\r
-\r
-static char *hangul_syllables[][3] = {\r
- { "G", "A", "" },\r
- { "GG", "AE", "G" },\r
- { "N", "YA", "GG" },\r
- { "D", "YAE", "GS" },\r
- { "DD", "EO", "N", },\r
- { "R", "E", "NJ" },\r
- { "M", "YEO", "NH" },\r
- { "B", "YE", "D" },\r
- { "BB", "O", "L" },\r
- { "S", "WA", "LG" },\r
- { "SS", "WAE", "LM" },\r
- { "", "OE", "LB" },\r
- { "J", "YO", "LS" },\r
- { "JJ", "U", "LT" },\r
- { "C", "WEO", "LP" },\r
- { "K", "WE", "LH" },\r
- { "T", "WI", "M" },\r
- { "P", "YU", "B" },\r
- { "H", "EU", "BS" },\r
- { 0, "YI", "S" },\r
- { 0, "I", "SS" },\r
- { 0, 0, "NG" },\r
- { 0, 0, "J" },\r
- { 0, 0, "C" },\r
- { 0, 0, "K" },\r
- { 0, 0, "T" },\r
- { 0, 0, "P" },\r
- { 0, 0, "H" }\r
-};\r
-\r
-static int\r
-is_unified_ideograph(Py_UCS4 code)\r
-{\r
- return (\r
- (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */\r
- (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph, Unicode 5.2 */\r
- (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */\r
- (0x2A700 <= code && code <= 0x2B734)); /* CJK Ideograph Extension C */\r
-}\r
-\r
-static int\r
-_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)\r
-{\r
- int offset;\r
- int i;\r
- int word;\r
- unsigned char* w;\r
-\r
- if (code >= 0x110000)\r
- return 0;\r
-\r
- if (self) {\r
- const change_record *old = get_old_record(self, code);\r
- if (old->category_changed == 0) {\r
- /* unassigned */\r
- return 0;\r
- }\r
- }\r
-\r
- if (SBase <= code && code < SBase+SCount) {\r
- /* Hangul syllable. */\r
- int SIndex = code - SBase;\r
- int L = SIndex / NCount;\r
- int V = (SIndex % NCount) / TCount;\r
- int T = SIndex % TCount;\r
-\r
- if (buflen < 27)\r
- /* Worst case: HANGUL SYLLABLE <10chars>. */\r
- return 0;\r
- strcpy(buffer, "HANGUL SYLLABLE ");\r
- buffer += 16;\r
- strcpy(buffer, hangul_syllables[L][0]);\r
- buffer += strlen(hangul_syllables[L][0]);\r
- strcpy(buffer, hangul_syllables[V][1]);\r
- buffer += strlen(hangul_syllables[V][1]);\r
- strcpy(buffer, hangul_syllables[T][2]);\r
- buffer += strlen(hangul_syllables[T][2]);\r
- *buffer = '\0';\r
- return 1;\r
- }\r
-\r
- if (is_unified_ideograph(code)) {\r
- if (buflen < 28)\r
- /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */\r
- return 0;\r
- sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);\r
- return 1;\r
- }\r
-\r
- /* get offset into phrasebook */\r
- offset = phrasebook_offset1[(code>>phrasebook_shift)];\r
- offset = phrasebook_offset2[(offset<<phrasebook_shift) +\r
- (code&((1<<phrasebook_shift)-1))];\r
- if (!offset)\r
- return 0;\r
-\r
- i = 0;\r
-\r
- for (;;) {\r
- /* get word index */\r
- word = phrasebook[offset] - phrasebook_short;\r
- if (word >= 0) {\r
- word = (word << 8) + phrasebook[offset+1];\r
- offset += 2;\r
- } else\r
- word = phrasebook[offset++];\r
- if (i) {\r
- if (i > buflen)\r
- return 0; /* buffer overflow */\r
- buffer[i++] = ' ';\r
- }\r
- /* copy word string from lexicon. the last character in the\r
- word has bit 7 set. the last word in a string ends with\r
- 0x80 */\r
- w = lexicon + lexicon_offset[word];\r
- while (*w < 128) {\r
- if (i >= buflen)\r
- return 0; /* buffer overflow */\r
- buffer[i++] = *w++;\r
- }\r
- if (i >= buflen)\r
- return 0; /* buffer overflow */\r
- buffer[i++] = *w & 127;\r
- if (*w == 128)\r
- break; /* end of word */\r
- }\r
-\r
- return 1;\r
-}\r
-\r
-static int\r
-_cmpname(PyObject *self, int code, const char* name, int namelen)\r
-{\r
- /* check if code corresponds to the given name */\r
- int i;\r
- char buffer[NAME_MAXLEN];\r
- if (!_getucname(self, code, buffer, sizeof(buffer)))\r
- return 0;\r
- for (i = 0; i < namelen; i++) {\r
- if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])\r
- return 0;\r
- }\r
- return buffer[namelen] == '\0';\r
-}\r
-\r
-static void\r
-find_syllable(const char *str, int *len, int *pos, int count, int column)\r
-{\r
- int i, len1;\r
- *len = -1;\r
- for (i = 0; i < count; i++) {\r
- char *s = hangul_syllables[i][column];\r
- len1 = strlen(s);\r
- if (len1 <= *len)\r
- continue;\r
- if (strncmp(str, s, len1) == 0) {\r
- *len = len1;\r
- *pos = i;\r
- }\r
- }\r
- if (*len == -1) {\r
- *len = 0;\r
- }\r
-}\r
-\r
-static int\r
-_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)\r
-{\r
- unsigned int h, v;\r
- unsigned int mask = code_size-1;\r
- unsigned int i, incr;\r
-\r
- /* Check for hangul syllables. */\r
- if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {\r
- int len, L = -1, V = -1, T = -1;\r
- const char *pos = name + 16;\r
- find_syllable(pos, &len, &L, LCount, 0);\r
- pos += len;\r
- find_syllable(pos, &len, &V, VCount, 1);\r
- pos += len;\r
- find_syllable(pos, &len, &T, TCount, 2);\r
- pos += len;\r
- if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {\r
- *code = SBase + (L*VCount+V)*TCount + T;\r
- return 1;\r
- }\r
- /* Otherwise, it's an illegal syllable name. */\r
- return 0;\r
- }\r
-\r
- /* Check for unified ideographs. */\r
- if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {\r
- /* Four or five hexdigits must follow. */\r
- v = 0;\r
- name += 22;\r
- namelen -= 22;\r
- if (namelen != 4 && namelen != 5)\r
- return 0;\r
- while (namelen--) {\r
- v *= 16;\r
- if (*name >= '0' && *name <= '9')\r
- v += *name - '0';\r
- else if (*name >= 'A' && *name <= 'F')\r
- v += *name - 'A' + 10;\r
- else\r
- return 0;\r
- name++;\r
- }\r
- if (!is_unified_ideograph(v))\r
- return 0;\r
- *code = v;\r
- return 1;\r
- }\r
-\r
- /* the following is the same as python's dictionary lookup, with\r
- only minor changes. see the makeunicodedata script for more\r
- details */\r
-\r
- h = (unsigned int) _gethash(name, namelen, code_magic);\r
- i = (~h) & mask;\r
- v = code_hash[i];\r
- if (!v)\r
- return 0;\r
- if (_cmpname(self, v, name, namelen)) {\r
- *code = v;\r
- return 1;\r
- }\r
- incr = (h ^ (h >> 3)) & mask;\r
- if (!incr)\r
- incr = mask;\r
- for (;;) {\r
- i = (i + incr) & mask;\r
- v = code_hash[i];\r
- if (!v)\r
- return 0;\r
- if (_cmpname(self, v, name, namelen)) {\r
- *code = v;\r
- return 1;\r
- }\r
- incr = incr << 1;\r
- if (incr > mask)\r
- incr = incr ^ code_poly;\r
- }\r
-}\r
-\r
-static const _PyUnicode_Name_CAPI hashAPI =\r
-{\r
- sizeof(_PyUnicode_Name_CAPI),\r
- _getucname,\r
- _getcode\r
-};\r
-\r
-/* -------------------------------------------------------------------- */\r
-/* Python bindings */\r
-\r
-PyDoc_STRVAR(unicodedata_name__doc__,\r
-"name(unichr[, default])\n\\r
-Returns the name assigned to the Unicode character unichr as a\n\\r
-string. If no name is defined, default is returned, or, if not\n\\r
-given, ValueError is raised.");\r
-\r
-static PyObject *\r
-unicodedata_name(PyObject* self, PyObject* args)\r
-{\r
- char name[NAME_MAXLEN];\r
- Py_UCS4 c;\r
-\r
- PyUnicodeObject* v;\r
- PyObject* defobj = NULL;\r
- if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))\r
- return NULL;\r
-\r
- c = getuchar(v);\r
- if (c == (Py_UCS4)-1)\r
- return NULL;\r
-\r
- if (!_getucname(self, c, name, sizeof(name))) {\r
- if (defobj == NULL) {\r
- PyErr_SetString(PyExc_ValueError, "no such name");\r
- return NULL;\r
- }\r
- else {\r
- Py_INCREF(defobj);\r
- return defobj;\r
- }\r
- }\r
-\r
- return Py_BuildValue("s", name);\r
-}\r
-\r
-PyDoc_STRVAR(unicodedata_lookup__doc__,\r
-"lookup(name)\n\\r
-\n\\r
-Look up character by name. If a character with the\n\\r
-given name is found, return the corresponding Unicode\n\\r
-character. If not found, KeyError is raised.");\r
-\r
-static PyObject *\r
-unicodedata_lookup(PyObject* self, PyObject* args)\r
-{\r
- Py_UCS4 code;\r
- Py_UNICODE str[2];\r
-\r
- char* name;\r
- int namelen;\r
- if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))\r
- return NULL;\r
-\r
- if (!_getcode(self, name, namelen, &code)) {\r
- PyErr_Format(PyExc_KeyError, "undefined character name '%s'",\r
- name);\r
- return NULL;\r
- }\r
-\r
-#ifndef Py_UNICODE_WIDE\r
- if (code >= 0x10000) {\r
- str[0] = 0xd800 + ((code - 0x10000) >> 10);\r
- str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);\r
- return PyUnicode_FromUnicode(str, 2);\r
- }\r
-#endif\r
- str[0] = (Py_UNICODE) code;\r
- return PyUnicode_FromUnicode(str, 1);\r
-}\r
-\r
-/* XXX Add doc strings. */\r
-\r
-static PyMethodDef unicodedata_functions[] = {\r
- {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},\r
- {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},\r
- {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},\r
- {"category", unicodedata_category, METH_VARARGS,\r
- unicodedata_category__doc__},\r
- {"bidirectional", unicodedata_bidirectional, METH_VARARGS,\r
- unicodedata_bidirectional__doc__},\r
- {"combining", unicodedata_combining, METH_VARARGS,\r
- unicodedata_combining__doc__},\r
- {"mirrored", unicodedata_mirrored, METH_VARARGS,\r
- unicodedata_mirrored__doc__},\r
- {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,\r
- unicodedata_east_asian_width__doc__},\r
- {"decomposition", unicodedata_decomposition, METH_VARARGS,\r
- unicodedata_decomposition__doc__},\r
- {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},\r
- {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},\r
- {"normalize", unicodedata_normalize, METH_VARARGS,\r
- unicodedata_normalize__doc__},\r
- {NULL, NULL} /* sentinel */\r
-};\r
-\r
-static PyTypeObject UCD_Type = {\r
- /* The ob_type field must be initialized in the module init function\r
- * to be portable to Windows without using C++. */\r
- PyVarObject_HEAD_INIT(NULL, 0)\r
- "unicodedata.UCD", /*tp_name*/\r
- sizeof(PreviousDBVersion), /*tp_basicsize*/\r
- 0, /*tp_itemsize*/\r
- /* methods */\r
- (destructor)PyObject_Del, /*tp_dealloc*/\r
- 0, /*tp_print*/\r
- 0, /*tp_getattr*/\r
- 0, /*tp_setattr*/\r
- 0, /*tp_compare*/\r
- 0, /*tp_repr*/\r
- 0, /*tp_as_number*/\r
- 0, /*tp_as_sequence*/\r
- 0, /*tp_as_mapping*/\r
- 0, /*tp_hash*/\r
- 0, /*tp_call*/\r
- 0, /*tp_str*/\r
- PyObject_GenericGetAttr,/*tp_getattro*/\r
- 0, /*tp_setattro*/\r
- 0, /*tp_as_buffer*/\r
- Py_TPFLAGS_DEFAULT, /*tp_flags*/\r
- 0, /*tp_doc*/\r
- 0, /*tp_traverse*/\r
- 0, /*tp_clear*/\r
- 0, /*tp_richcompare*/\r
- 0, /*tp_weaklistoffset*/\r
- 0, /*tp_iter*/\r
- 0, /*tp_iternext*/\r
- unicodedata_functions, /*tp_methods*/\r
- DB_members, /*tp_members*/\r
- 0, /*tp_getset*/\r
- 0, /*tp_base*/\r
- 0, /*tp_dict*/\r
- 0, /*tp_descr_get*/\r
- 0, /*tp_descr_set*/\r
- 0, /*tp_dictoffset*/\r
- 0, /*tp_init*/\r
- 0, /*tp_alloc*/\r
- 0, /*tp_new*/\r
- 0, /*tp_free*/\r
- 0, /*tp_is_gc*/\r
-};\r
-\r
-PyDoc_STRVAR(unicodedata_docstring,\r
-"This module provides access to the Unicode Character Database which\n\\r
-defines character properties for all Unicode characters. The data in\n\\r
-this database is based on the UnicodeData.txt file version\n\\r
-5.2.0 which is publically available from ftp://ftp.unicode.org/.\n\\r
-\n\\r
-The module uses the same names and symbols as defined by the\n\\r
-UnicodeData File Format 5.2.0 (see\n\\r
-http://www.unicode.org/reports/tr44/tr44-4.html).");\r
-\r
-PyMODINIT_FUNC\r
-initunicodedata(void)\r
-{\r
- PyObject *m, *v;\r
-\r
- Py_TYPE(&UCD_Type) = &PyType_Type;\r
-\r
- m = Py_InitModule3(\r
- "unicodedata", unicodedata_functions, unicodedata_docstring);\r
- if (!m)\r
- return;\r
-\r
- PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);\r
- Py_INCREF(&UCD_Type);\r
- PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);\r
-\r
- /* Previous versions */\r
- v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);\r
- if (v != NULL)\r
- PyModule_AddObject(m, "ucd_3_2_0", v);\r
-\r
- /* Export C API */\r
- v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);\r
- if (v != NULL)\r
- PyModule_AddObject(m, "ucnhash_CAPI", v);\r
-}\r
-\r
-/*\r
-Local variables:\r
-c-basic-offset: 4\r
-indent-tabs-mode: nil\r
-End:\r
-*/\r