1 /* ------------------------------------------------------------------------
3 unicodedata -- Provides access to the Unicode 5.2 data base.
5 Data was extracted from the Unicode 5.2 UnicodeData.txt file.
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
11 Copyright (c) Corporation for National Research Initiatives.
13 ------------------------------------------------------------------------ */
17 #include "structmember.h"
19 /* character properties */
22 const unsigned char category
; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining
; /* combining class value 0 - 255 */
25 const unsigned char bidirectional
; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored
; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width
; /* index into
29 _PyUnicode_EastAsianWidth */
30 const unsigned char normalization_quick_check
; /* see is_normalized() */
31 } _PyUnicode_DatabaseRecord
;
33 typedef struct change_record
{
34 /* sequence of fields should be the same as in merge_old_version */
35 const unsigned char bidir_changed
;
36 const unsigned char category_changed
;
37 const unsigned char decimal_changed
;
38 const unsigned char mirrored_changed
;
39 const double numeric_changed
;
42 /* data file generated by Tools/unicode/makeunicodedata.py */
43 #include "unicodedata_db.h"
45 static const _PyUnicode_DatabaseRecord
*
46 _getrecord_ex(Py_UCS4 code
)
52 index
= index1
[(code
>>SHIFT
)];
53 index
= index2
[(index
<<SHIFT
)+(code
&((1<<SHIFT
)-1))];
56 return &_PyUnicode_Database_Records
[index
];
59 /* ------------- Previous-version API ------------------------------------- */
60 typedef struct previous_version
{
63 const change_record
* (*getrecord
)(Py_UCS4
);
64 Py_UCS4 (*normalization
)(Py_UCS4
);
67 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
69 static PyMemberDef DB_members
[] = {
70 {"unidata_version", T_STRING
, offsetof(PreviousDBVersion
, name
), READONLY
},
74 /* forward declaration */
75 static PyTypeObject UCD_Type
;
78 new_previous_version(const char*name
, const change_record
* (*getrecord
)(Py_UCS4
),
79 Py_UCS4 (*normalization
)(Py_UCS4
))
81 PreviousDBVersion
*self
;
82 self
= PyObject_New(PreviousDBVersion
, &UCD_Type
);
86 self
->getrecord
= getrecord
;
87 self
->normalization
= normalization
;
88 return (PyObject
*)self
;
92 static Py_UCS4
getuchar(PyUnicodeObject
*obj
)
94 Py_UNICODE
*v
= PyUnicode_AS_UNICODE(obj
);
96 if (PyUnicode_GET_SIZE(obj
) == 1)
98 #ifndef Py_UNICODE_WIDE
99 else if ((PyUnicode_GET_SIZE(obj
) == 2) &&
100 (0xD800 <= v
[0] && v
[0] <= 0xDBFF) &&
101 (0xDC00 <= v
[1] && v
[1] <= 0xDFFF))
102 return (((v
[0] & 0x3FF)<<10) | (v
[1] & 0x3FF)) + 0x10000;
104 PyErr_SetString(PyExc_TypeError
,
105 "need a single Unicode character as parameter");
109 /* --- Module API --------------------------------------------------------- */
111 PyDoc_STRVAR(unicodedata_decimal__doc__
,
112 "decimal(unichr[, default])\n\
114 Returns the decimal value assigned to the Unicode character unichr\n\
115 as integer. If no such value is defined, default is returned, or, if\n\
116 not given, ValueError is raised.");
119 unicodedata_decimal(PyObject
*self
, PyObject
*args
)
122 PyObject
*defobj
= NULL
;
127 if (!PyArg_ParseTuple(args
, "O!|O:decimal", &PyUnicode_Type
, &v
, &defobj
))
130 if (c
== (Py_UCS4
)-1)
134 const change_record
*old
= get_old_record(self
, c
);
135 if (old
->category_changed
== 0) {
140 else if (old
->decimal_changed
!= 0xFF) {
142 rc
= old
->decimal_changed
;
147 rc
= Py_UNICODE_TODECIMAL(c
);
149 if (defobj
== NULL
) {
150 PyErr_SetString(PyExc_ValueError
,
159 return PyInt_FromLong(rc
);
162 PyDoc_STRVAR(unicodedata_digit__doc__
,
163 "digit(unichr[, default])\n\
165 Returns the digit value assigned to the Unicode character unichr as\n\
166 integer. If no such value is defined, default is returned, or, if\n\
167 not given, ValueError is raised.");
170 unicodedata_digit(PyObject
*self
, PyObject
*args
)
173 PyObject
*defobj
= NULL
;
177 if (!PyArg_ParseTuple(args
, "O!|O:digit", &PyUnicode_Type
, &v
, &defobj
))
180 if (c
== (Py_UCS4
)-1)
182 rc
= Py_UNICODE_TODIGIT(c
);
184 if (defobj
== NULL
) {
185 PyErr_SetString(PyExc_ValueError
, "not a digit");
193 return PyInt_FromLong(rc
);
196 PyDoc_STRVAR(unicodedata_numeric__doc__
,
197 "numeric(unichr[, default])\n\
199 Returns the numeric value assigned to the Unicode character unichr\n\
200 as float. If no such value is defined, default is returned, or, if\n\
201 not given, ValueError is raised.");
204 unicodedata_numeric(PyObject
*self
, PyObject
*args
)
207 PyObject
*defobj
= NULL
;
212 if (!PyArg_ParseTuple(args
, "O!|O:numeric", &PyUnicode_Type
, &v
, &defobj
))
215 if (c
== (Py_UCS4
)-1)
219 const change_record
*old
= get_old_record(self
, c
);
220 if (old
->category_changed
== 0) {
225 else if (old
->decimal_changed
!= 0xFF) {
227 rc
= old
->decimal_changed
;
232 rc
= Py_UNICODE_TONUMERIC(c
);
234 if (defobj
== NULL
) {
235 PyErr_SetString(PyExc_ValueError
, "not a numeric character");
243 return PyFloat_FromDouble(rc
);
246 PyDoc_STRVAR(unicodedata_category__doc__
,
249 Returns the general category assigned to the Unicode character\n\
253 unicodedata_category(PyObject
*self
, PyObject
*args
)
259 if (!PyArg_ParseTuple(args
, "O!:category",
260 &PyUnicode_Type
, &v
))
263 if (c
== (Py_UCS4
)-1)
265 index
= (int) _getrecord_ex(c
)->category
;
267 const change_record
*old
= get_old_record(self
, c
);
268 if (old
->category_changed
!= 0xFF)
269 index
= old
->category_changed
;
271 return PyString_FromString(_PyUnicode_CategoryNames
[index
]);
274 PyDoc_STRVAR(unicodedata_bidirectional__doc__
,
275 "bidirectional(unichr)\n\
277 Returns the bidirectional class assigned to the Unicode character\n\
278 unichr as string. If no such value is defined, an empty string is\n\
282 unicodedata_bidirectional(PyObject
*self
, PyObject
*args
)
288 if (!PyArg_ParseTuple(args
, "O!:bidirectional",
289 &PyUnicode_Type
, &v
))
292 if (c
== (Py_UCS4
)-1)
294 index
= (int) _getrecord_ex(c
)->bidirectional
;
296 const change_record
*old
= get_old_record(self
, c
);
297 if (old
->category_changed
== 0)
298 index
= 0; /* unassigned */
299 else if (old
->bidir_changed
!= 0xFF)
300 index
= old
->bidir_changed
;
302 return PyString_FromString(_PyUnicode_BidirectionalNames
[index
]);
305 PyDoc_STRVAR(unicodedata_combining__doc__
,
306 "combining(unichr)\n\
308 Returns the canonical combining class assigned to the Unicode\n\
309 character unichr as integer. Returns 0 if no combining class is\n\
313 unicodedata_combining(PyObject
*self
, PyObject
*args
)
319 if (!PyArg_ParseTuple(args
, "O!:combining",
320 &PyUnicode_Type
, &v
))
323 if (c
== (Py_UCS4
)-1)
325 index
= (int) _getrecord_ex(c
)->combining
;
327 const change_record
*old
= get_old_record(self
, c
);
328 if (old
->category_changed
== 0)
329 index
= 0; /* unassigned */
331 return PyInt_FromLong(index
);
334 PyDoc_STRVAR(unicodedata_mirrored__doc__
,
337 Returns the mirrored property assigned to the Unicode character\n\
338 unichr as integer. Returns 1 if the character has been identified as\n\
339 a \"mirrored\" character in bidirectional text, 0 otherwise.");
342 unicodedata_mirrored(PyObject
*self
, PyObject
*args
)
348 if (!PyArg_ParseTuple(args
, "O!:mirrored",
349 &PyUnicode_Type
, &v
))
352 if (c
== (Py_UCS4
)-1)
354 index
= (int) _getrecord_ex(c
)->mirrored
;
356 const change_record
*old
= get_old_record(self
, c
);
357 if (old
->category_changed
== 0)
358 index
= 0; /* unassigned */
359 else if (old
->mirrored_changed
!= 0xFF)
360 index
= old
->mirrored_changed
;
362 return PyInt_FromLong(index
);
365 PyDoc_STRVAR(unicodedata_east_asian_width__doc__
,
366 "east_asian_width(unichr)\n\
368 Returns the east asian width assigned to the Unicode character\n\
372 unicodedata_east_asian_width(PyObject
*self
, PyObject
*args
)
378 if (!PyArg_ParseTuple(args
, "O!:east_asian_width",
379 &PyUnicode_Type
, &v
))
382 if (c
== (Py_UCS4
)-1)
384 index
= (int) _getrecord_ex(c
)->east_asian_width
;
386 const change_record
*old
= get_old_record(self
, c
);
387 if (old
->category_changed
== 0)
388 index
= 0; /* unassigned */
390 return PyString_FromString(_PyUnicode_EastAsianWidthNames
[index
]);
393 PyDoc_STRVAR(unicodedata_decomposition__doc__
,
394 "decomposition(unichr)\n\
396 Returns the character decomposition mapping assigned to the Unicode\n\
397 character unichr as string. An empty string is returned in case no\n\
398 such mapping is defined.");
401 unicodedata_decomposition(PyObject
*self
, PyObject
*args
)
405 int code
, index
, count
, i
;
406 unsigned int prefix_index
;
409 if (!PyArg_ParseTuple(args
, "O!:decomposition",
410 &PyUnicode_Type
, &v
))
413 if (c
== (Py_UCS4
)-1)
419 const change_record
*old
= get_old_record(self
, c
);
420 if (old
->category_changed
== 0)
421 return PyString_FromString(""); /* unassigned */
424 if (code
< 0 || code
>= 0x110000)
427 index
= decomp_index1
[(code
>>DECOMP_SHIFT
)];
428 index
= decomp_index2
[(index
<<DECOMP_SHIFT
)+
429 (code
&((1<<DECOMP_SHIFT
)-1))];
432 /* high byte is number of hex bytes (usually one or two), low byte
433 is prefix code (from*/
434 count
= decomp_data
[index
] >> 8;
436 /* XXX: could allocate the PyString up front instead
437 (strlen(prefix) + 5 * count + 1 bytes) */
439 /* Based on how index is calculated above and decomp_data is generated
440 from Tools/unicode/makeunicodedata.py, it should not be possible
441 to overflow decomp_prefix. */
442 prefix_index
= decomp_data
[index
] & 255;
443 assert(prefix_index
< (sizeof(decomp_prefix
)/sizeof(*decomp_prefix
)));
446 i
= strlen(decomp_prefix
[prefix_index
]);
447 memcpy(decomp
, decomp_prefix
[prefix_index
], i
);
449 while (count
-- > 0) {
452 assert((size_t)i
< sizeof(decomp
));
453 PyOS_snprintf(decomp
+ i
, sizeof(decomp
) - i
, "%04X",
454 decomp_data
[++index
]);
455 i
+= strlen(decomp
+ i
);
460 return PyString_FromString(decomp
);
464 get_decomp_record(PyObject
*self
, Py_UCS4 code
, int *index
, int *prefix
, int *count
)
466 if (code
>= 0x110000) {
468 } else if (self
&& get_old_record(self
, code
)->category_changed
==0) {
469 /* unassigned in old version */
473 *index
= decomp_index1
[(code
>>DECOMP_SHIFT
)];
474 *index
= decomp_index2
[(*index
<<DECOMP_SHIFT
)+
475 (code
&((1<<DECOMP_SHIFT
)-1))];
478 /* high byte is number of hex bytes (usually one or two), low byte
479 is prefix code (from*/
480 *count
= decomp_data
[*index
] >> 8;
481 *prefix
= decomp_data
[*index
] & 255;
493 #define NCount (VCount*TCount)
494 #define SCount (LCount*NCount)
497 nfd_nfkd(PyObject
*self
, PyObject
*input
, int k
)
500 Py_UNICODE
*i
, *end
, *o
;
501 /* Longest decomposition in Unicode 3.2: U+FDFA */
502 Py_UNICODE stack
[20];
503 Py_ssize_t space
, isize
;
504 int index
, prefix
, count
, stackptr
;
505 unsigned char prev
, cur
;
508 isize
= PyUnicode_GET_SIZE(input
);
510 /* Overallocate at most 10 characters. */
512 if (space
<= PY_SSIZE_T_MAX
- 10)
518 result
= PyUnicode_FromUnicode(NULL
, space
);
521 i
= PyUnicode_AS_UNICODE(input
);
523 o
= PyUnicode_AS_UNICODE(result
);
526 stack
[stackptr
++] = *i
++;
528 Py_UNICODE code
= stack
[--stackptr
];
529 /* Hangul Decomposition adds three characters in
530 a single step, so we need at least that much room. */
532 Py_ssize_t newsize
= PyString_GET_SIZE(result
) + 10;
534 if (PyUnicode_Resize(&result
, newsize
) == -1)
536 o
= PyUnicode_AS_UNICODE(result
) + newsize
- space
;
538 /* Hangul Decomposition. */
539 if (SBase
<= code
&& code
< (SBase
+SCount
)) {
540 int SIndex
= code
- SBase
;
541 int L
= LBase
+ SIndex
/ NCount
;
542 int V
= VBase
+ (SIndex
% NCount
) / TCount
;
543 int T
= TBase
+ SIndex
% TCount
;
553 /* normalization changes */
555 Py_UCS4 value
= ((PreviousDBVersion
*)self
)->normalization(code
);
557 stack
[stackptr
++] = value
;
562 /* Other decompositions. */
563 get_decomp_record(self
, code
, &index
, &prefix
, &count
);
565 /* Copy character if it is not decomposable, or has a
566 compatibility decomposition, but we do NFD. */
567 if (!count
|| (prefix
&& !k
)) {
572 /* Copy decomposition onto the stack, in reverse
575 code
= decomp_data
[index
+ (--count
)];
576 stack
[stackptr
++] = code
;
581 /* Drop overallocation. Cannot fail. */
582 PyUnicode_Resize(&result
, PyUnicode_GET_SIZE(result
) - space
);
584 /* Sort canonically. */
585 i
= PyUnicode_AS_UNICODE(result
);
586 prev
= _getrecord_ex(*i
)->combining
;
587 end
= i
+ PyUnicode_GET_SIZE(result
);
588 for (i
++; i
< end
; i
++) {
589 cur
= _getrecord_ex(*i
)->combining
;
590 if (prev
== 0 || cur
== 0 || prev
<= cur
) {
594 /* Non-canonical order. Need to switch *i with previous. */
597 Py_UNICODE tmp
= o
[1];
601 if (o
< PyUnicode_AS_UNICODE(result
))
603 prev
= _getrecord_ex(*o
)->combining
;
604 if (prev
== 0 || prev
<= cur
)
607 prev
= _getrecord_ex(*i
)->combining
;
613 find_nfc_index(PyObject
*self
, struct reindex
* nfc
, Py_UNICODE code
)
616 for (index
= 0; nfc
[index
].start
; index
++) {
617 int start
= nfc
[index
].start
;
620 if (code
<= start
+ nfc
[index
].count
) {
621 int delta
= code
- start
;
622 return nfc
[index
].index
+ delta
;
629 nfc_nfkc(PyObject
*self
, PyObject
*input
, int k
)
632 Py_UNICODE
*i
, *i1
, *o
, *end
;
633 int f
,l
,index
,index1
,comb
;
635 Py_UNICODE
*skipped
[20];
638 result
= nfd_nfkd(self
, input
, k
);
642 /* We are going to modify result in-place.
643 If nfd_nfkd is changed to sometimes return the input,
644 this code needs to be reviewed. */
645 assert(result
!= input
);
647 i
= PyUnicode_AS_UNICODE(result
);
648 end
= i
+ PyUnicode_GET_SIZE(result
);
649 o
= PyUnicode_AS_UNICODE(result
);
653 for (index
= 0; index
< cskipped
; index
++) {
654 if (skipped
[index
] == i
) {
655 /* *i character is skipped.
657 skipped
[index
] = skipped
[cskipped
-1];
660 goto again
; /* continue while */
663 /* Hangul Composition. We don't need to check for <LV,T>
664 pairs, since we always have decomposed data. */
665 if (LBase
<= *i
&& *i
< (LBase
+LCount
) &&
667 VBase
<= i
[1] && i
[1] <= (VBase
+VCount
)) {
669 LIndex
= i
[0] - LBase
;
670 VIndex
= i
[1] - VBase
;
671 code
= SBase
+ (LIndex
*VCount
+VIndex
)*TCount
;
674 TBase
<= *i
&& *i
<= (TBase
+TCount
)) {
682 f
= find_nfc_index(self
, nfc_first
, *i
);
687 /* Find next unblocked character. */
691 int comb1
= _getrecord_ex(*i1
)->combining
;
696 /* Character is blocked. */
701 l
= find_nfc_index(self
, nfc_last
, *i1
);
702 /* *i1 cannot be combined with *i. If *i1
703 is a starter, we don't need to look further.
704 Otherwise, record the combining class. */
713 index
= f
*TOTAL_LAST
+ l
;
714 index1
= comp_index
[index
>> COMP_SHIFT
];
715 code
= comp_data
[(index1
<<COMP_SHIFT
)+
716 (index
&((1<<COMP_SHIFT
)-1))];
720 /* Replace the original character. */
722 /* Mark the second character unused. */
723 assert(cskipped
< 20);
724 skipped
[cskipped
++] = i1
;
726 f
= find_nfc_index(self
, nfc_first
, *i
);
733 PyUnicode_Resize(&result
, o
- PyUnicode_AS_UNICODE(result
));
737 /* Return 1 if the input is certainly normalized, 0 if it might not be. */
739 is_normalized(PyObject
*self
, PyObject
*input
, int nfc
, int k
)
742 unsigned char prev_combining
= 0, quickcheck_mask
;
744 /* An older version of the database is requested, quickchecks must be
749 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
750 as described in http://unicode.org/reports/tr15/#Annex8. */
751 quickcheck_mask
= 3 << ((nfc
? 4 : 0) + (k
? 2 : 0));
753 i
= PyUnicode_AS_UNICODE(input
);
754 end
= i
+ PyUnicode_GET_SIZE(input
);
756 const _PyUnicode_DatabaseRecord
*record
= _getrecord_ex(*i
++);
757 unsigned char combining
= record
->combining
;
758 unsigned char quickcheck
= record
->normalization_quick_check
;
760 if (quickcheck
& quickcheck_mask
)
761 return 0; /* this string might need normalization */
762 if (combining
&& prev_combining
> combining
)
763 return 0; /* non-canonical sort order, not normalized */
764 prev_combining
= combining
;
766 return 1; /* certainly normalized */
769 PyDoc_STRVAR(unicodedata_normalize__doc__
,
770 "normalize(form, unistr)\n\
772 Return the normal form 'form' for the Unicode string unistr. Valid\n\
773 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
776 unicodedata_normalize(PyObject
*self
, PyObject
*args
)
781 if(!PyArg_ParseTuple(args
, "sO!:normalize",
782 &form
, &PyUnicode_Type
, &input
))
785 if (PyUnicode_GetSize(input
) == 0) {
786 /* Special case empty input strings, since resizing
787 them later would cause internal errors. */
792 if (strcmp(form
, "NFC") == 0) {
793 if (is_normalized(self
, input
, 1, 0)) {
797 return nfc_nfkc(self
, input
, 0);
799 if (strcmp(form
, "NFKC") == 0) {
800 if (is_normalized(self
, input
, 1, 1)) {
804 return nfc_nfkc(self
, input
, 1);
806 if (strcmp(form
, "NFD") == 0) {
807 if (is_normalized(self
, input
, 0, 0)) {
811 return nfd_nfkd(self
, input
, 0);
813 if (strcmp(form
, "NFKD") == 0) {
814 if (is_normalized(self
, input
, 0, 1)) {
818 return nfd_nfkd(self
, input
, 1);
820 PyErr_SetString(PyExc_ValueError
, "invalid normalization form");
824 /* -------------------------------------------------------------------- */
825 /* unicode character name tables */
827 /* data file generated by Tools/unicode/makeunicodedata.py */
828 #include "unicodename_db.h"
830 /* -------------------------------------------------------------------- */
831 /* database code (cut and pasted from the unidb package) */
834 _gethash(const char *s
, int len
, int scale
)
839 for (i
= 0; i
< len
; i
++) {
840 h
= (h
* scale
) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s
[i
]));
843 h
= (h
^ ((ix
>>24) & 0xff)) & 0x00ffffff;
848 static char *hangul_syllables
[][3] = {
852 { "D", "YAE", "GS" },
853 { "DD", "EO", "N", },
855 { "M", "YEO", "NH" },
859 { "SS", "WAE", "LM" },
863 { "C", "WEO", "LP" },
880 is_unified_ideograph(Py_UCS4 code
)
883 (0x3400 <= code
&& code
<= 0x4DB5) || /* CJK Ideograph Extension A */
884 (0x4E00 <= code
&& code
<= 0x9FCB) || /* CJK Ideograph, Unicode 5.2 */
885 (0x20000 <= code
&& code
<= 0x2A6D6) || /* CJK Ideograph Extension B */
886 (0x2A700 <= code
&& code
<= 0x2B734)); /* CJK Ideograph Extension C */
890 _getucname(PyObject
*self
, Py_UCS4 code
, char* buffer
, int buflen
)
897 if (code
>= 0x110000)
901 const change_record
*old
= get_old_record(self
, code
);
902 if (old
->category_changed
== 0) {
908 if (SBase
<= code
&& code
< SBase
+SCount
) {
909 /* Hangul syllable. */
910 int SIndex
= code
- SBase
;
911 int L
= SIndex
/ NCount
;
912 int V
= (SIndex
% NCount
) / TCount
;
913 int T
= SIndex
% TCount
;
916 /* Worst case: HANGUL SYLLABLE <10chars>. */
918 strcpy(buffer
, "HANGUL SYLLABLE ");
920 strcpy(buffer
, hangul_syllables
[L
][0]);
921 buffer
+= strlen(hangul_syllables
[L
][0]);
922 strcpy(buffer
, hangul_syllables
[V
][1]);
923 buffer
+= strlen(hangul_syllables
[V
][1]);
924 strcpy(buffer
, hangul_syllables
[T
][2]);
925 buffer
+= strlen(hangul_syllables
[T
][2]);
930 if (is_unified_ideograph(code
)) {
932 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
934 sprintf(buffer
, "CJK UNIFIED IDEOGRAPH-%X", code
);
938 /* get offset into phrasebook */
939 offset
= phrasebook_offset1
[(code
>>phrasebook_shift
)];
940 offset
= phrasebook_offset2
[(offset
<<phrasebook_shift
) +
941 (code
&((1<<phrasebook_shift
)-1))];
949 word
= phrasebook
[offset
] - phrasebook_short
;
951 word
= (word
<< 8) + phrasebook
[offset
+1];
954 word
= phrasebook
[offset
++];
957 return 0; /* buffer overflow */
960 /* copy word string from lexicon. the last character in the
961 word has bit 7 set. the last word in a string ends with
963 w
= lexicon
+ lexicon_offset
[word
];
966 return 0; /* buffer overflow */
970 return 0; /* buffer overflow */
971 buffer
[i
++] = *w
& 127;
973 break; /* end of word */
980 _cmpname(PyObject
*self
, int code
, const char* name
, int namelen
)
982 /* check if code corresponds to the given name */
984 char buffer
[NAME_MAXLEN
];
985 if (!_getucname(self
, code
, buffer
, sizeof(buffer
)))
987 for (i
= 0; i
< namelen
; i
++) {
988 if (Py_TOUPPER(Py_CHARMASK(name
[i
])) != buffer
[i
])
991 return buffer
[namelen
] == '\0';
995 find_syllable(const char *str
, int *len
, int *pos
, int count
, int column
)
999 for (i
= 0; i
< count
; i
++) {
1000 char *s
= hangul_syllables
[i
][column
];
1004 if (strncmp(str
, s
, len1
) == 0) {
1015 _getcode(PyObject
* self
, const char* name
, int namelen
, Py_UCS4
* code
)
1018 unsigned int mask
= code_size
-1;
1019 unsigned int i
, incr
;
1021 /* Check for hangul syllables. */
1022 if (strncmp(name
, "HANGUL SYLLABLE ", 16) == 0) {
1023 int len
, L
= -1, V
= -1, T
= -1;
1024 const char *pos
= name
+ 16;
1025 find_syllable(pos
, &len
, &L
, LCount
, 0);
1027 find_syllable(pos
, &len
, &V
, VCount
, 1);
1029 find_syllable(pos
, &len
, &T
, TCount
, 2);
1031 if (L
!= -1 && V
!= -1 && T
!= -1 && pos
-name
== namelen
) {
1032 *code
= SBase
+ (L
*VCount
+V
)*TCount
+ T
;
1035 /* Otherwise, it's an illegal syllable name. */
1039 /* Check for unified ideographs. */
1040 if (strncmp(name
, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1041 /* Four or five hexdigits must follow. */
1045 if (namelen
!= 4 && namelen
!= 5)
1049 if (*name
>= '0' && *name
<= '9')
1051 else if (*name
>= 'A' && *name
<= 'F')
1052 v
+= *name
- 'A' + 10;
1057 if (!is_unified_ideograph(v
))
1063 /* the following is the same as python's dictionary lookup, with
1064 only minor changes. see the makeunicodedata script for more
1067 h
= (unsigned int) _gethash(name
, namelen
, code_magic
);
1072 if (_cmpname(self
, v
, name
, namelen
)) {
1076 incr
= (h
^ (h
>> 3)) & mask
;
1080 i
= (i
+ incr
) & mask
;
1084 if (_cmpname(self
, v
, name
, namelen
)) {
1090 incr
= incr
^ code_poly
;
1094 static const _PyUnicode_Name_CAPI hashAPI
=
1096 sizeof(_PyUnicode_Name_CAPI
),
1101 /* -------------------------------------------------------------------- */
1102 /* Python bindings */
1104 PyDoc_STRVAR(unicodedata_name__doc__
,
1105 "name(unichr[, default])\n\
1106 Returns the name assigned to the Unicode character unichr as a\n\
1107 string. If no name is defined, default is returned, or, if not\n\
1108 given, ValueError is raised.");
1111 unicodedata_name(PyObject
* self
, PyObject
* args
)
1113 char name
[NAME_MAXLEN
];
1117 PyObject
* defobj
= NULL
;
1118 if (!PyArg_ParseTuple(args
, "O!|O:name", &PyUnicode_Type
, &v
, &defobj
))
1122 if (c
== (Py_UCS4
)-1)
1125 if (!_getucname(self
, c
, name
, sizeof(name
))) {
1126 if (defobj
== NULL
) {
1127 PyErr_SetString(PyExc_ValueError
, "no such name");
1136 return Py_BuildValue("s", name
);
1139 PyDoc_STRVAR(unicodedata_lookup__doc__
,
1142 Look up character by name. If a character with the\n\
1143 given name is found, return the corresponding Unicode\n\
1144 character. If not found, KeyError is raised.");
1147 unicodedata_lookup(PyObject
* self
, PyObject
* args
)
1154 if (!PyArg_ParseTuple(args
, "s#:lookup", &name
, &namelen
))
1157 if (!_getcode(self
, name
, namelen
, &code
)) {
1158 PyErr_Format(PyExc_KeyError
, "undefined character name '%s'",
1163 #ifndef Py_UNICODE_WIDE
1164 if (code
>= 0x10000) {
1165 str
[0] = 0xd800 + ((code
- 0x10000) >> 10);
1166 str
[1] = 0xdc00 + ((code
- 0x10000) & 0x3ff);
1167 return PyUnicode_FromUnicode(str
, 2);
1170 str
[0] = (Py_UNICODE
) code
;
1171 return PyUnicode_FromUnicode(str
, 1);
1174 /* XXX Add doc strings. */
1176 static PyMethodDef unicodedata_functions
[] = {
1177 {"decimal", unicodedata_decimal
, METH_VARARGS
, unicodedata_decimal__doc__
},
1178 {"digit", unicodedata_digit
, METH_VARARGS
, unicodedata_digit__doc__
},
1179 {"numeric", unicodedata_numeric
, METH_VARARGS
, unicodedata_numeric__doc__
},
1180 {"category", unicodedata_category
, METH_VARARGS
,
1181 unicodedata_category__doc__
},
1182 {"bidirectional", unicodedata_bidirectional
, METH_VARARGS
,
1183 unicodedata_bidirectional__doc__
},
1184 {"combining", unicodedata_combining
, METH_VARARGS
,
1185 unicodedata_combining__doc__
},
1186 {"mirrored", unicodedata_mirrored
, METH_VARARGS
,
1187 unicodedata_mirrored__doc__
},
1188 {"east_asian_width", unicodedata_east_asian_width
, METH_VARARGS
,
1189 unicodedata_east_asian_width__doc__
},
1190 {"decomposition", unicodedata_decomposition
, METH_VARARGS
,
1191 unicodedata_decomposition__doc__
},
1192 {"name", unicodedata_name
, METH_VARARGS
, unicodedata_name__doc__
},
1193 {"lookup", unicodedata_lookup
, METH_VARARGS
, unicodedata_lookup__doc__
},
1194 {"normalize", unicodedata_normalize
, METH_VARARGS
,
1195 unicodedata_normalize__doc__
},
1196 {NULL
, NULL
} /* sentinel */
1199 static PyTypeObject UCD_Type
= {
1200 /* The ob_type field must be initialized in the module init function
1201 * to be portable to Windows without using C++. */
1202 PyVarObject_HEAD_INIT(NULL
, 0)
1203 "unicodedata.UCD", /*tp_name*/
1204 sizeof(PreviousDBVersion
), /*tp_basicsize*/
1207 (destructor
)PyObject_Del
, /*tp_dealloc*/
1214 0, /*tp_as_sequence*/
1215 0, /*tp_as_mapping*/
1219 PyObject_GenericGetAttr
,/*tp_getattro*/
1222 Py_TPFLAGS_DEFAULT
, /*tp_flags*/
1226 0, /*tp_richcompare*/
1227 0, /*tp_weaklistoffset*/
1230 unicodedata_functions
, /*tp_methods*/
1231 DB_members
, /*tp_members*/
1237 0, /*tp_dictoffset*/
1245 PyDoc_STRVAR(unicodedata_docstring
,
1246 "This module provides access to the Unicode Character Database which\n\
1247 defines character properties for all Unicode characters. The data in\n\
1248 this database is based on the UnicodeData.txt file version\n\
1249 5.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
1251 The module uses the same names and symbols as defined by the\n\
1252 UnicodeData File Format 5.2.0 (see\n\
1253 http://www.unicode.org/reports/tr44/tr44-4.html).");
1256 initunicodedata(void)
1260 Py_TYPE(&UCD_Type
) = &PyType_Type
;
1263 "unicodedata", unicodedata_functions
, unicodedata_docstring
);
1267 PyModule_AddStringConstant(m
, "unidata_version", UNIDATA_VERSION
);
1268 Py_INCREF(&UCD_Type
);
1269 PyModule_AddObject(m
, "UCD", (PyObject
*)&UCD_Type
);
1271 /* Previous versions */
1272 v
= new_previous_version("3.2.0", get_change_3_2_0
, normalization_3_2_0
);
1274 PyModule_AddObject(m
, "ucd_3_2_0", v
);
1277 v
= PyCapsule_New((void *)&hashAPI
, PyUnicodeData_CAPSULE_NAME
, NULL
);
1279 PyModule_AddObject(m
, "ucnhash_CAPI", v
);
1285 indent-tabs-mode: nil