2 # (re)generate unicode property and type databases
4 # this script converts a unicode 3.2 database file to
5 # Modules/unicodedata_db.h, Modules/unicodename_db.h,
6 # and Objects/unicodetype_db.h
9 # 2000-09-24 fl created (based on bits and pieces from unidb)
10 # 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
11 # 2000-09-25 fl added character type table
12 # 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
13 # 2000-11-03 fl expand first/last ranges
14 # 2001-01-19 fl added character name tables (2.1)
15 # 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
16 # 2002-09-11 wd use string methods
17 # 2002-10-18 mvl update to Unicode 3.2
18 # 2002-10-22 mvl generate NFC tables
19 # 2002-11-24 mvl expand all ranges, sort names version-independently
20 # 2002-11-25 mvl add UNIDATA_VERSION
21 # 2004-05-29 perky add east asian width information
22 # 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
24 # written by Fredrik Lundh (fredrik@pythonware.com)
32 # The Unicode Database
33 UNIDATA_VERSION
= "5.2.0"
34 UNICODE_DATA
= "UnicodeData%s.txt"
35 COMPOSITION_EXCLUSIONS
= "CompositionExclusions%s.txt"
36 EASTASIAN_WIDTH
= "EastAsianWidth%s.txt"
37 UNIHAN
= "Unihan%s.txt"
38 DERIVEDNORMALIZATION_PROPS
= "DerivedNormalizationProps%s.txt"
39 LINE_BREAK
= "LineBreak%s.txt"
41 old_versions
= ["3.2.0"]
43 CATEGORY_NAMES
= [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
44 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
45 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
48 BIDIRECTIONAL_NAMES
= [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
49 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
52 EASTASIANWIDTH_NAMES
= [ "F", "H", "W", "Na", "A", "N" ]
54 MANDATORY_LINE_BREAKS
= [ "BK", "CR", "LF", "NL" ]
56 # note: should match definitions in Objects/unicodectype.c
68 def maketables(trace
=0):
70 print "--- Reading", UNICODE_DATA
% "", "..."
73 unicode = UnicodeData(UNICODE_DATA
% version
,
74 COMPOSITION_EXCLUSIONS
% version
,
75 EASTASIAN_WIDTH
% version
,
77 DERIVEDNORMALIZATION_PROPS
% version
,
80 print len(filter(None, unicode.table
)), "characters"
82 for version
in old_versions
:
83 print "--- Reading", UNICODE_DATA
% ("-"+version
), "..."
84 old_unicode
= UnicodeData(UNICODE_DATA
% ("-"+version
),
85 COMPOSITION_EXCLUSIONS
% ("-"+version
),
86 EASTASIAN_WIDTH
% ("-"+version
),
87 UNIHAN
% ("-"+version
))
88 print len(filter(None, old_unicode
.table
)), "characters"
89 merge_old_version(version
, unicode, old_unicode
)
91 makeunicodename(unicode, trace
)
92 makeunicodedata(unicode, trace
)
93 makeunicodetype(unicode, trace
)
95 # --------------------------------------------------------------------
96 # unicode character properties
98 def makeunicodedata(unicode, trace
):
100 dummy
= (0, 0, 0, 0, 0, 0)
103 index
= [0] * len(unicode.chars
)
105 FILE
= "Modules/unicodedata_db.h"
107 print "--- Preparing", FILE
, "..."
109 # 1) database properties
111 for char
in unicode.chars
:
112 record
= unicode.table
[char
]
114 # extract database properties
115 category
= CATEGORY_NAMES
.index(record
[2])
116 combining
= int(record
[3])
117 bidirectional
= BIDIRECTIONAL_NAMES
.index(record
[4])
118 mirrored
= record
[9] == "Y"
119 eastasianwidth
= EASTASIANWIDTH_NAMES
.index(record
[15])
120 normalizationquickcheck
= record
[17]
122 category
, combining
, bidirectional
, mirrored
, eastasianwidth
,
123 normalizationquickcheck
125 # add entry to index and item tables
128 cache
[item
] = i
= len(table
)
132 # 2) decomposition data
136 decomp_index
= [0] * len(unicode.chars
)
140 comp_first
= [None] * len(unicode.chars
)
141 comp_last
= [None] * len(unicode.chars
)
143 for char
in unicode.chars
:
144 record
= unicode.table
[char
]
147 decomp
= record
[5].split()
149 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
151 if decomp
[0][0] == "<":
152 prefix
= decomp
.pop(0)
156 i
= decomp_prefix
.index(prefix
)
158 i
= len(decomp_prefix
)
159 decomp_prefix
.append(prefix
)
163 decomp
= [prefix
+ (len(decomp
)<<8)] + [int(s
, 16) for s
in decomp
]
165 if not prefix
and len(decomp
) == 3 and \
166 char
not in unicode.exclusions
and \
167 unicode.table
[decomp
[1]][3] == "0":
171 comp_pairs
.append((l
,r
,char
))
173 i
= decomp_data
.index(decomp
)
176 decomp_data
.extend(decomp
)
177 decomp_size
= decomp_size
+ len(decomp
) * 2
180 decomp_index
[char
] = i
183 comp_first_ranges
= []
184 comp_last_ranges
= []
185 prev_f
= prev_l
= None
186 for i
in unicode.chars
:
187 if comp_first
[i
] is not None:
192 elif prev_f
[1]+1 == i
:
195 comp_first_ranges
.append(prev_f
)
197 if comp_last
[i
] is not None:
202 elif prev_l
[1]+1 == i
:
205 comp_last_ranges
.append(prev_l
)
207 comp_first_ranges
.append(prev_f
)
208 comp_last_ranges
.append(prev_l
)
212 comp_data
= [0]*(total_first
*total_last
)
213 for f
,l
,char
in comp_pairs
:
216 comp_data
[f
*total_last
+l
] = char
218 print len(table
), "unique properties"
219 print len(decomp_prefix
), "unique decomposition prefixes"
220 print len(decomp_data
), "unique decomposition entries:",
221 print decomp_size
, "bytes"
222 print total_first
, "first characters in NFC"
223 print total_last
, "last characters in NFC"
224 print len(comp_pairs
), "NFC pairs"
226 print "--- Writing", FILE
, "..."
229 print >>fp
, "/* this file was generated by %s %s */" % (SCRIPT
, VERSION
)
231 print >>fp
, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
232 print >>fp
, "/* a list of unique database records */"
234 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
236 print >>fp
, " {%d, %d, %d, %d, %d, %d}," % item
240 print >>fp
, "/* Reindexing of NFC first characters. */"
241 print >>fp
, "#define TOTAL_FIRST",total_first
242 print >>fp
, "#define TOTAL_LAST",total_last
243 print >>fp
, "struct reindex{int start;short count,index;};"
244 print >>fp
, "static struct reindex nfc_first[] = {"
245 for start
,end
in comp_first_ranges
:
246 print >>fp
," { %d, %d, %d}," % (start
,end
-start
,comp_first
[start
])
247 print >>fp
," {0,0,0}"
249 print >>fp
, "static struct reindex nfc_last[] = {"
250 for start
,end
in comp_last_ranges
:
251 print >>fp
," { %d, %d, %d}," % (start
,end
-start
,comp_last
[start
])
252 print >>fp
," {0,0,0}"
255 # FIXME: <fl> the following tables could be made static, and
256 # the support code moved into unicodedatabase.c
258 print >>fp
, "/* string literals */"
259 print >>fp
, "const char *_PyUnicode_CategoryNames[] = {"
260 for name
in CATEGORY_NAMES
:
261 print >>fp
, " \"%s\"," % name
265 print >>fp
, "const char *_PyUnicode_BidirectionalNames[] = {"
266 for name
in BIDIRECTIONAL_NAMES
:
267 print >>fp
, " \"%s\"," % name
271 print >>fp
, "const char *_PyUnicode_EastAsianWidthNames[] = {"
272 for name
in EASTASIANWIDTH_NAMES
:
273 print >>fp
, " \"%s\"," % name
277 print >>fp
, "static const char *decomp_prefix[] = {"
278 for name
in decomp_prefix
:
279 print >>fp
, " \"%s\"," % name
283 # split record index table
284 index1
, index2
, shift
= splitbins(index
, trace
)
286 print >>fp
, "/* index tables for the database records */"
287 print >>fp
, "#define SHIFT", shift
288 Array("index1", index1
).dump(fp
, trace
)
289 Array("index2", index2
).dump(fp
, trace
)
291 # split decomposition index table
292 index1
, index2
, shift
= splitbins(decomp_index
, trace
)
294 print >>fp
, "/* decomposition data */"
295 Array("decomp_data", decomp_data
).dump(fp
, trace
)
297 print >>fp
, "/* index tables for the decomposition data */"
298 print >>fp
, "#define DECOMP_SHIFT", shift
299 Array("decomp_index1", index1
).dump(fp
, trace
)
300 Array("decomp_index2", index2
).dump(fp
, trace
)
302 index
, index2
, shift
= splitbins(comp_data
, trace
)
303 print >>fp
, "/* NFC pairs */"
304 print >>fp
, "#define COMP_SHIFT", shift
305 Array("comp_index", index
).dump(fp
, trace
)
306 Array("comp_data", index2
).dump(fp
, trace
)
308 # Generate delta tables for old versions
309 for version
, table
, normalization
in unicode.changed
:
310 cversion
= version
.replace(".","_")
313 index
= [0] * len(table
)
314 for i
, record
in enumerate(table
):
316 index
[i
] = cache
[record
]
318 index
[i
] = cache
[record
] = len(records
)
319 records
.append(record
)
320 index1
, index2
, shift
= splitbins(index
, trace
)
321 print >>fp
, "static const change_record change_records_%s[] = {" % cversion
322 for record
in records
:
323 print >>fp
, "\t{ %s }," % ", ".join(map(str,record
))
325 Array("changes_%s_index" % cversion
, index1
).dump(fp
, trace
)
326 Array("changes_%s_data" % cversion
, index2
).dump(fp
, trace
)
327 print >>fp
, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
329 print >>fp
, "\tint index;"
330 print >>fp
, "\tif (n >= 0x110000) index = 0;"
331 print >>fp
, "\telse {"
332 print >>fp
, "\t\tindex = changes_%s_index[n>>%d];" % (cversion
, shift
)
333 print >>fp
, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
334 (cversion
, shift
, ((1<<shift
)-1))
336 print >>fp
, "\treturn change_records_%s+index;" % cversion
338 print >>fp
, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
340 print >>fp
, "\tswitch(n) {"
341 for k
, v
in normalization
:
342 print >>fp
, "\tcase %s: return 0x%s;" % (hex(k
), v
)
343 print >>fp
, "\tdefault: return 0;"
344 print >>fp
, "\t}\n}\n"
348 # --------------------------------------------------------------------
349 # unicode character type tables
351 def makeunicodetype(unicode, trace
):
353 FILE
= "Objects/unicodetype_db.h"
355 print "--- Preparing", FILE
, "..."
357 # extract unicode types
358 dummy
= (0, 0, 0, 0, 0, 0)
361 index
= [0] * len(unicode.chars
)
366 for char
in unicode.chars
:
367 record
= unicode.table
[char
]
369 # extract database properties
371 bidirectional
= record
[4]
372 properties
= record
[16]
375 if category
in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
379 if 'Line_Break' in properties
or bidirectional
== "B":
380 flags |
= LINEBREAK_MASK
381 linebreaks
.append(char
)
382 if category
== "Zs" or bidirectional
in ("WS", "B", "S"):
389 # use delta predictor for upper/lower/title if it fits
391 upper
= int(record
[12], 16)
395 lower
= int(record
[13], 16)
399 title
= int(record
[14], 16)
401 # UCD.html says that a missing title char means that
402 # it defaults to the uppercase character, not to the
403 # character itself. Apparently, in the current UCD (5.x)
404 # this feature is never used
406 upper_d
= upper
- char
407 lower_d
= lower
- char
408 title_d
= title
- char
409 if -32768 <= upper_d
<= 32767 and \
410 -32768 <= lower_d
<= 32767 and \
411 -32768 <= title_d
<= 32767:
413 upper
= upper_d
& 0xffff
414 lower
= lower_d
& 0xffff
415 title
= title_d
& 0xffff
417 flags |
= NODELTA_MASK
418 # decimal digit, integer digit
421 flags |
= DECIMAL_MASK
422 decimal
= int(record
[6])
426 digit
= int(record
[7])
428 flags |
= NUMERIC_MASK
429 numeric
.setdefault(record
[8], []).append(char
)
431 upper
, lower
, title
, decimal
, digit
, flags
433 # add entry to index and item tables
436 cache
[item
] = i
= len(table
)
440 print len(table
), "unique character type entries"
441 print sum(map(len, numeric
.values())), "numeric code points"
442 print len(spaces
), "whitespace code points"
443 print len(linebreaks
), "linebreak code points"
445 print "--- Writing", FILE
, "..."
448 print >>fp
, "/* this file was generated by %s %s */" % (SCRIPT
, VERSION
)
450 print >>fp
, "/* a list of unique character type descriptors */"
451 print >>fp
, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
453 print >>fp
, " {%d, %d, %d, %d, %d, %d}," % item
457 # split decomposition index table
458 index1
, index2
, shift
= splitbins(index
, trace
)
460 print >>fp
, "/* type indexes */"
461 print >>fp
, "#define SHIFT", shift
462 Array("index1", index1
).dump(fp
, trace
)
463 Array("index2", index2
).dump(fp
, trace
)
465 # Generate code for _PyUnicode_ToNumeric()
466 numeric_items
= sorted(numeric
.items())
467 print >>fp
, '/* Returns the numeric value as double for Unicode characters'
468 print >>fp
, ' * having this property, -1.0 otherwise.'
470 print >>fp
, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'
472 print >>fp
, ' switch (ch) {'
473 for value
, codepoints
in numeric_items
:
474 # Turn text into float literals
475 parts
= value
.split('/')
476 parts
= [repr(float(part
)) for part
in parts
]
477 value
= '/'.join(parts
)
482 for codepoint
in codepoints
:
483 if codepoint
< 0x10000:
485 if codepoint
>= 0x10000 and not haswide
:
486 print >>fp
, '#ifdef Py_UNICODE_WIDE'
488 print >>fp
, ' case 0x%04X:' % (codepoint
,)
489 if haswide
and hasnonewide
:
491 print >>fp
, ' return (double) %s;' % (value
,)
492 if haswide
and not hasnonewide
:
495 print >>fp
,' return -1.0;'
499 # Generate code for _PyUnicode_IsWhitespace()
500 print >>fp
, "/* Returns 1 for Unicode characters having the bidirectional"
501 print >>fp
, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."
503 print >>fp
, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'
505 print >>fp
, '#ifdef WANT_WCTYPE_FUNCTIONS'
506 print >>fp
, ' return iswspace(ch);'
508 print >>fp
, ' switch (ch) {'
512 for codepoint
in sorted(spaces
):
513 if codepoint
< 0x10000:
515 if codepoint
>= 0x10000 and not haswide
:
516 print >>fp
, '#ifdef Py_UNICODE_WIDE'
518 print >>fp
, ' case 0x%04X:' % (codepoint
,)
519 if haswide
and hasnonewide
:
521 print >>fp
, ' return 1;'
522 if haswide
and not hasnonewide
:
526 print >>fp
,' return 0;'
531 # Generate code for _PyUnicode_IsLinebreak()
532 print >>fp
, "/* Returns 1 for Unicode characters having the line break"
533 print >>fp
, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional"
534 print >>fp
, " * type 'B', 0 otherwise."
536 print >>fp
, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
538 print >>fp
, ' switch (ch) {'
541 for codepoint
in sorted(linebreaks
):
542 if codepoint
< 0x10000:
544 if codepoint
>= 0x10000 and not haswide
:
545 print >>fp
, '#ifdef Py_UNICODE_WIDE'
547 print >>fp
, ' case 0x%04X:' % (codepoint
,)
548 if haswide
and hasnonewide
:
550 print >>fp
, ' return 1;'
551 if haswide
and not hasnonewide
:
555 print >>fp
,' return 0;'
561 # --------------------------------------------------------------------
562 # unicode name database
564 def makeunicodename(unicode, trace
):
566 FILE
= "Modules/unicodename_db.h"
568 print "--- Preparing", FILE
, "..."
571 names
= [None] * len(unicode.chars
)
573 for char
in unicode.chars
:
574 record
= unicode.table
[char
]
576 name
= record
[1].strip()
577 if name
and name
[0] != "<":
578 names
[char
] = name
+ chr(0)
580 print len(filter(lambda n
: n
is not None, names
)), "distinct names"
582 # collect unique words from names (note that we differ between
583 # words inside a sentence, and words ending a sentence. the
584 # latter includes the trailing null byte.
588 for char
in unicode.chars
:
599 words
[w
] = [len(words
)]
601 print n
, "words in text;", b
, "bytes"
603 wordlist
= words
.items()
605 # sort on falling frequency, then by name
608 return -len(alist
), aword
609 wordlist
.sort(key
=word_key
)
611 # figure out how many phrasebook escapes we need
613 while escapes
* 256 < len(wordlist
):
614 escapes
= escapes
+ 1
615 print escapes
, "escapes"
617 short
= 256 - escapes
621 print short
, "short indexes in lexicon"
625 for i
in range(short
):
626 n
= n
+ len(wordlist
[i
][1])
627 print n
, "short indexes in phrasebook"
629 # pick the most commonly used words, and sort the rest on falling
630 # length (to maximize overlap)
632 wordlist
, wordtail
= wordlist
[:short
], wordlist
[short
:]
633 wordtail
.sort(key
=lambda a
: a
[0], reverse
=True)
634 wordlist
.extend(wordtail
)
636 # generate lexicon from words
642 # build a lexicon string
644 for w
, x
in wordlist
:
645 # encoding: bit 7 indicates last character in word (chr(128)
646 # indicates the last character in an entire string)
647 ww
= w
[:-1] + chr(ord(w
[-1])+128)
648 # reuse string tails, when possible
652 lexicon
= lexicon
+ ww
653 offset
= offset
+ len(w
)
654 words
[w
] = len(lexicon_offset
)
655 lexicon_offset
.append(o
)
657 lexicon
= map(ord, lexicon
)
659 # generate phrasebook from names and lexicon
661 phrasebook_offset
= [0] * len(unicode.chars
)
662 for char
in unicode.chars
:
666 phrasebook_offset
[char
] = len(phrasebook
)
673 phrasebook
.append((i
>>8) + short
)
674 phrasebook
.append(i
&255)
676 assert getsize(phrasebook
) == 1
679 # unicode name hash table
683 for char
in unicode.chars
:
684 record
= unicode.table
[char
]
686 name
= record
[1].strip()
687 if name
and name
[0] != "<":
688 data
.append((name
, char
))
690 # the magic number 47 was chosen to minimize the number of
691 # collisions on the current data set. if you like, change it
692 # and see what happens...
694 codehash
= Hash("code", data
, 47)
696 print "--- Writing", FILE
, "..."
699 print >>fp
, "/* this file was generated by %s %s */" % (SCRIPT
, VERSION
)
701 print >>fp
, "#define NAME_MAXLEN", 256
703 print >>fp
, "/* lexicon */"
704 Array("lexicon", lexicon
).dump(fp
, trace
)
705 Array("lexicon_offset", lexicon_offset
).dump(fp
, trace
)
707 # split decomposition index table
708 offset1
, offset2
, shift
= splitbins(phrasebook_offset
, trace
)
710 print >>fp
, "/* code->name phrasebook */"
711 print >>fp
, "#define phrasebook_shift", shift
712 print >>fp
, "#define phrasebook_short", short
714 Array("phrasebook", phrasebook
).dump(fp
, trace
)
715 Array("phrasebook_offset1", offset1
).dump(fp
, trace
)
716 Array("phrasebook_offset2", offset2
).dump(fp
, trace
)
718 print >>fp
, "/* name->code dictionary */"
719 codehash
.dump(fp
, trace
)
724 def merge_old_version(version
, new
, old
):
725 # Changes to exclusion file not implemented yet
726 if old
.exclusions
!= new
.exclusions
:
727 raise NotImplementedError, "exclusions differ"
729 # In these change records, 0xFF means "no change"
730 bidir_changes
= [0xFF]*0x110000
731 category_changes
= [0xFF]*0x110000
732 decimal_changes
= [0xFF]*0x110000
733 mirrored_changes
= [0xFF]*0x110000
734 # In numeric data, 0 means "no change",
735 # -1 means "did not have a numeric value
736 numeric_changes
= [0] * 0x110000
737 # normalization_changes is a list of key-value pairs
738 normalization_changes
= []
739 for i
in range(0x110000):
740 if new
.table
[i
] is None:
741 # Characters unassigned in the new version ought to
742 # be unassigned in the old one
743 assert old
.table
[i
] is None
745 # check characters unassigned in the old version
746 if old
.table
[i
] is None:
747 # category 0 is "unassigned"
748 category_changes
[i
] = 0
750 # check characters that differ
751 if old
.table
[i
] != new
.table
[i
]:
752 for k
in range(len(old
.table
[i
])):
753 if old
.table
[i
][k
] != new
.table
[i
][k
]:
754 value
= old
.table
[i
][k
]
756 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
757 category_changes
[i
] = CATEGORY_NAMES
.index(value
)
759 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
760 bidir_changes
[i
] = BIDIRECTIONAL_NAMES
.index(value
)
762 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
763 # We assume that all normalization changes are in 1:1 mappings
764 assert " " not in value
765 normalization_changes
.append((i
, value
))
767 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
768 # we only support changes where the old value is a single digit
769 assert value
in "0123456789"
770 decimal_changes
[i
] = int(value
)
772 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
773 # Since 0 encodes "no change", the old value is better not 0
775 numeric_changes
[i
] = -1
777 numeric_changes
[i
] = float(value
)
778 assert numeric_changes
[i
] not in (0, -1)
781 mirrored_changes
[i
] = '1'
783 mirrored_changes
[i
] = '0'
785 # change to ISO comment, ignore
788 # change to simple uppercase mapping; ignore
791 # change to simple lowercase mapping; ignore
794 # change to simple titlecase mapping; ignore
797 # change to properties; not yet
800 class Difference(Exception):pass
801 raise Difference
, (hex(i
), k
, old
.table
[i
], new
.table
[i
])
802 new
.changed
.append((version
, zip(bidir_changes
, category_changes
,
803 decimal_changes
, mirrored_changes
,
805 normalization_changes
))
808 # --------------------------------------------------------------------
809 # the following support code is taken from the unidb utilities
810 # Copyright (c) 1999-2000 by Secret Labs AB
812 # load a unicode-data file from disk
816 # [ID, name, category, combining, bidi, decomp, (6)
817 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
818 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
821 def __init__(self
, filename
, exclusions
, eastasianwidth
, unihan
,
822 derivednormalizationprops
=None, linebreakprops
=None,
825 file = open(filename
)
826 table
= [None] * 0x110000
831 s
= s
.strip().split(";")
835 # expand first-last ranges
838 for i
in range(0, 0x110000):
841 if s
[1][-6:] == "First>":
844 elif s
[1][-5:] == "Last>":
853 self
.filename
= filename
855 self
.chars
= range(0x110000) # unicode 3.2
857 file = open(exclusions
)
865 char
= int(s
.split()[0],16)
866 self
.exclusions
[char
] = 1
868 widths
= [None] * 0x110000
869 for s
in open(eastasianwidth
):
875 s
= s
.split()[0].split(';')
877 first
, last
= [int(c
, 16) for c
in s
[0].split('..')]
878 chars
= range(first
, last
+1)
880 chars
= [int(s
[0], 16)]
883 for i
in range(0, 0x110000):
884 if table
[i
] is not None:
885 table
[i
].append(widths
[i
])
887 for i
in range(0, 0x110000):
888 if table
[i
] is not None:
889 table
[i
].append(set())
891 for s
in open(linebreakprops
):
892 s
= s
.partition('#')[0]
893 s
= [i
.strip() for i
in s
.split(';')]
894 if len(s
) < 2 or s
[1] not in MANDATORY_LINE_BREAKS
:
897 first
= last
= int(s
[0], 16)
899 first
, last
= [int(c
, 16) for c
in s
[0].split('..')]
900 for char
in range(first
, last
+1):
901 table
[char
][-1].add('Line_Break')
903 if derivednormalizationprops
:
904 quickchecks
= [0] * 0x110000 # default is Yes
905 qc_order
= 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
906 for s
in open(derivednormalizationprops
):
909 s
= [i
.strip() for i
in s
.split(';')]
910 if len(s
) < 2 or s
[1] not in qc_order
:
912 quickcheck
= 'MN'.index(s
[2]) + 1 # Maybe or No
913 quickcheck_shift
= qc_order
.index(s
[1])*2
914 quickcheck
<<= quickcheck_shift
916 first
= last
= int(s
[0], 16)
918 first
, last
= [int(c
, 16) for c
in s
[0].split('..')]
919 for char
in range(first
, last
+1):
920 assert not (quickchecks
[char
]>>quickcheck_shift
)&3
921 quickchecks
[char
] |
= quickcheck
922 for i
in range(0, 0x110000):
923 if table
[i
] is not None:
924 table
[i
].append(quickchecks
[i
])
926 for line
in open(unihan
):
927 if not line
.startswith('U+'):
929 code
, tag
, value
= line
.split(None, 3)[:3]
930 if tag
not in ('kAccountingNumeric', 'kPrimaryNumeric',
933 value
= value
.strip().replace(',', '')
934 i
= int(code
[2:], 16)
935 # Patch the numeric field
936 if table
[i
] is not None:
940 # restrict character range to ISO Latin 1
941 self
.chars
= range(256)
945 # this is a straight-forward reimplementation of Python's built-in
946 # dictionary type, using a static data structure, and a custom string
949 def myhash(s
, magic
):
951 for c
in map(ord, s
.upper()):
955 h
= (h ^
((ix
>>24) & 0xff)) & 0x00ffffff
959 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
960 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
961 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
962 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
966 def __init__(self
, name
, data
, magic
):
967 # turn a (key, value) list into a static hash table structure
969 # determine table size
970 for size
, poly
in SIZES
:
975 raise AssertionError, "ran out of polynominals"
977 print size
, "slots in hash table"
979 table
= [None] * size
987 # initialize hash table
988 for key
, value
in data
:
995 incr
= (h ^
(h
>> 3)) & mask
;
1000 i
= (i
+ incr
) & mask
1009 print n
, "collisions"
1012 for i
in range(len(table
)):
1013 if table
[i
] is None:
1016 self
.data
= Array(name
+ "_hash", table
)
1022 def dump(self
, file, trace
):
1023 # write data to file, as a C array
1024 self
.data
.dump(file, trace
)
1025 file.write("#define %s_magic %d\n" % (self
.name
, self
.magic
))
1026 file.write("#define %s_size %d\n" % (self
.name
, self
.size
))
1027 file.write("#define %s_poly %d\n" % (self
.name
, self
.poly
))
1029 # stuff to deal with arrays of unsigned integers
1033 def __init__(self
, name
, data
):
1037 def dump(self
, file, trace
=0):
1038 # write data to file, as a C array
1039 size
= getsize(self
.data
)
1041 print >>sys
.stderr
, self
.name
+":", size
*len(self
.data
), "bytes"
1042 file.write("static ")
1044 file.write("unsigned char")
1046 file.write("unsigned short")
1048 file.write("unsigned int")
1049 file.write(" " + self
.name
+ "[] = {\n")
1052 for item
in self
.data
:
1053 i
= str(item
) + ", "
1054 if len(s
) + len(i
) > 78:
1055 file.write(s
+ "\n")
1060 file.write(s
+ "\n")
1061 file.write("};\n\n")
1064 # return smallest possible integer size for the given array
1068 elif maxdata
< 65536:
1073 def splitbins(t
, trace
=0):
1074 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1076 t is a sequence of ints. This function can be useful to save space if
1077 many of the ints are the same. t1 and t2 are lists of ints, and shift
1078 is an int, chosen to minimize the combined size of t1 and t2 (in C
1079 code), and where for each i in range(len(t)),
1080 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1081 where mask is a bitmask isolating the last "shift" bits.
1083 If optional arg trace is non-zero (default zero), progress info
1084 is printed to sys.stderr. The higher the value, the more info
1089 def dump(t1
, t2
, shift
, bytes
):
1090 print >>sys
.stderr
, "%d+%d bins at shift %d; %d bytes" % (
1091 len(t1
), len(t2
), shift
, bytes
)
1092 print >>sys
.stderr
, "Size of original table:", len(t
)*getsize(t
), \
1094 n
= len(t
)-1 # last valid index
1095 maxshift
= 0 # the most we can shift n and still have something left
1101 bytes
= sys
.maxint
# smallest total size so far
1102 t
= tuple(t
) # so slices can be dict keys
1103 for shift
in range(maxshift
+ 1):
1108 for i
in range(0, len(t
), size
):
1110 index
= bincache
.get(bin
)
1113 bincache
[bin
] = index
1115 t1
.append(index
>> shift
)
1116 # determine memory size
1117 b
= len(t1
)*getsize(t1
) + len(t2
)*getsize(t2
)
1119 dump(t1
, t2
, shift
, b
)
1121 best
= t1
, t2
, shift
1123 t1
, t2
, shift
= best
1125 print >>sys
.stderr
, "Best:",
1126 dump(t1
, t2
, shift
, bytes
)
1128 # exhaustively verify that the decomposition is correct
1129 mask
= ~
((~
0) << shift
) # i.e., low-bit mask of shift bits
1130 for i
in xrange(len(t
)):
1131 assert t
[i
] == t2
[(t1
[i
>> shift
] << shift
) + (i
& mask
)]
1134 if __name__
== "__main__":