]> git.proxmox.com Git - mirror_edk2.git/blob - AppPkg/Applications/Python/Python-2.7.2/Tools/unicode/makeunicodedata.py
EmbeddedPkg: Extend NvVarStoreFormattedLib LIBRARY_CLASS
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.2 / Tools / unicode / makeunicodedata.py
1 #
2 # (re)generate unicode property and type databases
3 #
4 # this script converts a unicode 3.2 database file to
5 # Modules/unicodedata_db.h, Modules/unicodename_db.h,
6 # and Objects/unicodetype_db.h
7 #
8 # history:
9 # 2000-09-24 fl created (based on bits and pieces from unidb)
10 # 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
11 # 2000-09-25 fl added character type table
12 # 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
13 # 2000-11-03 fl expand first/last ranges
14 # 2001-01-19 fl added character name tables (2.1)
15 # 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
16 # 2002-09-11 wd use string methods
17 # 2002-10-18 mvl update to Unicode 3.2
18 # 2002-10-22 mvl generate NFC tables
19 # 2002-11-24 mvl expand all ranges, sort names version-independently
20 # 2002-11-25 mvl add UNIDATA_VERSION
21 # 2004-05-29 perky add east asian width information
22 # 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
23 #
24 # written by Fredrik Lundh (fredrik@pythonware.com)
25 #
26
27 import sys
28
29 SCRIPT = sys.argv[0]
30 VERSION = "2.6"
31
32 # The Unicode Database
33 UNIDATA_VERSION = "5.2.0"
34 UNICODE_DATA = "UnicodeData%s.txt"
35 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
37 UNIHAN = "Unihan%s.txt"
38 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
39 LINE_BREAK = "LineBreak%s.txt"
40
41 old_versions = ["3.2.0"]
42
43 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
44 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
45 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
46 "So" ]
47
48 BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
49 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
50 "ON" ]
51
52 EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
53
54 MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
55
56 # note: should match definitions in Objects/unicodectype.c
57 ALPHA_MASK = 0x01
58 DECIMAL_MASK = 0x02
59 DIGIT_MASK = 0x04
60 LOWER_MASK = 0x08
61 LINEBREAK_MASK = 0x10
62 SPACE_MASK = 0x20
63 TITLE_MASK = 0x40
64 UPPER_MASK = 0x80
65 NODELTA_MASK = 0x100
66 NUMERIC_MASK = 0x200
67
68 def maketables(trace=0):
69
70 print "--- Reading", UNICODE_DATA % "", "..."
71
72 version = ""
73 unicode = UnicodeData(UNICODE_DATA % version,
74 COMPOSITION_EXCLUSIONS % version,
75 EASTASIAN_WIDTH % version,
76 UNIHAN % version,
77 DERIVEDNORMALIZATION_PROPS % version,
78 LINE_BREAK % version)
79
80 print len(filter(None, unicode.table)), "characters"
81
82 for version in old_versions:
83 print "--- Reading", UNICODE_DATA % ("-"+version), "..."
84 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
85 COMPOSITION_EXCLUSIONS % ("-"+version),
86 EASTASIAN_WIDTH % ("-"+version),
87 UNIHAN % ("-"+version))
88 print len(filter(None, old_unicode.table)), "characters"
89 merge_old_version(version, unicode, old_unicode)
90
91 makeunicodename(unicode, trace)
92 makeunicodedata(unicode, trace)
93 makeunicodetype(unicode, trace)
94
95 # --------------------------------------------------------------------
96 # unicode character properties
97
98 def makeunicodedata(unicode, trace):
99
100 dummy = (0, 0, 0, 0, 0, 0)
101 table = [dummy]
102 cache = {0: dummy}
103 index = [0] * len(unicode.chars)
104
105 FILE = "Modules/unicodedata_db.h"
106
107 print "--- Preparing", FILE, "..."
108
109 # 1) database properties
110
111 for char in unicode.chars:
112 record = unicode.table[char]
113 if record:
114 # extract database properties
115 category = CATEGORY_NAMES.index(record[2])
116 combining = int(record[3])
117 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
118 mirrored = record[9] == "Y"
119 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
120 normalizationquickcheck = record[17]
121 item = (
122 category, combining, bidirectional, mirrored, eastasianwidth,
123 normalizationquickcheck
124 )
125 # add entry to index and item tables
126 i = cache.get(item)
127 if i is None:
128 cache[item] = i = len(table)
129 table.append(item)
130 index[char] = i
131
132 # 2) decomposition data
133
134 decomp_data = [0]
135 decomp_prefix = [""]
136 decomp_index = [0] * len(unicode.chars)
137 decomp_size = 0
138
139 comp_pairs = []
140 comp_first = [None] * len(unicode.chars)
141 comp_last = [None] * len(unicode.chars)
142
143 for char in unicode.chars:
144 record = unicode.table[char]
145 if record:
146 if record[5]:
147 decomp = record[5].split()
148 if len(decomp) > 19:
149 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
150 # prefix
151 if decomp[0][0] == "<":
152 prefix = decomp.pop(0)
153 else:
154 prefix = ""
155 try:
156 i = decomp_prefix.index(prefix)
157 except ValueError:
158 i = len(decomp_prefix)
159 decomp_prefix.append(prefix)
160 prefix = i
161 assert prefix < 256
162 # content
163 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
164 # Collect NFC pairs
165 if not prefix and len(decomp) == 3 and \
166 char not in unicode.exclusions and \
167 unicode.table[decomp[1]][3] == "0":
168 p, l, r = decomp
169 comp_first[l] = 1
170 comp_last[r] = 1
171 comp_pairs.append((l,r,char))
172 try:
173 i = decomp_data.index(decomp)
174 except ValueError:
175 i = len(decomp_data)
176 decomp_data.extend(decomp)
177 decomp_size = decomp_size + len(decomp) * 2
178 else:
179 i = 0
180 decomp_index[char] = i
181
182 f = l = 0
183 comp_first_ranges = []
184 comp_last_ranges = []
185 prev_f = prev_l = None
186 for i in unicode.chars:
187 if comp_first[i] is not None:
188 comp_first[i] = f
189 f += 1
190 if prev_f is None:
191 prev_f = (i,i)
192 elif prev_f[1]+1 == i:
193 prev_f = prev_f[0],i
194 else:
195 comp_first_ranges.append(prev_f)
196 prev_f = (i,i)
197 if comp_last[i] is not None:
198 comp_last[i] = l
199 l += 1
200 if prev_l is None:
201 prev_l = (i,i)
202 elif prev_l[1]+1 == i:
203 prev_l = prev_l[0],i
204 else:
205 comp_last_ranges.append(prev_l)
206 prev_l = (i,i)
207 comp_first_ranges.append(prev_f)
208 comp_last_ranges.append(prev_l)
209 total_first = f
210 total_last = l
211
212 comp_data = [0]*(total_first*total_last)
213 for f,l,char in comp_pairs:
214 f = comp_first[f]
215 l = comp_last[l]
216 comp_data[f*total_last+l] = char
217
218 print len(table), "unique properties"
219 print len(decomp_prefix), "unique decomposition prefixes"
220 print len(decomp_data), "unique decomposition entries:",
221 print decomp_size, "bytes"
222 print total_first, "first characters in NFC"
223 print total_last, "last characters in NFC"
224 print len(comp_pairs), "NFC pairs"
225
226 print "--- Writing", FILE, "..."
227
228 fp = open(FILE, "w")
229 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
230 print >>fp
231 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
232 print >>fp, "/* a list of unique database records */"
233 print >>fp, \
234 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
235 for item in table:
236 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
237 print >>fp, "};"
238 print >>fp
239
240 print >>fp, "/* Reindexing of NFC first characters. */"
241 print >>fp, "#define TOTAL_FIRST",total_first
242 print >>fp, "#define TOTAL_LAST",total_last
243 print >>fp, "struct reindex{int start;short count,index;};"
244 print >>fp, "static struct reindex nfc_first[] = {"
245 for start,end in comp_first_ranges:
246 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
247 print >>fp," {0,0,0}"
248 print >>fp,"};\n"
249 print >>fp, "static struct reindex nfc_last[] = {"
250 for start,end in comp_last_ranges:
251 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
252 print >>fp," {0,0,0}"
253 print >>fp,"};\n"
254
255 # FIXME: <fl> the following tables could be made static, and
256 # the support code moved into unicodedatabase.c
257
258 print >>fp, "/* string literals */"
259 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
260 for name in CATEGORY_NAMES:
261 print >>fp, " \"%s\"," % name
262 print >>fp, " NULL"
263 print >>fp, "};"
264
265 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
266 for name in BIDIRECTIONAL_NAMES:
267 print >>fp, " \"%s\"," % name
268 print >>fp, " NULL"
269 print >>fp, "};"
270
271 print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
272 for name in EASTASIANWIDTH_NAMES:
273 print >>fp, " \"%s\"," % name
274 print >>fp, " NULL"
275 print >>fp, "};"
276
277 print >>fp, "static const char *decomp_prefix[] = {"
278 for name in decomp_prefix:
279 print >>fp, " \"%s\"," % name
280 print >>fp, " NULL"
281 print >>fp, "};"
282
283 # split record index table
284 index1, index2, shift = splitbins(index, trace)
285
286 print >>fp, "/* index tables for the database records */"
287 print >>fp, "#define SHIFT", shift
288 Array("index1", index1).dump(fp, trace)
289 Array("index2", index2).dump(fp, trace)
290
291 # split decomposition index table
292 index1, index2, shift = splitbins(decomp_index, trace)
293
294 print >>fp, "/* decomposition data */"
295 Array("decomp_data", decomp_data).dump(fp, trace)
296
297 print >>fp, "/* index tables for the decomposition data */"
298 print >>fp, "#define DECOMP_SHIFT", shift
299 Array("decomp_index1", index1).dump(fp, trace)
300 Array("decomp_index2", index2).dump(fp, trace)
301
302 index, index2, shift = splitbins(comp_data, trace)
303 print >>fp, "/* NFC pairs */"
304 print >>fp, "#define COMP_SHIFT", shift
305 Array("comp_index", index).dump(fp, trace)
306 Array("comp_data", index2).dump(fp, trace)
307
308 # Generate delta tables for old versions
309 for version, table, normalization in unicode.changed:
310 cversion = version.replace(".","_")
311 records = [table[0]]
312 cache = {table[0]:0}
313 index = [0] * len(table)
314 for i, record in enumerate(table):
315 try:
316 index[i] = cache[record]
317 except KeyError:
318 index[i] = cache[record] = len(records)
319 records.append(record)
320 index1, index2, shift = splitbins(index, trace)
321 print >>fp, "static const change_record change_records_%s[] = {" % cversion
322 for record in records:
323 print >>fp, "\t{ %s }," % ", ".join(map(str,record))
324 print >>fp, "};"
325 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
326 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
327 print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
328 print >>fp, "{"
329 print >>fp, "\tint index;"
330 print >>fp, "\tif (n >= 0x110000) index = 0;"
331 print >>fp, "\telse {"
332 print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
333 print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
334 (cversion, shift, ((1<<shift)-1))
335 print >>fp, "\t}"
336 print >>fp, "\treturn change_records_%s+index;" % cversion
337 print >>fp, "}\n"
338 print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
339 print >>fp, "{"
340 print >>fp, "\tswitch(n) {"
341 for k, v in normalization:
342 print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
343 print >>fp, "\tdefault: return 0;"
344 print >>fp, "\t}\n}\n"
345
346 fp.close()
347
348 # --------------------------------------------------------------------
349 # unicode character type tables
350
351 def makeunicodetype(unicode, trace):
352
353 FILE = "Objects/unicodetype_db.h"
354
355 print "--- Preparing", FILE, "..."
356
357 # extract unicode types
358 dummy = (0, 0, 0, 0, 0, 0)
359 table = [dummy]
360 cache = {0: dummy}
361 index = [0] * len(unicode.chars)
362 numeric = {}
363 spaces = []
364 linebreaks = []
365
366 for char in unicode.chars:
367 record = unicode.table[char]
368 if record:
369 # extract database properties
370 category = record[2]
371 bidirectional = record[4]
372 properties = record[16]
373 flags = 0
374 delta = True
375 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
376 flags |= ALPHA_MASK
377 if category == "Ll":
378 flags |= LOWER_MASK
379 if 'Line_Break' in properties or bidirectional == "B":
380 flags |= LINEBREAK_MASK
381 linebreaks.append(char)
382 if category == "Zs" or bidirectional in ("WS", "B", "S"):
383 flags |= SPACE_MASK
384 spaces.append(char)
385 if category == "Lt":
386 flags |= TITLE_MASK
387 if category == "Lu":
388 flags |= UPPER_MASK
389 # use delta predictor for upper/lower/title if it fits
390 if record[12]:
391 upper = int(record[12], 16)
392 else:
393 upper = char
394 if record[13]:
395 lower = int(record[13], 16)
396 else:
397 lower = char
398 if record[14]:
399 title = int(record[14], 16)
400 else:
401 # UCD.html says that a missing title char means that
402 # it defaults to the uppercase character, not to the
403 # character itself. Apparently, in the current UCD (5.x)
404 # this feature is never used
405 title = upper
406 upper_d = upper - char
407 lower_d = lower - char
408 title_d = title - char
409 if -32768 <= upper_d <= 32767 and \
410 -32768 <= lower_d <= 32767 and \
411 -32768 <= title_d <= 32767:
412 # use deltas
413 upper = upper_d & 0xffff
414 lower = lower_d & 0xffff
415 title = title_d & 0xffff
416 else:
417 flags |= NODELTA_MASK
418 # decimal digit, integer digit
419 decimal = 0
420 if record[6]:
421 flags |= DECIMAL_MASK
422 decimal = int(record[6])
423 digit = 0
424 if record[7]:
425 flags |= DIGIT_MASK
426 digit = int(record[7])
427 if record[8]:
428 flags |= NUMERIC_MASK
429 numeric.setdefault(record[8], []).append(char)
430 item = (
431 upper, lower, title, decimal, digit, flags
432 )
433 # add entry to index and item tables
434 i = cache.get(item)
435 if i is None:
436 cache[item] = i = len(table)
437 table.append(item)
438 index[char] = i
439
440 print len(table), "unique character type entries"
441 print sum(map(len, numeric.values())), "numeric code points"
442 print len(spaces), "whitespace code points"
443 print len(linebreaks), "linebreak code points"
444
445 print "--- Writing", FILE, "..."
446
447 fp = open(FILE, "w")
448 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
449 print >>fp
450 print >>fp, "/* a list of unique character type descriptors */"
451 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
452 for item in table:
453 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
454 print >>fp, "};"
455 print >>fp
456
457 # split decomposition index table
458 index1, index2, shift = splitbins(index, trace)
459
460 print >>fp, "/* type indexes */"
461 print >>fp, "#define SHIFT", shift
462 Array("index1", index1).dump(fp, trace)
463 Array("index2", index2).dump(fp, trace)
464
465 # Generate code for _PyUnicode_ToNumeric()
466 numeric_items = sorted(numeric.items())
467 print >>fp, '/* Returns the numeric value as double for Unicode characters'
468 print >>fp, ' * having this property, -1.0 otherwise.'
469 print >>fp, ' */'
470 print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'
471 print >>fp, '{'
472 print >>fp, ' switch (ch) {'
473 for value, codepoints in numeric_items:
474 # Turn text into float literals
475 parts = value.split('/')
476 parts = [repr(float(part)) for part in parts]
477 value = '/'.join(parts)
478
479 haswide = False
480 hasnonewide = False
481 codepoints.sort()
482 for codepoint in codepoints:
483 if codepoint < 0x10000:
484 hasnonewide = True
485 if codepoint >= 0x10000 and not haswide:
486 print >>fp, '#ifdef Py_UNICODE_WIDE'
487 haswide = True
488 print >>fp, ' case 0x%04X:' % (codepoint,)
489 if haswide and hasnonewide:
490 print >>fp, '#endif'
491 print >>fp, ' return (double) %s;' % (value,)
492 if haswide and not hasnonewide:
493 print >>fp, '#endif'
494 print >>fp,' }'
495 print >>fp,' return -1.0;'
496 print >>fp,'}'
497 print >>fp
498
499 # Generate code for _PyUnicode_IsWhitespace()
500 print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"
501 print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."
502 print >>fp, " */"
503 print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'
504 print >>fp, '{'
505 print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'
506 print >>fp, ' return iswspace(ch);'
507 print >>fp, '#else'
508 print >>fp, ' switch (ch) {'
509
510 haswide = False
511 hasnonewide = False
512 for codepoint in sorted(spaces):
513 if codepoint < 0x10000:
514 hasnonewide = True
515 if codepoint >= 0x10000 and not haswide:
516 print >>fp, '#ifdef Py_UNICODE_WIDE'
517 haswide = True
518 print >>fp, ' case 0x%04X:' % (codepoint,)
519 if haswide and hasnonewide:
520 print >>fp, '#endif'
521 print >>fp, ' return 1;'
522 if haswide and not hasnonewide:
523 print >>fp, '#endif'
524
525 print >>fp,' }'
526 print >>fp,' return 0;'
527 print >>fp, '#endif'
528 print >>fp,'}'
529 print >>fp
530
531 # Generate code for _PyUnicode_IsLinebreak()
532 print >>fp, "/* Returns 1 for Unicode characters having the line break"
533 print >>fp, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional"
534 print >>fp, " * type 'B', 0 otherwise."
535 print >>fp, " */"
536 print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
537 print >>fp, '{'
538 print >>fp, ' switch (ch) {'
539 haswide = False
540 hasnonewide = False
541 for codepoint in sorted(linebreaks):
542 if codepoint < 0x10000:
543 hasnonewide = True
544 if codepoint >= 0x10000 and not haswide:
545 print >>fp, '#ifdef Py_UNICODE_WIDE'
546 haswide = True
547 print >>fp, ' case 0x%04X:' % (codepoint,)
548 if haswide and hasnonewide:
549 print >>fp, '#endif'
550 print >>fp, ' return 1;'
551 if haswide and not hasnonewide:
552 print >>fp, '#endif'
553
554 print >>fp,' }'
555 print >>fp,' return 0;'
556 print >>fp,'}'
557 print >>fp
558
559 fp.close()
560
561 # --------------------------------------------------------------------
562 # unicode name database
563
564 def makeunicodename(unicode, trace):
565
566 FILE = "Modules/unicodename_db.h"
567
568 print "--- Preparing", FILE, "..."
569
570 # collect names
571 names = [None] * len(unicode.chars)
572
573 for char in unicode.chars:
574 record = unicode.table[char]
575 if record:
576 name = record[1].strip()
577 if name and name[0] != "<":
578 names[char] = name + chr(0)
579
580 print len(filter(lambda n: n is not None, names)), "distinct names"
581
582 # collect unique words from names (note that we differ between
583 # words inside a sentence, and words ending a sentence. the
584 # latter includes the trailing null byte.
585
586 words = {}
587 n = b = 0
588 for char in unicode.chars:
589 name = names[char]
590 if name:
591 w = name.split()
592 b = b + len(name)
593 n = n + len(w)
594 for w in w:
595 l = words.get(w)
596 if l:
597 l.append(None)
598 else:
599 words[w] = [len(words)]
600
601 print n, "words in text;", b, "bytes"
602
603 wordlist = words.items()
604
605 # sort on falling frequency, then by name
606 def word_key(a):
607 aword, alist = a
608 return -len(alist), aword
609 wordlist.sort(key=word_key)
610
611 # figure out how many phrasebook escapes we need
612 escapes = 0
613 while escapes * 256 < len(wordlist):
614 escapes = escapes + 1
615 print escapes, "escapes"
616
617 short = 256 - escapes
618
619 assert short > 0
620
621 print short, "short indexes in lexicon"
622
623 # statistics
624 n = 0
625 for i in range(short):
626 n = n + len(wordlist[i][1])
627 print n, "short indexes in phrasebook"
628
629 # pick the most commonly used words, and sort the rest on falling
630 # length (to maximize overlap)
631
632 wordlist, wordtail = wordlist[:short], wordlist[short:]
633 wordtail.sort(key=lambda a: a[0], reverse=True)
634 wordlist.extend(wordtail)
635
636 # generate lexicon from words
637
638 lexicon_offset = [0]
639 lexicon = ""
640 words = {}
641
642 # build a lexicon string
643 offset = 0
644 for w, x in wordlist:
645 # encoding: bit 7 indicates last character in word (chr(128)
646 # indicates the last character in an entire string)
647 ww = w[:-1] + chr(ord(w[-1])+128)
648 # reuse string tails, when possible
649 o = lexicon.find(ww)
650 if o < 0:
651 o = offset
652 lexicon = lexicon + ww
653 offset = offset + len(w)
654 words[w] = len(lexicon_offset)
655 lexicon_offset.append(o)
656
657 lexicon = map(ord, lexicon)
658
659 # generate phrasebook from names and lexicon
660 phrasebook = [0]
661 phrasebook_offset = [0] * len(unicode.chars)
662 for char in unicode.chars:
663 name = names[char]
664 if name:
665 w = name.split()
666 phrasebook_offset[char] = len(phrasebook)
667 for w in w:
668 i = words[w]
669 if i < short:
670 phrasebook.append(i)
671 else:
672 # store as two bytes
673 phrasebook.append((i>>8) + short)
674 phrasebook.append(i&255)
675
676 assert getsize(phrasebook) == 1
677
678 #
679 # unicode name hash table
680
681 # extract names
682 data = []
683 for char in unicode.chars:
684 record = unicode.table[char]
685 if record:
686 name = record[1].strip()
687 if name and name[0] != "<":
688 data.append((name, char))
689
690 # the magic number 47 was chosen to minimize the number of
691 # collisions on the current data set. if you like, change it
692 # and see what happens...
693
694 codehash = Hash("code", data, 47)
695
696 print "--- Writing", FILE, "..."
697
698 fp = open(FILE, "w")
699 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
700 print >>fp
701 print >>fp, "#define NAME_MAXLEN", 256
702 print >>fp
703 print >>fp, "/* lexicon */"
704 Array("lexicon", lexicon).dump(fp, trace)
705 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
706
707 # split decomposition index table
708 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
709
710 print >>fp, "/* code->name phrasebook */"
711 print >>fp, "#define phrasebook_shift", shift
712 print >>fp, "#define phrasebook_short", short
713
714 Array("phrasebook", phrasebook).dump(fp, trace)
715 Array("phrasebook_offset1", offset1).dump(fp, trace)
716 Array("phrasebook_offset2", offset2).dump(fp, trace)
717
718 print >>fp, "/* name->code dictionary */"
719 codehash.dump(fp, trace)
720
721 fp.close()
722
723
724 def merge_old_version(version, new, old):
725 # Changes to exclusion file not implemented yet
726 if old.exclusions != new.exclusions:
727 raise NotImplementedError, "exclusions differ"
728
729 # In these change records, 0xFF means "no change"
730 bidir_changes = [0xFF]*0x110000
731 category_changes = [0xFF]*0x110000
732 decimal_changes = [0xFF]*0x110000
733 mirrored_changes = [0xFF]*0x110000
734 # In numeric data, 0 means "no change",
735 # -1 means "did not have a numeric value
736 numeric_changes = [0] * 0x110000
737 # normalization_changes is a list of key-value pairs
738 normalization_changes = []
739 for i in range(0x110000):
740 if new.table[i] is None:
741 # Characters unassigned in the new version ought to
742 # be unassigned in the old one
743 assert old.table[i] is None
744 continue
745 # check characters unassigned in the old version
746 if old.table[i] is None:
747 # category 0 is "unassigned"
748 category_changes[i] = 0
749 continue
750 # check characters that differ
751 if old.table[i] != new.table[i]:
752 for k in range(len(old.table[i])):
753 if old.table[i][k] != new.table[i][k]:
754 value = old.table[i][k]
755 if k == 2:
756 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
757 category_changes[i] = CATEGORY_NAMES.index(value)
758 elif k == 4:
759 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
760 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
761 elif k == 5:
762 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
763 # We assume that all normalization changes are in 1:1 mappings
764 assert " " not in value
765 normalization_changes.append((i, value))
766 elif k == 6:
767 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
768 # we only support changes where the old value is a single digit
769 assert value in "0123456789"
770 decimal_changes[i] = int(value)
771 elif k == 8:
772 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
773 # Since 0 encodes "no change", the old value is better not 0
774 if not value:
775 numeric_changes[i] = -1
776 else:
777 numeric_changes[i] = float(value)
778 assert numeric_changes[i] not in (0, -1)
779 elif k == 9:
780 if value == 'Y':
781 mirrored_changes[i] = '1'
782 else:
783 mirrored_changes[i] = '0'
784 elif k == 11:
785 # change to ISO comment, ignore
786 pass
787 elif k == 12:
788 # change to simple uppercase mapping; ignore
789 pass
790 elif k == 13:
791 # change to simple lowercase mapping; ignore
792 pass
793 elif k == 14:
794 # change to simple titlecase mapping; ignore
795 pass
796 elif k == 16:
797 # change to properties; not yet
798 pass
799 else:
800 class Difference(Exception):pass
801 raise Difference, (hex(i), k, old.table[i], new.table[i])
802 new.changed.append((version, zip(bidir_changes, category_changes,
803 decimal_changes, mirrored_changes,
804 numeric_changes),
805 normalization_changes))
806
807
808 # --------------------------------------------------------------------
809 # the following support code is taken from the unidb utilities
810 # Copyright (c) 1999-2000 by Secret Labs AB
811
812 # load a unicode-data file from disk
813
814 class UnicodeData:
815 # Record structure:
816 # [ID, name, category, combining, bidi, decomp, (6)
817 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
818 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
819 # properties] (17)
820
821 def __init__(self, filename, exclusions, eastasianwidth, unihan,
822 derivednormalizationprops=None, linebreakprops=None,
823 expand=1):
824 self.changed = []
825 file = open(filename)
826 table = [None] * 0x110000
827 while 1:
828 s = file.readline()
829 if not s:
830 break
831 s = s.strip().split(";")
832 char = int(s[0], 16)
833 table[char] = s
834
835 # expand first-last ranges
836 if expand:
837 field = None
838 for i in range(0, 0x110000):
839 s = table[i]
840 if s:
841 if s[1][-6:] == "First>":
842 s[1] = ""
843 field = s
844 elif s[1][-5:] == "Last>":
845 s[1] = ""
846 field = None
847 elif field:
848 f2 = field[:]
849 f2[0] = "%X" % i
850 table[i] = f2
851
852 # public attributes
853 self.filename = filename
854 self.table = table
855 self.chars = range(0x110000) # unicode 3.2
856
857 file = open(exclusions)
858 self.exclusions = {}
859 for s in file:
860 s = s.strip()
861 if not s:
862 continue
863 if s[0] == '#':
864 continue
865 char = int(s.split()[0],16)
866 self.exclusions[char] = 1
867
868 widths = [None] * 0x110000
869 for s in open(eastasianwidth):
870 s = s.strip()
871 if not s:
872 continue
873 if s[0] == '#':
874 continue
875 s = s.split()[0].split(';')
876 if '..' in s[0]:
877 first, last = [int(c, 16) for c in s[0].split('..')]
878 chars = range(first, last+1)
879 else:
880 chars = [int(s[0], 16)]
881 for char in chars:
882 widths[char] = s[1]
883 for i in range(0, 0x110000):
884 if table[i] is not None:
885 table[i].append(widths[i])
886
887 for i in range(0, 0x110000):
888 if table[i] is not None:
889 table[i].append(set())
890 if linebreakprops:
891 for s in open(linebreakprops):
892 s = s.partition('#')[0]
893 s = [i.strip() for i in s.split(';')]
894 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
895 continue
896 if '..' not in s[0]:
897 first = last = int(s[0], 16)
898 else:
899 first, last = [int(c, 16) for c in s[0].split('..')]
900 for char in range(first, last+1):
901 table[char][-1].add('Line_Break')
902
903 if derivednormalizationprops:
904 quickchecks = [0] * 0x110000 # default is Yes
905 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
906 for s in open(derivednormalizationprops):
907 if '#' in s:
908 s = s[:s.index('#')]
909 s = [i.strip() for i in s.split(';')]
910 if len(s) < 2 or s[1] not in qc_order:
911 continue
912 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
913 quickcheck_shift = qc_order.index(s[1])*2
914 quickcheck <<= quickcheck_shift
915 if '..' not in s[0]:
916 first = last = int(s[0], 16)
917 else:
918 first, last = [int(c, 16) for c in s[0].split('..')]
919 for char in range(first, last+1):
920 assert not (quickchecks[char]>>quickcheck_shift)&3
921 quickchecks[char] |= quickcheck
922 for i in range(0, 0x110000):
923 if table[i] is not None:
924 table[i].append(quickchecks[i])
925
926 for line in open(unihan):
927 if not line.startswith('U+'):
928 continue
929 code, tag, value = line.split(None, 3)[:3]
930 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
931 'kOtherNumeric'):
932 continue
933 value = value.strip().replace(',', '')
934 i = int(code[2:], 16)
935 # Patch the numeric field
936 if table[i] is not None:
937 table[i][8] = value
938
939 def uselatin1(self):
940 # restrict character range to ISO Latin 1
941 self.chars = range(256)
942
943 # hash table tools
944
945 # this is a straight-forward reimplementation of Python's built-in
946 # dictionary type, using a static data structure, and a custom string
947 # hash algorithm.
948
949 def myhash(s, magic):
950 h = 0
951 for c in map(ord, s.upper()):
952 h = (h * magic) + c
953 ix = h & 0xff000000L
954 if ix:
955 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
956 return h
957
958 SIZES = [
959 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
960 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
961 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
962 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
963 ]
964
965 class Hash:
966 def __init__(self, name, data, magic):
967 # turn a (key, value) list into a static hash table structure
968
969 # determine table size
970 for size, poly in SIZES:
971 if size > len(data):
972 poly = size + poly
973 break
974 else:
975 raise AssertionError, "ran out of polynominals"
976
977 print size, "slots in hash table"
978
979 table = [None] * size
980
981 mask = size-1
982
983 n = 0
984
985 hash = myhash
986
987 # initialize hash table
988 for key, value in data:
989 h = hash(key, magic)
990 i = (~h) & mask
991 v = table[i]
992 if v is None:
993 table[i] = value
994 continue
995 incr = (h ^ (h >> 3)) & mask;
996 if not incr:
997 incr = mask
998 while 1:
999 n = n + 1
1000 i = (i + incr) & mask
1001 v = table[i]
1002 if v is None:
1003 table[i] = value
1004 break
1005 incr = incr << 1
1006 if incr > mask:
1007 incr = incr ^ poly
1008
1009 print n, "collisions"
1010 self.collisions = n
1011
1012 for i in range(len(table)):
1013 if table[i] is None:
1014 table[i] = 0
1015
1016 self.data = Array(name + "_hash", table)
1017 self.magic = magic
1018 self.name = name
1019 self.size = size
1020 self.poly = poly
1021
1022 def dump(self, file, trace):
1023 # write data to file, as a C array
1024 self.data.dump(file, trace)
1025 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1026 file.write("#define %s_size %d\n" % (self.name, self.size))
1027 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1028
1029 # stuff to deal with arrays of unsigned integers
1030
1031 class Array:
1032
1033 def __init__(self, name, data):
1034 self.name = name
1035 self.data = data
1036
1037 def dump(self, file, trace=0):
1038 # write data to file, as a C array
1039 size = getsize(self.data)
1040 if trace:
1041 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
1042 file.write("static ")
1043 if size == 1:
1044 file.write("unsigned char")
1045 elif size == 2:
1046 file.write("unsigned short")
1047 else:
1048 file.write("unsigned int")
1049 file.write(" " + self.name + "[] = {\n")
1050 if self.data:
1051 s = " "
1052 for item in self.data:
1053 i = str(item) + ", "
1054 if len(s) + len(i) > 78:
1055 file.write(s + "\n")
1056 s = " " + i
1057 else:
1058 s = s + i
1059 if s.strip():
1060 file.write(s + "\n")
1061 file.write("};\n\n")
1062
1063 def getsize(data):
1064 # return smallest possible integer size for the given array
1065 maxdata = max(data)
1066 if maxdata < 256:
1067 return 1
1068 elif maxdata < 65536:
1069 return 2
1070 else:
1071 return 4
1072
1073 def splitbins(t, trace=0):
1074 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1075
1076 t is a sequence of ints. This function can be useful to save space if
1077 many of the ints are the same. t1 and t2 are lists of ints, and shift
1078 is an int, chosen to minimize the combined size of t1 and t2 (in C
1079 code), and where for each i in range(len(t)),
1080 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1081 where mask is a bitmask isolating the last "shift" bits.
1082
1083 If optional arg trace is non-zero (default zero), progress info
1084 is printed to sys.stderr. The higher the value, the more info
1085 you'll get.
1086 """
1087
1088 if trace:
1089 def dump(t1, t2, shift, bytes):
1090 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
1091 len(t1), len(t2), shift, bytes)
1092 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
1093 "bytes"
1094 n = len(t)-1 # last valid index
1095 maxshift = 0 # the most we can shift n and still have something left
1096 if n > 0:
1097 while n >> 1:
1098 n >>= 1
1099 maxshift += 1
1100 del n
1101 bytes = sys.maxint # smallest total size so far
1102 t = tuple(t) # so slices can be dict keys
1103 for shift in range(maxshift + 1):
1104 t1 = []
1105 t2 = []
1106 size = 2**shift
1107 bincache = {}
1108 for i in range(0, len(t), size):
1109 bin = t[i:i+size]
1110 index = bincache.get(bin)
1111 if index is None:
1112 index = len(t2)
1113 bincache[bin] = index
1114 t2.extend(bin)
1115 t1.append(index >> shift)
1116 # determine memory size
1117 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
1118 if trace > 1:
1119 dump(t1, t2, shift, b)
1120 if b < bytes:
1121 best = t1, t2, shift
1122 bytes = b
1123 t1, t2, shift = best
1124 if trace:
1125 print >>sys.stderr, "Best:",
1126 dump(t1, t2, shift, bytes)
1127 if __debug__:
1128 # exhaustively verify that the decomposition is correct
1129 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1130 for i in xrange(len(t)):
1131 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1132 return best
1133
1134 if __name__ == "__main__":
1135 maketables(1)