AppPkg/Applications/Python/Python-2.7.2/Tools/unicode/makeunicodedata.py

   1 #
   2 # (re)generate unicode property and type databases
   3 #
   4 # this script converts a unicode 3.2 database file to
   5 # Modules/unicodedata_db.h, Modules/unicodename_db.h,
   6 # and Objects/unicodetype_db.h
   7 #
   8 # history:
   9 # 2000-09-24 fl   created (based on bits and pieces from unidb)
  10 # 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
  11 # 2000-09-25 fl   added character type table
  12 # 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
  13 # 2000-11-03 fl   expand first/last ranges
  14 # 2001-01-19 fl   added character name tables (2.1)
  15 # 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
  16 # 2002-09-11 wd   use string methods
  17 # 2002-10-18 mvl  update to Unicode 3.2
  18 # 2002-10-22 mvl  generate NFC tables
  19 # 2002-11-24 mvl  expand all ranges, sort names version-independently
  20 # 2002-11-25 mvl  add UNIDATA_VERSION
  21 # 2004-05-29 perky add east asian width information
  22 # 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
  23 #
  24 # written by Fredrik Lundh (fredrik@pythonware.com)
  25 #
  26
  27 import sys
  28
  29 SCRIPT = sys.argv[0]
  30 VERSION = "2.6"
  31
  32 # The Unicode Database
  33 UNIDATA_VERSION = "5.2.0"
  34 UNICODE_DATA = "UnicodeData%s.txt"
  35 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
  36 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
  37 UNIHAN = "Unihan%s.txt"
  38 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
  39 LINE_BREAK = "LineBreak%s.txt"
  40
  41 old_versions = ["3.2.0"]
  42
  43 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
  44     "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
  45     "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
  46     "So" ]
  47
  48 BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
  49     "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
  50     "ON" ]
  51
  52 EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
  53
  54 MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
  55
  56 # note: should match definitions in Objects/unicodectype.c
  57 ALPHA_MASK = 0x01
  58 DECIMAL_MASK = 0x02
  59 DIGIT_MASK = 0x04
  60 LOWER_MASK = 0x08
  61 LINEBREAK_MASK = 0x10
  62 SPACE_MASK = 0x20
  63 TITLE_MASK = 0x40
  64 UPPER_MASK = 0x80
  65 NODELTA_MASK = 0x100
  66 NUMERIC_MASK = 0x200
  67
  68 def maketables(trace=0):
  69
  70     print "--- Reading", UNICODE_DATA % "", "..."
  71
  72     version = ""
  73     unicode = UnicodeData(UNICODE_DATA % version,
  74                           COMPOSITION_EXCLUSIONS % version,
  75                           EASTASIAN_WIDTH % version,
  76                           UNIHAN % version,
  77                           DERIVEDNORMALIZATION_PROPS % version,
  78                           LINE_BREAK % version)
  79
  80     print len(filter(None, unicode.table)), "characters"
  81
  82     for version in old_versions:
  83         print "--- Reading", UNICODE_DATA % ("-"+version), "..."
  84         old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
  85                                   COMPOSITION_EXCLUSIONS % ("-"+version),
  86                                   EASTASIAN_WIDTH % ("-"+version),
  87                                   UNIHAN % ("-"+version))
  88         print len(filter(None, old_unicode.table)), "characters"
  89         merge_old_version(version, unicode, old_unicode)
  90
  91     makeunicodename(unicode, trace)
  92     makeunicodedata(unicode, trace)
  93     makeunicodetype(unicode, trace)
  94
  95 # --------------------------------------------------------------------
  96 # unicode character properties
  97
  98 def makeunicodedata(unicode, trace):
  99
 100     dummy = (0, 0, 0, 0, 0, 0)
 101     table = [dummy]
 102     cache = {0: dummy}
 103     index = [0] * len(unicode.chars)
 104
 105     FILE = "Modules/unicodedata_db.h"
 106
 107     print "--- Preparing", FILE, "..."
 108
 109     # 1) database properties
 110
 111     for char in unicode.chars:
 112         record = unicode.table[char]
 113         if record:
 114             # extract database properties
 115             category = CATEGORY_NAMES.index(record[2])
 116             combining = int(record[3])
 117             bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
 118             mirrored = record[9] == "Y"
 119             eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
 120             normalizationquickcheck = record[17]
 121             item = (
 122                 category, combining, bidirectional, mirrored, eastasianwidth,
 123                 normalizationquickcheck
 124                 )
 125             # add entry to index and item tables
 126             i = cache.get(item)
 127             if i is None:
 128                 cache[item] = i = len(table)
 129                 table.append(item)
 130             index[char] = i
 131
 132     # 2) decomposition data
 133
 134     decomp_data = [0]
 135     decomp_prefix = [""]
 136     decomp_index = [0] * len(unicode.chars)
 137     decomp_size = 0
 138
 139     comp_pairs = []
 140     comp_first = [None] * len(unicode.chars)
 141     comp_last = [None] * len(unicode.chars)
 142
 143     for char in unicode.chars:
 144         record = unicode.table[char]
 145         if record:
 146             if record[5]:
 147                 decomp = record[5].split()
 148                 if len(decomp) > 19:
 149                     raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
 150                 # prefix
 151                 if decomp[0][0] == "<":
 152                     prefix = decomp.pop(0)
 153                 else:
 154                     prefix = ""
 155                 try:
 156                     i = decomp_prefix.index(prefix)
 157                 except ValueError:
 158                     i = len(decomp_prefix)
 159                     decomp_prefix.append(prefix)
 160                 prefix = i
 161                 assert prefix < 256
 162                 # content
 163                 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
 164                 # Collect NFC pairs
 165                 if not prefix and len(decomp) == 3 and \
 166                    char not in unicode.exclusions and \
 167                    unicode.table[decomp[1]][3] == "0":
 168                     p, l, r = decomp
 169                     comp_first[l] = 1
 170                     comp_last[r] = 1
 171                     comp_pairs.append((l,r,char))
 172                 try:
 173                     i = decomp_data.index(decomp)
 174                 except ValueError:
 175                     i = len(decomp_data)
 176                     decomp_data.extend(decomp)
 177                     decomp_size = decomp_size + len(decomp) * 2
 178             else:
 179                 i = 0
 180             decomp_index[char] = i
 181
 182     f = l = 0
 183     comp_first_ranges = []
 184     comp_last_ranges = []
 185     prev_f = prev_l = None
 186     for i in unicode.chars:
 187         if comp_first[i] is not None:
 188             comp_first[i] = f
 189             f += 1
 190             if prev_f is None:
 191                 prev_f = (i,i)
 192             elif prev_f[1]+1 == i:
 193                 prev_f = prev_f[0],i
 194             else:
 195                 comp_first_ranges.append(prev_f)
 196                 prev_f = (i,i)
 197         if comp_last[i] is not None:
 198             comp_last[i] = l
 199             l += 1
 200             if prev_l is None:
 201                 prev_l = (i,i)
 202             elif prev_l[1]+1 == i:
 203                 prev_l = prev_l[0],i
 204             else:
 205                 comp_last_ranges.append(prev_l)
 206                 prev_l = (i,i)
 207     comp_first_ranges.append(prev_f)
 208     comp_last_ranges.append(prev_l)
 209     total_first = f
 210     total_last = l
 211
 212     comp_data = [0]*(total_first*total_last)
 213     for f,l,char in comp_pairs:
 214         f = comp_first[f]
 215         l = comp_last[l]
 216         comp_data[f*total_last+l] = char
 217
 218     print len(table), "unique properties"
 219     print len(decomp_prefix), "unique decomposition prefixes"
 220     print len(decomp_data), "unique decomposition entries:",
 221     print decomp_size, "bytes"
 222     print total_first, "first characters in NFC"
 223     print total_last, "last characters in NFC"
 224     print len(comp_pairs), "NFC pairs"
 225
 226     print "--- Writing", FILE, "..."
 227
 228     fp = open(FILE, "w")
 229     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
 230     print >>fp
 231     print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
 232     print >>fp, "/* a list of unique database records */"
 233     print >>fp, \
 234           "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
 235     for item in table:
 236         print >>fp, "    {%d, %d, %d, %d, %d, %d}," % item
 237     print >>fp, "};"
 238     print >>fp
 239
 240     print >>fp, "/* Reindexing of NFC first characters. */"
 241     print >>fp, "#define TOTAL_FIRST",total_first
 242     print >>fp, "#define TOTAL_LAST",total_last
 243     print >>fp, "struct reindex{int start;short count,index;};"
 244     print >>fp, "static struct reindex nfc_first[] = {"
 245     for start,end in comp_first_ranges:
 246         print >>fp,"  { %d, %d, %d}," % (start,end-start,comp_first[start])
 247     print >>fp,"  {0,0,0}"
 248     print >>fp,"};\n"
 249     print >>fp, "static struct reindex nfc_last[] = {"
 250     for start,end in comp_last_ranges:
 251         print >>fp,"  { %d, %d, %d}," % (start,end-start,comp_last[start])
 252     print >>fp,"  {0,0,0}"
 253     print >>fp,"};\n"
 254
 255     # FIXME: <fl> the following tables could be made static, and
 256     # the support code moved into unicodedatabase.c
 257
 258     print >>fp, "/* string literals */"
 259     print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
 260     for name in CATEGORY_NAMES:
 261         print >>fp, "    \"%s\"," % name
 262     print >>fp, "    NULL"
 263     print >>fp, "};"
 264
 265     print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
 266     for name in BIDIRECTIONAL_NAMES:
 267         print >>fp, "    \"%s\"," % name
 268     print >>fp, "    NULL"
 269     print >>fp, "};"
 270
 271     print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
 272     for name in EASTASIANWIDTH_NAMES:
 273         print >>fp, "    \"%s\"," % name
 274     print >>fp, "    NULL"
 275     print >>fp, "};"
 276
 277     print >>fp, "static const char *decomp_prefix[] = {"
 278     for name in decomp_prefix:
 279         print >>fp, "    \"%s\"," % name
 280     print >>fp, "    NULL"
 281     print >>fp, "};"
 282
 283     # split record index table
 284     index1, index2, shift = splitbins(index, trace)
 285
 286     print >>fp, "/* index tables for the database records */"
 287     print >>fp, "#define SHIFT", shift
 288     Array("index1", index1).dump(fp, trace)
 289     Array("index2", index2).dump(fp, trace)
 290
 291     # split decomposition index table
 292     index1, index2, shift = splitbins(decomp_index, trace)
 293
 294     print >>fp, "/* decomposition data */"
 295     Array("decomp_data", decomp_data).dump(fp, trace)
 296
 297     print >>fp, "/* index tables for the decomposition data */"
 298     print >>fp, "#define DECOMP_SHIFT", shift
 299     Array("decomp_index1", index1).dump(fp, trace)
 300     Array("decomp_index2", index2).dump(fp, trace)
 301
 302     index, index2, shift = splitbins(comp_data, trace)
 303     print >>fp, "/* NFC pairs */"
 304     print >>fp, "#define COMP_SHIFT", shift
 305     Array("comp_index", index).dump(fp, trace)
 306     Array("comp_data", index2).dump(fp, trace)
 307
 308     # Generate delta tables for old versions
 309     for version, table, normalization in unicode.changed:
 310         cversion = version.replace(".","_")
 311         records = [table[0]]
 312         cache = {table[0]:0}
 313         index = [0] * len(table)
 314         for i, record in enumerate(table):
 315             try:
 316                 index[i] = cache[record]
 317             except KeyError:
 318                 index[i] = cache[record] = len(records)
 319                 records.append(record)
 320         index1, index2, shift = splitbins(index, trace)
 321         print >>fp, "static const change_record change_records_%s[] = {" % cversion
 322         for record in records:
 323             print >>fp, "\t{ %s }," % ", ".join(map(str,record))
 324         print >>fp, "};"
 325         Array("changes_%s_index" % cversion, index1).dump(fp, trace)
 326         Array("changes_%s_data" % cversion, index2).dump(fp, trace)
 327         print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
 328         print >>fp, "{"
 329         print >>fp, "\tint index;"
 330         print >>fp, "\tif (n >= 0x110000) index = 0;"
 331         print >>fp, "\telse {"
 332         print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
 333         print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
 334               (cversion, shift, ((1<<shift)-1))
 335         print >>fp, "\t}"
 336         print >>fp, "\treturn change_records_%s+index;" % cversion
 337         print >>fp, "}\n"
 338         print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
 339         print >>fp, "{"
 340         print >>fp, "\tswitch(n) {"
 341         for k, v in normalization:
 342             print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
 343         print >>fp, "\tdefault: return 0;"
 344         print >>fp, "\t}\n}\n"
 345
 346     fp.close()
 347
 348 # --------------------------------------------------------------------
 349 # unicode character type tables
 350
 351 def makeunicodetype(unicode, trace):
 352
 353     FILE = "Objects/unicodetype_db.h"
 354
 355     print "--- Preparing", FILE, "..."
 356
 357     # extract unicode types
 358     dummy = (0, 0, 0, 0, 0, 0)
 359     table = [dummy]
 360     cache = {0: dummy}
 361     index = [0] * len(unicode.chars)
 362     numeric = {}
 363     spaces = []
 364     linebreaks = []
 365
 366     for char in unicode.chars:
 367         record = unicode.table[char]
 368         if record:
 369             # extract database properties
 370             category = record[2]
 371             bidirectional = record[4]
 372             properties = record[16]
 373             flags = 0
 374             delta = True
 375             if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
 376                 flags |= ALPHA_MASK
 377             if category == "Ll":
 378                 flags |= LOWER_MASK
 379             if 'Line_Break' in properties or bidirectional == "B":
 380                 flags |= LINEBREAK_MASK
 381                 linebreaks.append(char)
 382             if category == "Zs" or bidirectional in ("WS", "B", "S"):
 383                 flags |= SPACE_MASK
 384                 spaces.append(char)
 385             if category == "Lt":
 386                 flags |= TITLE_MASK
 387             if category == "Lu":
 388                 flags |= UPPER_MASK
 389             # use delta predictor for upper/lower/title if it fits
 390             if record[12]:
 391                 upper = int(record[12], 16)
 392             else:
 393                 upper = char
 394             if record[13]:
 395                 lower = int(record[13], 16)
 396             else:
 397                 lower = char
 398             if record[14]:
 399                 title = int(record[14], 16)
 400             else:
 401                 # UCD.html says that a missing title char means that
 402                 # it defaults to the uppercase character, not to the
 403                 # character itself. Apparently, in the current UCD (5.x)
 404                 # this feature is never used
 405                 title = upper
 406             upper_d = upper - char
 407             lower_d = lower - char
 408             title_d = title - char
 409             if -32768 <= upper_d <= 32767 and \
 410                -32768 <= lower_d <= 32767 and \
 411                -32768 <= title_d <= 32767:
 412                 # use deltas
 413                 upper = upper_d & 0xffff
 414                 lower = lower_d & 0xffff
 415                 title = title_d & 0xffff
 416             else:
 417                 flags |= NODELTA_MASK
 418             # decimal digit, integer digit
 419             decimal = 0
 420             if record[6]:
 421                 flags |= DECIMAL_MASK
 422                 decimal = int(record[6])
 423             digit = 0
 424             if record[7]:
 425                 flags |= DIGIT_MASK
 426                 digit = int(record[7])
 427             if record[8]:
 428                 flags |= NUMERIC_MASK
 429                 numeric.setdefault(record[8], []).append(char)
 430             item = (
 431                 upper, lower, title, decimal, digit, flags
 432                 )
 433             # add entry to index and item tables
 434             i = cache.get(item)
 435             if i is None:
 436                 cache[item] = i = len(table)
 437                 table.append(item)
 438             index[char] = i
 439
 440     print len(table), "unique character type entries"
 441     print sum(map(len, numeric.values())), "numeric code points"
 442     print len(spaces), "whitespace code points"
 443     print len(linebreaks), "linebreak code points"
 444
 445     print "--- Writing", FILE, "..."
 446
 447     fp = open(FILE, "w")
 448     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
 449     print >>fp
 450     print >>fp, "/* a list of unique character type descriptors */"
 451     print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
 452     for item in table:
 453         print >>fp, "    {%d, %d, %d, %d, %d, %d}," % item
 454     print >>fp, "};"
 455     print >>fp
 456
 457     # split decomposition index table
 458     index1, index2, shift = splitbins(index, trace)
 459
 460     print >>fp, "/* type indexes */"
 461     print >>fp, "#define SHIFT", shift
 462     Array("index1", index1).dump(fp, trace)
 463     Array("index2", index2).dump(fp, trace)
 464
 465     # Generate code for _PyUnicode_ToNumeric()
 466     numeric_items = sorted(numeric.items())
 467     print >>fp, '/* Returns the numeric value as double for Unicode characters'
 468     print >>fp, ' * having this property, -1.0 otherwise.'
 469     print >>fp, ' */'
 470     print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'
 471     print >>fp, '{'
 472     print >>fp, '    switch (ch) {'
 473     for value, codepoints in numeric_items:
 474         # Turn text into float literals
 475         parts = value.split('/')
 476         parts = [repr(float(part)) for part in parts]
 477         value = '/'.join(parts)
 478
 479         haswide = False
 480         hasnonewide = False
 481         codepoints.sort()
 482         for codepoint in codepoints:
 483             if codepoint < 0x10000:
 484                 hasnonewide = True
 485             if codepoint >= 0x10000 and not haswide:
 486                 print >>fp, '#ifdef Py_UNICODE_WIDE'
 487                 haswide = True
 488             print >>fp, '    case 0x%04X:' % (codepoint,)
 489         if haswide and hasnonewide:
 490             print >>fp, '#endif'
 491         print >>fp, '        return (double) %s;' % (value,)
 492         if haswide and not hasnonewide:
 493             print >>fp, '#endif'
 494     print >>fp,'    }'
 495     print >>fp,'    return -1.0;'
 496     print >>fp,'}'
 497     print >>fp
 498
 499     # Generate code for _PyUnicode_IsWhitespace()
 500     print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"
 501     print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."
 502     print >>fp, " */"
 503     print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'
 504     print >>fp, '{'
 505     print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'
 506     print >>fp, '    return iswspace(ch);'
 507     print >>fp, '#else'
 508     print >>fp, '    switch (ch) {'
 509
 510     haswide = False
 511     hasnonewide = False
 512     for codepoint in sorted(spaces):
 513         if codepoint < 0x10000:
 514             hasnonewide = True
 515         if codepoint >= 0x10000 and not haswide:
 516             print >>fp, '#ifdef Py_UNICODE_WIDE'
 517             haswide = True
 518         print >>fp, '    case 0x%04X:' % (codepoint,)
 519     if haswide and hasnonewide:
 520         print >>fp, '#endif'
 521     print >>fp, '        return 1;'
 522     if haswide and not hasnonewide:
 523         print >>fp, '#endif'
 524
 525     print >>fp,'    }'
 526     print >>fp,'    return 0;'
 527     print >>fp, '#endif'
 528     print >>fp,'}'
 529     print >>fp
 530
 531     # Generate code for _PyUnicode_IsLinebreak()
 532     print >>fp, "/* Returns 1 for Unicode characters having the line break"
 533     print >>fp, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional"
 534     print >>fp, " * type 'B', 0 otherwise."
 535     print >>fp, " */"
 536     print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
 537     print >>fp, '{'
 538     print >>fp, '    switch (ch) {'
 539     haswide = False
 540     hasnonewide = False
 541     for codepoint in sorted(linebreaks):
 542         if codepoint < 0x10000:
 543             hasnonewide = True
 544         if codepoint >= 0x10000 and not haswide:
 545             print >>fp, '#ifdef Py_UNICODE_WIDE'
 546             haswide = True
 547         print >>fp, '    case 0x%04X:' % (codepoint,)
 548     if haswide and hasnonewide:
 549         print >>fp, '#endif'
 550     print >>fp, '        return 1;'
 551     if haswide and not hasnonewide:
 552         print >>fp, '#endif'
 553
 554     print >>fp,'    }'
 555     print >>fp,'    return 0;'
 556     print >>fp,'}'
 557     print >>fp
 558
 559     fp.close()
 560
 561 # --------------------------------------------------------------------
 562 # unicode name database
 563
 564 def makeunicodename(unicode, trace):
 565
 566     FILE = "Modules/unicodename_db.h"
 567
 568     print "--- Preparing", FILE, "..."
 569
 570     # collect names
 571     names = [None] * len(unicode.chars)
 572
 573     for char in unicode.chars:
 574         record = unicode.table[char]
 575         if record:
 576             name = record[1].strip()
 577             if name and name[0] != "<":
 578                 names[char] = name + chr(0)
 579
 580     print len(filter(lambda n: n is not None, names)), "distinct names"
 581
 582     # collect unique words from names (note that we differ between
 583     # words inside a sentence, and words ending a sentence.  the
 584     # latter includes the trailing null byte.
 585
 586     words = {}
 587     n = b = 0
 588     for char in unicode.chars:
 589         name = names[char]
 590         if name:
 591             w = name.split()
 592             b = b + len(name)
 593             n = n + len(w)
 594             for w in w:
 595                 l = words.get(w)
 596                 if l:
 597                     l.append(None)
 598                 else:
 599                     words[w] = [len(words)]
 600
 601     print n, "words in text;", b, "bytes"
 602
 603     wordlist = words.items()
 604
 605     # sort on falling frequency, then by name
 606     def word_key(a):
 607         aword, alist = a
 608         return -len(alist), aword
 609     wordlist.sort(key=word_key)
 610
 611     # figure out how many phrasebook escapes we need
 612     escapes = 0
 613     while escapes * 256 < len(wordlist):
 614         escapes = escapes + 1
 615     print escapes, "escapes"
 616
 617     short = 256 - escapes
 618
 619     assert short > 0
 620
 621     print short, "short indexes in lexicon"
 622
 623     # statistics
 624     n = 0
 625     for i in range(short):
 626         n = n + len(wordlist[i][1])
 627     print n, "short indexes in phrasebook"
 628
 629     # pick the most commonly used words, and sort the rest on falling
 630     # length (to maximize overlap)
 631
 632     wordlist, wordtail = wordlist[:short], wordlist[short:]
 633     wordtail.sort(key=lambda a: a[0], reverse=True)
 634     wordlist.extend(wordtail)
 635
 636     # generate lexicon from words
 637
 638     lexicon_offset = [0]
 639     lexicon = ""
 640     words = {}
 641
 642     # build a lexicon string
 643     offset = 0
 644     for w, x in wordlist:
 645         # encoding: bit 7 indicates last character in word (chr(128)
 646         # indicates the last character in an entire string)
 647         ww = w[:-1] + chr(ord(w[-1])+128)
 648         # reuse string tails, when possible
 649         o = lexicon.find(ww)
 650         if o < 0:
 651             o = offset
 652             lexicon = lexicon + ww
 653             offset = offset + len(w)
 654         words[w] = len(lexicon_offset)
 655         lexicon_offset.append(o)
 656
 657     lexicon = map(ord, lexicon)
 658
 659     # generate phrasebook from names and lexicon
 660     phrasebook = [0]
 661     phrasebook_offset = [0] * len(unicode.chars)
 662     for char in unicode.chars:
 663         name = names[char]
 664         if name:
 665             w = name.split()
 666             phrasebook_offset[char] = len(phrasebook)
 667             for w in w:
 668                 i = words[w]
 669                 if i < short:
 670                     phrasebook.append(i)
 671                 else:
 672                     # store as two bytes
 673                     phrasebook.append((i>>8) + short)
 674                     phrasebook.append(i&255)
 675
 676     assert getsize(phrasebook) == 1
 677
 678     #
 679     # unicode name hash table
 680
 681     # extract names
 682     data = []
 683     for char in unicode.chars:
 684         record = unicode.table[char]
 685         if record:
 686             name = record[1].strip()
 687             if name and name[0] != "<":
 688                 data.append((name, char))
 689
 690     # the magic number 47 was chosen to minimize the number of
 691     # collisions on the current data set.  if you like, change it
 692     # and see what happens...
 693
 694     codehash = Hash("code", data, 47)
 695
 696     print "--- Writing", FILE, "..."
 697
 698     fp = open(FILE, "w")
 699     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
 700     print >>fp
 701     print >>fp, "#define NAME_MAXLEN", 256
 702     print >>fp
 703     print >>fp, "/* lexicon */"
 704     Array("lexicon", lexicon).dump(fp, trace)
 705     Array("lexicon_offset", lexicon_offset).dump(fp, trace)
 706
 707     # split decomposition index table
 708     offset1, offset2, shift = splitbins(phrasebook_offset, trace)
 709
 710     print >>fp, "/* code->name phrasebook */"
 711     print >>fp, "#define phrasebook_shift", shift
 712     print >>fp, "#define phrasebook_short", short
 713
 714     Array("phrasebook", phrasebook).dump(fp, trace)
 715     Array("phrasebook_offset1", offset1).dump(fp, trace)
 716     Array("phrasebook_offset2", offset2).dump(fp, trace)
 717
 718     print >>fp, "/* name->code dictionary */"
 719     codehash.dump(fp, trace)
 720
 721     fp.close()
 722
 723
 724 def merge_old_version(version, new, old):
 725     # Changes to exclusion file not implemented yet
 726     if old.exclusions != new.exclusions:
 727         raise NotImplementedError, "exclusions differ"
 728
 729     # In these change records, 0xFF means "no change"
 730     bidir_changes = [0xFF]*0x110000
 731     category_changes = [0xFF]*0x110000
 732     decimal_changes = [0xFF]*0x110000
 733     mirrored_changes = [0xFF]*0x110000
 734     # In numeric data, 0 means "no change",
 735     # -1 means "did not have a numeric value
 736     numeric_changes = [0] * 0x110000
 737     # normalization_changes is a list of key-value pairs
 738     normalization_changes = []
 739     for i in range(0x110000):
 740         if new.table[i] is None:
 741             # Characters unassigned in the new version ought to
 742             # be unassigned in the old one
 743             assert old.table[i] is None
 744             continue
 745         # check characters unassigned in the old version
 746         if old.table[i] is None:
 747             # category 0 is "unassigned"
 748             category_changes[i] = 0
 749             continue
 750         # check characters that differ
 751         if old.table[i] != new.table[i]:
 752             for k in range(len(old.table[i])):
 753                 if old.table[i][k] != new.table[i][k]:
 754                     value = old.table[i][k]
 755                     if k == 2:
 756                         #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
 757                         category_changes[i] = CATEGORY_NAMES.index(value)
 758                     elif k == 4:
 759                         #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
 760                         bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
 761                     elif k == 5:
 762                         #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
 763                         # We assume that all normalization changes are in 1:1 mappings
 764                         assert " " not in value
 765                         normalization_changes.append((i, value))
 766                     elif k == 6:
 767                         #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
 768                         # we only support changes where the old value is a single digit
 769                         assert value in "0123456789"
 770                         decimal_changes[i] = int(value)
 771                     elif k == 8:
 772                         # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
 773                         # Since 0 encodes "no change", the old value is better not 0
 774                         if not value:
 775                             numeric_changes[i] = -1
 776                         else:
 777                             numeric_changes[i] = float(value)
 778                             assert numeric_changes[i] not in (0, -1)
 779                     elif k == 9:
 780                         if value == 'Y':
 781                             mirrored_changes[i] = '1'
 782                         else:
 783                             mirrored_changes[i] = '0'
 784                     elif k == 11:
 785                         # change to ISO comment, ignore
 786                         pass
 787                     elif k == 12:
 788                         # change to simple uppercase mapping; ignore
 789                         pass
 790                     elif k == 13:
 791                         # change to simple lowercase mapping; ignore
 792                         pass
 793                     elif k == 14:
 794                         # change to simple titlecase mapping; ignore
 795                         pass
 796                     elif k == 16:
 797                         # change to properties; not yet
 798                         pass
 799                     else:
 800                         class Difference(Exception):pass
 801                         raise Difference, (hex(i), k, old.table[i], new.table[i])
 802     new.changed.append((version, zip(bidir_changes, category_changes,
 803                                      decimal_changes, mirrored_changes,
 804                                      numeric_changes),
 805                         normalization_changes))
 806
 807
 808 # --------------------------------------------------------------------
 809 # the following support code is taken from the unidb utilities
 810 # Copyright (c) 1999-2000 by Secret Labs AB
 811
 812 # load a unicode-data file from disk
 813
 814 class UnicodeData:
 815     # Record structure:
 816     # [ID, name, category, combining, bidi, decomp,  (6)
 817     #  decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
 818     #  ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
 819     #  properties] (17)
 820
 821     def __init__(self, filename, exclusions, eastasianwidth, unihan,
 822                  derivednormalizationprops=None, linebreakprops=None,
 823                  expand=1):
 824         self.changed = []
 825         file = open(filename)
 826         table = [None] * 0x110000
 827         while 1:
 828             s = file.readline()
 829             if not s:
 830                 break
 831             s = s.strip().split(";")
 832             char = int(s[0], 16)
 833             table[char] = s
 834
 835         # expand first-last ranges
 836         if expand:
 837             field = None
 838             for i in range(0, 0x110000):
 839                 s = table[i]
 840                 if s:
 841                     if s[1][-6:] == "First>":
 842                         s[1] = ""
 843                         field = s
 844                     elif s[1][-5:] == "Last>":
 845                         s[1] = ""
 846                         field = None
 847                 elif field:
 848                     f2 = field[:]
 849                     f2[0] = "%X" % i
 850                     table[i] = f2
 851
 852         # public attributes
 853         self.filename = filename
 854         self.table = table
 855         self.chars = range(0x110000) # unicode 3.2
 856
 857         file = open(exclusions)
 858         self.exclusions = {}
 859         for s in file:
 860             s = s.strip()
 861             if not s:
 862                 continue
 863             if s[0] == '#':
 864                 continue
 865             char = int(s.split()[0],16)
 866             self.exclusions[char] = 1
 867
 868         widths = [None] * 0x110000
 869         for s in open(eastasianwidth):
 870             s = s.strip()
 871             if not s:
 872                 continue
 873             if s[0] == '#':
 874                 continue
 875             s = s.split()[0].split(';')
 876             if '..' in s[0]:
 877                 first, last = [int(c, 16) for c in s[0].split('..')]
 878                 chars = range(first, last+1)
 879             else:
 880                 chars = [int(s[0], 16)]
 881             for char in chars:
 882                 widths[char] = s[1]
 883         for i in range(0, 0x110000):
 884             if table[i] is not None:
 885                 table[i].append(widths[i])
 886
 887         for i in range(0, 0x110000):
 888             if table[i] is not None:
 889                 table[i].append(set())
 890         if linebreakprops:
 891             for s in open(linebreakprops):
 892                 s = s.partition('#')[0]
 893                 s = [i.strip() for i in s.split(';')]
 894                 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
 895                     continue
 896                 if '..' not in s[0]:
 897                     first = last = int(s[0], 16)
 898                 else:
 899                     first, last = [int(c, 16) for c in s[0].split('..')]
 900                 for char in range(first, last+1):
 901                     table[char][-1].add('Line_Break')
 902
 903         if derivednormalizationprops:
 904             quickchecks = [0] * 0x110000 # default is Yes
 905             qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
 906             for s in open(derivednormalizationprops):
 907                 if '#' in s:
 908                     s = s[:s.index('#')]
 909                 s = [i.strip() for i in s.split(';')]
 910                 if len(s) < 2 or s[1] not in qc_order:
 911                     continue
 912                 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
 913                 quickcheck_shift = qc_order.index(s[1])*2
 914                 quickcheck <<= quickcheck_shift
 915                 if '..' not in s[0]:
 916                     first = last = int(s[0], 16)
 917                 else:
 918                     first, last = [int(c, 16) for c in s[0].split('..')]
 919                 for char in range(first, last+1):
 920                     assert not (quickchecks[char]>>quickcheck_shift)&3
 921                     quickchecks[char] |= quickcheck
 922             for i in range(0, 0x110000):
 923                 if table[i] is not None:
 924                     table[i].append(quickchecks[i])
 925
 926         for line in open(unihan):
 927             if not line.startswith('U+'):
 928                 continue
 929             code, tag, value = line.split(None, 3)[:3]
 930             if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
 931                            'kOtherNumeric'):
 932                 continue
 933             value = value.strip().replace(',', '')
 934             i = int(code[2:], 16)
 935             # Patch the numeric field
 936             if table[i] is not None:
 937                 table[i][8] = value
 938
 939     def uselatin1(self):
 940         # restrict character range to ISO Latin 1
 941         self.chars = range(256)
 942
 943 # hash table tools
 944
 945 # this is a straight-forward reimplementation of Python's built-in
 946 # dictionary type, using a static data structure, and a custom string
 947 # hash algorithm.
 948
 949 def myhash(s, magic):
 950     h = 0
 951     for c in map(ord, s.upper()):
 952         h = (h * magic) + c
 953         ix = h & 0xff000000L
 954         if ix:
 955             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
 956     return h
 957
 958 SIZES = [
 959     (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
 960     (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
 961     (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
 962     (2097152,5), (4194304,3), (8388608,33), (16777216,27)
 963 ]
 964
 965 class Hash:
 966     def __init__(self, name, data, magic):
 967         # turn a (key, value) list into a static hash table structure
 968
 969         # determine table size
 970         for size, poly in SIZES:
 971             if size > len(data):
 972                 poly = size + poly
 973                 break
 974         else:
 975             raise AssertionError, "ran out of polynominals"
 976
 977         print size, "slots in hash table"
 978
 979         table = [None] * size
 980
 981         mask = size-1
 982
 983         n = 0
 984
 985         hash = myhash
 986
 987         # initialize hash table
 988         for key, value in data:
 989             h = hash(key, magic)
 990             i = (~h) & mask
 991             v = table[i]
 992             if v is None:
 993                 table[i] = value
 994                 continue
 995             incr = (h ^ (h >> 3)) & mask;
 996             if not incr:
 997                 incr = mask
 998             while 1:
 999                 n = n + 1
1000                 i = (i + incr) & mask
1001                 v = table[i]
1002                 if v is None:
1003                     table[i] = value
1004                     break
1005                 incr = incr << 1
1006                 if incr > mask:
1007                     incr = incr ^ poly
1008
1009         print n, "collisions"
1010         self.collisions = n
1011
1012         for i in range(len(table)):
1013             if table[i] is None:
1014                 table[i] = 0
1015
1016         self.data = Array(name + "_hash", table)
1017         self.magic = magic
1018         self.name = name
1019         self.size = size
1020         self.poly = poly
1021
1022     def dump(self, file, trace):
1023         # write data to file, as a C array
1024         self.data.dump(file, trace)
1025         file.write("#define %s_magic %d\n" % (self.name, self.magic))
1026         file.write("#define %s_size %d\n" % (self.name, self.size))
1027         file.write("#define %s_poly %d\n" % (self.name, self.poly))
1028
1029 # stuff to deal with arrays of unsigned integers
1030
1031 class Array:
1032
1033     def __init__(self, name, data):
1034         self.name = name
1035         self.data = data
1036
1037     def dump(self, file, trace=0):
1038         # write data to file, as a C array
1039         size = getsize(self.data)
1040         if trace:
1041             print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
1042         file.write("static ")
1043         if size == 1:
1044             file.write("unsigned char")
1045         elif size == 2:
1046             file.write("unsigned short")
1047         else:
1048             file.write("unsigned int")
1049         file.write(" " + self.name + "[] = {\n")
1050         if self.data:
1051             s = "    "
1052             for item in self.data:
1053                 i = str(item) + ", "
1054                 if len(s) + len(i) > 78:
1055                     file.write(s + "\n")
1056                     s = "    " + i
1057                 else:
1058                     s = s + i
1059             if s.strip():
1060                 file.write(s + "\n")
1061         file.write("};\n\n")
1062
1063 def getsize(data):
1064     # return smallest possible integer size for the given array
1065     maxdata = max(data)
1066     if maxdata < 256:
1067         return 1
1068     elif maxdata < 65536:
1069         return 2
1070     else:
1071         return 4
1072
1073 def splitbins(t, trace=0):
1074     """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
1075
1076     t is a sequence of ints.  This function can be useful to save space if
1077     many of the ints are the same.  t1 and t2 are lists of ints, and shift
1078     is an int, chosen to minimize the combined size of t1 and t2 (in C
1079     code), and where for each i in range(len(t)),
1080         t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1081     where mask is a bitmask isolating the last "shift" bits.
1082
1083     If optional arg trace is non-zero (default zero), progress info
1084     is printed to sys.stderr.  The higher the value, the more info
1085     you'll get.
1086     """
1087
1088     if trace:
1089         def dump(t1, t2, shift, bytes):
1090             print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
1091                 len(t1), len(t2), shift, bytes)
1092         print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
1093                             "bytes"
1094     n = len(t)-1    # last valid index
1095     maxshift = 0    # the most we can shift n and still have something left
1096     if n > 0:
1097         while n >> 1:
1098             n >>= 1
1099             maxshift += 1
1100     del n
1101     bytes = sys.maxint  # smallest total size so far
1102     t = tuple(t)    # so slices can be dict keys
1103     for shift in range(maxshift + 1):
1104         t1 = []
1105         t2 = []
1106         size = 2**shift
1107         bincache = {}
1108         for i in range(0, len(t), size):
1109             bin = t[i:i+size]
1110             index = bincache.get(bin)
1111             if index is None:
1112                 index = len(t2)
1113                 bincache[bin] = index
1114                 t2.extend(bin)
1115             t1.append(index >> shift)
1116         # determine memory size
1117         b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
1118         if trace > 1:
1119             dump(t1, t2, shift, b)
1120         if b < bytes:
1121             best = t1, t2, shift
1122             bytes = b
1123     t1, t2, shift = best
1124     if trace:
1125         print >>sys.stderr, "Best:",
1126         dump(t1, t2, shift, bytes)
1127     if __debug__:
1128         # exhaustively verify that the decomposition is correct
1129         mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1130         for i in xrange(len(t)):
1131             assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1132     return best
1133
1134 if __name__ == "__main__":
1135     maketables(1)