]> git.proxmox.com Git - mirror_edk2.git/blame - AppPkg/Applications/Python/Python-2.7.2/Tools/unicode/makeunicodedata.py
EmbeddedPkg: Extend NvVarStoreFormattedLib LIBRARY_CLASS
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.2 / Tools / unicode / makeunicodedata.py
CommitLineData
4710c53d 1#\r
2# (re)generate unicode property and type databases\r
3#\r
4# this script converts a unicode 3.2 database file to\r
5# Modules/unicodedata_db.h, Modules/unicodename_db.h,\r
6# and Objects/unicodetype_db.h\r
7#\r
8# history:\r
9# 2000-09-24 fl created (based on bits and pieces from unidb)\r
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table\r
11# 2000-09-25 fl added character type table\r
12# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)\r
13# 2000-11-03 fl expand first/last ranges\r
14# 2001-01-19 fl added character name tables (2.1)\r
15# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold\r
16# 2002-09-11 wd use string methods\r
17# 2002-10-18 mvl update to Unicode 3.2\r
18# 2002-10-22 mvl generate NFC tables\r
19# 2002-11-24 mvl expand all ranges, sort names version-independently\r
20# 2002-11-25 mvl add UNIDATA_VERSION\r
21# 2004-05-29 perky add east asian width information\r
22# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta\r
23#\r
24# written by Fredrik Lundh (fredrik@pythonware.com)\r
25#\r
26\r
27import sys\r
28\r
29SCRIPT = sys.argv[0]\r
30VERSION = "2.6"\r
31\r
32# The Unicode Database\r
33UNIDATA_VERSION = "5.2.0"\r
34UNICODE_DATA = "UnicodeData%s.txt"\r
35COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"\r
36EASTASIAN_WIDTH = "EastAsianWidth%s.txt"\r
37UNIHAN = "Unihan%s.txt"\r
38DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"\r
39LINE_BREAK = "LineBreak%s.txt"\r
40\r
41old_versions = ["3.2.0"]\r
42\r
43CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",\r
44 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",\r
45 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",\r
46 "So" ]\r
47\r
48BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",\r
49 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",\r
50 "ON" ]\r
51\r
52EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]\r
53\r
54MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]\r
55\r
56# note: should match definitions in Objects/unicodectype.c\r
57ALPHA_MASK = 0x01\r
58DECIMAL_MASK = 0x02\r
59DIGIT_MASK = 0x04\r
60LOWER_MASK = 0x08\r
61LINEBREAK_MASK = 0x10\r
62SPACE_MASK = 0x20\r
63TITLE_MASK = 0x40\r
64UPPER_MASK = 0x80\r
65NODELTA_MASK = 0x100\r
66NUMERIC_MASK = 0x200\r
67\r
68def maketables(trace=0):\r
69\r
70 print "--- Reading", UNICODE_DATA % "", "..."\r
71\r
72 version = ""\r
73 unicode = UnicodeData(UNICODE_DATA % version,\r
74 COMPOSITION_EXCLUSIONS % version,\r
75 EASTASIAN_WIDTH % version,\r
76 UNIHAN % version,\r
77 DERIVEDNORMALIZATION_PROPS % version,\r
78 LINE_BREAK % version)\r
79\r
80 print len(filter(None, unicode.table)), "characters"\r
81\r
82 for version in old_versions:\r
83 print "--- Reading", UNICODE_DATA % ("-"+version), "..."\r
84 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),\r
85 COMPOSITION_EXCLUSIONS % ("-"+version),\r
86 EASTASIAN_WIDTH % ("-"+version),\r
87 UNIHAN % ("-"+version))\r
88 print len(filter(None, old_unicode.table)), "characters"\r
89 merge_old_version(version, unicode, old_unicode)\r
90\r
91 makeunicodename(unicode, trace)\r
92 makeunicodedata(unicode, trace)\r
93 makeunicodetype(unicode, trace)\r
94\r
95# --------------------------------------------------------------------\r
96# unicode character properties\r
97\r
98def makeunicodedata(unicode, trace):\r
99\r
100 dummy = (0, 0, 0, 0, 0, 0)\r
101 table = [dummy]\r
102 cache = {0: dummy}\r
103 index = [0] * len(unicode.chars)\r
104\r
105 FILE = "Modules/unicodedata_db.h"\r
106\r
107 print "--- Preparing", FILE, "..."\r
108\r
109 # 1) database properties\r
110\r
111 for char in unicode.chars:\r
112 record = unicode.table[char]\r
113 if record:\r
114 # extract database properties\r
115 category = CATEGORY_NAMES.index(record[2])\r
116 combining = int(record[3])\r
117 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])\r
118 mirrored = record[9] == "Y"\r
119 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])\r
120 normalizationquickcheck = record[17]\r
121 item = (\r
122 category, combining, bidirectional, mirrored, eastasianwidth,\r
123 normalizationquickcheck\r
124 )\r
125 # add entry to index and item tables\r
126 i = cache.get(item)\r
127 if i is None:\r
128 cache[item] = i = len(table)\r
129 table.append(item)\r
130 index[char] = i\r
131\r
132 # 2) decomposition data\r
133\r
134 decomp_data = [0]\r
135 decomp_prefix = [""]\r
136 decomp_index = [0] * len(unicode.chars)\r
137 decomp_size = 0\r
138\r
139 comp_pairs = []\r
140 comp_first = [None] * len(unicode.chars)\r
141 comp_last = [None] * len(unicode.chars)\r
142\r
143 for char in unicode.chars:\r
144 record = unicode.table[char]\r
145 if record:\r
146 if record[5]:\r
147 decomp = record[5].split()\r
148 if len(decomp) > 19:\r
149 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char\r
150 # prefix\r
151 if decomp[0][0] == "<":\r
152 prefix = decomp.pop(0)\r
153 else:\r
154 prefix = ""\r
155 try:\r
156 i = decomp_prefix.index(prefix)\r
157 except ValueError:\r
158 i = len(decomp_prefix)\r
159 decomp_prefix.append(prefix)\r
160 prefix = i\r
161 assert prefix < 256\r
162 # content\r
163 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]\r
164 # Collect NFC pairs\r
165 if not prefix and len(decomp) == 3 and \\r
166 char not in unicode.exclusions and \\r
167 unicode.table[decomp[1]][3] == "0":\r
168 p, l, r = decomp\r
169 comp_first[l] = 1\r
170 comp_last[r] = 1\r
171 comp_pairs.append((l,r,char))\r
172 try:\r
173 i = decomp_data.index(decomp)\r
174 except ValueError:\r
175 i = len(decomp_data)\r
176 decomp_data.extend(decomp)\r
177 decomp_size = decomp_size + len(decomp) * 2\r
178 else:\r
179 i = 0\r
180 decomp_index[char] = i\r
181\r
182 f = l = 0\r
183 comp_first_ranges = []\r
184 comp_last_ranges = []\r
185 prev_f = prev_l = None\r
186 for i in unicode.chars:\r
187 if comp_first[i] is not None:\r
188 comp_first[i] = f\r
189 f += 1\r
190 if prev_f is None:\r
191 prev_f = (i,i)\r
192 elif prev_f[1]+1 == i:\r
193 prev_f = prev_f[0],i\r
194 else:\r
195 comp_first_ranges.append(prev_f)\r
196 prev_f = (i,i)\r
197 if comp_last[i] is not None:\r
198 comp_last[i] = l\r
199 l += 1\r
200 if prev_l is None:\r
201 prev_l = (i,i)\r
202 elif prev_l[1]+1 == i:\r
203 prev_l = prev_l[0],i\r
204 else:\r
205 comp_last_ranges.append(prev_l)\r
206 prev_l = (i,i)\r
207 comp_first_ranges.append(prev_f)\r
208 comp_last_ranges.append(prev_l)\r
209 total_first = f\r
210 total_last = l\r
211\r
212 comp_data = [0]*(total_first*total_last)\r
213 for f,l,char in comp_pairs:\r
214 f = comp_first[f]\r
215 l = comp_last[l]\r
216 comp_data[f*total_last+l] = char\r
217\r
218 print len(table), "unique properties"\r
219 print len(decomp_prefix), "unique decomposition prefixes"\r
220 print len(decomp_data), "unique decomposition entries:",\r
221 print decomp_size, "bytes"\r
222 print total_first, "first characters in NFC"\r
223 print total_last, "last characters in NFC"\r
224 print len(comp_pairs), "NFC pairs"\r
225\r
226 print "--- Writing", FILE, "..."\r
227\r
228 fp = open(FILE, "w")\r
229 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)\r
230 print >>fp\r
231 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION\r
232 print >>fp, "/* a list of unique database records */"\r
233 print >>fp, \\r
234 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"\r
235 for item in table:\r
236 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item\r
237 print >>fp, "};"\r
238 print >>fp\r
239\r
240 print >>fp, "/* Reindexing of NFC first characters. */"\r
241 print >>fp, "#define TOTAL_FIRST",total_first\r
242 print >>fp, "#define TOTAL_LAST",total_last\r
243 print >>fp, "struct reindex{int start;short count,index;};"\r
244 print >>fp, "static struct reindex nfc_first[] = {"\r
245 for start,end in comp_first_ranges:\r
246 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])\r
247 print >>fp," {0,0,0}"\r
248 print >>fp,"};\n"\r
249 print >>fp, "static struct reindex nfc_last[] = {"\r
250 for start,end in comp_last_ranges:\r
251 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])\r
252 print >>fp," {0,0,0}"\r
253 print >>fp,"};\n"\r
254\r
255 # FIXME: <fl> the following tables could be made static, and\r
256 # the support code moved into unicodedatabase.c\r
257\r
258 print >>fp, "/* string literals */"\r
259 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"\r
260 for name in CATEGORY_NAMES:\r
261 print >>fp, " \"%s\"," % name\r
262 print >>fp, " NULL"\r
263 print >>fp, "};"\r
264\r
265 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"\r
266 for name in BIDIRECTIONAL_NAMES:\r
267 print >>fp, " \"%s\"," % name\r
268 print >>fp, " NULL"\r
269 print >>fp, "};"\r
270\r
271 print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"\r
272 for name in EASTASIANWIDTH_NAMES:\r
273 print >>fp, " \"%s\"," % name\r
274 print >>fp, " NULL"\r
275 print >>fp, "};"\r
276\r
277 print >>fp, "static const char *decomp_prefix[] = {"\r
278 for name in decomp_prefix:\r
279 print >>fp, " \"%s\"," % name\r
280 print >>fp, " NULL"\r
281 print >>fp, "};"\r
282\r
283 # split record index table\r
284 index1, index2, shift = splitbins(index, trace)\r
285\r
286 print >>fp, "/* index tables for the database records */"\r
287 print >>fp, "#define SHIFT", shift\r
288 Array("index1", index1).dump(fp, trace)\r
289 Array("index2", index2).dump(fp, trace)\r
290\r
291 # split decomposition index table\r
292 index1, index2, shift = splitbins(decomp_index, trace)\r
293\r
294 print >>fp, "/* decomposition data */"\r
295 Array("decomp_data", decomp_data).dump(fp, trace)\r
296\r
297 print >>fp, "/* index tables for the decomposition data */"\r
298 print >>fp, "#define DECOMP_SHIFT", shift\r
299 Array("decomp_index1", index1).dump(fp, trace)\r
300 Array("decomp_index2", index2).dump(fp, trace)\r
301\r
302 index, index2, shift = splitbins(comp_data, trace)\r
303 print >>fp, "/* NFC pairs */"\r
304 print >>fp, "#define COMP_SHIFT", shift\r
305 Array("comp_index", index).dump(fp, trace)\r
306 Array("comp_data", index2).dump(fp, trace)\r
307\r
308 # Generate delta tables for old versions\r
309 for version, table, normalization in unicode.changed:\r
310 cversion = version.replace(".","_")\r
311 records = [table[0]]\r
312 cache = {table[0]:0}\r
313 index = [0] * len(table)\r
314 for i, record in enumerate(table):\r
315 try:\r
316 index[i] = cache[record]\r
317 except KeyError:\r
318 index[i] = cache[record] = len(records)\r
319 records.append(record)\r
320 index1, index2, shift = splitbins(index, trace)\r
321 print >>fp, "static const change_record change_records_%s[] = {" % cversion\r
322 for record in records:\r
323 print >>fp, "\t{ %s }," % ", ".join(map(str,record))\r
324 print >>fp, "};"\r
325 Array("changes_%s_index" % cversion, index1).dump(fp, trace)\r
326 Array("changes_%s_data" % cversion, index2).dump(fp, trace)\r
327 print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion\r
328 print >>fp, "{"\r
329 print >>fp, "\tint index;"\r
330 print >>fp, "\tif (n >= 0x110000) index = 0;"\r
331 print >>fp, "\telse {"\r
332 print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)\r
333 print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \\r
334 (cversion, shift, ((1<<shift)-1))\r
335 print >>fp, "\t}"\r
336 print >>fp, "\treturn change_records_%s+index;" % cversion\r
337 print >>fp, "}\n"\r
338 print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion\r
339 print >>fp, "{"\r
340 print >>fp, "\tswitch(n) {"\r
341 for k, v in normalization:\r
342 print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)\r
343 print >>fp, "\tdefault: return 0;"\r
344 print >>fp, "\t}\n}\n"\r
345\r
346 fp.close()\r
347\r
348# --------------------------------------------------------------------\r
349# unicode character type tables\r
350\r
351def makeunicodetype(unicode, trace):\r
352\r
353 FILE = "Objects/unicodetype_db.h"\r
354\r
355 print "--- Preparing", FILE, "..."\r
356\r
357 # extract unicode types\r
358 dummy = (0, 0, 0, 0, 0, 0)\r
359 table = [dummy]\r
360 cache = {0: dummy}\r
361 index = [0] * len(unicode.chars)\r
362 numeric = {}\r
363 spaces = []\r
364 linebreaks = []\r
365\r
366 for char in unicode.chars:\r
367 record = unicode.table[char]\r
368 if record:\r
369 # extract database properties\r
370 category = record[2]\r
371 bidirectional = record[4]\r
372 properties = record[16]\r
373 flags = 0\r
374 delta = True\r
375 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:\r
376 flags |= ALPHA_MASK\r
377 if category == "Ll":\r
378 flags |= LOWER_MASK\r
379 if 'Line_Break' in properties or bidirectional == "B":\r
380 flags |= LINEBREAK_MASK\r
381 linebreaks.append(char)\r
382 if category == "Zs" or bidirectional in ("WS", "B", "S"):\r
383 flags |= SPACE_MASK\r
384 spaces.append(char)\r
385 if category == "Lt":\r
386 flags |= TITLE_MASK\r
387 if category == "Lu":\r
388 flags |= UPPER_MASK\r
389 # use delta predictor for upper/lower/title if it fits\r
390 if record[12]:\r
391 upper = int(record[12], 16)\r
392 else:\r
393 upper = char\r
394 if record[13]:\r
395 lower = int(record[13], 16)\r
396 else:\r
397 lower = char\r
398 if record[14]:\r
399 title = int(record[14], 16)\r
400 else:\r
401 # UCD.html says that a missing title char means that\r
402 # it defaults to the uppercase character, not to the\r
403 # character itself. Apparently, in the current UCD (5.x)\r
404 # this feature is never used\r
405 title = upper\r
406 upper_d = upper - char\r
407 lower_d = lower - char\r
408 title_d = title - char\r
409 if -32768 <= upper_d <= 32767 and \\r
410 -32768 <= lower_d <= 32767 and \\r
411 -32768 <= title_d <= 32767:\r
412 # use deltas\r
413 upper = upper_d & 0xffff\r
414 lower = lower_d & 0xffff\r
415 title = title_d & 0xffff\r
416 else:\r
417 flags |= NODELTA_MASK\r
418 # decimal digit, integer digit\r
419 decimal = 0\r
420 if record[6]:\r
421 flags |= DECIMAL_MASK\r
422 decimal = int(record[6])\r
423 digit = 0\r
424 if record[7]:\r
425 flags |= DIGIT_MASK\r
426 digit = int(record[7])\r
427 if record[8]:\r
428 flags |= NUMERIC_MASK\r
429 numeric.setdefault(record[8], []).append(char)\r
430 item = (\r
431 upper, lower, title, decimal, digit, flags\r
432 )\r
433 # add entry to index and item tables\r
434 i = cache.get(item)\r
435 if i is None:\r
436 cache[item] = i = len(table)\r
437 table.append(item)\r
438 index[char] = i\r
439\r
440 print len(table), "unique character type entries"\r
441 print sum(map(len, numeric.values())), "numeric code points"\r
442 print len(spaces), "whitespace code points"\r
443 print len(linebreaks), "linebreak code points"\r
444\r
445 print "--- Writing", FILE, "..."\r
446\r
447 fp = open(FILE, "w")\r
448 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)\r
449 print >>fp\r
450 print >>fp, "/* a list of unique character type descriptors */"\r
451 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"\r
452 for item in table:\r
453 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item\r
454 print >>fp, "};"\r
455 print >>fp\r
456\r
457 # split decomposition index table\r
458 index1, index2, shift = splitbins(index, trace)\r
459\r
460 print >>fp, "/* type indexes */"\r
461 print >>fp, "#define SHIFT", shift\r
462 Array("index1", index1).dump(fp, trace)\r
463 Array("index2", index2).dump(fp, trace)\r
464\r
465 # Generate code for _PyUnicode_ToNumeric()\r
466 numeric_items = sorted(numeric.items())\r
467 print >>fp, '/* Returns the numeric value as double for Unicode characters'\r
468 print >>fp, ' * having this property, -1.0 otherwise.'\r
469 print >>fp, ' */'\r
470 print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'\r
471 print >>fp, '{'\r
472 print >>fp, ' switch (ch) {'\r
473 for value, codepoints in numeric_items:\r
474 # Turn text into float literals\r
475 parts = value.split('/')\r
476 parts = [repr(float(part)) for part in parts]\r
477 value = '/'.join(parts)\r
478\r
479 haswide = False\r
480 hasnonewide = False\r
481 codepoints.sort()\r
482 for codepoint in codepoints:\r
483 if codepoint < 0x10000:\r
484 hasnonewide = True\r
485 if codepoint >= 0x10000 and not haswide:\r
486 print >>fp, '#ifdef Py_UNICODE_WIDE'\r
487 haswide = True\r
488 print >>fp, ' case 0x%04X:' % (codepoint,)\r
489 if haswide and hasnonewide:\r
490 print >>fp, '#endif'\r
491 print >>fp, ' return (double) %s;' % (value,)\r
492 if haswide and not hasnonewide:\r
493 print >>fp, '#endif'\r
494 print >>fp,' }'\r
495 print >>fp,' return -1.0;'\r
496 print >>fp,'}'\r
497 print >>fp\r
498\r
499 # Generate code for _PyUnicode_IsWhitespace()\r
500 print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"\r
501 print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."\r
502 print >>fp, " */"\r
503 print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'\r
504 print >>fp, '{'\r
505 print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'\r
506 print >>fp, ' return iswspace(ch);'\r
507 print >>fp, '#else'\r
508 print >>fp, ' switch (ch) {'\r
509\r
510 haswide = False\r
511 hasnonewide = False\r
512 for codepoint in sorted(spaces):\r
513 if codepoint < 0x10000:\r
514 hasnonewide = True\r
515 if codepoint >= 0x10000 and not haswide:\r
516 print >>fp, '#ifdef Py_UNICODE_WIDE'\r
517 haswide = True\r
518 print >>fp, ' case 0x%04X:' % (codepoint,)\r
519 if haswide and hasnonewide:\r
520 print >>fp, '#endif'\r
521 print >>fp, ' return 1;'\r
522 if haswide and not hasnonewide:\r
523 print >>fp, '#endif'\r
524\r
525 print >>fp,' }'\r
526 print >>fp,' return 0;'\r
527 print >>fp, '#endif'\r
528 print >>fp,'}'\r
529 print >>fp\r
530\r
531 # Generate code for _PyUnicode_IsLinebreak()\r
532 print >>fp, "/* Returns 1 for Unicode characters having the line break"\r
533 print >>fp, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional"\r
534 print >>fp, " * type 'B', 0 otherwise."\r
535 print >>fp, " */"\r
536 print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'\r
537 print >>fp, '{'\r
538 print >>fp, ' switch (ch) {'\r
539 haswide = False\r
540 hasnonewide = False\r
541 for codepoint in sorted(linebreaks):\r
542 if codepoint < 0x10000:\r
543 hasnonewide = True\r
544 if codepoint >= 0x10000 and not haswide:\r
545 print >>fp, '#ifdef Py_UNICODE_WIDE'\r
546 haswide = True\r
547 print >>fp, ' case 0x%04X:' % (codepoint,)\r
548 if haswide and hasnonewide:\r
549 print >>fp, '#endif'\r
550 print >>fp, ' return 1;'\r
551 if haswide and not hasnonewide:\r
552 print >>fp, '#endif'\r
553\r
554 print >>fp,' }'\r
555 print >>fp,' return 0;'\r
556 print >>fp,'}'\r
557 print >>fp\r
558\r
559 fp.close()\r
560\r
561# --------------------------------------------------------------------\r
562# unicode name database\r
563\r
564def makeunicodename(unicode, trace):\r
565\r
566 FILE = "Modules/unicodename_db.h"\r
567\r
568 print "--- Preparing", FILE, "..."\r
569\r
570 # collect names\r
571 names = [None] * len(unicode.chars)\r
572\r
573 for char in unicode.chars:\r
574 record = unicode.table[char]\r
575 if record:\r
576 name = record[1].strip()\r
577 if name and name[0] != "<":\r
578 names[char] = name + chr(0)\r
579\r
580 print len(filter(lambda n: n is not None, names)), "distinct names"\r
581\r
582 # collect unique words from names (note that we differ between\r
583 # words inside a sentence, and words ending a sentence. the\r
584 # latter includes the trailing null byte.\r
585\r
586 words = {}\r
587 n = b = 0\r
588 for char in unicode.chars:\r
589 name = names[char]\r
590 if name:\r
591 w = name.split()\r
592 b = b + len(name)\r
593 n = n + len(w)\r
594 for w in w:\r
595 l = words.get(w)\r
596 if l:\r
597 l.append(None)\r
598 else:\r
599 words[w] = [len(words)]\r
600\r
601 print n, "words in text;", b, "bytes"\r
602\r
603 wordlist = words.items()\r
604\r
605 # sort on falling frequency, then by name\r
606 def word_key(a):\r
607 aword, alist = a\r
608 return -len(alist), aword\r
609 wordlist.sort(key=word_key)\r
610\r
611 # figure out how many phrasebook escapes we need\r
612 escapes = 0\r
613 while escapes * 256 < len(wordlist):\r
614 escapes = escapes + 1\r
615 print escapes, "escapes"\r
616\r
617 short = 256 - escapes\r
618\r
619 assert short > 0\r
620\r
621 print short, "short indexes in lexicon"\r
622\r
623 # statistics\r
624 n = 0\r
625 for i in range(short):\r
626 n = n + len(wordlist[i][1])\r
627 print n, "short indexes in phrasebook"\r
628\r
629 # pick the most commonly used words, and sort the rest on falling\r
630 # length (to maximize overlap)\r
631\r
632 wordlist, wordtail = wordlist[:short], wordlist[short:]\r
633 wordtail.sort(key=lambda a: a[0], reverse=True)\r
634 wordlist.extend(wordtail)\r
635\r
636 # generate lexicon from words\r
637\r
638 lexicon_offset = [0]\r
639 lexicon = ""\r
640 words = {}\r
641\r
642 # build a lexicon string\r
643 offset = 0\r
644 for w, x in wordlist:\r
645 # encoding: bit 7 indicates last character in word (chr(128)\r
646 # indicates the last character in an entire string)\r
647 ww = w[:-1] + chr(ord(w[-1])+128)\r
648 # reuse string tails, when possible\r
649 o = lexicon.find(ww)\r
650 if o < 0:\r
651 o = offset\r
652 lexicon = lexicon + ww\r
653 offset = offset + len(w)\r
654 words[w] = len(lexicon_offset)\r
655 lexicon_offset.append(o)\r
656\r
657 lexicon = map(ord, lexicon)\r
658\r
659 # generate phrasebook from names and lexicon\r
660 phrasebook = [0]\r
661 phrasebook_offset = [0] * len(unicode.chars)\r
662 for char in unicode.chars:\r
663 name = names[char]\r
664 if name:\r
665 w = name.split()\r
666 phrasebook_offset[char] = len(phrasebook)\r
667 for w in w:\r
668 i = words[w]\r
669 if i < short:\r
670 phrasebook.append(i)\r
671 else:\r
672 # store as two bytes\r
673 phrasebook.append((i>>8) + short)\r
674 phrasebook.append(i&255)\r
675\r
676 assert getsize(phrasebook) == 1\r
677\r
678 #\r
679 # unicode name hash table\r
680\r
681 # extract names\r
682 data = []\r
683 for char in unicode.chars:\r
684 record = unicode.table[char]\r
685 if record:\r
686 name = record[1].strip()\r
687 if name and name[0] != "<":\r
688 data.append((name, char))\r
689\r
690 # the magic number 47 was chosen to minimize the number of\r
691 # collisions on the current data set. if you like, change it\r
692 # and see what happens...\r
693\r
694 codehash = Hash("code", data, 47)\r
695\r
696 print "--- Writing", FILE, "..."\r
697\r
698 fp = open(FILE, "w")\r
699 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)\r
700 print >>fp\r
701 print >>fp, "#define NAME_MAXLEN", 256\r
702 print >>fp\r
703 print >>fp, "/* lexicon */"\r
704 Array("lexicon", lexicon).dump(fp, trace)\r
705 Array("lexicon_offset", lexicon_offset).dump(fp, trace)\r
706\r
707 # split decomposition index table\r
708 offset1, offset2, shift = splitbins(phrasebook_offset, trace)\r
709\r
710 print >>fp, "/* code->name phrasebook */"\r
711 print >>fp, "#define phrasebook_shift", shift\r
712 print >>fp, "#define phrasebook_short", short\r
713\r
714 Array("phrasebook", phrasebook).dump(fp, trace)\r
715 Array("phrasebook_offset1", offset1).dump(fp, trace)\r
716 Array("phrasebook_offset2", offset2).dump(fp, trace)\r
717\r
718 print >>fp, "/* name->code dictionary */"\r
719 codehash.dump(fp, trace)\r
720\r
721 fp.close()\r
722\r
723\r
724def merge_old_version(version, new, old):\r
725 # Changes to exclusion file not implemented yet\r
726 if old.exclusions != new.exclusions:\r
727 raise NotImplementedError, "exclusions differ"\r
728\r
729 # In these change records, 0xFF means "no change"\r
730 bidir_changes = [0xFF]*0x110000\r
731 category_changes = [0xFF]*0x110000\r
732 decimal_changes = [0xFF]*0x110000\r
733 mirrored_changes = [0xFF]*0x110000\r
734 # In numeric data, 0 means "no change",\r
735 # -1 means "did not have a numeric value\r
736 numeric_changes = [0] * 0x110000\r
737 # normalization_changes is a list of key-value pairs\r
738 normalization_changes = []\r
739 for i in range(0x110000):\r
740 if new.table[i] is None:\r
741 # Characters unassigned in the new version ought to\r
742 # be unassigned in the old one\r
743 assert old.table[i] is None\r
744 continue\r
745 # check characters unassigned in the old version\r
746 if old.table[i] is None:\r
747 # category 0 is "unassigned"\r
748 category_changes[i] = 0\r
749 continue\r
750 # check characters that differ\r
751 if old.table[i] != new.table[i]:\r
752 for k in range(len(old.table[i])):\r
753 if old.table[i][k] != new.table[i][k]:\r
754 value = old.table[i][k]\r
755 if k == 2:\r
756 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]\r
757 category_changes[i] = CATEGORY_NAMES.index(value)\r
758 elif k == 4:\r
759 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]\r
760 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)\r
761 elif k == 5:\r
762 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]\r
763 # We assume that all normalization changes are in 1:1 mappings\r
764 assert " " not in value\r
765 normalization_changes.append((i, value))\r
766 elif k == 6:\r
767 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]\r
768 # we only support changes where the old value is a single digit\r
769 assert value in "0123456789"\r
770 decimal_changes[i] = int(value)\r
771 elif k == 8:\r
772 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]\r
773 # Since 0 encodes "no change", the old value is better not 0\r
774 if not value:\r
775 numeric_changes[i] = -1\r
776 else:\r
777 numeric_changes[i] = float(value)\r
778 assert numeric_changes[i] not in (0, -1)\r
779 elif k == 9:\r
780 if value == 'Y':\r
781 mirrored_changes[i] = '1'\r
782 else:\r
783 mirrored_changes[i] = '0'\r
784 elif k == 11:\r
785 # change to ISO comment, ignore\r
786 pass\r
787 elif k == 12:\r
788 # change to simple uppercase mapping; ignore\r
789 pass\r
790 elif k == 13:\r
791 # change to simple lowercase mapping; ignore\r
792 pass\r
793 elif k == 14:\r
794 # change to simple titlecase mapping; ignore\r
795 pass\r
796 elif k == 16:\r
797 # change to properties; not yet\r
798 pass\r
799 else:\r
800 class Difference(Exception):pass\r
801 raise Difference, (hex(i), k, old.table[i], new.table[i])\r
802 new.changed.append((version, zip(bidir_changes, category_changes,\r
803 decimal_changes, mirrored_changes,\r
804 numeric_changes),\r
805 normalization_changes))\r
806\r
807\r
808# --------------------------------------------------------------------\r
809# the following support code is taken from the unidb utilities\r
810# Copyright (c) 1999-2000 by Secret Labs AB\r
811\r
812# load a unicode-data file from disk\r
813\r
814class UnicodeData:\r
815 # Record structure:\r
816 # [ID, name, category, combining, bidi, decomp, (6)\r
817 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)\r
818 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)\r
819 # properties] (17)\r
820\r
821 def __init__(self, filename, exclusions, eastasianwidth, unihan,\r
822 derivednormalizationprops=None, linebreakprops=None,\r
823 expand=1):\r
824 self.changed = []\r
825 file = open(filename)\r
826 table = [None] * 0x110000\r
827 while 1:\r
828 s = file.readline()\r
829 if not s:\r
830 break\r
831 s = s.strip().split(";")\r
832 char = int(s[0], 16)\r
833 table[char] = s\r
834\r
835 # expand first-last ranges\r
836 if expand:\r
837 field = None\r
838 for i in range(0, 0x110000):\r
839 s = table[i]\r
840 if s:\r
841 if s[1][-6:] == "First>":\r
842 s[1] = ""\r
843 field = s\r
844 elif s[1][-5:] == "Last>":\r
845 s[1] = ""\r
846 field = None\r
847 elif field:\r
848 f2 = field[:]\r
849 f2[0] = "%X" % i\r
850 table[i] = f2\r
851\r
852 # public attributes\r
853 self.filename = filename\r
854 self.table = table\r
855 self.chars = range(0x110000) # unicode 3.2\r
856\r
857 file = open(exclusions)\r
858 self.exclusions = {}\r
859 for s in file:\r
860 s = s.strip()\r
861 if not s:\r
862 continue\r
863 if s[0] == '#':\r
864 continue\r
865 char = int(s.split()[0],16)\r
866 self.exclusions[char] = 1\r
867\r
868 widths = [None] * 0x110000\r
869 for s in open(eastasianwidth):\r
870 s = s.strip()\r
871 if not s:\r
872 continue\r
873 if s[0] == '#':\r
874 continue\r
875 s = s.split()[0].split(';')\r
876 if '..' in s[0]:\r
877 first, last = [int(c, 16) for c in s[0].split('..')]\r
878 chars = range(first, last+1)\r
879 else:\r
880 chars = [int(s[0], 16)]\r
881 for char in chars:\r
882 widths[char] = s[1]\r
883 for i in range(0, 0x110000):\r
884 if table[i] is not None:\r
885 table[i].append(widths[i])\r
886\r
887 for i in range(0, 0x110000):\r
888 if table[i] is not None:\r
889 table[i].append(set())\r
890 if linebreakprops:\r
891 for s in open(linebreakprops):\r
892 s = s.partition('#')[0]\r
893 s = [i.strip() for i in s.split(';')]\r
894 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:\r
895 continue\r
896 if '..' not in s[0]:\r
897 first = last = int(s[0], 16)\r
898 else:\r
899 first, last = [int(c, 16) for c in s[0].split('..')]\r
900 for char in range(first, last+1):\r
901 table[char][-1].add('Line_Break')\r
902\r
903 if derivednormalizationprops:\r
904 quickchecks = [0] * 0x110000 # default is Yes\r
905 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()\r
906 for s in open(derivednormalizationprops):\r
907 if '#' in s:\r
908 s = s[:s.index('#')]\r
909 s = [i.strip() for i in s.split(';')]\r
910 if len(s) < 2 or s[1] not in qc_order:\r
911 continue\r
912 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No\r
913 quickcheck_shift = qc_order.index(s[1])*2\r
914 quickcheck <<= quickcheck_shift\r
915 if '..' not in s[0]:\r
916 first = last = int(s[0], 16)\r
917 else:\r
918 first, last = [int(c, 16) for c in s[0].split('..')]\r
919 for char in range(first, last+1):\r
920 assert not (quickchecks[char]>>quickcheck_shift)&3\r
921 quickchecks[char] |= quickcheck\r
922 for i in range(0, 0x110000):\r
923 if table[i] is not None:\r
924 table[i].append(quickchecks[i])\r
925\r
926 for line in open(unihan):\r
927 if not line.startswith('U+'):\r
928 continue\r
929 code, tag, value = line.split(None, 3)[:3]\r
930 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',\r
931 'kOtherNumeric'):\r
932 continue\r
933 value = value.strip().replace(',', '')\r
934 i = int(code[2:], 16)\r
935 # Patch the numeric field\r
936 if table[i] is not None:\r
937 table[i][8] = value\r
938\r
939 def uselatin1(self):\r
940 # restrict character range to ISO Latin 1\r
941 self.chars = range(256)\r
942\r
943# hash table tools\r
944\r
945# this is a straight-forward reimplementation of Python's built-in\r
946# dictionary type, using a static data structure, and a custom string\r
947# hash algorithm.\r
948\r
949def myhash(s, magic):\r
950 h = 0\r
951 for c in map(ord, s.upper()):\r
952 h = (h * magic) + c\r
953 ix = h & 0xff000000L\r
954 if ix:\r
955 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff\r
956 return h\r
957\r
958SIZES = [\r
959 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),\r
960 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),\r
961 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),\r
962 (2097152,5), (4194304,3), (8388608,33), (16777216,27)\r
963]\r
964\r
965class Hash:\r
966 def __init__(self, name, data, magic):\r
967 # turn a (key, value) list into a static hash table structure\r
968\r
969 # determine table size\r
970 for size, poly in SIZES:\r
971 if size > len(data):\r
972 poly = size + poly\r
973 break\r
974 else:\r
975 raise AssertionError, "ran out of polynominals"\r
976\r
977 print size, "slots in hash table"\r
978\r
979 table = [None] * size\r
980\r
981 mask = size-1\r
982\r
983 n = 0\r
984\r
985 hash = myhash\r
986\r
987 # initialize hash table\r
988 for key, value in data:\r
989 h = hash(key, magic)\r
990 i = (~h) & mask\r
991 v = table[i]\r
992 if v is None:\r
993 table[i] = value\r
994 continue\r
995 incr = (h ^ (h >> 3)) & mask;\r
996 if not incr:\r
997 incr = mask\r
998 while 1:\r
999 n = n + 1\r
1000 i = (i + incr) & mask\r
1001 v = table[i]\r
1002 if v is None:\r
1003 table[i] = value\r
1004 break\r
1005 incr = incr << 1\r
1006 if incr > mask:\r
1007 incr = incr ^ poly\r
1008\r
1009 print n, "collisions"\r
1010 self.collisions = n\r
1011\r
1012 for i in range(len(table)):\r
1013 if table[i] is None:\r
1014 table[i] = 0\r
1015\r
1016 self.data = Array(name + "_hash", table)\r
1017 self.magic = magic\r
1018 self.name = name\r
1019 self.size = size\r
1020 self.poly = poly\r
1021\r
1022 def dump(self, file, trace):\r
1023 # write data to file, as a C array\r
1024 self.data.dump(file, trace)\r
1025 file.write("#define %s_magic %d\n" % (self.name, self.magic))\r
1026 file.write("#define %s_size %d\n" % (self.name, self.size))\r
1027 file.write("#define %s_poly %d\n" % (self.name, self.poly))\r
1028\r
1029# stuff to deal with arrays of unsigned integers\r
1030\r
1031class Array:\r
1032\r
1033 def __init__(self, name, data):\r
1034 self.name = name\r
1035 self.data = data\r
1036\r
1037 def dump(self, file, trace=0):\r
1038 # write data to file, as a C array\r
1039 size = getsize(self.data)\r
1040 if trace:\r
1041 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"\r
1042 file.write("static ")\r
1043 if size == 1:\r
1044 file.write("unsigned char")\r
1045 elif size == 2:\r
1046 file.write("unsigned short")\r
1047 else:\r
1048 file.write("unsigned int")\r
1049 file.write(" " + self.name + "[] = {\n")\r
1050 if self.data:\r
1051 s = " "\r
1052 for item in self.data:\r
1053 i = str(item) + ", "\r
1054 if len(s) + len(i) > 78:\r
1055 file.write(s + "\n")\r
1056 s = " " + i\r
1057 else:\r
1058 s = s + i\r
1059 if s.strip():\r
1060 file.write(s + "\n")\r
1061 file.write("};\n\n")\r
1062\r
1063def getsize(data):\r
1064 # return smallest possible integer size for the given array\r
1065 maxdata = max(data)\r
1066 if maxdata < 256:\r
1067 return 1\r
1068 elif maxdata < 65536:\r
1069 return 2\r
1070 else:\r
1071 return 4\r
1072\r
1073def splitbins(t, trace=0):\r
1074 """t, trace=0 -> (t1, t2, shift). Split a table to save space.\r
1075\r
1076 t is a sequence of ints. This function can be useful to save space if\r
1077 many of the ints are the same. t1 and t2 are lists of ints, and shift\r
1078 is an int, chosen to minimize the combined size of t1 and t2 (in C\r
1079 code), and where for each i in range(len(t)),\r
1080 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]\r
1081 where mask is a bitmask isolating the last "shift" bits.\r
1082\r
1083 If optional arg trace is non-zero (default zero), progress info\r
1084 is printed to sys.stderr. The higher the value, the more info\r
1085 you'll get.\r
1086 """\r
1087\r
1088 if trace:\r
1089 def dump(t1, t2, shift, bytes):\r
1090 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (\r
1091 len(t1), len(t2), shift, bytes)\r
1092 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \\r
1093 "bytes"\r
1094 n = len(t)-1 # last valid index\r
1095 maxshift = 0 # the most we can shift n and still have something left\r
1096 if n > 0:\r
1097 while n >> 1:\r
1098 n >>= 1\r
1099 maxshift += 1\r
1100 del n\r
1101 bytes = sys.maxint # smallest total size so far\r
1102 t = tuple(t) # so slices can be dict keys\r
1103 for shift in range(maxshift + 1):\r
1104 t1 = []\r
1105 t2 = []\r
1106 size = 2**shift\r
1107 bincache = {}\r
1108 for i in range(0, len(t), size):\r
1109 bin = t[i:i+size]\r
1110 index = bincache.get(bin)\r
1111 if index is None:\r
1112 index = len(t2)\r
1113 bincache[bin] = index\r
1114 t2.extend(bin)\r
1115 t1.append(index >> shift)\r
1116 # determine memory size\r
1117 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)\r
1118 if trace > 1:\r
1119 dump(t1, t2, shift, b)\r
1120 if b < bytes:\r
1121 best = t1, t2, shift\r
1122 bytes = b\r
1123 t1, t2, shift = best\r
1124 if trace:\r
1125 print >>sys.stderr, "Best:",\r
1126 dump(t1, t2, shift, bytes)\r
1127 if __debug__:\r
1128 # exhaustively verify that the decomposition is correct\r
1129 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits\r
1130 for i in xrange(len(t)):\r
1131 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]\r
1132 return best\r
1133\r
1134if __name__ == "__main__":\r
1135 maketables(1)\r