]>
Commit | Line | Data |
---|---|---|
4710c53d | 1 | #\r |
2 | # (re)generate unicode property and type databases\r | |
3 | #\r | |
4 | # this script converts a unicode 3.2 database file to\r | |
5 | # Modules/unicodedata_db.h, Modules/unicodename_db.h,\r | |
6 | # and Objects/unicodetype_db.h\r | |
7 | #\r | |
8 | # history:\r | |
9 | # 2000-09-24 fl created (based on bits and pieces from unidb)\r | |
10 | # 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table\r | |
11 | # 2000-09-25 fl added character type table\r | |
12 | # 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)\r | |
13 | # 2000-11-03 fl expand first/last ranges\r | |
14 | # 2001-01-19 fl added character name tables (2.1)\r | |
15 | # 2001-01-21 fl added decomp compression; dynamic phrasebook threshold\r | |
16 | # 2002-09-11 wd use string methods\r | |
17 | # 2002-10-18 mvl update to Unicode 3.2\r | |
18 | # 2002-10-22 mvl generate NFC tables\r | |
19 | # 2002-11-24 mvl expand all ranges, sort names version-independently\r | |
20 | # 2002-11-25 mvl add UNIDATA_VERSION\r | |
21 | # 2004-05-29 perky add east asian width information\r | |
22 | # 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta\r | |
23 | #\r | |
24 | # written by Fredrik Lundh (fredrik@pythonware.com)\r | |
25 | #\r | |
26 | \r | |
27 | import sys\r | |
28 | \r | |
29 | SCRIPT = sys.argv[0]\r | |
30 | VERSION = "2.6"\r | |
31 | \r | |
32 | # The Unicode Database\r | |
33 | UNIDATA_VERSION = "5.2.0"\r | |
34 | UNICODE_DATA = "UnicodeData%s.txt"\r | |
35 | COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"\r | |
36 | EASTASIAN_WIDTH = "EastAsianWidth%s.txt"\r | |
37 | UNIHAN = "Unihan%s.txt"\r | |
38 | DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"\r | |
39 | LINE_BREAK = "LineBreak%s.txt"\r | |
40 | \r | |
41 | old_versions = ["3.2.0"]\r | |
42 | \r | |
43 | CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",\r | |
44 | "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",\r | |
45 | "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",\r | |
46 | "So" ]\r | |
47 | \r | |
48 | BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",\r | |
49 | "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",\r | |
50 | "ON" ]\r | |
51 | \r | |
52 | EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]\r | |
53 | \r | |
54 | MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]\r | |
55 | \r | |
56 | # note: should match definitions in Objects/unicodectype.c\r | |
57 | ALPHA_MASK = 0x01\r | |
58 | DECIMAL_MASK = 0x02\r | |
59 | DIGIT_MASK = 0x04\r | |
60 | LOWER_MASK = 0x08\r | |
61 | LINEBREAK_MASK = 0x10\r | |
62 | SPACE_MASK = 0x20\r | |
63 | TITLE_MASK = 0x40\r | |
64 | UPPER_MASK = 0x80\r | |
65 | NODELTA_MASK = 0x100\r | |
66 | NUMERIC_MASK = 0x200\r | |
67 | \r | |
68 | def maketables(trace=0):\r | |
69 | \r | |
70 | print "--- Reading", UNICODE_DATA % "", "..."\r | |
71 | \r | |
72 | version = ""\r | |
73 | unicode = UnicodeData(UNICODE_DATA % version,\r | |
74 | COMPOSITION_EXCLUSIONS % version,\r | |
75 | EASTASIAN_WIDTH % version,\r | |
76 | UNIHAN % version,\r | |
77 | DERIVEDNORMALIZATION_PROPS % version,\r | |
78 | LINE_BREAK % version)\r | |
79 | \r | |
80 | print len(filter(None, unicode.table)), "characters"\r | |
81 | \r | |
82 | for version in old_versions:\r | |
83 | print "--- Reading", UNICODE_DATA % ("-"+version), "..."\r | |
84 | old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),\r | |
85 | COMPOSITION_EXCLUSIONS % ("-"+version),\r | |
86 | EASTASIAN_WIDTH % ("-"+version),\r | |
87 | UNIHAN % ("-"+version))\r | |
88 | print len(filter(None, old_unicode.table)), "characters"\r | |
89 | merge_old_version(version, unicode, old_unicode)\r | |
90 | \r | |
91 | makeunicodename(unicode, trace)\r | |
92 | makeunicodedata(unicode, trace)\r | |
93 | makeunicodetype(unicode, trace)\r | |
94 | \r | |
95 | # --------------------------------------------------------------------\r | |
96 | # unicode character properties\r | |
97 | \r | |
98 | def makeunicodedata(unicode, trace):\r | |
99 | \r | |
100 | dummy = (0, 0, 0, 0, 0, 0)\r | |
101 | table = [dummy]\r | |
102 | cache = {0: dummy}\r | |
103 | index = [0] * len(unicode.chars)\r | |
104 | \r | |
105 | FILE = "Modules/unicodedata_db.h"\r | |
106 | \r | |
107 | print "--- Preparing", FILE, "..."\r | |
108 | \r | |
109 | # 1) database properties\r | |
110 | \r | |
111 | for char in unicode.chars:\r | |
112 | record = unicode.table[char]\r | |
113 | if record:\r | |
114 | # extract database properties\r | |
115 | category = CATEGORY_NAMES.index(record[2])\r | |
116 | combining = int(record[3])\r | |
117 | bidirectional = BIDIRECTIONAL_NAMES.index(record[4])\r | |
118 | mirrored = record[9] == "Y"\r | |
119 | eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])\r | |
120 | normalizationquickcheck = record[17]\r | |
121 | item = (\r | |
122 | category, combining, bidirectional, mirrored, eastasianwidth,\r | |
123 | normalizationquickcheck\r | |
124 | )\r | |
125 | # add entry to index and item tables\r | |
126 | i = cache.get(item)\r | |
127 | if i is None:\r | |
128 | cache[item] = i = len(table)\r | |
129 | table.append(item)\r | |
130 | index[char] = i\r | |
131 | \r | |
132 | # 2) decomposition data\r | |
133 | \r | |
134 | decomp_data = [0]\r | |
135 | decomp_prefix = [""]\r | |
136 | decomp_index = [0] * len(unicode.chars)\r | |
137 | decomp_size = 0\r | |
138 | \r | |
139 | comp_pairs = []\r | |
140 | comp_first = [None] * len(unicode.chars)\r | |
141 | comp_last = [None] * len(unicode.chars)\r | |
142 | \r | |
143 | for char in unicode.chars:\r | |
144 | record = unicode.table[char]\r | |
145 | if record:\r | |
146 | if record[5]:\r | |
147 | decomp = record[5].split()\r | |
148 | if len(decomp) > 19:\r | |
149 | raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char\r | |
150 | # prefix\r | |
151 | if decomp[0][0] == "<":\r | |
152 | prefix = decomp.pop(0)\r | |
153 | else:\r | |
154 | prefix = ""\r | |
155 | try:\r | |
156 | i = decomp_prefix.index(prefix)\r | |
157 | except ValueError:\r | |
158 | i = len(decomp_prefix)\r | |
159 | decomp_prefix.append(prefix)\r | |
160 | prefix = i\r | |
161 | assert prefix < 256\r | |
162 | # content\r | |
163 | decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]\r | |
164 | # Collect NFC pairs\r | |
165 | if not prefix and len(decomp) == 3 and \\r | |
166 | char not in unicode.exclusions and \\r | |
167 | unicode.table[decomp[1]][3] == "0":\r | |
168 | p, l, r = decomp\r | |
169 | comp_first[l] = 1\r | |
170 | comp_last[r] = 1\r | |
171 | comp_pairs.append((l,r,char))\r | |
172 | try:\r | |
173 | i = decomp_data.index(decomp)\r | |
174 | except ValueError:\r | |
175 | i = len(decomp_data)\r | |
176 | decomp_data.extend(decomp)\r | |
177 | decomp_size = decomp_size + len(decomp) * 2\r | |
178 | else:\r | |
179 | i = 0\r | |
180 | decomp_index[char] = i\r | |
181 | \r | |
182 | f = l = 0\r | |
183 | comp_first_ranges = []\r | |
184 | comp_last_ranges = []\r | |
185 | prev_f = prev_l = None\r | |
186 | for i in unicode.chars:\r | |
187 | if comp_first[i] is not None:\r | |
188 | comp_first[i] = f\r | |
189 | f += 1\r | |
190 | if prev_f is None:\r | |
191 | prev_f = (i,i)\r | |
192 | elif prev_f[1]+1 == i:\r | |
193 | prev_f = prev_f[0],i\r | |
194 | else:\r | |
195 | comp_first_ranges.append(prev_f)\r | |
196 | prev_f = (i,i)\r | |
197 | if comp_last[i] is not None:\r | |
198 | comp_last[i] = l\r | |
199 | l += 1\r | |
200 | if prev_l is None:\r | |
201 | prev_l = (i,i)\r | |
202 | elif prev_l[1]+1 == i:\r | |
203 | prev_l = prev_l[0],i\r | |
204 | else:\r | |
205 | comp_last_ranges.append(prev_l)\r | |
206 | prev_l = (i,i)\r | |
207 | comp_first_ranges.append(prev_f)\r | |
208 | comp_last_ranges.append(prev_l)\r | |
209 | total_first = f\r | |
210 | total_last = l\r | |
211 | \r | |
212 | comp_data = [0]*(total_first*total_last)\r | |
213 | for f,l,char in comp_pairs:\r | |
214 | f = comp_first[f]\r | |
215 | l = comp_last[l]\r | |
216 | comp_data[f*total_last+l] = char\r | |
217 | \r | |
218 | print len(table), "unique properties"\r | |
219 | print len(decomp_prefix), "unique decomposition prefixes"\r | |
220 | print len(decomp_data), "unique decomposition entries:",\r | |
221 | print decomp_size, "bytes"\r | |
222 | print total_first, "first characters in NFC"\r | |
223 | print total_last, "last characters in NFC"\r | |
224 | print len(comp_pairs), "NFC pairs"\r | |
225 | \r | |
226 | print "--- Writing", FILE, "..."\r | |
227 | \r | |
228 | fp = open(FILE, "w")\r | |
229 | print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)\r | |
230 | print >>fp\r | |
231 | print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION\r | |
232 | print >>fp, "/* a list of unique database records */"\r | |
233 | print >>fp, \\r | |
234 | "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"\r | |
235 | for item in table:\r | |
236 | print >>fp, " {%d, %d, %d, %d, %d, %d}," % item\r | |
237 | print >>fp, "};"\r | |
238 | print >>fp\r | |
239 | \r | |
240 | print >>fp, "/* Reindexing of NFC first characters. */"\r | |
241 | print >>fp, "#define TOTAL_FIRST",total_first\r | |
242 | print >>fp, "#define TOTAL_LAST",total_last\r | |
243 | print >>fp, "struct reindex{int start;short count,index;};"\r | |
244 | print >>fp, "static struct reindex nfc_first[] = {"\r | |
245 | for start,end in comp_first_ranges:\r | |
246 | print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])\r | |
247 | print >>fp," {0,0,0}"\r | |
248 | print >>fp,"};\n"\r | |
249 | print >>fp, "static struct reindex nfc_last[] = {"\r | |
250 | for start,end in comp_last_ranges:\r | |
251 | print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])\r | |
252 | print >>fp," {0,0,0}"\r | |
253 | print >>fp,"};\n"\r | |
254 | \r | |
255 | # FIXME: <fl> the following tables could be made static, and\r | |
256 | # the support code moved into unicodedatabase.c\r | |
257 | \r | |
258 | print >>fp, "/* string literals */"\r | |
259 | print >>fp, "const char *_PyUnicode_CategoryNames[] = {"\r | |
260 | for name in CATEGORY_NAMES:\r | |
261 | print >>fp, " \"%s\"," % name\r | |
262 | print >>fp, " NULL"\r | |
263 | print >>fp, "};"\r | |
264 | \r | |
265 | print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"\r | |
266 | for name in BIDIRECTIONAL_NAMES:\r | |
267 | print >>fp, " \"%s\"," % name\r | |
268 | print >>fp, " NULL"\r | |
269 | print >>fp, "};"\r | |
270 | \r | |
271 | print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"\r | |
272 | for name in EASTASIANWIDTH_NAMES:\r | |
273 | print >>fp, " \"%s\"," % name\r | |
274 | print >>fp, " NULL"\r | |
275 | print >>fp, "};"\r | |
276 | \r | |
277 | print >>fp, "static const char *decomp_prefix[] = {"\r | |
278 | for name in decomp_prefix:\r | |
279 | print >>fp, " \"%s\"," % name\r | |
280 | print >>fp, " NULL"\r | |
281 | print >>fp, "};"\r | |
282 | \r | |
283 | # split record index table\r | |
284 | index1, index2, shift = splitbins(index, trace)\r | |
285 | \r | |
286 | print >>fp, "/* index tables for the database records */"\r | |
287 | print >>fp, "#define SHIFT", shift\r | |
288 | Array("index1", index1).dump(fp, trace)\r | |
289 | Array("index2", index2).dump(fp, trace)\r | |
290 | \r | |
291 | # split decomposition index table\r | |
292 | index1, index2, shift = splitbins(decomp_index, trace)\r | |
293 | \r | |
294 | print >>fp, "/* decomposition data */"\r | |
295 | Array("decomp_data", decomp_data).dump(fp, trace)\r | |
296 | \r | |
297 | print >>fp, "/* index tables for the decomposition data */"\r | |
298 | print >>fp, "#define DECOMP_SHIFT", shift\r | |
299 | Array("decomp_index1", index1).dump(fp, trace)\r | |
300 | Array("decomp_index2", index2).dump(fp, trace)\r | |
301 | \r | |
302 | index, index2, shift = splitbins(comp_data, trace)\r | |
303 | print >>fp, "/* NFC pairs */"\r | |
304 | print >>fp, "#define COMP_SHIFT", shift\r | |
305 | Array("comp_index", index).dump(fp, trace)\r | |
306 | Array("comp_data", index2).dump(fp, trace)\r | |
307 | \r | |
308 | # Generate delta tables for old versions\r | |
309 | for version, table, normalization in unicode.changed:\r | |
310 | cversion = version.replace(".","_")\r | |
311 | records = [table[0]]\r | |
312 | cache = {table[0]:0}\r | |
313 | index = [0] * len(table)\r | |
314 | for i, record in enumerate(table):\r | |
315 | try:\r | |
316 | index[i] = cache[record]\r | |
317 | except KeyError:\r | |
318 | index[i] = cache[record] = len(records)\r | |
319 | records.append(record)\r | |
320 | index1, index2, shift = splitbins(index, trace)\r | |
321 | print >>fp, "static const change_record change_records_%s[] = {" % cversion\r | |
322 | for record in records:\r | |
323 | print >>fp, "\t{ %s }," % ", ".join(map(str,record))\r | |
324 | print >>fp, "};"\r | |
325 | Array("changes_%s_index" % cversion, index1).dump(fp, trace)\r | |
326 | Array("changes_%s_data" % cversion, index2).dump(fp, trace)\r | |
327 | print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion\r | |
328 | print >>fp, "{"\r | |
329 | print >>fp, "\tint index;"\r | |
330 | print >>fp, "\tif (n >= 0x110000) index = 0;"\r | |
331 | print >>fp, "\telse {"\r | |
332 | print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)\r | |
333 | print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \\r | |
334 | (cversion, shift, ((1<<shift)-1))\r | |
335 | print >>fp, "\t}"\r | |
336 | print >>fp, "\treturn change_records_%s+index;" % cversion\r | |
337 | print >>fp, "}\n"\r | |
338 | print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion\r | |
339 | print >>fp, "{"\r | |
340 | print >>fp, "\tswitch(n) {"\r | |
341 | for k, v in normalization:\r | |
342 | print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)\r | |
343 | print >>fp, "\tdefault: return 0;"\r | |
344 | print >>fp, "\t}\n}\n"\r | |
345 | \r | |
346 | fp.close()\r | |
347 | \r | |
348 | # --------------------------------------------------------------------\r | |
349 | # unicode character type tables\r | |
350 | \r | |
351 | def makeunicodetype(unicode, trace):\r | |
352 | \r | |
353 | FILE = "Objects/unicodetype_db.h"\r | |
354 | \r | |
355 | print "--- Preparing", FILE, "..."\r | |
356 | \r | |
357 | # extract unicode types\r | |
358 | dummy = (0, 0, 0, 0, 0, 0)\r | |
359 | table = [dummy]\r | |
360 | cache = {0: dummy}\r | |
361 | index = [0] * len(unicode.chars)\r | |
362 | numeric = {}\r | |
363 | spaces = []\r | |
364 | linebreaks = []\r | |
365 | \r | |
366 | for char in unicode.chars:\r | |
367 | record = unicode.table[char]\r | |
368 | if record:\r | |
369 | # extract database properties\r | |
370 | category = record[2]\r | |
371 | bidirectional = record[4]\r | |
372 | properties = record[16]\r | |
373 | flags = 0\r | |
374 | delta = True\r | |
375 | if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:\r | |
376 | flags |= ALPHA_MASK\r | |
377 | if category == "Ll":\r | |
378 | flags |= LOWER_MASK\r | |
379 | if 'Line_Break' in properties or bidirectional == "B":\r | |
380 | flags |= LINEBREAK_MASK\r | |
381 | linebreaks.append(char)\r | |
382 | if category == "Zs" or bidirectional in ("WS", "B", "S"):\r | |
383 | flags |= SPACE_MASK\r | |
384 | spaces.append(char)\r | |
385 | if category == "Lt":\r | |
386 | flags |= TITLE_MASK\r | |
387 | if category == "Lu":\r | |
388 | flags |= UPPER_MASK\r | |
389 | # use delta predictor for upper/lower/title if it fits\r | |
390 | if record[12]:\r | |
391 | upper = int(record[12], 16)\r | |
392 | else:\r | |
393 | upper = char\r | |
394 | if record[13]:\r | |
395 | lower = int(record[13], 16)\r | |
396 | else:\r | |
397 | lower = char\r | |
398 | if record[14]:\r | |
399 | title = int(record[14], 16)\r | |
400 | else:\r | |
401 | # UCD.html says that a missing title char means that\r | |
402 | # it defaults to the uppercase character, not to the\r | |
403 | # character itself. Apparently, in the current UCD (5.x)\r | |
404 | # this feature is never used\r | |
405 | title = upper\r | |
406 | upper_d = upper - char\r | |
407 | lower_d = lower - char\r | |
408 | title_d = title - char\r | |
409 | if -32768 <= upper_d <= 32767 and \\r | |
410 | -32768 <= lower_d <= 32767 and \\r | |
411 | -32768 <= title_d <= 32767:\r | |
412 | # use deltas\r | |
413 | upper = upper_d & 0xffff\r | |
414 | lower = lower_d & 0xffff\r | |
415 | title = title_d & 0xffff\r | |
416 | else:\r | |
417 | flags |= NODELTA_MASK\r | |
418 | # decimal digit, integer digit\r | |
419 | decimal = 0\r | |
420 | if record[6]:\r | |
421 | flags |= DECIMAL_MASK\r | |
422 | decimal = int(record[6])\r | |
423 | digit = 0\r | |
424 | if record[7]:\r | |
425 | flags |= DIGIT_MASK\r | |
426 | digit = int(record[7])\r | |
427 | if record[8]:\r | |
428 | flags |= NUMERIC_MASK\r | |
429 | numeric.setdefault(record[8], []).append(char)\r | |
430 | item = (\r | |
431 | upper, lower, title, decimal, digit, flags\r | |
432 | )\r | |
433 | # add entry to index and item tables\r | |
434 | i = cache.get(item)\r | |
435 | if i is None:\r | |
436 | cache[item] = i = len(table)\r | |
437 | table.append(item)\r | |
438 | index[char] = i\r | |
439 | \r | |
440 | print len(table), "unique character type entries"\r | |
441 | print sum(map(len, numeric.values())), "numeric code points"\r | |
442 | print len(spaces), "whitespace code points"\r | |
443 | print len(linebreaks), "linebreak code points"\r | |
444 | \r | |
445 | print "--- Writing", FILE, "..."\r | |
446 | \r | |
447 | fp = open(FILE, "w")\r | |
448 | print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)\r | |
449 | print >>fp\r | |
450 | print >>fp, "/* a list of unique character type descriptors */"\r | |
451 | print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"\r | |
452 | for item in table:\r | |
453 | print >>fp, " {%d, %d, %d, %d, %d, %d}," % item\r | |
454 | print >>fp, "};"\r | |
455 | print >>fp\r | |
456 | \r | |
457 | # split decomposition index table\r | |
458 | index1, index2, shift = splitbins(index, trace)\r | |
459 | \r | |
460 | print >>fp, "/* type indexes */"\r | |
461 | print >>fp, "#define SHIFT", shift\r | |
462 | Array("index1", index1).dump(fp, trace)\r | |
463 | Array("index2", index2).dump(fp, trace)\r | |
464 | \r | |
465 | # Generate code for _PyUnicode_ToNumeric()\r | |
466 | numeric_items = sorted(numeric.items())\r | |
467 | print >>fp, '/* Returns the numeric value as double for Unicode characters'\r | |
468 | print >>fp, ' * having this property, -1.0 otherwise.'\r | |
469 | print >>fp, ' */'\r | |
470 | print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'\r | |
471 | print >>fp, '{'\r | |
472 | print >>fp, ' switch (ch) {'\r | |
473 | for value, codepoints in numeric_items:\r | |
474 | # Turn text into float literals\r | |
475 | parts = value.split('/')\r | |
476 | parts = [repr(float(part)) for part in parts]\r | |
477 | value = '/'.join(parts)\r | |
478 | \r | |
479 | haswide = False\r | |
480 | hasnonewide = False\r | |
481 | codepoints.sort()\r | |
482 | for codepoint in codepoints:\r | |
483 | if codepoint < 0x10000:\r | |
484 | hasnonewide = True\r | |
485 | if codepoint >= 0x10000 and not haswide:\r | |
486 | print >>fp, '#ifdef Py_UNICODE_WIDE'\r | |
487 | haswide = True\r | |
488 | print >>fp, ' case 0x%04X:' % (codepoint,)\r | |
489 | if haswide and hasnonewide:\r | |
490 | print >>fp, '#endif'\r | |
491 | print >>fp, ' return (double) %s;' % (value,)\r | |
492 | if haswide and not hasnonewide:\r | |
493 | print >>fp, '#endif'\r | |
494 | print >>fp,' }'\r | |
495 | print >>fp,' return -1.0;'\r | |
496 | print >>fp,'}'\r | |
497 | print >>fp\r | |
498 | \r | |
499 | # Generate code for _PyUnicode_IsWhitespace()\r | |
500 | print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"\r | |
501 | print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."\r | |
502 | print >>fp, " */"\r | |
503 | print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'\r | |
504 | print >>fp, '{'\r | |
505 | print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'\r | |
506 | print >>fp, ' return iswspace(ch);'\r | |
507 | print >>fp, '#else'\r | |
508 | print >>fp, ' switch (ch) {'\r | |
509 | \r | |
510 | haswide = False\r | |
511 | hasnonewide = False\r | |
512 | for codepoint in sorted(spaces):\r | |
513 | if codepoint < 0x10000:\r | |
514 | hasnonewide = True\r | |
515 | if codepoint >= 0x10000 and not haswide:\r | |
516 | print >>fp, '#ifdef Py_UNICODE_WIDE'\r | |
517 | haswide = True\r | |
518 | print >>fp, ' case 0x%04X:' % (codepoint,)\r | |
519 | if haswide and hasnonewide:\r | |
520 | print >>fp, '#endif'\r | |
521 | print >>fp, ' return 1;'\r | |
522 | if haswide and not hasnonewide:\r | |
523 | print >>fp, '#endif'\r | |
524 | \r | |
525 | print >>fp,' }'\r | |
526 | print >>fp,' return 0;'\r | |
527 | print >>fp, '#endif'\r | |
528 | print >>fp,'}'\r | |
529 | print >>fp\r | |
530 | \r | |
531 | # Generate code for _PyUnicode_IsLinebreak()\r | |
532 | print >>fp, "/* Returns 1 for Unicode characters having the line break"\r | |
533 | print >>fp, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional"\r | |
534 | print >>fp, " * type 'B', 0 otherwise."\r | |
535 | print >>fp, " */"\r | |
536 | print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'\r | |
537 | print >>fp, '{'\r | |
538 | print >>fp, ' switch (ch) {'\r | |
539 | haswide = False\r | |
540 | hasnonewide = False\r | |
541 | for codepoint in sorted(linebreaks):\r | |
542 | if codepoint < 0x10000:\r | |
543 | hasnonewide = True\r | |
544 | if codepoint >= 0x10000 and not haswide:\r | |
545 | print >>fp, '#ifdef Py_UNICODE_WIDE'\r | |
546 | haswide = True\r | |
547 | print >>fp, ' case 0x%04X:' % (codepoint,)\r | |
548 | if haswide and hasnonewide:\r | |
549 | print >>fp, '#endif'\r | |
550 | print >>fp, ' return 1;'\r | |
551 | if haswide and not hasnonewide:\r | |
552 | print >>fp, '#endif'\r | |
553 | \r | |
554 | print >>fp,' }'\r | |
555 | print >>fp,' return 0;'\r | |
556 | print >>fp,'}'\r | |
557 | print >>fp\r | |
558 | \r | |
559 | fp.close()\r | |
560 | \r | |
561 | # --------------------------------------------------------------------\r | |
562 | # unicode name database\r | |
563 | \r | |
564 | def makeunicodename(unicode, trace):\r | |
565 | \r | |
566 | FILE = "Modules/unicodename_db.h"\r | |
567 | \r | |
568 | print "--- Preparing", FILE, "..."\r | |
569 | \r | |
570 | # collect names\r | |
571 | names = [None] * len(unicode.chars)\r | |
572 | \r | |
573 | for char in unicode.chars:\r | |
574 | record = unicode.table[char]\r | |
575 | if record:\r | |
576 | name = record[1].strip()\r | |
577 | if name and name[0] != "<":\r | |
578 | names[char] = name + chr(0)\r | |
579 | \r | |
580 | print len(filter(lambda n: n is not None, names)), "distinct names"\r | |
581 | \r | |
582 | # collect unique words from names (note that we differ between\r | |
583 | # words inside a sentence, and words ending a sentence. the\r | |
584 | # latter includes the trailing null byte.\r | |
585 | \r | |
586 | words = {}\r | |
587 | n = b = 0\r | |
588 | for char in unicode.chars:\r | |
589 | name = names[char]\r | |
590 | if name:\r | |
591 | w = name.split()\r | |
592 | b = b + len(name)\r | |
593 | n = n + len(w)\r | |
594 | for w in w:\r | |
595 | l = words.get(w)\r | |
596 | if l:\r | |
597 | l.append(None)\r | |
598 | else:\r | |
599 | words[w] = [len(words)]\r | |
600 | \r | |
601 | print n, "words in text;", b, "bytes"\r | |
602 | \r | |
603 | wordlist = words.items()\r | |
604 | \r | |
605 | # sort on falling frequency, then by name\r | |
606 | def word_key(a):\r | |
607 | aword, alist = a\r | |
608 | return -len(alist), aword\r | |
609 | wordlist.sort(key=word_key)\r | |
610 | \r | |
611 | # figure out how many phrasebook escapes we need\r | |
612 | escapes = 0\r | |
613 | while escapes * 256 < len(wordlist):\r | |
614 | escapes = escapes + 1\r | |
615 | print escapes, "escapes"\r | |
616 | \r | |
617 | short = 256 - escapes\r | |
618 | \r | |
619 | assert short > 0\r | |
620 | \r | |
621 | print short, "short indexes in lexicon"\r | |
622 | \r | |
623 | # statistics\r | |
624 | n = 0\r | |
625 | for i in range(short):\r | |
626 | n = n + len(wordlist[i][1])\r | |
627 | print n, "short indexes in phrasebook"\r | |
628 | \r | |
629 | # pick the most commonly used words, and sort the rest on falling\r | |
630 | # length (to maximize overlap)\r | |
631 | \r | |
632 | wordlist, wordtail = wordlist[:short], wordlist[short:]\r | |
633 | wordtail.sort(key=lambda a: a[0], reverse=True)\r | |
634 | wordlist.extend(wordtail)\r | |
635 | \r | |
636 | # generate lexicon from words\r | |
637 | \r | |
638 | lexicon_offset = [0]\r | |
639 | lexicon = ""\r | |
640 | words = {}\r | |
641 | \r | |
642 | # build a lexicon string\r | |
643 | offset = 0\r | |
644 | for w, x in wordlist:\r | |
645 | # encoding: bit 7 indicates last character in word (chr(128)\r | |
646 | # indicates the last character in an entire string)\r | |
647 | ww = w[:-1] + chr(ord(w[-1])+128)\r | |
648 | # reuse string tails, when possible\r | |
649 | o = lexicon.find(ww)\r | |
650 | if o < 0:\r | |
651 | o = offset\r | |
652 | lexicon = lexicon + ww\r | |
653 | offset = offset + len(w)\r | |
654 | words[w] = len(lexicon_offset)\r | |
655 | lexicon_offset.append(o)\r | |
656 | \r | |
657 | lexicon = map(ord, lexicon)\r | |
658 | \r | |
659 | # generate phrasebook from names and lexicon\r | |
660 | phrasebook = [0]\r | |
661 | phrasebook_offset = [0] * len(unicode.chars)\r | |
662 | for char in unicode.chars:\r | |
663 | name = names[char]\r | |
664 | if name:\r | |
665 | w = name.split()\r | |
666 | phrasebook_offset[char] = len(phrasebook)\r | |
667 | for w in w:\r | |
668 | i = words[w]\r | |
669 | if i < short:\r | |
670 | phrasebook.append(i)\r | |
671 | else:\r | |
672 | # store as two bytes\r | |
673 | phrasebook.append((i>>8) + short)\r | |
674 | phrasebook.append(i&255)\r | |
675 | \r | |
676 | assert getsize(phrasebook) == 1\r | |
677 | \r | |
678 | #\r | |
679 | # unicode name hash table\r | |
680 | \r | |
681 | # extract names\r | |
682 | data = []\r | |
683 | for char in unicode.chars:\r | |
684 | record = unicode.table[char]\r | |
685 | if record:\r | |
686 | name = record[1].strip()\r | |
687 | if name and name[0] != "<":\r | |
688 | data.append((name, char))\r | |
689 | \r | |
690 | # the magic number 47 was chosen to minimize the number of\r | |
691 | # collisions on the current data set. if you like, change it\r | |
692 | # and see what happens...\r | |
693 | \r | |
694 | codehash = Hash("code", data, 47)\r | |
695 | \r | |
696 | print "--- Writing", FILE, "..."\r | |
697 | \r | |
698 | fp = open(FILE, "w")\r | |
699 | print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)\r | |
700 | print >>fp\r | |
701 | print >>fp, "#define NAME_MAXLEN", 256\r | |
702 | print >>fp\r | |
703 | print >>fp, "/* lexicon */"\r | |
704 | Array("lexicon", lexicon).dump(fp, trace)\r | |
705 | Array("lexicon_offset", lexicon_offset).dump(fp, trace)\r | |
706 | \r | |
707 | # split decomposition index table\r | |
708 | offset1, offset2, shift = splitbins(phrasebook_offset, trace)\r | |
709 | \r | |
710 | print >>fp, "/* code->name phrasebook */"\r | |
711 | print >>fp, "#define phrasebook_shift", shift\r | |
712 | print >>fp, "#define phrasebook_short", short\r | |
713 | \r | |
714 | Array("phrasebook", phrasebook).dump(fp, trace)\r | |
715 | Array("phrasebook_offset1", offset1).dump(fp, trace)\r | |
716 | Array("phrasebook_offset2", offset2).dump(fp, trace)\r | |
717 | \r | |
718 | print >>fp, "/* name->code dictionary */"\r | |
719 | codehash.dump(fp, trace)\r | |
720 | \r | |
721 | fp.close()\r | |
722 | \r | |
723 | \r | |
724 | def merge_old_version(version, new, old):\r | |
725 | # Changes to exclusion file not implemented yet\r | |
726 | if old.exclusions != new.exclusions:\r | |
727 | raise NotImplementedError, "exclusions differ"\r | |
728 | \r | |
729 | # In these change records, 0xFF means "no change"\r | |
730 | bidir_changes = [0xFF]*0x110000\r | |
731 | category_changes = [0xFF]*0x110000\r | |
732 | decimal_changes = [0xFF]*0x110000\r | |
733 | mirrored_changes = [0xFF]*0x110000\r | |
734 | # In numeric data, 0 means "no change",\r | |
735 | # -1 means "did not have a numeric value\r | |
736 | numeric_changes = [0] * 0x110000\r | |
737 | # normalization_changes is a list of key-value pairs\r | |
738 | normalization_changes = []\r | |
739 | for i in range(0x110000):\r | |
740 | if new.table[i] is None:\r | |
741 | # Characters unassigned in the new version ought to\r | |
742 | # be unassigned in the old one\r | |
743 | assert old.table[i] is None\r | |
744 | continue\r | |
745 | # check characters unassigned in the old version\r | |
746 | if old.table[i] is None:\r | |
747 | # category 0 is "unassigned"\r | |
748 | category_changes[i] = 0\r | |
749 | continue\r | |
750 | # check characters that differ\r | |
751 | if old.table[i] != new.table[i]:\r | |
752 | for k in range(len(old.table[i])):\r | |
753 | if old.table[i][k] != new.table[i][k]:\r | |
754 | value = old.table[i][k]\r | |
755 | if k == 2:\r | |
756 | #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]\r | |
757 | category_changes[i] = CATEGORY_NAMES.index(value)\r | |
758 | elif k == 4:\r | |
759 | #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]\r | |
760 | bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)\r | |
761 | elif k == 5:\r | |
762 | #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]\r | |
763 | # We assume that all normalization changes are in 1:1 mappings\r | |
764 | assert " " not in value\r | |
765 | normalization_changes.append((i, value))\r | |
766 | elif k == 6:\r | |
767 | #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]\r | |
768 | # we only support changes where the old value is a single digit\r | |
769 | assert value in "0123456789"\r | |
770 | decimal_changes[i] = int(value)\r | |
771 | elif k == 8:\r | |
772 | # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]\r | |
773 | # Since 0 encodes "no change", the old value is better not 0\r | |
774 | if not value:\r | |
775 | numeric_changes[i] = -1\r | |
776 | else:\r | |
777 | numeric_changes[i] = float(value)\r | |
778 | assert numeric_changes[i] not in (0, -1)\r | |
779 | elif k == 9:\r | |
780 | if value == 'Y':\r | |
781 | mirrored_changes[i] = '1'\r | |
782 | else:\r | |
783 | mirrored_changes[i] = '0'\r | |
784 | elif k == 11:\r | |
785 | # change to ISO comment, ignore\r | |
786 | pass\r | |
787 | elif k == 12:\r | |
788 | # change to simple uppercase mapping; ignore\r | |
789 | pass\r | |
790 | elif k == 13:\r | |
791 | # change to simple lowercase mapping; ignore\r | |
792 | pass\r | |
793 | elif k == 14:\r | |
794 | # change to simple titlecase mapping; ignore\r | |
795 | pass\r | |
796 | elif k == 16:\r | |
797 | # change to properties; not yet\r | |
798 | pass\r | |
799 | else:\r | |
800 | class Difference(Exception):pass\r | |
801 | raise Difference, (hex(i), k, old.table[i], new.table[i])\r | |
802 | new.changed.append((version, zip(bidir_changes, category_changes,\r | |
803 | decimal_changes, mirrored_changes,\r | |
804 | numeric_changes),\r | |
805 | normalization_changes))\r | |
806 | \r | |
807 | \r | |
808 | # --------------------------------------------------------------------\r | |
809 | # the following support code is taken from the unidb utilities\r | |
810 | # Copyright (c) 1999-2000 by Secret Labs AB\r | |
811 | \r | |
812 | # load a unicode-data file from disk\r | |
813 | \r | |
814 | class UnicodeData:\r | |
815 | # Record structure:\r | |
816 | # [ID, name, category, combining, bidi, decomp, (6)\r | |
817 | # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)\r | |
818 | # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)\r | |
819 | # properties] (17)\r | |
820 | \r | |
821 | def __init__(self, filename, exclusions, eastasianwidth, unihan,\r | |
822 | derivednormalizationprops=None, linebreakprops=None,\r | |
823 | expand=1):\r | |
824 | self.changed = []\r | |
825 | file = open(filename)\r | |
826 | table = [None] * 0x110000\r | |
827 | while 1:\r | |
828 | s = file.readline()\r | |
829 | if not s:\r | |
830 | break\r | |
831 | s = s.strip().split(";")\r | |
832 | char = int(s[0], 16)\r | |
833 | table[char] = s\r | |
834 | \r | |
835 | # expand first-last ranges\r | |
836 | if expand:\r | |
837 | field = None\r | |
838 | for i in range(0, 0x110000):\r | |
839 | s = table[i]\r | |
840 | if s:\r | |
841 | if s[1][-6:] == "First>":\r | |
842 | s[1] = ""\r | |
843 | field = s\r | |
844 | elif s[1][-5:] == "Last>":\r | |
845 | s[1] = ""\r | |
846 | field = None\r | |
847 | elif field:\r | |
848 | f2 = field[:]\r | |
849 | f2[0] = "%X" % i\r | |
850 | table[i] = f2\r | |
851 | \r | |
852 | # public attributes\r | |
853 | self.filename = filename\r | |
854 | self.table = table\r | |
855 | self.chars = range(0x110000) # unicode 3.2\r | |
856 | \r | |
857 | file = open(exclusions)\r | |
858 | self.exclusions = {}\r | |
859 | for s in file:\r | |
860 | s = s.strip()\r | |
861 | if not s:\r | |
862 | continue\r | |
863 | if s[0] == '#':\r | |
864 | continue\r | |
865 | char = int(s.split()[0],16)\r | |
866 | self.exclusions[char] = 1\r | |
867 | \r | |
868 | widths = [None] * 0x110000\r | |
869 | for s in open(eastasianwidth):\r | |
870 | s = s.strip()\r | |
871 | if not s:\r | |
872 | continue\r | |
873 | if s[0] == '#':\r | |
874 | continue\r | |
875 | s = s.split()[0].split(';')\r | |
876 | if '..' in s[0]:\r | |
877 | first, last = [int(c, 16) for c in s[0].split('..')]\r | |
878 | chars = range(first, last+1)\r | |
879 | else:\r | |
880 | chars = [int(s[0], 16)]\r | |
881 | for char in chars:\r | |
882 | widths[char] = s[1]\r | |
883 | for i in range(0, 0x110000):\r | |
884 | if table[i] is not None:\r | |
885 | table[i].append(widths[i])\r | |
886 | \r | |
887 | for i in range(0, 0x110000):\r | |
888 | if table[i] is not None:\r | |
889 | table[i].append(set())\r | |
890 | if linebreakprops:\r | |
891 | for s in open(linebreakprops):\r | |
892 | s = s.partition('#')[0]\r | |
893 | s = [i.strip() for i in s.split(';')]\r | |
894 | if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:\r | |
895 | continue\r | |
896 | if '..' not in s[0]:\r | |
897 | first = last = int(s[0], 16)\r | |
898 | else:\r | |
899 | first, last = [int(c, 16) for c in s[0].split('..')]\r | |
900 | for char in range(first, last+1):\r | |
901 | table[char][-1].add('Line_Break')\r | |
902 | \r | |
903 | if derivednormalizationprops:\r | |
904 | quickchecks = [0] * 0x110000 # default is Yes\r | |
905 | qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()\r | |
906 | for s in open(derivednormalizationprops):\r | |
907 | if '#' in s:\r | |
908 | s = s[:s.index('#')]\r | |
909 | s = [i.strip() for i in s.split(';')]\r | |
910 | if len(s) < 2 or s[1] not in qc_order:\r | |
911 | continue\r | |
912 | quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No\r | |
913 | quickcheck_shift = qc_order.index(s[1])*2\r | |
914 | quickcheck <<= quickcheck_shift\r | |
915 | if '..' not in s[0]:\r | |
916 | first = last = int(s[0], 16)\r | |
917 | else:\r | |
918 | first, last = [int(c, 16) for c in s[0].split('..')]\r | |
919 | for char in range(first, last+1):\r | |
920 | assert not (quickchecks[char]>>quickcheck_shift)&3\r | |
921 | quickchecks[char] |= quickcheck\r | |
922 | for i in range(0, 0x110000):\r | |
923 | if table[i] is not None:\r | |
924 | table[i].append(quickchecks[i])\r | |
925 | \r | |
926 | for line in open(unihan):\r | |
927 | if not line.startswith('U+'):\r | |
928 | continue\r | |
929 | code, tag, value = line.split(None, 3)[:3]\r | |
930 | if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',\r | |
931 | 'kOtherNumeric'):\r | |
932 | continue\r | |
933 | value = value.strip().replace(',', '')\r | |
934 | i = int(code[2:], 16)\r | |
935 | # Patch the numeric field\r | |
936 | if table[i] is not None:\r | |
937 | table[i][8] = value\r | |
938 | \r | |
939 | def uselatin1(self):\r | |
940 | # restrict character range to ISO Latin 1\r | |
941 | self.chars = range(256)\r | |
942 | \r | |
943 | # hash table tools\r | |
944 | \r | |
945 | # this is a straight-forward reimplementation of Python's built-in\r | |
946 | # dictionary type, using a static data structure, and a custom string\r | |
947 | # hash algorithm.\r | |
948 | \r | |
949 | def myhash(s, magic):\r | |
950 | h = 0\r | |
951 | for c in map(ord, s.upper()):\r | |
952 | h = (h * magic) + c\r | |
953 | ix = h & 0xff000000L\r | |
954 | if ix:\r | |
955 | h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff\r | |
956 | return h\r | |
957 | \r | |
958 | SIZES = [\r | |
959 | (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),\r | |
960 | (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),\r | |
961 | (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),\r | |
962 | (2097152,5), (4194304,3), (8388608,33), (16777216,27)\r | |
963 | ]\r | |
964 | \r | |
965 | class Hash:\r | |
966 | def __init__(self, name, data, magic):\r | |
967 | # turn a (key, value) list into a static hash table structure\r | |
968 | \r | |
969 | # determine table size\r | |
970 | for size, poly in SIZES:\r | |
971 | if size > len(data):\r | |
972 | poly = size + poly\r | |
973 | break\r | |
974 | else:\r | |
975 | raise AssertionError, "ran out of polynominals"\r | |
976 | \r | |
977 | print size, "slots in hash table"\r | |
978 | \r | |
979 | table = [None] * size\r | |
980 | \r | |
981 | mask = size-1\r | |
982 | \r | |
983 | n = 0\r | |
984 | \r | |
985 | hash = myhash\r | |
986 | \r | |
987 | # initialize hash table\r | |
988 | for key, value in data:\r | |
989 | h = hash(key, magic)\r | |
990 | i = (~h) & mask\r | |
991 | v = table[i]\r | |
992 | if v is None:\r | |
993 | table[i] = value\r | |
994 | continue\r | |
995 | incr = (h ^ (h >> 3)) & mask;\r | |
996 | if not incr:\r | |
997 | incr = mask\r | |
998 | while 1:\r | |
999 | n = n + 1\r | |
1000 | i = (i + incr) & mask\r | |
1001 | v = table[i]\r | |
1002 | if v is None:\r | |
1003 | table[i] = value\r | |
1004 | break\r | |
1005 | incr = incr << 1\r | |
1006 | if incr > mask:\r | |
1007 | incr = incr ^ poly\r | |
1008 | \r | |
1009 | print n, "collisions"\r | |
1010 | self.collisions = n\r | |
1011 | \r | |
1012 | for i in range(len(table)):\r | |
1013 | if table[i] is None:\r | |
1014 | table[i] = 0\r | |
1015 | \r | |
1016 | self.data = Array(name + "_hash", table)\r | |
1017 | self.magic = magic\r | |
1018 | self.name = name\r | |
1019 | self.size = size\r | |
1020 | self.poly = poly\r | |
1021 | \r | |
1022 | def dump(self, file, trace):\r | |
1023 | # write data to file, as a C array\r | |
1024 | self.data.dump(file, trace)\r | |
1025 | file.write("#define %s_magic %d\n" % (self.name, self.magic))\r | |
1026 | file.write("#define %s_size %d\n" % (self.name, self.size))\r | |
1027 | file.write("#define %s_poly %d\n" % (self.name, self.poly))\r | |
1028 | \r | |
1029 | # stuff to deal with arrays of unsigned integers\r | |
1030 | \r | |
1031 | class Array:\r | |
1032 | \r | |
1033 | def __init__(self, name, data):\r | |
1034 | self.name = name\r | |
1035 | self.data = data\r | |
1036 | \r | |
1037 | def dump(self, file, trace=0):\r | |
1038 | # write data to file, as a C array\r | |
1039 | size = getsize(self.data)\r | |
1040 | if trace:\r | |
1041 | print >>sys.stderr, self.name+":", size*len(self.data), "bytes"\r | |
1042 | file.write("static ")\r | |
1043 | if size == 1:\r | |
1044 | file.write("unsigned char")\r | |
1045 | elif size == 2:\r | |
1046 | file.write("unsigned short")\r | |
1047 | else:\r | |
1048 | file.write("unsigned int")\r | |
1049 | file.write(" " + self.name + "[] = {\n")\r | |
1050 | if self.data:\r | |
1051 | s = " "\r | |
1052 | for item in self.data:\r | |
1053 | i = str(item) + ", "\r | |
1054 | if len(s) + len(i) > 78:\r | |
1055 | file.write(s + "\n")\r | |
1056 | s = " " + i\r | |
1057 | else:\r | |
1058 | s = s + i\r | |
1059 | if s.strip():\r | |
1060 | file.write(s + "\n")\r | |
1061 | file.write("};\n\n")\r | |
1062 | \r | |
1063 | def getsize(data):\r | |
1064 | # return smallest possible integer size for the given array\r | |
1065 | maxdata = max(data)\r | |
1066 | if maxdata < 256:\r | |
1067 | return 1\r | |
1068 | elif maxdata < 65536:\r | |
1069 | return 2\r | |
1070 | else:\r | |
1071 | return 4\r | |
1072 | \r | |
1073 | def splitbins(t, trace=0):\r | |
1074 | """t, trace=0 -> (t1, t2, shift). Split a table to save space.\r | |
1075 | \r | |
1076 | t is a sequence of ints. This function can be useful to save space if\r | |
1077 | many of the ints are the same. t1 and t2 are lists of ints, and shift\r | |
1078 | is an int, chosen to minimize the combined size of t1 and t2 (in C\r | |
1079 | code), and where for each i in range(len(t)),\r | |
1080 | t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]\r | |
1081 | where mask is a bitmask isolating the last "shift" bits.\r | |
1082 | \r | |
1083 | If optional arg trace is non-zero (default zero), progress info\r | |
1084 | is printed to sys.stderr. The higher the value, the more info\r | |
1085 | you'll get.\r | |
1086 | """\r | |
1087 | \r | |
1088 | if trace:\r | |
1089 | def dump(t1, t2, shift, bytes):\r | |
1090 | print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (\r | |
1091 | len(t1), len(t2), shift, bytes)\r | |
1092 | print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \\r | |
1093 | "bytes"\r | |
1094 | n = len(t)-1 # last valid index\r | |
1095 | maxshift = 0 # the most we can shift n and still have something left\r | |
1096 | if n > 0:\r | |
1097 | while n >> 1:\r | |
1098 | n >>= 1\r | |
1099 | maxshift += 1\r | |
1100 | del n\r | |
1101 | bytes = sys.maxint # smallest total size so far\r | |
1102 | t = tuple(t) # so slices can be dict keys\r | |
1103 | for shift in range(maxshift + 1):\r | |
1104 | t1 = []\r | |
1105 | t2 = []\r | |
1106 | size = 2**shift\r | |
1107 | bincache = {}\r | |
1108 | for i in range(0, len(t), size):\r | |
1109 | bin = t[i:i+size]\r | |
1110 | index = bincache.get(bin)\r | |
1111 | if index is None:\r | |
1112 | index = len(t2)\r | |
1113 | bincache[bin] = index\r | |
1114 | t2.extend(bin)\r | |
1115 | t1.append(index >> shift)\r | |
1116 | # determine memory size\r | |
1117 | b = len(t1)*getsize(t1) + len(t2)*getsize(t2)\r | |
1118 | if trace > 1:\r | |
1119 | dump(t1, t2, shift, b)\r | |
1120 | if b < bytes:\r | |
1121 | best = t1, t2, shift\r | |
1122 | bytes = b\r | |
1123 | t1, t2, shift = best\r | |
1124 | if trace:\r | |
1125 | print >>sys.stderr, "Best:",\r | |
1126 | dump(t1, t2, shift, bytes)\r | |
1127 | if __debug__:\r | |
1128 | # exhaustively verify that the decomposition is correct\r | |
1129 | mask = ~((~0) << shift) # i.e., low-bit mask of shift bits\r | |
1130 | for i in xrange(len(t)):\r | |
1131 | assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]\r | |
1132 | return best\r | |
1133 | \r | |
1134 | if __name__ == "__main__":\r | |
1135 | maketables(1)\r |