]>
Commit | Line | Data |
---|---|---|
223e47cc | 1 | #!/usr/bin/env python |
223e47cc | 2 | # |
1a4d82fc JJ |
3 | # Copyright 2011-2013 The Rust Project Developers. See the COPYRIGHT |
4 | # file at the top-level directory of this distribution and at | |
5 | # http://rust-lang.org/COPYRIGHT. | |
6 | # | |
7 | # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
8 | # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
9 | # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
10 | # option. This file may not be copied, modified, or distributed | |
11 | # except according to those terms. | |
12 | ||
13 | # This script uses the following Unicode tables: | |
14 | # - DerivedCoreProperties.txt | |
9346a6ac | 15 | # - DerivedNormalizationProps.txt |
1a4d82fc | 16 | # - EastAsianWidth.txt |
9346a6ac | 17 | # - auxiliary/GraphemeBreakProperty.txt |
1a4d82fc | 18 | # - PropList.txt |
9346a6ac | 19 | # - ReadMe.txt |
1a4d82fc JJ |
20 | # - Scripts.txt |
21 | # - UnicodeData.txt | |
22 | # | |
23 | # Since this should not require frequent updates, we just store this | |
24 | # out-of-line and check the unicode.rs file into git. | |
223e47cc | 25 | |
1a4d82fc JJ |
26 | import fileinput, re, os, sys, operator |
27 | ||
a7813a04 XL |
28 | bytes_old = 0 |
29 | bytes_new = 0 | |
30 | ||
5bcae85e | 31 | preamble = '''// Copyright 2012-2016 The Rust Project Developers. See the COPYRIGHT |
1a4d82fc JJ |
32 | // file at the top-level directory of this distribution and at |
33 | // http://rust-lang.org/COPYRIGHT. | |
34 | // | |
35 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
36 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
37 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
38 | // option. This file may not be copied, modified, or distributed | |
39 | // except according to those terms. | |
223e47cc | 40 | |
1a4d82fc JJ |
41 | // NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly |
42 | ||
43 | #![allow(missing_docs, non_upper_case_globals, non_snake_case)] | |
44 | ''' | |
45 | ||
46 | # Mapping taken from Table 12 from: | |
47 | # http://www.unicode.org/reports/tr44/#General_Category_Values | |
48 | expanded_categories = { | |
49 | 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], | |
50 | 'Lm': ['L'], 'Lo': ['L'], | |
51 | 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], | |
52 | 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], | |
53 | 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], | |
54 | 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], | |
55 | 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], | |
56 | 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], | |
57 | 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], | |
58 | } | |
59 | ||
9346a6ac AL |
60 | # these are the surrogate codepoints, which are not valid rust characters |
61 | surrogate_codepoints = (0xd800, 0xdfff) | |
223e47cc LB |
62 | |
63 | def fetch(f): | |
9346a6ac | 64 | if not os.path.exists(os.path.basename(f)): |
223e47cc LB |
65 | os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s" |
66 | % f) | |
67 | ||
9346a6ac | 68 | if not os.path.exists(os.path.basename(f)): |
223e47cc LB |
69 | sys.stderr.write("cannot load %s" % f) |
70 | exit(1) | |
71 | ||
c34b1796 | 72 | def is_surrogate(n): |
9346a6ac | 73 | return surrogate_codepoints[0] <= n <= surrogate_codepoints[1] |
223e47cc LB |
74 | |
75 | def load_unicode_data(f): | |
76 | fetch(f) | |
77 | gencats = {} | |
62682a34 SL |
78 | to_lower = {} |
79 | to_upper = {} | |
80 | to_title = {} | |
1a4d82fc | 81 | combines = {} |
223e47cc LB |
82 | canon_decomp = {} |
83 | compat_decomp = {} | |
1a4d82fc | 84 | |
9e0c209e SL |
85 | udict = {} |
86 | range_start = -1 | |
223e47cc | 87 | for line in fileinput.input(f): |
9e0c209e | 88 | data = line.split(';') |
c34b1796 | 89 | if len(data) != 15: |
223e47cc | 90 | continue |
9e0c209e | 91 | cp = int(data[0], 16) |
c34b1796 | 92 | if is_surrogate(cp): |
1a4d82fc | 93 | continue |
c34b1796 AL |
94 | if range_start >= 0: |
95 | for i in xrange(range_start, cp): | |
9e0c209e SL |
96 | udict[i] = data |
97 | range_start = -1 | |
c34b1796 | 98 | if data[1].endswith(", First>"): |
9e0c209e SL |
99 | range_start = cp |
100 | continue | |
101 | udict[cp] = data | |
c34b1796 AL |
102 | |
103 | for code in udict: | |
9e0c209e | 104 | (code_org, name, gencat, combine, bidi, |
c34b1796 | 105 | decomp, deci, digit, num, mirror, |
9e0c209e | 106 | old, iso, upcase, lowcase, titlecase) = udict[code] |
1a4d82fc JJ |
107 | |
108 | # generate char to char direct common and simple conversions | |
109 | # uppercase to lowercase | |
62682a34 SL |
110 | if lowcase != "" and code_org != lowcase: |
111 | to_lower[code] = (int(lowcase, 16), 0, 0) | |
223e47cc | 112 | |
1a4d82fc | 113 | # lowercase to uppercase |
62682a34 SL |
114 | if upcase != "" and code_org != upcase: |
115 | to_upper[code] = (int(upcase, 16), 0, 0) | |
116 | ||
117 | # title case | |
118 | if titlecase.strip() != "" and code_org != titlecase: | |
119 | to_title[code] = (int(titlecase, 16), 0, 0) | |
223e47cc | 120 | |
1a4d82fc | 121 | # store decomposition, if given |
223e47cc LB |
122 | if decomp != "": |
123 | if decomp.startswith('<'): | |
124 | seq = [] | |
125 | for i in decomp.split()[1:]: | |
126 | seq.append(int(i, 16)) | |
127 | compat_decomp[code] = seq | |
128 | else: | |
129 | seq = [] | |
130 | for i in decomp.split(): | |
131 | seq.append(int(i, 16)) | |
132 | canon_decomp[code] = seq | |
133 | ||
1a4d82fc JJ |
134 | # place letter in categories as appropriate |
135 | for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []): | |
136 | if cat not in gencats: | |
137 | gencats[cat] = [] | |
138 | gencats[cat].append(code) | |
139 | ||
140 | # record combining class, if any | |
141 | if combine != "0": | |
142 | if combine not in combines: | |
143 | combines[combine] = [] | |
144 | combines[combine].append(code) | |
145 | ||
146 | # generate Not_Assigned from Assigned | |
147 | gencats["Cn"] = gen_unassigned(gencats["Assigned"]) | |
148 | # Assigned is not a real category | |
149 | del(gencats["Assigned"]) | |
150 | # Other contains Not_Assigned | |
151 | gencats["C"].extend(gencats["Cn"]) | |
152 | gencats = group_cats(gencats) | |
153 | combines = to_combines(group_cats(combines)) | |
154 | ||
62682a34 SL |
155 | return (canon_decomp, compat_decomp, gencats, combines, to_upper, to_lower, to_title) |
156 | ||
157 | def load_special_casing(f, to_upper, to_lower, to_title): | |
158 | fetch(f) | |
159 | for line in fileinput.input(f): | |
160 | data = line.split('#')[0].split(';') | |
161 | if len(data) == 5: | |
162 | code, lower, title, upper, _comment = data | |
163 | elif len(data) == 6: | |
164 | code, lower, title, upper, condition, _comment = data | |
165 | if condition.strip(): # Only keep unconditional mappins | |
166 | continue | |
167 | else: | |
168 | continue | |
169 | code = code.strip() | |
170 | lower = lower.strip() | |
171 | title = title.strip() | |
172 | upper = upper.strip() | |
173 | key = int(code, 16) | |
174 | for (map_, values) in [(to_lower, lower), (to_upper, upper), (to_title, title)]: | |
175 | if values != code: | |
176 | values = [int(i, 16) for i in values.split()] | |
177 | for _ in range(len(values), 3): | |
178 | values.append(0) | |
179 | assert len(values) == 3 | |
180 | map_[key] = values | |
1a4d82fc JJ |
181 | |
182 | def group_cats(cats): | |
183 | cats_out = {} | |
184 | for cat in cats: | |
185 | cats_out[cat] = group_cat(cats[cat]) | |
186 | return cats_out | |
187 | ||
188 | def group_cat(cat): | |
189 | cat_out = [] | |
190 | letters = sorted(set(cat)) | |
191 | cur_start = letters.pop(0) | |
192 | cur_end = cur_start | |
193 | for letter in letters: | |
194 | assert letter > cur_end, \ | |
195 | "cur_end: %s, letter: %s" % (hex(cur_end), hex(letter)) | |
196 | if letter == cur_end + 1: | |
197 | cur_end = letter | |
223e47cc | 198 | else: |
1a4d82fc JJ |
199 | cat_out.append((cur_start, cur_end)) |
200 | cur_start = cur_end = letter | |
201 | cat_out.append((cur_start, cur_end)) | |
202 | return cat_out | |
203 | ||
204 | def ungroup_cat(cat): | |
205 | cat_out = [] | |
206 | for (lo, hi) in cat: | |
207 | while lo <= hi: | |
208 | cat_out.append(lo) | |
209 | lo += 1 | |
210 | return cat_out | |
211 | ||
212 | def gen_unassigned(assigned): | |
213 | assigned = set(assigned) | |
214 | return ([i for i in range(0, 0xd800) if i not in assigned] + | |
215 | [i for i in range(0xe000, 0x110000) if i not in assigned]) | |
216 | ||
217 | def to_combines(combs): | |
218 | combs_out = [] | |
219 | for comb in combs: | |
220 | for (lo, hi) in combs[comb]: | |
221 | combs_out.append((lo, hi, comb)) | |
222 | combs_out.sort(key=lambda comb: comb[0]) | |
223 | return combs_out | |
224 | ||
225 | def format_table_content(f, content, indent): | |
226 | line = " "*indent | |
227 | first = True | |
228 | for chunk in content.split(","): | |
229 | if len(line) + len(chunk) < 98: | |
230 | if first: | |
231 | line += chunk | |
232 | else: | |
233 | line += ", " + chunk | |
234 | first = False | |
235 | else: | |
236 | f.write(line + ",\n") | |
237 | line = " "*indent + chunk | |
238 | f.write(line) | |
223e47cc | 239 | |
1a4d82fc | 240 | def load_properties(f, interestingprops): |
223e47cc | 241 | fetch(f) |
1a4d82fc | 242 | props = {} |
d9579d0f AL |
243 | re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)") |
244 | re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)") | |
223e47cc | 245 | |
9346a6ac | 246 | for line in fileinput.input(os.path.basename(f)): |
223e47cc LB |
247 | prop = None |
248 | d_lo = 0 | |
249 | d_hi = 0 | |
250 | m = re1.match(line) | |
251 | if m: | |
252 | d_lo = m.group(1) | |
253 | d_hi = m.group(1) | |
254 | prop = m.group(2) | |
255 | else: | |
256 | m = re2.match(line) | |
257 | if m: | |
258 | d_lo = m.group(1) | |
259 | d_hi = m.group(2) | |
260 | prop = m.group(3) | |
261 | else: | |
262 | continue | |
1a4d82fc | 263 | if interestingprops and prop not in interestingprops: |
223e47cc LB |
264 | continue |
265 | d_lo = int(d_lo, 16) | |
266 | d_hi = int(d_hi, 16) | |
1a4d82fc JJ |
267 | if prop not in props: |
268 | props[prop] = [] | |
269 | props[prop].append((d_lo, d_hi)) | |
d9579d0f AL |
270 | |
271 | # optimize if possible | |
272 | for prop in props: | |
273 | props[prop] = group_cat(ungroup_cat(props[prop])) | |
274 | ||
1a4d82fc JJ |
275 | return props |
276 | ||
223e47cc | 277 | def escape_char(c): |
62682a34 | 278 | return "'\\u{%x}'" % c if c != 0 else "'\\0'" |
970d7e83 LB |
279 | |
280 | def emit_bsearch_range_table(f): | |
281 | f.write(""" | |
b039eaaf | 282 | fn bsearch_range_table(c: char, r: &'static [(char, char)]) -> bool { |
1a4d82fc | 283 | use core::cmp::Ordering::{Equal, Less, Greater}; |
b039eaaf | 284 | r.binary_search_by(|&(lo, hi)| { |
9cc50fc6 SL |
285 | if c < lo { |
286 | Greater | |
b039eaaf SL |
287 | } else if hi < c { |
288 | Less | |
289 | } else { | |
9cc50fc6 | 290 | Equal |
b039eaaf SL |
291 | } |
292 | }) | |
293 | .is_ok() | |
1a4d82fc JJ |
294 | }\n |
295 | """) | |
296 | ||
297 | def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True, | |
298 | pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))): | |
299 | pub_string = "" | |
300 | if is_pub: | |
301 | pub_string = "pub " | |
c34b1796 | 302 | f.write(" %sconst %s: %s = &[\n" % (pub_string, name, t_type)) |
1a4d82fc JJ |
303 | data = "" |
304 | first = True | |
305 | for dat in t_data: | |
306 | if not first: | |
307 | data += "," | |
308 | first = False | |
309 | data += pfun(dat) | |
310 | format_table_content(f, data, 8) | |
311 | f.write("\n ];\n\n") | |
312 | ||
a7813a04 XL |
313 | def emit_trie_lookup_range_table(f): |
314 | f.write(""" | |
315 | ||
316 | // BoolTrie is a trie for representing a set of Unicode codepoints. It is | |
317 | // implemented with postfix compression (sharing of identical child nodes), | |
318 | // which gives both compact size and fast lookup. | |
319 | // | |
320 | // The space of Unicode codepoints is divided into 3 subareas, each | |
321 | // represented by a trie with different depth. In the first (0..0x800), there | |
322 | // is no trie structure at all; each u64 entry corresponds to a bitvector | |
323 | // effectively holding 64 bool values. | |
324 | // | |
325 | // In the second (0x800..0x10000), each child of the root node represents a | |
326 | // 64-wide subrange, but instead of storing the full 64-bit value of the leaf, | |
327 | // the trie stores an 8-bit index into a shared table of leaf values. This | |
328 | // exploits the fact that in reasonable sets, many such leaves can be shared. | |
329 | // | |
330 | // In the third (0x10000..0x110000), each child of the root node represents a | |
331 | // 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice | |
332 | // of a child tree. Each of these 64 bytes represents an index into the table | |
333 | // of shared 64-bit leaf values. This exploits the sparse structure in the | |
334 | // non-BMP range of most Unicode sets. | |
335 | pub struct BoolTrie { | |
336 | // 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences) | |
337 | r1: [u64; 32], // leaves | |
338 | ||
339 | // 0x800..0x10000 (corresponding to 3 byte utf-8 sequences) | |
340 | r2: [u8; 992], // first level | |
341 | r3: &'static [u64], // leaves | |
342 | ||
343 | // 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences) | |
344 | r4: [u8; 256], // first level | |
345 | r5: &'static [u8], // second level | |
346 | r6: &'static [u64], // leaves | |
347 | } | |
348 | ||
349 | fn trie_range_leaf(c: usize, bitmap_chunk: u64) -> bool { | |
350 | ((bitmap_chunk >> (c & 63)) & 1) != 0 | |
351 | } | |
352 | ||
353 | fn trie_lookup_range_table(c: char, r: &'static BoolTrie) -> bool { | |
354 | let c = c as usize; | |
355 | if c < 0x800 { | |
356 | trie_range_leaf(c, r.r1[c >> 6]) | |
357 | } else if c < 0x10000 { | |
358 | let child = r.r2[(c >> 6) - 0x20]; | |
359 | trie_range_leaf(c, r.r3[child as usize]) | |
360 | } else { | |
361 | let child = r.r4[(c >> 12) - 0x10]; | |
362 | let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)]; | |
363 | trie_range_leaf(c, r.r6[leaf as usize]) | |
364 | } | |
365 | }\n | |
366 | """) | |
367 | ||
368 | def compute_trie(rawdata, chunksize): | |
369 | root = [] | |
370 | childmap = {} | |
371 | child_data = [] | |
372 | for i in range(len(rawdata) / chunksize): | |
373 | data = rawdata[i * chunksize: (i + 1) * chunksize] | |
374 | child = '|'.join(map(str, data)) | |
375 | if child not in childmap: | |
376 | childmap[child] = len(childmap) | |
377 | child_data.extend(data) | |
378 | root.append(childmap[child]) | |
379 | return (root, child_data) | |
380 | ||
381 | def emit_bool_trie(f, name, t_data, is_pub=True): | |
382 | global bytes_old, bytes_new | |
383 | bytes_old += 8 * len(t_data) | |
384 | CHUNK = 64 | |
9e0c209e | 385 | rawdata = [False] * 0x110000 |
a7813a04 XL |
386 | for (lo, hi) in t_data: |
387 | for cp in range(lo, hi + 1): | |
388 | rawdata[cp] = True | |
389 | ||
390 | # convert to bitmap chunks of 64 bits each | |
391 | chunks = [] | |
392 | for i in range(0x110000 / CHUNK): | |
393 | chunk = 0 | |
394 | for j in range(64): | |
395 | if rawdata[i * 64 + j]: | |
396 | chunk |= 1 << j | |
397 | chunks.append(chunk) | |
398 | ||
399 | pub_string = "" | |
400 | if is_pub: | |
401 | pub_string = "pub " | |
402 | f.write(" %sconst %s: &'static super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name)) | |
403 | f.write(" r1: [\n") | |
404 | data = ','.join('0x%016x' % chunk for chunk in chunks[0:0x800 / CHUNK]) | |
405 | format_table_content(f, data, 12) | |
406 | f.write("\n ],\n") | |
407 | ||
408 | # 0x800..0x10000 trie | |
409 | (r2, r3) = compute_trie(chunks[0x800 / CHUNK : 0x10000 / CHUNK], 64 / CHUNK) | |
410 | f.write(" r2: [\n") | |
411 | data = ','.join(str(node) for node in r2) | |
412 | format_table_content(f, data, 12) | |
413 | f.write("\n ],\n") | |
414 | f.write(" r3: &[\n") | |
415 | data = ','.join('0x%016x' % chunk for chunk in r3) | |
416 | format_table_content(f, data, 12) | |
417 | f.write("\n ],\n") | |
418 | ||
419 | # 0x10000..0x110000 trie | |
420 | (mid, r6) = compute_trie(chunks[0x10000 / CHUNK : 0x110000 / CHUNK], 64 / CHUNK) | |
421 | (r4, r5) = compute_trie(mid, 64) | |
422 | f.write(" r4: [\n") | |
423 | data = ','.join(str(node) for node in r4) | |
424 | format_table_content(f, data, 12) | |
425 | f.write("\n ],\n") | |
426 | f.write(" r5: &[\n") | |
427 | data = ','.join(str(node) for node in r5) | |
428 | format_table_content(f, data, 12) | |
429 | f.write("\n ],\n") | |
430 | f.write(" r6: &[\n") | |
431 | data = ','.join('0x%016x' % chunk for chunk in r6) | |
432 | format_table_content(f, data, 12) | |
433 | f.write("\n ],\n") | |
434 | ||
435 | f.write(" };\n\n") | |
436 | bytes_new += 256 + 992 + 256 + 8 * len(r3) + len(r5) + 8 * len(r6) | |
437 | ||
9346a6ac | 438 | def emit_property_module(f, mod, tbl, emit): |
970d7e83 | 439 | f.write("pub mod %s {\n" % mod) |
9346a6ac | 440 | for cat in sorted(emit): |
a7813a04 | 441 | emit_bool_trie(f, "%s_table" % cat, tbl[cat]) |
9346a6ac | 442 | f.write(" pub fn %s(c: char) -> bool {\n" % cat) |
a7813a04 | 443 | f.write(" super::trie_lookup_range_table(c, %s_table)\n" % cat) |
9346a6ac | 444 | f.write(" }\n\n") |
1a4d82fc JJ |
445 | f.write("}\n\n") |
446 | ||
62682a34 | 447 | def emit_conversions_module(f, to_upper, to_lower, to_title): |
1a4d82fc JJ |
448 | f.write("pub mod conversions {") |
449 | f.write(""" | |
1a4d82fc JJ |
450 | use core::option::Option; |
451 | use core::option::Option::{Some, None}; | |
1a4d82fc | 452 | |
62682a34 SL |
453 | pub fn to_lower(c: char) -> [char; 3] { |
454 | match bsearch_case_table(c, to_lowercase_table) { | |
9cc50fc6 SL |
455 | None => [c, '\\0', '\\0'], |
456 | Some(index) => to_lowercase_table[index].1, | |
62682a34 SL |
457 | } |
458 | } | |
459 | ||
460 | pub fn to_upper(c: char) -> [char; 3] { | |
461 | match bsearch_case_table(c, to_uppercase_table) { | |
462 | None => [c, '\\0', '\\0'], | |
9cc50fc6 | 463 | Some(index) => to_uppercase_table[index].1, |
1a4d82fc JJ |
464 | } |
465 | } | |
466 | ||
62682a34 | 467 | fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option<usize> { |
9cc50fc6 | 468 | table.binary_search_by(|&(key, _)| key.cmp(&c)).ok() |
1a4d82fc JJ |
469 | } |
470 | ||
471 | """) | |
62682a34 SL |
472 | t_type = "&'static [(char, [char; 3])]" |
473 | pfun = lambda x: "(%s,[%s,%s,%s])" % ( | |
474 | escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2])) | |
475 | emit_table(f, "to_lowercase_table", | |
476 | sorted(to_lower.iteritems(), key=operator.itemgetter(0)), | |
477 | is_pub=False, t_type = t_type, pfun=pfun) | |
478 | emit_table(f, "to_uppercase_table", | |
479 | sorted(to_upper.iteritems(), key=operator.itemgetter(0)), | |
480 | is_pub=False, t_type = t_type, pfun=pfun) | |
1a4d82fc JJ |
481 | f.write("}\n\n") |
482 | ||
1a4d82fc | 483 | def emit_norm_module(f, canon, compat, combine, norm_props): |
223e47cc LB |
484 | canon_keys = canon.keys() |
485 | canon_keys.sort() | |
486 | ||
487 | compat_keys = compat.keys() | |
488 | compat_keys.sort() | |
223e47cc | 489 | |
1a4d82fc JJ |
490 | canon_comp = {} |
491 | comp_exclusions = norm_props["Full_Composition_Exclusion"] | |
492 | for char in canon_keys: | |
493 | if True in map(lambda (lo, hi): lo <= char <= hi, comp_exclusions): | |
494 | continue | |
495 | decomp = canon[char] | |
496 | if len(decomp) == 2: | |
497 | if not canon_comp.has_key(decomp[0]): | |
498 | canon_comp[decomp[0]] = [] | |
499 | canon_comp[decomp[0]].append( (decomp[1], char) ) | |
500 | canon_comp_keys = canon_comp.keys() | |
501 | canon_comp_keys.sort() | |
502 | ||
1a4d82fc JJ |
503 | if __name__ == "__main__": |
504 | r = "tables.rs" | |
505 | if os.path.exists(r): | |
506 | os.remove(r) | |
507 | with open(r, "w") as rf: | |
508 | # write the file's preamble | |
509 | rf.write(preamble) | |
510 | ||
511 | # download and parse all the data | |
512 | fetch("ReadMe.txt") | |
513 | with open("ReadMe.txt") as readme: | |
514 | pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode" | |
515 | unicode_version = re.search(pattern, readme.read()).groups() | |
516 | rf.write(""" | |
517 | /// The version of [Unicode](http://www.unicode.org/) | |
c34b1796 | 518 | /// that the unicode parts of `CharExt` and `UnicodeStrPrelude` traits are based on. |
85aaf69f | 519 | pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s); |
1a4d82fc JJ |
520 | """ % unicode_version) |
521 | (canon_decomp, compat_decomp, gencats, combines, | |
62682a34 SL |
522 | to_upper, to_lower, to_title) = load_unicode_data("UnicodeData.txt") |
523 | load_special_casing("SpecialCasing.txt", to_upper, to_lower, to_title) | |
524 | want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase", | |
525 | "Cased", "Case_Ignorable"] | |
9346a6ac | 526 | derived = load_properties("DerivedCoreProperties.txt", want_derived) |
1a4d82fc JJ |
527 | scripts = load_properties("Scripts.txt", []) |
528 | props = load_properties("PropList.txt", | |
54a0048b | 529 | ["White_Space", "Join_Control", "Noncharacter_Code_Point", "Pattern_White_Space"]) |
1a4d82fc JJ |
530 | norm_props = load_properties("DerivedNormalizationProps.txt", |
531 | ["Full_Composition_Exclusion"]) | |
532 | ||
a7813a04 XL |
533 | # trie_lookup_table is used in all the property modules below |
534 | emit_trie_lookup_range_table(rf) | |
535 | # emit_bsearch_range_table(rf) | |
1a4d82fc | 536 | |
9346a6ac | 537 | # category tables |
1a4d82fc JJ |
538 | for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \ |
539 | ("derived_property", derived, want_derived), \ | |
54a0048b | 540 | ("property", props, ["White_Space", "Pattern_White_Space"]): |
1a4d82fc | 541 | emit_property_module(rf, name, cat, pfuns) |
1a4d82fc JJ |
542 | |
543 | # normalizations and conversions module | |
544 | emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props) | |
62682a34 | 545 | emit_conversions_module(rf, to_upper, to_lower, to_title) |
a7813a04 | 546 | #print 'bytes before = %d, bytes after = %d' % (bytes_old, bytes_new) |