]>
Commit | Line | Data |
---|---|---|
223e47cc | 1 | #!/usr/bin/env python |
223e47cc | 2 | # |
1a4d82fc JJ |
3 | # Copyright 2011-2013 The Rust Project Developers. See the COPYRIGHT |
4 | # file at the top-level directory of this distribution and at | |
5 | # http://rust-lang.org/COPYRIGHT. | |
6 | # | |
7 | # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
8 | # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
9 | # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
10 | # option. This file may not be copied, modified, or distributed | |
11 | # except according to those terms. | |
12 | ||
13 | # This script uses the following Unicode tables: | |
14 | # - DerivedCoreProperties.txt | |
9346a6ac | 15 | # - DerivedNormalizationProps.txt |
1a4d82fc | 16 | # - EastAsianWidth.txt |
9346a6ac | 17 | # - auxiliary/GraphemeBreakProperty.txt |
1a4d82fc | 18 | # - PropList.txt |
9346a6ac | 19 | # - ReadMe.txt |
1a4d82fc JJ |
20 | # - Scripts.txt |
21 | # - UnicodeData.txt | |
22 | # | |
23 | # Since this should not require frequent updates, we just store this | |
24 | # out-of-line and check the unicode.rs file into git. | |
223e47cc | 25 | |
1a4d82fc JJ |
26 | import fileinput, re, os, sys, operator |
27 | ||
d9579d0f | 28 | preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT |
1a4d82fc JJ |
29 | // file at the top-level directory of this distribution and at |
30 | // http://rust-lang.org/COPYRIGHT. | |
31 | // | |
32 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
33 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
34 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
35 | // option. This file may not be copied, modified, or distributed | |
36 | // except according to those terms. | |
223e47cc | 37 | |
1a4d82fc JJ |
38 | // NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly |
39 | ||
40 | #![allow(missing_docs, non_upper_case_globals, non_snake_case)] | |
41 | ''' | |
42 | ||
43 | # Mapping taken from Table 12 from: | |
44 | # http://www.unicode.org/reports/tr44/#General_Category_Values | |
45 | expanded_categories = { | |
46 | 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], | |
47 | 'Lm': ['L'], 'Lo': ['L'], | |
48 | 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], | |
49 | 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], | |
50 | 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], | |
51 | 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], | |
52 | 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], | |
53 | 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], | |
54 | 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], | |
55 | } | |
56 | ||
9346a6ac AL |
57 | # these are the surrogate codepoints, which are not valid rust characters |
58 | surrogate_codepoints = (0xd800, 0xdfff) | |
223e47cc LB |
59 | |
60 | def fetch(f): | |
9346a6ac | 61 | if not os.path.exists(os.path.basename(f)): |
223e47cc LB |
62 | os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s" |
63 | % f) | |
64 | ||
9346a6ac | 65 | if not os.path.exists(os.path.basename(f)): |
223e47cc LB |
66 | sys.stderr.write("cannot load %s" % f) |
67 | exit(1) | |
68 | ||
c34b1796 | 69 | def is_surrogate(n): |
9346a6ac | 70 | return surrogate_codepoints[0] <= n <= surrogate_codepoints[1] |
223e47cc LB |
71 | |
72 | def load_unicode_data(f): | |
73 | fetch(f) | |
74 | gencats = {} | |
62682a34 SL |
75 | to_lower = {} |
76 | to_upper = {} | |
77 | to_title = {} | |
1a4d82fc | 78 | combines = {} |
223e47cc LB |
79 | canon_decomp = {} |
80 | compat_decomp = {} | |
1a4d82fc | 81 | |
c34b1796 AL |
82 | udict = {}; |
83 | range_start = -1; | |
223e47cc | 84 | for line in fileinput.input(f): |
c34b1796 AL |
85 | data = line.split(';'); |
86 | if len(data) != 15: | |
223e47cc | 87 | continue |
c34b1796 AL |
88 | cp = int(data[0], 16); |
89 | if is_surrogate(cp): | |
1a4d82fc | 90 | continue |
c34b1796 AL |
91 | if range_start >= 0: |
92 | for i in xrange(range_start, cp): | |
93 | udict[i] = data; | |
94 | range_start = -1; | |
95 | if data[1].endswith(", First>"): | |
96 | range_start = cp; | |
97 | continue; | |
98 | udict[cp] = data; | |
99 | ||
100 | for code in udict: | |
101 | [code_org, name, gencat, combine, bidi, | |
102 | decomp, deci, digit, num, mirror, | |
103 | old, iso, upcase, lowcase, titlecase ] = udict[code]; | |
1a4d82fc JJ |
104 | |
105 | # generate char to char direct common and simple conversions | |
106 | # uppercase to lowercase | |
62682a34 SL |
107 | if lowcase != "" and code_org != lowcase: |
108 | to_lower[code] = (int(lowcase, 16), 0, 0) | |
223e47cc | 109 | |
1a4d82fc | 110 | # lowercase to uppercase |
62682a34 SL |
111 | if upcase != "" and code_org != upcase: |
112 | to_upper[code] = (int(upcase, 16), 0, 0) | |
113 | ||
114 | # title case | |
115 | if titlecase.strip() != "" and code_org != titlecase: | |
116 | to_title[code] = (int(titlecase, 16), 0, 0) | |
223e47cc | 117 | |
1a4d82fc | 118 | # store decomposition, if given |
223e47cc LB |
119 | if decomp != "": |
120 | if decomp.startswith('<'): | |
121 | seq = [] | |
122 | for i in decomp.split()[1:]: | |
123 | seq.append(int(i, 16)) | |
124 | compat_decomp[code] = seq | |
125 | else: | |
126 | seq = [] | |
127 | for i in decomp.split(): | |
128 | seq.append(int(i, 16)) | |
129 | canon_decomp[code] = seq | |
130 | ||
1a4d82fc JJ |
131 | # place letter in categories as appropriate |
132 | for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []): | |
133 | if cat not in gencats: | |
134 | gencats[cat] = [] | |
135 | gencats[cat].append(code) | |
136 | ||
137 | # record combining class, if any | |
138 | if combine != "0": | |
139 | if combine not in combines: | |
140 | combines[combine] = [] | |
141 | combines[combine].append(code) | |
142 | ||
143 | # generate Not_Assigned from Assigned | |
144 | gencats["Cn"] = gen_unassigned(gencats["Assigned"]) | |
145 | # Assigned is not a real category | |
146 | del(gencats["Assigned"]) | |
147 | # Other contains Not_Assigned | |
148 | gencats["C"].extend(gencats["Cn"]) | |
149 | gencats = group_cats(gencats) | |
150 | combines = to_combines(group_cats(combines)) | |
151 | ||
62682a34 SL |
152 | return (canon_decomp, compat_decomp, gencats, combines, to_upper, to_lower, to_title) |
153 | ||
154 | def load_special_casing(f, to_upper, to_lower, to_title): | |
155 | fetch(f) | |
156 | for line in fileinput.input(f): | |
157 | data = line.split('#')[0].split(';') | |
158 | if len(data) == 5: | |
159 | code, lower, title, upper, _comment = data | |
160 | elif len(data) == 6: | |
161 | code, lower, title, upper, condition, _comment = data | |
162 | if condition.strip(): # Only keep unconditional mappins | |
163 | continue | |
164 | else: | |
165 | continue | |
166 | code = code.strip() | |
167 | lower = lower.strip() | |
168 | title = title.strip() | |
169 | upper = upper.strip() | |
170 | key = int(code, 16) | |
171 | for (map_, values) in [(to_lower, lower), (to_upper, upper), (to_title, title)]: | |
172 | if values != code: | |
173 | values = [int(i, 16) for i in values.split()] | |
174 | for _ in range(len(values), 3): | |
175 | values.append(0) | |
176 | assert len(values) == 3 | |
177 | map_[key] = values | |
1a4d82fc JJ |
178 | |
179 | def group_cats(cats): | |
180 | cats_out = {} | |
181 | for cat in cats: | |
182 | cats_out[cat] = group_cat(cats[cat]) | |
183 | return cats_out | |
184 | ||
185 | def group_cat(cat): | |
186 | cat_out = [] | |
187 | letters = sorted(set(cat)) | |
188 | cur_start = letters.pop(0) | |
189 | cur_end = cur_start | |
190 | for letter in letters: | |
191 | assert letter > cur_end, \ | |
192 | "cur_end: %s, letter: %s" % (hex(cur_end), hex(letter)) | |
193 | if letter == cur_end + 1: | |
194 | cur_end = letter | |
223e47cc | 195 | else: |
1a4d82fc JJ |
196 | cat_out.append((cur_start, cur_end)) |
197 | cur_start = cur_end = letter | |
198 | cat_out.append((cur_start, cur_end)) | |
199 | return cat_out | |
200 | ||
201 | def ungroup_cat(cat): | |
202 | cat_out = [] | |
203 | for (lo, hi) in cat: | |
204 | while lo <= hi: | |
205 | cat_out.append(lo) | |
206 | lo += 1 | |
207 | return cat_out | |
208 | ||
209 | def gen_unassigned(assigned): | |
210 | assigned = set(assigned) | |
211 | return ([i for i in range(0, 0xd800) if i not in assigned] + | |
212 | [i for i in range(0xe000, 0x110000) if i not in assigned]) | |
213 | ||
214 | def to_combines(combs): | |
215 | combs_out = [] | |
216 | for comb in combs: | |
217 | for (lo, hi) in combs[comb]: | |
218 | combs_out.append((lo, hi, comb)) | |
219 | combs_out.sort(key=lambda comb: comb[0]) | |
220 | return combs_out | |
221 | ||
222 | def format_table_content(f, content, indent): | |
223 | line = " "*indent | |
224 | first = True | |
225 | for chunk in content.split(","): | |
226 | if len(line) + len(chunk) < 98: | |
227 | if first: | |
228 | line += chunk | |
229 | else: | |
230 | line += ", " + chunk | |
231 | first = False | |
232 | else: | |
233 | f.write(line + ",\n") | |
234 | line = " "*indent + chunk | |
235 | f.write(line) | |
223e47cc | 236 | |
1a4d82fc | 237 | def load_properties(f, interestingprops): |
223e47cc | 238 | fetch(f) |
1a4d82fc | 239 | props = {} |
d9579d0f AL |
240 | re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)") |
241 | re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)") | |
223e47cc | 242 | |
9346a6ac | 243 | for line in fileinput.input(os.path.basename(f)): |
223e47cc LB |
244 | prop = None |
245 | d_lo = 0 | |
246 | d_hi = 0 | |
247 | m = re1.match(line) | |
248 | if m: | |
249 | d_lo = m.group(1) | |
250 | d_hi = m.group(1) | |
251 | prop = m.group(2) | |
252 | else: | |
253 | m = re2.match(line) | |
254 | if m: | |
255 | d_lo = m.group(1) | |
256 | d_hi = m.group(2) | |
257 | prop = m.group(3) | |
258 | else: | |
259 | continue | |
1a4d82fc | 260 | if interestingprops and prop not in interestingprops: |
223e47cc LB |
261 | continue |
262 | d_lo = int(d_lo, 16) | |
263 | d_hi = int(d_hi, 16) | |
1a4d82fc JJ |
264 | if prop not in props: |
265 | props[prop] = [] | |
266 | props[prop].append((d_lo, d_hi)) | |
d9579d0f AL |
267 | |
268 | # optimize if possible | |
269 | for prop in props: | |
270 | props[prop] = group_cat(ungroup_cat(props[prop])) | |
271 | ||
1a4d82fc JJ |
272 | return props |
273 | ||
274 | # load all widths of want_widths, except those in except_cats | |
275 | def load_east_asian_width(want_widths, except_cats): | |
276 | f = "EastAsianWidth.txt" | |
277 | fetch(f) | |
278 | widths = {} | |
279 | re1 = re.compile("^([0-9A-F]+);(\w+) +# (\w+)") | |
280 | re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)") | |
281 | ||
282 | for line in fileinput.input(f): | |
283 | width = None | |
284 | d_lo = 0 | |
285 | d_hi = 0 | |
286 | cat = None | |
287 | m = re1.match(line) | |
288 | if m: | |
289 | d_lo = m.group(1) | |
290 | d_hi = m.group(1) | |
291 | width = m.group(2) | |
292 | cat = m.group(3) | |
293 | else: | |
294 | m = re2.match(line) | |
295 | if m: | |
296 | d_lo = m.group(1) | |
297 | d_hi = m.group(2) | |
298 | width = m.group(3) | |
299 | cat = m.group(4) | |
300 | else: | |
301 | continue | |
302 | if cat in except_cats or width not in want_widths: | |
303 | continue | |
304 | d_lo = int(d_lo, 16) | |
305 | d_hi = int(d_hi, 16) | |
306 | if width not in widths: | |
307 | widths[width] = [] | |
308 | widths[width].append((d_lo, d_hi)) | |
309 | return widths | |
223e47cc LB |
310 | |
311 | def escape_char(c): | |
62682a34 | 312 | return "'\\u{%x}'" % c if c != 0 else "'\\0'" |
970d7e83 LB |
313 | |
314 | def emit_bsearch_range_table(f): | |
315 | f.write(""" | |
1a4d82fc JJ |
316 | fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool { |
317 | use core::cmp::Ordering::{Equal, Less, Greater}; | |
318 | use core::slice::SliceExt; | |
c34b1796 | 319 | r.binary_search_by(|&(lo,hi)| { |
1a4d82fc JJ |
320 | if lo <= c && c <= hi { Equal } |
321 | else if hi < c { Less } | |
322 | else { Greater } | |
c34b1796 | 323 | }).is_ok() |
1a4d82fc JJ |
324 | }\n |
325 | """) | |
326 | ||
327 | def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True, | |
328 | pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))): | |
329 | pub_string = "" | |
330 | if is_pub: | |
331 | pub_string = "pub " | |
c34b1796 | 332 | f.write(" %sconst %s: %s = &[\n" % (pub_string, name, t_type)) |
1a4d82fc JJ |
333 | data = "" |
334 | first = True | |
335 | for dat in t_data: | |
336 | if not first: | |
337 | data += "," | |
338 | first = False | |
339 | data += pfun(dat) | |
340 | format_table_content(f, data, 8) | |
341 | f.write("\n ];\n\n") | |
342 | ||
9346a6ac | 343 | def emit_property_module(f, mod, tbl, emit): |
970d7e83 | 344 | f.write("pub mod %s {\n" % mod) |
9346a6ac | 345 | for cat in sorted(emit): |
1a4d82fc | 346 | emit_table(f, "%s_table" % cat, tbl[cat]) |
9346a6ac AL |
347 | f.write(" pub fn %s(c: char) -> bool {\n" % cat) |
348 | f.write(" super::bsearch_range_table(c, %s_table)\n" % cat) | |
349 | f.write(" }\n\n") | |
1a4d82fc JJ |
350 | f.write("}\n\n") |
351 | ||
62682a34 | 352 | def emit_conversions_module(f, to_upper, to_lower, to_title): |
1a4d82fc JJ |
353 | f.write("pub mod conversions {") |
354 | f.write(""" | |
355 | use core::cmp::Ordering::{Equal, Less, Greater}; | |
356 | use core::slice::SliceExt; | |
357 | use core::option::Option; | |
358 | use core::option::Option::{Some, None}; | |
c34b1796 | 359 | use core::result::Result::{Ok, Err}; |
1a4d82fc | 360 | |
62682a34 SL |
361 | pub fn to_lower(c: char) -> [char; 3] { |
362 | match bsearch_case_table(c, to_lowercase_table) { | |
363 | None => [c, '\\0', '\\0'], | |
364 | Some(index) => to_lowercase_table[index].1 | |
365 | } | |
366 | } | |
367 | ||
368 | pub fn to_upper(c: char) -> [char; 3] { | |
369 | match bsearch_case_table(c, to_uppercase_table) { | |
370 | None => [c, '\\0', '\\0'], | |
371 | Some(index) => to_uppercase_table[index].1 | |
1a4d82fc JJ |
372 | } |
373 | } | |
374 | ||
62682a34 SL |
375 | pub fn to_title(c: char) -> [char; 3] { |
376 | match bsearch_case_table(c, to_titlecase_table) { | |
377 | None => [c, '\\0', '\\0'], | |
378 | Some(index) => to_titlecase_table[index].1 | |
1a4d82fc JJ |
379 | } |
380 | } | |
381 | ||
62682a34 | 382 | fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option<usize> { |
c34b1796 | 383 | match table.binary_search_by(|&(key, _)| { |
1a4d82fc JJ |
384 | if c == key { Equal } |
385 | else if key < c { Less } | |
386 | else { Greater } | |
387 | }) { | |
c34b1796 AL |
388 | Ok(i) => Some(i), |
389 | Err(_) => None, | |
1a4d82fc JJ |
390 | } |
391 | } | |
392 | ||
393 | """) | |
62682a34 SL |
394 | t_type = "&'static [(char, [char; 3])]" |
395 | pfun = lambda x: "(%s,[%s,%s,%s])" % ( | |
396 | escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2])) | |
397 | emit_table(f, "to_lowercase_table", | |
398 | sorted(to_lower.iteritems(), key=operator.itemgetter(0)), | |
399 | is_pub=False, t_type = t_type, pfun=pfun) | |
400 | emit_table(f, "to_uppercase_table", | |
401 | sorted(to_upper.iteritems(), key=operator.itemgetter(0)), | |
402 | is_pub=False, t_type = t_type, pfun=pfun) | |
403 | emit_table(f, "to_titlecase_table", | |
404 | sorted(to_title.iteritems(), key=operator.itemgetter(0)), | |
405 | is_pub=False, t_type = t_type, pfun=pfun) | |
1a4d82fc JJ |
406 | f.write("}\n\n") |
407 | ||
408 | def emit_grapheme_module(f, grapheme_table, grapheme_cats): | |
409 | f.write("""pub mod grapheme { | |
1a4d82fc JJ |
410 | use core::slice::SliceExt; |
411 | pub use self::GraphemeCat::*; | |
c34b1796 | 412 | use core::result::Result::{Ok, Err}; |
1a4d82fc JJ |
413 | |
414 | #[allow(non_camel_case_types)] | |
85aaf69f | 415 | #[derive(Clone, Copy)] |
1a4d82fc JJ |
416 | pub enum GraphemeCat { |
417 | """) | |
418 | for cat in grapheme_cats + ["Any"]: | |
419 | f.write(" GC_" + cat + ",\n") | |
420 | f.write(""" } | |
421 | ||
1a4d82fc JJ |
422 | fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat { |
423 | use core::cmp::Ordering::{Equal, Less, Greater}; | |
c34b1796 | 424 | match r.binary_search_by(|&(lo, hi, _)| { |
1a4d82fc JJ |
425 | if lo <= c && c <= hi { Equal } |
426 | else if hi < c { Less } | |
427 | else { Greater } | |
428 | }) { | |
c34b1796 | 429 | Ok(idx) => { |
1a4d82fc JJ |
430 | let (_, _, cat) = r[idx]; |
431 | cat | |
432 | } | |
c34b1796 | 433 | Err(_) => GC_Any |
1a4d82fc JJ |
434 | } |
435 | } | |
436 | ||
437 | pub fn grapheme_category(c: char) -> GraphemeCat { | |
438 | bsearch_range_value_table(c, grapheme_cat_table) | |
439 | } | |
440 | ||
441 | """) | |
442 | ||
443 | emit_table(f, "grapheme_cat_table", grapheme_table, "&'static [(char, char, GraphemeCat)]", | |
444 | pfun=lambda x: "(%s,%s,GC_%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]), | |
445 | is_pub=False) | |
223e47cc LB |
446 | f.write("}\n") |
447 | ||
1a4d82fc JJ |
448 | def emit_charwidth_module(f, width_table): |
449 | f.write("pub mod charwidth {\n") | |
450 | f.write(" use core::option::Option;\n") | |
451 | f.write(" use core::option::Option::{Some, None};\n") | |
452 | f.write(" use core::slice::SliceExt;\n") | |
c34b1796 | 453 | f.write(" use core::result::Result::{Ok, Err};\n") |
1a4d82fc JJ |
454 | f.write(""" |
455 | fn bsearch_range_value_table(c: char, is_cjk: bool, r: &'static [(char, char, u8, u8)]) -> u8 { | |
456 | use core::cmp::Ordering::{Equal, Less, Greater}; | |
c34b1796 | 457 | match r.binary_search_by(|&(lo, hi, _, _)| { |
1a4d82fc JJ |
458 | if lo <= c && c <= hi { Equal } |
459 | else if hi < c { Less } | |
460 | else { Greater } | |
461 | }) { | |
c34b1796 | 462 | Ok(idx) => { |
1a4d82fc JJ |
463 | let (_, _, r_ncjk, r_cjk) = r[idx]; |
464 | if is_cjk { r_cjk } else { r_ncjk } | |
465 | } | |
c34b1796 | 466 | Err(_) => 1 |
1a4d82fc JJ |
467 | } |
468 | } | |
469 | """) | |
470 | ||
471 | f.write(""" | |
85aaf69f SL |
472 | pub fn width(c: char, is_cjk: bool) -> Option<usize> { |
473 | match c as usize { | |
1a4d82fc JJ |
474 | _c @ 0 => Some(0), // null is zero width |
475 | cu if cu < 0x20 => None, // control sequences have no width | |
476 | cu if cu < 0x7F => Some(1), // ASCII | |
477 | cu if cu < 0xA0 => None, // more control sequences | |
85aaf69f | 478 | _ => Some(bsearch_range_value_table(c, is_cjk, charwidth_table) as usize) |
1a4d82fc JJ |
479 | } |
480 | } | |
481 | ||
482 | """) | |
483 | ||
484 | f.write(" // character width table. Based on Markus Kuhn's free wcwidth() implementation,\n") | |
485 | f.write(" // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n") | |
486 | emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False, | |
487 | pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3])) | |
488 | f.write("}\n\n") | |
489 | ||
490 | def emit_norm_module(f, canon, compat, combine, norm_props): | |
223e47cc LB |
491 | canon_keys = canon.keys() |
492 | canon_keys.sort() | |
493 | ||
494 | compat_keys = compat.keys() | |
495 | compat_keys.sort() | |
223e47cc | 496 | |
1a4d82fc JJ |
497 | canon_comp = {} |
498 | comp_exclusions = norm_props["Full_Composition_Exclusion"] | |
499 | for char in canon_keys: | |
500 | if True in map(lambda (lo, hi): lo <= char <= hi, comp_exclusions): | |
501 | continue | |
502 | decomp = canon[char] | |
503 | if len(decomp) == 2: | |
504 | if not canon_comp.has_key(decomp[0]): | |
505 | canon_comp[decomp[0]] = [] | |
506 | canon_comp[decomp[0]].append( (decomp[1], char) ) | |
507 | canon_comp_keys = canon_comp.keys() | |
508 | canon_comp_keys.sort() | |
509 | ||
510 | f.write("pub mod normalization {\n") | |
511 | ||
512 | def mkdata_fun(table): | |
513 | def f(char): | |
514 | data = "(%s,&[" % escape_char(char) | |
515 | first = True | |
516 | for d in table[char]: | |
517 | if not first: | |
518 | data += "," | |
519 | first = False | |
520 | data += escape_char(d) | |
521 | data += "])" | |
522 | return data | |
523 | return f | |
524 | ||
525 | f.write(" // Canonical decompositions\n") | |
526 | emit_table(f, "canonical_table", canon_keys, "&'static [(char, &'static [char])]", | |
527 | pfun=mkdata_fun(canon)) | |
528 | ||
529 | f.write(" // Compatibility decompositions\n") | |
530 | emit_table(f, "compatibility_table", compat_keys, "&'static [(char, &'static [char])]", | |
531 | pfun=mkdata_fun(compat)) | |
532 | ||
533 | def comp_pfun(char): | |
534 | data = "(%s,&[" % escape_char(char) | |
535 | canon_comp[char].sort(lambda x, y: x[0] - y[0]) | |
536 | first = True | |
537 | for pair in canon_comp[char]: | |
538 | if not first: | |
539 | data += "," | |
540 | first = False | |
541 | data += "(%s,%s)" % (escape_char(pair[0]), escape_char(pair[1])) | |
542 | data += "])" | |
543 | return data | |
544 | ||
545 | f.write(" // Canonical compositions\n") | |
546 | emit_table(f, "composition_table", canon_comp_keys, | |
547 | "&'static [(char, &'static [(char, char)])]", pfun=comp_pfun) | |
970d7e83 | 548 | |
1a4d82fc JJ |
549 | f.write(""" |
550 | fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 { | |
551 | use core::cmp::Ordering::{Equal, Less, Greater}; | |
552 | use core::slice::SliceExt; | |
c34b1796 AL |
553 | use core::result::Result::{Ok, Err}; |
554 | match r.binary_search_by(|&(lo, hi, _)| { | |
1a4d82fc JJ |
555 | if lo <= c && c <= hi { Equal } |
556 | else if hi < c { Less } | |
557 | else { Greater } | |
558 | }) { | |
c34b1796 | 559 | Ok(idx) => { |
1a4d82fc JJ |
560 | let (_, _, result) = r[idx]; |
561 | result | |
562 | } | |
c34b1796 | 563 | Err(_) => 0 |
1a4d82fc JJ |
564 | } |
565 | }\n | |
566 | """) | |
567 | ||
568 | emit_table(f, "combining_class_table", combine, "&'static [(char, char, u8)]", is_pub=False, | |
569 | pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2])) | |
570 | ||
d9579d0f AL |
571 | f.write(""" #[deprecated(reason = "use the crates.io `unicode-normalization` lib instead", |
572 | since = "1.0.0")] | |
573 | #[unstable(feature = "unicode", | |
574 | reason = "this functionality will be moved to crates.io")] | |
575 | pub fn canonical_combining_class(c: char) -> u8 { | |
576 | bsearch_range_value_table(c, combining_class_table) | |
577 | } | |
970d7e83 | 578 | |
1a4d82fc | 579 | } |
970d7e83 | 580 | |
1a4d82fc | 581 | """) |
970d7e83 | 582 | |
1a4d82fc JJ |
583 | def remove_from_wtable(wtable, val): |
584 | wtable_out = [] | |
585 | while wtable: | |
586 | if wtable[0][1] < val: | |
587 | wtable_out.append(wtable.pop(0)) | |
588 | elif wtable[0][0] > val: | |
589 | break | |
590 | else: | |
591 | (wt_lo, wt_hi, width, width_cjk) = wtable.pop(0) | |
592 | if wt_lo == wt_hi == val: | |
593 | continue | |
594 | elif wt_lo == val: | |
595 | wtable_out.append((wt_lo+1, wt_hi, width, width_cjk)) | |
596 | elif wt_hi == val: | |
597 | wtable_out.append((wt_lo, wt_hi-1, width, width_cjk)) | |
598 | else: | |
599 | wtable_out.append((wt_lo, val-1, width, width_cjk)) | |
600 | wtable_out.append((val+1, wt_hi, width, width_cjk)) | |
601 | if wtable: | |
602 | wtable_out.extend(wtable) | |
603 | return wtable_out | |
970d7e83 | 604 | |
223e47cc | 605 | |
970d7e83 | 606 | |
1a4d82fc JJ |
607 | def optimize_width_table(wtable): |
608 | wtable_out = [] | |
609 | w_this = wtable.pop(0) | |
610 | while wtable: | |
611 | if w_this[1] == wtable[0][0] - 1 and w_this[2:3] == wtable[0][2:3]: | |
612 | w_tmp = wtable.pop(0) | |
613 | w_this = (w_this[0], w_tmp[1], w_tmp[2], w_tmp[3]) | |
614 | else: | |
615 | wtable_out.append(w_this) | |
616 | w_this = wtable.pop(0) | |
617 | wtable_out.append(w_this) | |
618 | return wtable_out | |
619 | ||
620 | if __name__ == "__main__": | |
621 | r = "tables.rs" | |
622 | if os.path.exists(r): | |
623 | os.remove(r) | |
624 | with open(r, "w") as rf: | |
625 | # write the file's preamble | |
626 | rf.write(preamble) | |
627 | ||
628 | # download and parse all the data | |
629 | fetch("ReadMe.txt") | |
630 | with open("ReadMe.txt") as readme: | |
631 | pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode" | |
632 | unicode_version = re.search(pattern, readme.read()).groups() | |
633 | rf.write(""" | |
634 | /// The version of [Unicode](http://www.unicode.org/) | |
c34b1796 | 635 | /// that the unicode parts of `CharExt` and `UnicodeStrPrelude` traits are based on. |
85aaf69f | 636 | pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s); |
1a4d82fc JJ |
637 | """ % unicode_version) |
638 | (canon_decomp, compat_decomp, gencats, combines, | |
62682a34 SL |
639 | to_upper, to_lower, to_title) = load_unicode_data("UnicodeData.txt") |
640 | load_special_casing("SpecialCasing.txt", to_upper, to_lower, to_title) | |
641 | want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase", | |
642 | "Cased", "Case_Ignorable"] | |
9346a6ac | 643 | derived = load_properties("DerivedCoreProperties.txt", want_derived) |
1a4d82fc JJ |
644 | scripts = load_properties("Scripts.txt", []) |
645 | props = load_properties("PropList.txt", | |
646 | ["White_Space", "Join_Control", "Noncharacter_Code_Point"]) | |
647 | norm_props = load_properties("DerivedNormalizationProps.txt", | |
648 | ["Full_Composition_Exclusion"]) | |
649 | ||
1a4d82fc JJ |
650 | # bsearch_range_table is used in all the property modules below |
651 | emit_bsearch_range_table(rf) | |
652 | ||
9346a6ac | 653 | # category tables |
1a4d82fc JJ |
654 | for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \ |
655 | ("derived_property", derived, want_derived), \ | |
1a4d82fc JJ |
656 | ("property", props, ["White_Space"]): |
657 | emit_property_module(rf, name, cat, pfuns) | |
1a4d82fc JJ |
658 | |
659 | # normalizations and conversions module | |
660 | emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props) | |
62682a34 | 661 | emit_conversions_module(rf, to_upper, to_lower, to_title) |
1a4d82fc JJ |
662 | |
663 | ### character width module | |
664 | width_table = [] | |
665 | for zwcat in ["Me", "Mn", "Cf"]: | |
666 | width_table.extend(map(lambda (lo, hi): (lo, hi, 0, 0), gencats[zwcat])) | |
667 | width_table.append((4448, 4607, 0, 0)) | |
668 | ||
669 | # get widths, except those that are explicitly marked zero-width above | |
670 | ea_widths = load_east_asian_width(["W", "F", "A"], ["Me", "Mn", "Cf"]) | |
671 | # these are doublewidth | |
672 | for dwcat in ["W", "F"]: | |
673 | width_table.extend(map(lambda (lo, hi): (lo, hi, 2, 2), ea_widths[dwcat])) | |
674 | width_table.extend(map(lambda (lo, hi): (lo, hi, 1, 2), ea_widths["A"])) | |
675 | ||
676 | width_table.sort(key=lambda w: w[0]) | |
677 | ||
678 | # soft hyphen is not zero width in preformatted text; it's used to indicate | |
679 | # a hyphen inserted to facilitate a linebreak. | |
680 | width_table = remove_from_wtable(width_table, 173) | |
681 | ||
682 | # optimize the width table by collapsing adjacent entities when possible | |
683 | width_table = optimize_width_table(width_table) | |
684 | emit_charwidth_module(rf, width_table) | |
685 | ||
686 | ### grapheme cluster module | |
687 | # from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values | |
9346a6ac | 688 | grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt", []) |
1a4d82fc JJ |
689 | |
690 | # Control | |
9346a6ac | 691 | # Note 1: |
1a4d82fc JJ |
692 | # This category also includes Cs (surrogate codepoints), but Rust's `char`s are |
693 | # Unicode Scalar Values only, and surrogates are thus invalid `char`s. | |
9346a6ac AL |
694 | # Thus, we have to remove Cs from the Control category |
695 | # Note 2: | |
696 | # 0x0a and 0x0d (CR and LF) are not in the Control category for Graphemes. | |
697 | # However, the Graphemes iterator treats these as a special case, so they | |
698 | # should be included in grapheme_cats["Control"] for our implementation. | |
1a4d82fc | 699 | grapheme_cats["Control"] = group_cat(list( |
9346a6ac AL |
700 | (set(ungroup_cat(grapheme_cats["Control"])) |
701 | | set(ungroup_cat(grapheme_cats["CR"])) | |
702 | | set(ungroup_cat(grapheme_cats["LF"]))) | |
703 | - set(ungroup_cat([surrogate_codepoints])))) | |
704 | del(grapheme_cats["CR"]) | |
705 | del(grapheme_cats["LF"]) | |
1a4d82fc JJ |
706 | |
707 | grapheme_table = [] | |
708 | for cat in grapheme_cats: | |
709 | grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]]) | |
710 | grapheme_table.sort(key=lambda w: w[0]) | |
711 | emit_grapheme_module(rf, grapheme_table, grapheme_cats.keys()) |