]>
Commit | Line | Data |
---|---|---|
abe05a73 XL |
1 | #!/usr/bin/env python |
2 | # | |
3 | # Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT | |
4 | # file at the top-level directory of this distribution and at | |
5 | # http://rust-lang.org/COPYRIGHT. | |
6 | # | |
7 | # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
8 | # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
9 | # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
10 | # option. This file may not be copied, modified, or distributed | |
11 | # except according to those terms. | |
12 | ||
13 | # This script uses the following Unicode tables: | |
14 | # - DerivedNormalizationProps.txt | |
15 | # - ReadMe.txt | |
16 | # - UnicodeData.txt | |
17 | # | |
18 | # Since this should not require frequent updates, we just store this | |
19 | # out-of-line and check the unicode.rs file into git. | |
20 | ||
21 | import fileinput, re, os, sys, collections | |
22 | ||
23 | preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT | |
24 | // file at the top-level directory of this distribution and at | |
25 | // http://rust-lang.org/COPYRIGHT. | |
26 | // | |
27 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
28 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
29 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
30 | // option. This file may not be copied, modified, or distributed | |
31 | // except according to those terms. | |
32 | ||
33 | // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly | |
34 | ||
35 | #![allow(missing_docs, non_upper_case_globals, non_snake_case)] | |
36 | ''' | |
37 | ||
38 | # Mapping taken from Table 12 from: | |
39 | # http://www.unicode.org/reports/tr44/#General_Category_Values | |
40 | expanded_categories = { | |
41 | 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], | |
42 | 'Lm': ['L'], 'Lo': ['L'], | |
43 | 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], | |
44 | 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], | |
45 | 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], | |
46 | 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], | |
47 | 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], | |
48 | 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], | |
49 | 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], | |
50 | } | |
51 | ||
52 | # these are the surrogate codepoints, which are not valid rust characters | |
53 | surrogate_codepoints = (0xd800, 0xdfff) | |
54 | ||
55 | def fetch(f): | |
56 | if not os.path.exists(os.path.basename(f)): | |
57 | os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s" | |
58 | % f) | |
59 | ||
60 | if not os.path.exists(os.path.basename(f)): | |
61 | sys.stderr.write("cannot load %s" % f) | |
62 | exit(1) | |
63 | ||
64 | def is_surrogate(n): | |
65 | return surrogate_codepoints[0] <= n <= surrogate_codepoints[1] | |
66 | ||
67 | def load_unicode_data(f): | |
68 | fetch(f) | |
69 | combines = {} | |
70 | canon_decomp = {} | |
71 | compat_decomp = {} | |
72 | general_category_mark = [] | |
73 | ||
74 | udict = {}; | |
75 | range_start = -1; | |
76 | for line in fileinput.input(f): | |
77 | data = line.split(';'); | |
78 | if len(data) != 15: | |
79 | continue | |
80 | cp = int(data[0], 16); | |
81 | if is_surrogate(cp): | |
82 | continue | |
83 | if range_start >= 0: | |
84 | for i in xrange(range_start, cp): | |
85 | udict[i] = data; | |
86 | range_start = -1; | |
87 | if data[1].endswith(", First>"): | |
88 | range_start = cp; | |
89 | continue; | |
90 | udict[cp] = data; | |
91 | ||
92 | for code in udict: | |
93 | [code_org, name, gencat, combine, bidi, | |
94 | decomp, deci, digit, num, mirror, | |
95 | old, iso, upcase, lowcase, titlecase ] = udict[code]; | |
96 | ||
97 | # store decomposition, if given | |
98 | if decomp != "": | |
99 | if decomp.startswith('<'): | |
100 | seq = [] | |
101 | for i in decomp.split()[1:]: | |
102 | seq.append(int(i, 16)) | |
103 | compat_decomp[code] = seq | |
104 | else: | |
105 | seq = [] | |
106 | for i in decomp.split(): | |
107 | seq.append(int(i, 16)) | |
108 | canon_decomp[code] = seq | |
109 | ||
110 | # record combining class, if any | |
111 | if combine != "0": | |
112 | if combine not in combines: | |
113 | combines[combine] = [] | |
114 | combines[combine].append(code) | |
115 | ||
116 | if 'M' in [gencat] + expanded_categories.get(gencat, []): | |
117 | general_category_mark.append(code) | |
118 | general_category_mark = group_cat(general_category_mark) | |
119 | ||
120 | combines = to_combines(group_cats(combines)) | |
121 | ||
122 | return (canon_decomp, compat_decomp, combines, general_category_mark) | |
123 | ||
124 | def group_cats(cats): | |
125 | cats_out = {} | |
126 | for cat in cats: | |
127 | cats_out[cat] = group_cat(cats[cat]) | |
128 | return cats_out | |
129 | ||
130 | def group_cat(cat): | |
131 | cat_out = [] | |
132 | letters = sorted(set(cat)) | |
133 | cur_start = letters.pop(0) | |
134 | cur_end = cur_start | |
135 | for letter in letters: | |
136 | assert letter > cur_end, \ | |
137 | "cur_end: %s, letter: %s" % (hex(cur_end), hex(letter)) | |
138 | if letter == cur_end + 1: | |
139 | cur_end = letter | |
140 | else: | |
141 | cat_out.append((cur_start, cur_end)) | |
142 | cur_start = cur_end = letter | |
143 | cat_out.append((cur_start, cur_end)) | |
144 | return cat_out | |
145 | ||
146 | def ungroup_cat(cat): | |
147 | cat_out = [] | |
148 | for (lo, hi) in cat: | |
149 | while lo <= hi: | |
150 | cat_out.append(lo) | |
151 | lo += 1 | |
152 | return cat_out | |
153 | ||
154 | def to_combines(combs): | |
155 | combs_out = [] | |
156 | for comb in combs: | |
157 | for (lo, hi) in combs[comb]: | |
158 | combs_out.append((lo, hi, comb)) | |
159 | combs_out.sort(key=lambda comb: comb[0]) | |
160 | return combs_out | |
161 | ||
162 | def format_table_content(f, content, indent): | |
163 | indent = " "*indent | |
164 | for c in content: | |
165 | f.write("%s%s,\n" % (indent, c)) | |
166 | ||
167 | def load_properties(f, interestingprops): | |
168 | fetch(f) | |
169 | props = {} | |
170 | re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)") | |
171 | re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)") | |
172 | ||
173 | for line in fileinput.input(os.path.basename(f)): | |
174 | prop = None | |
175 | d_lo = 0 | |
176 | d_hi = 0 | |
177 | m = re1.match(line) | |
178 | if m: | |
179 | d_lo = m.group(1) | |
180 | d_hi = m.group(1) | |
181 | prop = m.group(2) | |
182 | else: | |
183 | m = re2.match(line) | |
184 | if m: | |
185 | d_lo = m.group(1) | |
186 | d_hi = m.group(2) | |
187 | prop = m.group(3) | |
188 | else: | |
189 | continue | |
190 | if interestingprops and prop not in interestingprops: | |
191 | continue | |
192 | d_lo = int(d_lo, 16) | |
193 | d_hi = int(d_hi, 16) | |
194 | if prop not in props: | |
195 | props[prop] = [] | |
196 | props[prop].append((d_lo, d_hi)) | |
197 | ||
198 | # optimize if possible | |
199 | for prop in props: | |
200 | props[prop] = group_cat(ungroup_cat(props[prop])) | |
201 | ||
202 | return props | |
203 | ||
204 | def escape_char(c): | |
205 | return "'\\u{%x}'" % c | |
206 | ||
207 | def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True, | |
208 | pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))): | |
209 | pub_string = "" | |
210 | if is_pub: | |
211 | pub_string = "pub " | |
212 | f.write(" %sconst %s: %s = &[\n" % (pub_string, name, t_type)) | |
213 | format_table_content(f, [pfun(d) for d in t_data], 8) | |
214 | f.write("\n ];\n\n") | |
215 | ||
216 | def emit_strtab_table(f, name, keys, vfun, is_pub=True, | |
217 | tab_entry_type='char', slice_element_sfun=escape_char): | |
218 | pub_string = "" | |
219 | if is_pub: | |
220 | pub_string = "pub " | |
221 | f.write(" %s const %s: &'static [(char, Slice)] = &[\n" | |
222 | % (pub_string, name)) | |
223 | ||
224 | strtab = collections.OrderedDict() | |
225 | strtab_offset = 0 | |
226 | ||
227 | # TODO: a more sophisticated algorithm here would not only check for the | |
228 | # existence of v in the strtab, but also v in contiguous substrings of | |
229 | # strtab, if that's possible. | |
230 | for k in keys: | |
231 | v = tuple(vfun(k)) | |
232 | if v in strtab: | |
233 | item_slice = strtab[v] | |
234 | else: | |
235 | value_len = len(v) | |
236 | item_slice = (strtab_offset, value_len) | |
237 | strtab[v] = item_slice | |
238 | strtab_offset += value_len | |
239 | ||
240 | f.write("%s(%s, Slice { offset: %d, length: %d }),\n" | |
241 | % (" "*8, escape_char(k), item_slice[0], item_slice[1])) | |
242 | ||
243 | f.write("\n ];\n\n") | |
244 | ||
245 | f.write(" %s const %s_STRTAB: &'static [%s] = &[\n" | |
246 | % (pub_string, name, tab_entry_type)) | |
247 | ||
248 | for (v, _) in strtab.iteritems(): | |
249 | f.write("%s%s,\n" % (" "*8, ', '.join(slice_element_sfun(c) for c in v))) | |
250 | ||
251 | f.write("\n ];\n\n") | |
252 | ||
253 | def emit_norm_module(f, canon, compat, combine, norm_props, general_category_mark): | |
254 | canon_keys = canon.keys() | |
255 | canon_keys.sort() | |
256 | ||
257 | compat_keys = compat.keys() | |
258 | compat_keys.sort() | |
259 | ||
260 | canon_comp = {} | |
261 | comp_exclusions = norm_props["Full_Composition_Exclusion"] | |
262 | for char in canon_keys: | |
263 | if True in map(lambda (lo, hi): lo <= char <= hi, comp_exclusions): | |
264 | continue | |
265 | decomp = canon[char] | |
266 | if len(decomp) == 2: | |
267 | if not canon_comp.has_key(decomp[0]): | |
268 | canon_comp[decomp[0]] = [] | |
269 | canon_comp[decomp[0]].append( (decomp[1], char) ) | |
270 | canon_comp_keys = canon_comp.keys() | |
271 | canon_comp_keys.sort() | |
272 | ||
273 | f.write("pub mod normalization {\n") | |
274 | f.write(""" | |
275 | pub struct Slice { | |
276 | pub offset: u16, | |
277 | pub length: u16, | |
278 | } | |
279 | """) | |
280 | ||
281 | def mkdata_fun(table): | |
282 | def f(char): | |
283 | return table[char] | |
284 | return f | |
285 | ||
286 | # TODO: should the strtab of these two tables be of type &'static str, for | |
287 | # smaller data? | |
288 | f.write(" // Canonical decompositions\n") | |
289 | emit_strtab_table(f, "canonical_table", canon_keys, | |
290 | vfun=mkdata_fun(canon)) | |
291 | ||
292 | f.write(" // Compatibility decompositions\n") | |
293 | emit_strtab_table(f, "compatibility_table", compat_keys, | |
294 | vfun=mkdata_fun(compat)) | |
295 | ||
296 | def comp_vfun(char): | |
297 | return sorted(canon_comp[char], lambda x, y: x[0] - y[0]) | |
298 | ||
299 | f.write(" // Canonical compositions\n") | |
300 | # "&'static [(char, &'static [(char, char)])]", pfun=comp_pfun) | |
301 | emit_strtab_table(f, "composition_table", canon_comp_keys, | |
302 | vfun=comp_vfun, | |
303 | tab_entry_type="(char, char)", | |
304 | slice_element_sfun=lambda pair: "(%s,%s)" % (escape_char(pair[0]), | |
305 | escape_char(pair[1]))) | |
306 | ||
307 | f.write(""" | |
308 | fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 { | |
309 | use std::cmp::Ordering::{Equal, Less, Greater}; | |
310 | match r.binary_search_by(|&(lo, hi, _)| { | |
311 | if lo <= c && c <= hi { Equal } | |
312 | else if hi < c { Less } | |
313 | else { Greater } | |
314 | }) { | |
315 | Ok(idx) => { | |
316 | let (_, _, result) = r[idx]; | |
317 | result | |
318 | } | |
319 | Err(_) => 0 | |
320 | } | |
321 | }\n | |
322 | """) | |
323 | ||
324 | emit_table(f, "combining_class_table", combine, "&'static [(char, char, u8)]", is_pub=False, | |
325 | pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2])) | |
326 | ||
327 | f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n" | |
328 | + " bsearch_range_value_table(c, combining_class_table)\n" | |
329 | + " }\n") | |
330 | ||
331 | f.write(""" | |
332 | fn bsearch_range_table(c: char, r: &'static [(char, char)]) -> bool { | |
333 | use std::cmp::Ordering::{Equal, Less, Greater}; | |
334 | r.binary_search_by(|&(lo, hi)| { | |
335 | if lo <= c && c <= hi { | |
336 | Equal | |
337 | } else if hi < c { | |
338 | Less | |
339 | } else { | |
340 | Greater | |
341 | } | |
342 | }) | |
343 | .is_ok() | |
344 | } | |
345 | ||
346 | /// Return whether the given character is a combining mark (`General_Category=Mark`) | |
347 | pub fn is_combining_mark(c: char) -> bool { | |
348 | bsearch_range_table(c, general_category_mark) | |
349 | } | |
350 | ||
351 | """) | |
352 | ||
353 | emit_table(f, "general_category_mark", general_category_mark, "&'static [(char, char)]", is_pub=False, | |
354 | pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))) | |
355 | ||
356 | f.write(""" | |
357 | } | |
358 | ||
359 | """) | |
360 | ||
361 | if __name__ == "__main__": | |
362 | r = "tables.rs" | |
363 | if os.path.exists(r): | |
364 | os.remove(r) | |
365 | with open(r, "w") as rf: | |
366 | # write the file's preamble | |
367 | rf.write(preamble) | |
368 | ||
369 | # download and parse all the data | |
370 | fetch("ReadMe.txt") | |
371 | with open("ReadMe.txt") as readme: | |
372 | pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode" | |
373 | unicode_version = re.search(pattern, readme.read()).groups() | |
374 | rf.write(""" | |
375 | /// The version of [Unicode](http://www.unicode.org/) | |
376 | /// that this version of unicode-normalization is based on. | |
377 | pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s); | |
378 | ||
379 | """ % unicode_version) | |
380 | (canon_decomp, compat_decomp, combines, general_category_mark) = \ | |
381 | load_unicode_data("UnicodeData.txt") | |
382 | norm_props = load_properties("DerivedNormalizationProps.txt", | |
383 | ["Full_Composition_Exclusion"]) | |
384 | ||
385 | # normalizations and conversions module | |
386 | emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props, | |
387 | general_category_mark) |