]>
Commit | Line | Data |
---|---|---|
abe05a73 XL |
1 | #!/usr/bin/env python |
2 | # | |
8faf50e0 | 3 | # Copyright 2011-2018 The Rust Project Developers. See the COPYRIGHT |
abe05a73 XL |
4 | # file at the top-level directory of this distribution and at |
5 | # http://rust-lang.org/COPYRIGHT. | |
6 | # | |
7 | # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
8 | # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
9 | # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
10 | # option. This file may not be copied, modified, or distributed | |
11 | # except according to those terms. | |
12 | ||
13 | # This script uses the following Unicode tables: | |
14 | # - DerivedNormalizationProps.txt | |
8faf50e0 | 15 | # - NormalizationTest.txt |
abe05a73 XL |
16 | # - UnicodeData.txt |
17 | # | |
18 | # Since this should not require frequent updates, we just store this | |
19 | # out-of-line and check the unicode.rs file into git. | |
8faf50e0 | 20 | import collections |
dfeec247 | 21 | import urllib.request |
abe05a73 | 22 | |
8faf50e0 XL |
23 | UNICODE_VERSION = "9.0.0" |
24 | UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION | |
abe05a73 | 25 | |
8faf50e0 | 26 | PREAMBLE = """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT |
abe05a73 XL |
27 | // file at the top-level directory of this distribution and at |
28 | // http://rust-lang.org/COPYRIGHT. | |
29 | // | |
30 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
31 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
32 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
33 | // option. This file may not be copied, modified, or distributed | |
34 | // except according to those terms. | |
35 | ||
36 | // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly | |
37 | ||
8faf50e0 XL |
38 | #![allow(missing_docs)] |
39 | """ | |
40 | ||
41 | NormalizationTest = collections.namedtuple( | |
42 | "NormalizationTest", | |
43 | ["source", "nfc", "nfd", "nfkc", "nfkd"], | |
44 | ) | |
abe05a73 XL |
45 | |
46 | # Mapping taken from Table 12 from: | |
47 | # http://www.unicode.org/reports/tr44/#General_Category_Values | |
48 | expanded_categories = { | |
49 | 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], | |
50 | 'Lm': ['L'], 'Lo': ['L'], | |
51 | 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], | |
52 | 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], | |
53 | 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], | |
54 | 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], | |
55 | 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], | |
56 | 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], | |
57 | 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], | |
58 | } | |
59 | ||
8faf50e0 XL |
60 | class UnicodeData(object): |
61 | def __init__(self): | |
62 | self._load_unicode_data() | |
63 | self.norm_props = self._load_norm_props() | |
64 | self.norm_tests = self._load_norm_tests() | |
65 | ||
66 | self.canon_comp = self._compute_canonical_comp() | |
67 | self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed() | |
68 | ||
69 | def stats(name, table): | |
70 | count = sum(len(v) for v in table.values()) | |
dfeec247 | 71 | print("%s: %d chars => %d decomposed chars" % (name, len(table), count)) |
8faf50e0 | 72 | |
dfeec247 | 73 | print("Decomposition table stats:") |
8faf50e0 XL |
74 | stats("Canonical decomp", self.canon_decomp) |
75 | stats("Compatible decomp", self.compat_decomp) | |
76 | stats("Canonical fully decomp", self.canon_fully_decomp) | |
77 | stats("Compatible fully decomp", self.compat_fully_decomp) | |
78 | ||
79 | self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables() | |
80 | ||
81 | def _fetch(self, filename): | |
dfeec247 XL |
82 | resp = urllib.request.urlopen(UCD_URL + filename) |
83 | return resp.read().decode('utf-8') | |
8faf50e0 XL |
84 | |
85 | def _load_unicode_data(self): | |
86 | self.combining_classes = {} | |
87 | self.compat_decomp = {} | |
88 | self.canon_decomp = {} | |
89 | self.general_category_mark = [] | |
90 | ||
91 | for line in self._fetch("UnicodeData.txt").splitlines(): | |
92 | # See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html | |
93 | pieces = line.split(';') | |
94 | assert len(pieces) == 15 | |
95 | char, category, cc, decomp = pieces[0], pieces[2], pieces[3], pieces[5] | |
96 | char_int = int(char, 16) | |
97 | ||
98 | if cc != '0': | |
99 | self.combining_classes[char_int] = cc | |
100 | ||
abe05a73 | 101 | if decomp.startswith('<'): |
8faf50e0 XL |
102 | self.compat_decomp[char_int] = [int(c, 16) for c in decomp.split()[1:]] |
103 | elif decomp != '': | |
104 | self.canon_decomp[char_int] = [int(c, 16) for c in decomp.split()] | |
105 | ||
106 | if category == 'M' or 'M' in expanded_categories.get(category, []): | |
107 | self.general_category_mark.append(char_int) | |
108 | ||
109 | def _load_norm_props(self): | |
110 | props = collections.defaultdict(list) | |
111 | ||
112 | for line in self._fetch("DerivedNormalizationProps.txt").splitlines(): | |
113 | (prop_data, _, _) = line.partition("#") | |
114 | prop_pieces = prop_data.split(";") | |
115 | ||
116 | if len(prop_pieces) < 2: | |
117 | continue | |
118 | ||
119 | assert len(prop_pieces) <= 3 | |
120 | (low, _, high) = prop_pieces[0].strip().partition("..") | |
121 | ||
122 | prop = prop_pieces[1].strip() | |
123 | ||
124 | data = None | |
125 | if len(prop_pieces) == 3: | |
126 | data = prop_pieces[2].strip() | |
127 | ||
128 | props[prop].append((low, high, data)) | |
129 | ||
130 | return props | |
131 | ||
132 | def _load_norm_tests(self): | |
133 | tests = [] | |
134 | for line in self._fetch("NormalizationTest.txt").splitlines(): | |
135 | (test_data, _, _) = line.partition("#") | |
136 | test_pieces = test_data.split(";") | |
137 | ||
138 | if len(test_pieces) < 5: | |
abe05a73 | 139 | continue |
8faf50e0 XL |
140 | |
141 | source, nfc, nfd, nfkc, nfkd = [[c.strip() for c in p.split()] for p in test_pieces[:5]] | |
142 | tests.append(NormalizationTest(source, nfc, nfd, nfkc, nfkd)) | |
143 | ||
144 | return tests | |
145 | ||
146 | def _compute_canonical_comp(self): | |
147 | canon_comp = {} | |
148 | comp_exclusions = [ | |
149 | (int(low, 16), int(high or low, 16)) | |
150 | for low, high, _ in self.norm_props["Full_Composition_Exclusion"] | |
151 | ] | |
152 | for char_int, decomp in self.canon_decomp.items(): | |
153 | if any(lo <= char_int <= hi for lo, hi in comp_exclusions): | |
154 | continue | |
155 | ||
156 | assert len(decomp) == 2 | |
157 | assert (decomp[0], decomp[1]) not in canon_comp | |
158 | canon_comp[(decomp[0], decomp[1])] = char_int | |
159 | ||
160 | return canon_comp | |
161 | ||
162 | def _compute_fully_decomposed(self): | |
163 | """ | |
164 | Even though the decomposition algorithm is recursive, it is possible | |
165 | to precompute the recursion at table generation time with modest | |
166 | increase to the table size. Then, for these precomputed tables, we | |
167 | note that 1) compatible decomposition is a subset of canonical | |
168 | decomposition and 2) they mostly agree on their intersection. | |
169 | Therefore, we don't store entries in the compatible table for | |
170 | characters that decompose the same way under canonical decomposition. | |
171 | ||
172 | Decomposition table stats: | |
173 | Canonical decomp: 2060 chars => 3085 decomposed chars | |
174 | Compatible decomp: 3662 chars => 5440 decomposed chars | |
175 | Canonical fully decomp: 2060 chars => 3404 decomposed chars | |
176 | Compatible fully decomp: 3678 chars => 5599 decomposed chars | |
177 | ||
178 | The upshot is that decomposition code is very simple and easy to inline | |
179 | at mild code size cost. | |
180 | """ | |
181 | # Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior | |
182 | # http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior | |
183 | S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28 | |
184 | S_COUNT = L_COUNT * V_COUNT * T_COUNT | |
185 | ||
186 | def _decompose(char_int, compatible): | |
187 | # 7-bit ASCII never decomposes | |
188 | if char_int <= 0x7f: | |
189 | yield char_int | |
190 | return | |
191 | ||
192 | # Assert that we're handling Hangul separately. | |
193 | assert not (S_BASE <= char_int < S_BASE + S_COUNT) | |
194 | ||
195 | decomp = self.canon_decomp.get(char_int) | |
196 | if decomp is not None: | |
197 | for decomposed_ch in decomp: | |
198 | for fully_decomposed_ch in _decompose(decomposed_ch, compatible): | |
199 | yield fully_decomposed_ch | |
200 | return | |
201 | ||
202 | if compatible and char_int in self.compat_decomp: | |
203 | for decomposed_ch in self.compat_decomp[char_int]: | |
204 | for fully_decomposed_ch in _decompose(decomposed_ch, compatible): | |
205 | yield fully_decomposed_ch | |
206 | return | |
207 | ||
208 | yield char_int | |
209 | return | |
210 | ||
211 | end_codepoint = max( | |
212 | max(self.canon_decomp.keys()), | |
213 | max(self.compat_decomp.keys()), | |
214 | ) | |
215 | ||
216 | canon_fully_decomp = {} | |
217 | compat_fully_decomp = {} | |
218 | ||
219 | for char_int in range(0, end_codepoint + 1): | |
220 | # Always skip Hangul, since it's more efficient to represent its | |
221 | # decomposition programmatically. | |
222 | if S_BASE <= char_int < S_BASE + S_COUNT: | |
223 | continue | |
224 | ||
225 | canon = list(_decompose(char_int, False)) | |
226 | if not (len(canon) == 1 and canon[0] == char_int): | |
227 | canon_fully_decomp[char_int] = canon | |
228 | ||
229 | compat = list(_decompose(char_int, True)) | |
230 | if not (len(compat) == 1 and compat[0] == char_int): | |
231 | compat_fully_decomp[char_int] = compat | |
232 | ||
233 | # Since canon_fully_decomp is a subset of compat_fully_decomp, we don't | |
234 | # need to store their overlap when they agree. When they don't agree, | |
235 | # store the decomposition in the compatibility table since we'll check | |
236 | # that first when normalizing to NFKD. | |
dfeec247 | 237 | assert set(canon_fully_decomp) <= set(compat_fully_decomp) |
8faf50e0 XL |
238 | |
239 | for ch in set(canon_fully_decomp) & set(compat_fully_decomp): | |
240 | if canon_fully_decomp[ch] == compat_fully_decomp[ch]: | |
241 | del compat_fully_decomp[ch] | |
242 | ||
243 | return canon_fully_decomp, compat_fully_decomp | |
244 | ||
245 | def _compute_stream_safe_tables(self): | |
246 | """ | |
247 | To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4), | |
248 | we need to be able to know the number of contiguous non-starters *after* | |
249 | applying compatibility decomposition to each character. | |
250 | ||
251 | We can do this incrementally by computing the number of leading and | |
252 | trailing non-starters for each character's compatibility decomposition | |
253 | with the following rules: | |
254 | ||
255 | 1) If a character is not affected by compatibility decomposition, look | |
256 | up its canonical combining class to find out if it's a non-starter. | |
257 | 2) All Hangul characters are starters, even under decomposition. | |
258 | 3) Otherwise, very few decomposing characters have a nonzero count | |
259 | of leading or trailing non-starters, so store these characters | |
260 | with their associated counts in a separate table. | |
261 | """ | |
262 | leading_nonstarters = {} | |
263 | trailing_nonstarters = {} | |
264 | ||
265 | for c in set(self.canon_fully_decomp) | set(self.compat_fully_decomp): | |
266 | decomposed = self.compat_fully_decomp.get(c) or self.canon_fully_decomp[c] | |
267 | ||
268 | num_leading = 0 | |
269 | for d in decomposed: | |
270 | if d not in self.combining_classes: | |
271 | break | |
272 | num_leading += 1 | |
273 | ||
274 | num_trailing = 0 | |
275 | for d in reversed(decomposed): | |
276 | if d not in self.combining_classes: | |
277 | break | |
278 | num_trailing += 1 | |
279 | ||
280 | if num_leading > 0: | |
281 | leading_nonstarters[c] = num_leading | |
282 | if num_trailing > 0: | |
283 | trailing_nonstarters[c] = num_trailing | |
284 | ||
285 | return leading_nonstarters, trailing_nonstarters | |
286 | ||
dfeec247 | 287 | hexify = lambda c: '{:04X}'.format(c) |
8faf50e0 | 288 | |
dfeec247 XL |
289 | def gen_mph_data(name, d, kv_type, kv_callback): |
290 | (salt, keys) = minimal_perfect_hash(d) | |
291 | out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper()) | |
292 | for s in salt: | |
293 | out.write(" 0x{:x},\n".format(s)) | |
294 | out.write("];\n") | |
295 | out.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name.upper(), kv_type)) | |
296 | for k in keys: | |
297 | out.write(" {},\n".format(kv_callback(k))) | |
298 | out.write("];\n\n") | |
8faf50e0 | 299 | |
dfeec247 XL |
300 | def gen_combining_class(combining_classes, out): |
301 | gen_mph_data('canonical_combining_class', combining_classes, 'u32', | |
302 | lambda k: "0x{:X}".format(int(combining_classes[k]) | (k << 8))) | |
8faf50e0 XL |
303 | |
304 | def gen_composition_table(canon_comp, out): | |
dfeec247 XL |
305 | table = {} |
306 | for (c1, c2), c3 in canon_comp.items(): | |
307 | if c1 < 0x10000 and c2 < 0x10000: | |
308 | table[(c1 << 16) | c2] = c3 | |
309 | (salt, keys) = minimal_perfect_hash(table) | |
310 | gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)', | |
311 | lambda k: "(0x%s, '\\u{%s}')" % (hexify(k), hexify(table[k]))) | |
312 | ||
313 | out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n") | |
8faf50e0 | 314 | out.write(" match (c1, c2) {\n") |
8faf50e0 | 315 | for (c1, c2), c3 in sorted(canon_comp.items()): |
dfeec247 XL |
316 | if c1 >= 0x10000 and c2 >= 0x10000: |
317 | out.write(" ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3))) | |
8faf50e0 XL |
318 | |
319 | out.write(" _ => None,\n") | |
320 | out.write(" }\n") | |
321 | out.write("}\n") | |
322 | ||
323 | def gen_decomposition_tables(canon_decomp, compat_decomp, out): | |
324 | tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility')] | |
325 | for table, name in tables: | |
dfeec247 XL |
326 | gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])", |
327 | lambda k: "(0x{:x}, &[{}])".format(k, | |
328 | ", ".join("'\\u{%s}'" % hexify(c) for c in table[k]))) | |
8faf50e0 XL |
329 | |
330 | def gen_qc_match(prop_table, out): | |
331 | out.write(" match c {\n") | |
332 | ||
333 | for low, high, data in prop_table: | |
334 | assert data in ('N', 'M') | |
335 | result = "No" if data == 'N' else "Maybe" | |
336 | if high: | |
337 | out.write(r" '\u{%s}'...'\u{%s}' => %s," % (low, high, result)) | |
abe05a73 | 338 | else: |
8faf50e0 XL |
339 | out.write(r" '\u{%s}' => %s," % (low, result)) |
340 | out.write("\n") | |
341 | ||
342 | out.write(" _ => Yes,\n") | |
343 | out.write(" }\n") | |
344 | ||
345 | def gen_nfc_qc(prop_tables, out): | |
346 | out.write("#[inline]\n") | |
dfeec247 | 347 | out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") |
8faf50e0 XL |
348 | out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n") |
349 | gen_qc_match(prop_tables['NFC_QC'], out) | |
350 | out.write("}\n") | |
351 | ||
dfeec247 XL |
352 | def gen_nfkc_qc(prop_tables, out): |
353 | out.write("#[inline]\n") | |
354 | out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") | |
355 | out.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n") | |
356 | gen_qc_match(prop_tables['NFKC_QC'], out) | |
357 | out.write("}\n") | |
358 | ||
8faf50e0 XL |
359 | def gen_nfd_qc(prop_tables, out): |
360 | out.write("#[inline]\n") | |
dfeec247 | 361 | out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") |
8faf50e0 XL |
362 | out.write("pub fn qc_nfd(c: char) -> IsNormalized {\n") |
363 | gen_qc_match(prop_tables['NFD_QC'], out) | |
364 | out.write("}\n") | |
365 | ||
dfeec247 | 366 | def gen_nfkd_qc(prop_tables, out): |
8faf50e0 | 367 | out.write("#[inline]\n") |
dfeec247 XL |
368 | out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") |
369 | out.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n") | |
370 | gen_qc_match(prop_tables['NFKD_QC'], out) | |
8faf50e0 XL |
371 | out.write("}\n") |
372 | ||
dfeec247 XL |
373 | def gen_combining_mark(general_category_mark, out): |
374 | gen_mph_data('combining_mark', general_category_mark, 'u32', | |
375 | lambda k: '0x{:04x}'.format(k)) | |
376 | ||
8faf50e0 | 377 | def gen_stream_safe(leading, trailing, out): |
dfeec247 | 378 | # This could be done as a hash but the table is very small. |
8faf50e0 XL |
379 | out.write("#[inline]\n") |
380 | out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n") | |
381 | out.write(" match c {\n") | |
382 | ||
dfeec247 XL |
383 | for char, num_leading in sorted(leading.items()): |
384 | out.write(" '\\u{%s}' => %d,\n" % (hexify(char), num_leading)) | |
8faf50e0 XL |
385 | |
386 | out.write(" _ => 0,\n") | |
387 | out.write(" }\n") | |
388 | out.write("}\n") | |
389 | out.write("\n") | |
390 | ||
dfeec247 XL |
391 | gen_mph_data('trailing_nonstarters', trailing, 'u32', |
392 | lambda k: "0x{:X}".format(int(trailing[k]) | (k << 8))) | |
8faf50e0 XL |
393 | |
394 | def gen_tests(tests, out): | |
395 | out.write("""#[derive(Debug)] | |
396 | pub struct NormalizationTest { | |
397 | pub source: &'static str, | |
398 | pub nfc: &'static str, | |
399 | pub nfd: &'static str, | |
400 | pub nfkc: &'static str, | |
401 | pub nfkd: &'static str, | |
abe05a73 | 402 | } |
abe05a73 | 403 | |
abe05a73 XL |
404 | """) |
405 | ||
8faf50e0 | 406 | out.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n") |
dfeec247 | 407 | str_literal = lambda s: '"%s"' % "".join("\\u{%s}" % c for c in s) |
abe05a73 | 408 | |
8faf50e0 XL |
409 | for test in tests: |
410 | out.write(" NormalizationTest {\n") | |
411 | out.write(" source: %s,\n" % str_literal(test.source)) | |
412 | out.write(" nfc: %s,\n" % str_literal(test.nfc)) | |
413 | out.write(" nfd: %s,\n" % str_literal(test.nfd)) | |
414 | out.write(" nfkc: %s,\n" % str_literal(test.nfkc)) | |
415 | out.write(" nfkd: %s,\n" % str_literal(test.nfkd)) | |
416 | out.write(" },\n") | |
abe05a73 | 417 | |
8faf50e0 | 418 | out.write("];\n") |
abe05a73 | 419 | |
dfeec247 XL |
420 | # Guaranteed to be less than n. |
421 | def my_hash(x, salt, n): | |
422 | # This is hash based on the theory that multiplication is efficient | |
423 | mask_32 = 0xffffffff | |
424 | y = ((x + salt) * 2654435769) & mask_32 | |
425 | y ^= (x * 0x31415926) & mask_32 | |
426 | return (y * n) >> 32 | |
427 | ||
428 | # Compute minimal perfect hash function, d can be either a dict or list of keys. | |
429 | def minimal_perfect_hash(d): | |
430 | n = len(d) | |
431 | buckets = dict((h, []) for h in range(n)) | |
432 | for key in d: | |
433 | h = my_hash(key, 0, n) | |
434 | buckets[h].append(key) | |
435 | bsorted = [(len(buckets[h]), h) for h in range(n)] | |
436 | bsorted.sort(reverse = True) | |
437 | claimed = [False] * n | |
438 | salts = [0] * n | |
439 | keys = [0] * n | |
440 | for (bucket_size, h) in bsorted: | |
441 | # Note: the traditional perfect hashing approach would also special-case | |
442 | # bucket_size == 1 here and assign any empty slot, rather than iterating | |
443 | # until rehash finds an empty slot. But we're not doing that so we can | |
444 | # avoid the branch. | |
445 | if bucket_size == 0: | |
446 | break | |
447 | else: | |
448 | for salt in range(1, 32768): | |
449 | rehashes = [my_hash(key, salt, n) for key in buckets[h]] | |
450 | # Make sure there are no rehash collisions within this bucket. | |
451 | if all(not claimed[hash] for hash in rehashes): | |
452 | if len(set(rehashes)) < bucket_size: | |
453 | continue | |
454 | salts[h] = salt | |
455 | for key in buckets[h]: | |
456 | rehash = my_hash(key, salt, n) | |
457 | claimed[rehash] = True | |
458 | keys[rehash] = key | |
459 | break | |
460 | if salts[h] == 0: | |
461 | print("minimal perfect hashing failed") | |
462 | # Note: if this happens (because of unfortunate data), then there are | |
463 | # a few things that could be done. First, the hash function could be | |
464 | # tweaked. Second, the bucket order could be scrambled (especially the | |
465 | # singletons). Right now, the buckets are sorted, which has the advantage | |
466 | # of being deterministic. | |
467 | # | |
468 | # As a more extreme approach, the singleton bucket optimization could be | |
469 | # applied (give the direct address for singleton buckets, rather than | |
470 | # relying on a rehash). That is definitely the more standard approach in | |
471 | # the minimal perfect hashing literature, but in testing the branch was a | |
472 | # significant slowdown. | |
473 | exit(1) | |
474 | return (salts, keys) | |
475 | ||
8faf50e0 XL |
476 | if __name__ == '__main__': |
477 | data = UnicodeData() | |
dfeec247 | 478 | with open("tables.rs", "w", newline = "\n") as out: |
8faf50e0 | 479 | out.write(PREAMBLE) |
f035d41b XL |
480 | out.write("use crate::quick_check::IsNormalized;\n") |
481 | out.write("use crate::quick_check::IsNormalized::*;\n") | |
8faf50e0 | 482 | out.write("\n") |
abe05a73 | 483 | |
8faf50e0 XL |
484 | version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split(".")) |
485 | out.write("#[allow(unused)]\n") | |
486 | out.write("pub const UNICODE_VERSION: (u64, u64, u64) = %s;\n\n" % version) | |
487 | ||
488 | gen_combining_class(data.combining_classes, out) | |
489 | out.write("\n") | |
490 | ||
491 | gen_composition_table(data.canon_comp, out) | |
492 | out.write("\n") | |
493 | ||
494 | gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, out) | |
495 | ||
496 | gen_combining_mark(data.general_category_mark, out) | |
497 | out.write("\n") | |
498 | ||
499 | gen_nfc_qc(data.norm_props, out) | |
500 | out.write("\n") | |
501 | ||
dfeec247 XL |
502 | gen_nfkc_qc(data.norm_props, out) |
503 | out.write("\n") | |
504 | ||
8faf50e0 XL |
505 | gen_nfd_qc(data.norm_props, out) |
506 | out.write("\n") | |
507 | ||
dfeec247 XL |
508 | gen_nfkd_qc(data.norm_props, out) |
509 | out.write("\n") | |
510 | ||
8faf50e0 XL |
511 | gen_stream_safe(data.ss_leading, data.ss_trailing, out) |
512 | out.write("\n") | |
abe05a73 | 513 | |
dfeec247 | 514 | with open("normalization_tests.rs", "w", newline = "\n") as out: |
8faf50e0 XL |
515 | out.write(PREAMBLE) |
516 | gen_tests(data.norm_tests, out) |