]> git.proxmox.com Git - rustc.git/blob - vendor/unicode-normalization/scripts/unicode.py
New upstream version 1.46.0~beta.2+dfsg1
[rustc.git] / vendor / unicode-normalization / scripts / unicode.py
1 #!/usr/bin/env python
2 #
3 # Copyright 2011-2018 The Rust Project Developers. See the COPYRIGHT
4 # file at the top-level directory of this distribution and at
5 # http://rust-lang.org/COPYRIGHT.
6 #
7 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10 # option. This file may not be copied, modified, or distributed
11 # except according to those terms.
12
13 # This script uses the following Unicode tables:
14 # - DerivedNormalizationProps.txt
15 # - NormalizationTest.txt
16 # - UnicodeData.txt
17 #
18 # Since this should not require frequent updates, we just store this
19 # out-of-line and check the unicode.rs file into git.
20 import collections
21 import urllib.request
22
23 UNICODE_VERSION = "9.0.0"
24 UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
25
26 PREAMBLE = """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
27 // file at the top-level directory of this distribution and at
28 // http://rust-lang.org/COPYRIGHT.
29 //
30 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
31 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
32 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
33 // option. This file may not be copied, modified, or distributed
34 // except according to those terms.
35
36 // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
37
38 #![allow(missing_docs)]
39 """
40
41 NormalizationTest = collections.namedtuple(
42 "NormalizationTest",
43 ["source", "nfc", "nfd", "nfkc", "nfkd"],
44 )
45
46 # Mapping taken from Table 12 from:
47 # http://www.unicode.org/reports/tr44/#General_Category_Values
48 expanded_categories = {
49 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
50 'Lm': ['L'], 'Lo': ['L'],
51 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
52 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
53 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
54 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
55 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
56 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
57 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
58 }
59
60 class UnicodeData(object):
61 def __init__(self):
62 self._load_unicode_data()
63 self.norm_props = self._load_norm_props()
64 self.norm_tests = self._load_norm_tests()
65
66 self.canon_comp = self._compute_canonical_comp()
67 self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
68
69 def stats(name, table):
70 count = sum(len(v) for v in table.values())
71 print("%s: %d chars => %d decomposed chars" % (name, len(table), count))
72
73 print("Decomposition table stats:")
74 stats("Canonical decomp", self.canon_decomp)
75 stats("Compatible decomp", self.compat_decomp)
76 stats("Canonical fully decomp", self.canon_fully_decomp)
77 stats("Compatible fully decomp", self.compat_fully_decomp)
78
79 self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
80
81 def _fetch(self, filename):
82 resp = urllib.request.urlopen(UCD_URL + filename)
83 return resp.read().decode('utf-8')
84
85 def _load_unicode_data(self):
86 self.combining_classes = {}
87 self.compat_decomp = {}
88 self.canon_decomp = {}
89 self.general_category_mark = []
90
91 for line in self._fetch("UnicodeData.txt").splitlines():
92 # See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
93 pieces = line.split(';')
94 assert len(pieces) == 15
95 char, category, cc, decomp = pieces[0], pieces[2], pieces[3], pieces[5]
96 char_int = int(char, 16)
97
98 if cc != '0':
99 self.combining_classes[char_int] = cc
100
101 if decomp.startswith('<'):
102 self.compat_decomp[char_int] = [int(c, 16) for c in decomp.split()[1:]]
103 elif decomp != '':
104 self.canon_decomp[char_int] = [int(c, 16) for c in decomp.split()]
105
106 if category == 'M' or 'M' in expanded_categories.get(category, []):
107 self.general_category_mark.append(char_int)
108
109 def _load_norm_props(self):
110 props = collections.defaultdict(list)
111
112 for line in self._fetch("DerivedNormalizationProps.txt").splitlines():
113 (prop_data, _, _) = line.partition("#")
114 prop_pieces = prop_data.split(";")
115
116 if len(prop_pieces) < 2:
117 continue
118
119 assert len(prop_pieces) <= 3
120 (low, _, high) = prop_pieces[0].strip().partition("..")
121
122 prop = prop_pieces[1].strip()
123
124 data = None
125 if len(prop_pieces) == 3:
126 data = prop_pieces[2].strip()
127
128 props[prop].append((low, high, data))
129
130 return props
131
132 def _load_norm_tests(self):
133 tests = []
134 for line in self._fetch("NormalizationTest.txt").splitlines():
135 (test_data, _, _) = line.partition("#")
136 test_pieces = test_data.split(";")
137
138 if len(test_pieces) < 5:
139 continue
140
141 source, nfc, nfd, nfkc, nfkd = [[c.strip() for c in p.split()] for p in test_pieces[:5]]
142 tests.append(NormalizationTest(source, nfc, nfd, nfkc, nfkd))
143
144 return tests
145
146 def _compute_canonical_comp(self):
147 canon_comp = {}
148 comp_exclusions = [
149 (int(low, 16), int(high or low, 16))
150 for low, high, _ in self.norm_props["Full_Composition_Exclusion"]
151 ]
152 for char_int, decomp in self.canon_decomp.items():
153 if any(lo <= char_int <= hi for lo, hi in comp_exclusions):
154 continue
155
156 assert len(decomp) == 2
157 assert (decomp[0], decomp[1]) not in canon_comp
158 canon_comp[(decomp[0], decomp[1])] = char_int
159
160 return canon_comp
161
162 def _compute_fully_decomposed(self):
163 """
164 Even though the decomposition algorithm is recursive, it is possible
165 to precompute the recursion at table generation time with modest
166 increase to the table size. Then, for these precomputed tables, we
167 note that 1) compatible decomposition is a subset of canonical
168 decomposition and 2) they mostly agree on their intersection.
169 Therefore, we don't store entries in the compatible table for
170 characters that decompose the same way under canonical decomposition.
171
172 Decomposition table stats:
173 Canonical decomp: 2060 chars => 3085 decomposed chars
174 Compatible decomp: 3662 chars => 5440 decomposed chars
175 Canonical fully decomp: 2060 chars => 3404 decomposed chars
176 Compatible fully decomp: 3678 chars => 5599 decomposed chars
177
178 The upshot is that decomposition code is very simple and easy to inline
179 at mild code size cost.
180 """
181 # Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
182 # http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
183 S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
184 S_COUNT = L_COUNT * V_COUNT * T_COUNT
185
186 def _decompose(char_int, compatible):
187 # 7-bit ASCII never decomposes
188 if char_int <= 0x7f:
189 yield char_int
190 return
191
192 # Assert that we're handling Hangul separately.
193 assert not (S_BASE <= char_int < S_BASE + S_COUNT)
194
195 decomp = self.canon_decomp.get(char_int)
196 if decomp is not None:
197 for decomposed_ch in decomp:
198 for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
199 yield fully_decomposed_ch
200 return
201
202 if compatible and char_int in self.compat_decomp:
203 for decomposed_ch in self.compat_decomp[char_int]:
204 for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
205 yield fully_decomposed_ch
206 return
207
208 yield char_int
209 return
210
211 end_codepoint = max(
212 max(self.canon_decomp.keys()),
213 max(self.compat_decomp.keys()),
214 )
215
216 canon_fully_decomp = {}
217 compat_fully_decomp = {}
218
219 for char_int in range(0, end_codepoint + 1):
220 # Always skip Hangul, since it's more efficient to represent its
221 # decomposition programmatically.
222 if S_BASE <= char_int < S_BASE + S_COUNT:
223 continue
224
225 canon = list(_decompose(char_int, False))
226 if not (len(canon) == 1 and canon[0] == char_int):
227 canon_fully_decomp[char_int] = canon
228
229 compat = list(_decompose(char_int, True))
230 if not (len(compat) == 1 and compat[0] == char_int):
231 compat_fully_decomp[char_int] = compat
232
233 # Since canon_fully_decomp is a subset of compat_fully_decomp, we don't
234 # need to store their overlap when they agree. When they don't agree,
235 # store the decomposition in the compatibility table since we'll check
236 # that first when normalizing to NFKD.
237 assert set(canon_fully_decomp) <= set(compat_fully_decomp)
238
239 for ch in set(canon_fully_decomp) & set(compat_fully_decomp):
240 if canon_fully_decomp[ch] == compat_fully_decomp[ch]:
241 del compat_fully_decomp[ch]
242
243 return canon_fully_decomp, compat_fully_decomp
244
245 def _compute_stream_safe_tables(self):
246 """
247 To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4),
248 we need to be able to know the number of contiguous non-starters *after*
249 applying compatibility decomposition to each character.
250
251 We can do this incrementally by computing the number of leading and
252 trailing non-starters for each character's compatibility decomposition
253 with the following rules:
254
255 1) If a character is not affected by compatibility decomposition, look
256 up its canonical combining class to find out if it's a non-starter.
257 2) All Hangul characters are starters, even under decomposition.
258 3) Otherwise, very few decomposing characters have a nonzero count
259 of leading or trailing non-starters, so store these characters
260 with their associated counts in a separate table.
261 """
262 leading_nonstarters = {}
263 trailing_nonstarters = {}
264
265 for c in set(self.canon_fully_decomp) | set(self.compat_fully_decomp):
266 decomposed = self.compat_fully_decomp.get(c) or self.canon_fully_decomp[c]
267
268 num_leading = 0
269 for d in decomposed:
270 if d not in self.combining_classes:
271 break
272 num_leading += 1
273
274 num_trailing = 0
275 for d in reversed(decomposed):
276 if d not in self.combining_classes:
277 break
278 num_trailing += 1
279
280 if num_leading > 0:
281 leading_nonstarters[c] = num_leading
282 if num_trailing > 0:
283 trailing_nonstarters[c] = num_trailing
284
285 return leading_nonstarters, trailing_nonstarters
286
287 hexify = lambda c: '{:04X}'.format(c)
288
289 def gen_mph_data(name, d, kv_type, kv_callback):
290 (salt, keys) = minimal_perfect_hash(d)
291 out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper())
292 for s in salt:
293 out.write(" 0x{:x},\n".format(s))
294 out.write("];\n")
295 out.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name.upper(), kv_type))
296 for k in keys:
297 out.write(" {},\n".format(kv_callback(k)))
298 out.write("];\n\n")
299
300 def gen_combining_class(combining_classes, out):
301 gen_mph_data('canonical_combining_class', combining_classes, 'u32',
302 lambda k: "0x{:X}".format(int(combining_classes[k]) | (k << 8)))
303
304 def gen_composition_table(canon_comp, out):
305 table = {}
306 for (c1, c2), c3 in canon_comp.items():
307 if c1 < 0x10000 and c2 < 0x10000:
308 table[(c1 << 16) | c2] = c3
309 (salt, keys) = minimal_perfect_hash(table)
310 gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)',
311 lambda k: "(0x%s, '\\u{%s}')" % (hexify(k), hexify(table[k])))
312
313 out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n")
314 out.write(" match (c1, c2) {\n")
315 for (c1, c2), c3 in sorted(canon_comp.items()):
316 if c1 >= 0x10000 and c2 >= 0x10000:
317 out.write(" ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3)))
318
319 out.write(" _ => None,\n")
320 out.write(" }\n")
321 out.write("}\n")
322
323 def gen_decomposition_tables(canon_decomp, compat_decomp, out):
324 tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility')]
325 for table, name in tables:
326 gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])",
327 lambda k: "(0x{:x}, &[{}])".format(k,
328 ", ".join("'\\u{%s}'" % hexify(c) for c in table[k])))
329
330 def gen_qc_match(prop_table, out):
331 out.write(" match c {\n")
332
333 for low, high, data in prop_table:
334 assert data in ('N', 'M')
335 result = "No" if data == 'N' else "Maybe"
336 if high:
337 out.write(r" '\u{%s}'...'\u{%s}' => %s," % (low, high, result))
338 else:
339 out.write(r" '\u{%s}' => %s," % (low, result))
340 out.write("\n")
341
342 out.write(" _ => Yes,\n")
343 out.write(" }\n")
344
345 def gen_nfc_qc(prop_tables, out):
346 out.write("#[inline]\n")
347 out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
348 out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n")
349 gen_qc_match(prop_tables['NFC_QC'], out)
350 out.write("}\n")
351
352 def gen_nfkc_qc(prop_tables, out):
353 out.write("#[inline]\n")
354 out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
355 out.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n")
356 gen_qc_match(prop_tables['NFKC_QC'], out)
357 out.write("}\n")
358
359 def gen_nfd_qc(prop_tables, out):
360 out.write("#[inline]\n")
361 out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
362 out.write("pub fn qc_nfd(c: char) -> IsNormalized {\n")
363 gen_qc_match(prop_tables['NFD_QC'], out)
364 out.write("}\n")
365
366 def gen_nfkd_qc(prop_tables, out):
367 out.write("#[inline]\n")
368 out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
369 out.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n")
370 gen_qc_match(prop_tables['NFKD_QC'], out)
371 out.write("}\n")
372
373 def gen_combining_mark(general_category_mark, out):
374 gen_mph_data('combining_mark', general_category_mark, 'u32',
375 lambda k: '0x{:04x}'.format(k))
376
377 def gen_stream_safe(leading, trailing, out):
378 # This could be done as a hash but the table is very small.
379 out.write("#[inline]\n")
380 out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n")
381 out.write(" match c {\n")
382
383 for char, num_leading in sorted(leading.items()):
384 out.write(" '\\u{%s}' => %d,\n" % (hexify(char), num_leading))
385
386 out.write(" _ => 0,\n")
387 out.write(" }\n")
388 out.write("}\n")
389 out.write("\n")
390
391 gen_mph_data('trailing_nonstarters', trailing, 'u32',
392 lambda k: "0x{:X}".format(int(trailing[k]) | (k << 8)))
393
394 def gen_tests(tests, out):
395 out.write("""#[derive(Debug)]
396 pub struct NormalizationTest {
397 pub source: &'static str,
398 pub nfc: &'static str,
399 pub nfd: &'static str,
400 pub nfkc: &'static str,
401 pub nfkd: &'static str,
402 }
403
404 """)
405
406 out.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n")
407 str_literal = lambda s: '"%s"' % "".join("\\u{%s}" % c for c in s)
408
409 for test in tests:
410 out.write(" NormalizationTest {\n")
411 out.write(" source: %s,\n" % str_literal(test.source))
412 out.write(" nfc: %s,\n" % str_literal(test.nfc))
413 out.write(" nfd: %s,\n" % str_literal(test.nfd))
414 out.write(" nfkc: %s,\n" % str_literal(test.nfkc))
415 out.write(" nfkd: %s,\n" % str_literal(test.nfkd))
416 out.write(" },\n")
417
418 out.write("];\n")
419
420 # Guaranteed to be less than n.
421 def my_hash(x, salt, n):
422 # This is hash based on the theory that multiplication is efficient
423 mask_32 = 0xffffffff
424 y = ((x + salt) * 2654435769) & mask_32
425 y ^= (x * 0x31415926) & mask_32
426 return (y * n) >> 32
427
428 # Compute minimal perfect hash function, d can be either a dict or list of keys.
429 def minimal_perfect_hash(d):
430 n = len(d)
431 buckets = dict((h, []) for h in range(n))
432 for key in d:
433 h = my_hash(key, 0, n)
434 buckets[h].append(key)
435 bsorted = [(len(buckets[h]), h) for h in range(n)]
436 bsorted.sort(reverse = True)
437 claimed = [False] * n
438 salts = [0] * n
439 keys = [0] * n
440 for (bucket_size, h) in bsorted:
441 # Note: the traditional perfect hashing approach would also special-case
442 # bucket_size == 1 here and assign any empty slot, rather than iterating
443 # until rehash finds an empty slot. But we're not doing that so we can
444 # avoid the branch.
445 if bucket_size == 0:
446 break
447 else:
448 for salt in range(1, 32768):
449 rehashes = [my_hash(key, salt, n) for key in buckets[h]]
450 # Make sure there are no rehash collisions within this bucket.
451 if all(not claimed[hash] for hash in rehashes):
452 if len(set(rehashes)) < bucket_size:
453 continue
454 salts[h] = salt
455 for key in buckets[h]:
456 rehash = my_hash(key, salt, n)
457 claimed[rehash] = True
458 keys[rehash] = key
459 break
460 if salts[h] == 0:
461 print("minimal perfect hashing failed")
462 # Note: if this happens (because of unfortunate data), then there are
463 # a few things that could be done. First, the hash function could be
464 # tweaked. Second, the bucket order could be scrambled (especially the
465 # singletons). Right now, the buckets are sorted, which has the advantage
466 # of being deterministic.
467 #
468 # As a more extreme approach, the singleton bucket optimization could be
469 # applied (give the direct address for singleton buckets, rather than
470 # relying on a rehash). That is definitely the more standard approach in
471 # the minimal perfect hashing literature, but in testing the branch was a
472 # significant slowdown.
473 exit(1)
474 return (salts, keys)
475
476 if __name__ == '__main__':
477 data = UnicodeData()
478 with open("tables.rs", "w", newline = "\n") as out:
479 out.write(PREAMBLE)
480 out.write("use crate::quick_check::IsNormalized;\n")
481 out.write("use crate::quick_check::IsNormalized::*;\n")
482 out.write("\n")
483
484 version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split("."))
485 out.write("#[allow(unused)]\n")
486 out.write("pub const UNICODE_VERSION: (u64, u64, u64) = %s;\n\n" % version)
487
488 gen_combining_class(data.combining_classes, out)
489 out.write("\n")
490
491 gen_composition_table(data.canon_comp, out)
492 out.write("\n")
493
494 gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, out)
495
496 gen_combining_mark(data.general_category_mark, out)
497 out.write("\n")
498
499 gen_nfc_qc(data.norm_props, out)
500 out.write("\n")
501
502 gen_nfkc_qc(data.norm_props, out)
503 out.write("\n")
504
505 gen_nfd_qc(data.norm_props, out)
506 out.write("\n")
507
508 gen_nfkd_qc(data.norm_props, out)
509 out.write("\n")
510
511 gen_stream_safe(data.ss_leading, data.ss_trailing, out)
512 out.write("\n")
513
514 with open("normalization_tests.rs", "w", newline = "\n") as out:
515 out.write(PREAMBLE)
516 gen_tests(data.norm_tests, out)