]>
git.proxmox.com Git - rustc.git/blob - vendor/unicode-normalization/scripts/unicode.py
3 # Copyright 2011-2018 The Rust Project Developers. See the COPYRIGHT
4 # file at the top-level directory of this distribution and at
5 # http://rust-lang.org/COPYRIGHT.
7 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10 # option. This file may not be copied, modified, or distributed
11 # except according to those terms.
13 # This script uses the following Unicode tables:
14 # - DerivedNormalizationProps.txt
15 # - NormalizationTest.txt
18 # Since this should not require frequent updates, we just store this
19 # out-of-line and check the unicode.rs file into git.
23 UNICODE_VERSION
= "9.0.0"
24 UCD_URL
= "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
26 PREAMBLE
= """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
27 // file at the top-level directory of this distribution and at
28 // http://rust-lang.org/COPYRIGHT.
30 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
31 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
32 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
33 // option. This file may not be copied, modified, or distributed
34 // except according to those terms.
36 // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
38 #![allow(missing_docs)]
41 NormalizationTest
= collections
.namedtuple(
43 ["source", "nfc", "nfd", "nfkc", "nfkd"],
46 # Mapping taken from Table 12 from:
47 # http://www.unicode.org/reports/tr44/#General_Category_Values
48 expanded_categories
= {
49 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
50 'Lm': ['L'], 'Lo': ['L'],
51 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
52 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
53 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
54 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
55 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
56 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
57 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
60 class UnicodeData(object):
62 self
._load
_unicode
_data
()
63 self
.norm_props
= self
._load
_norm
_props
()
64 self
.norm_tests
= self
._load
_norm
_tests
()
66 self
.canon_comp
= self
._compute
_canonical
_comp
()
67 self
.canon_fully_decomp
, self
.compat_fully_decomp
= self
._compute
_fully
_decomposed
()
69 def stats(name
, table
):
70 count
= sum(len(v
) for v
in table
.values())
71 print("%s: %d chars => %d decomposed chars" % (name
, len(table
), count
))
73 print("Decomposition table stats:")
74 stats("Canonical decomp", self
.canon_decomp
)
75 stats("Compatible decomp", self
.compat_decomp
)
76 stats("Canonical fully decomp", self
.canon_fully_decomp
)
77 stats("Compatible fully decomp", self
.compat_fully_decomp
)
79 self
.ss_leading
, self
.ss_trailing
= self
._compute
_stream
_safe
_tables
()
81 def _fetch(self
, filename
):
82 resp
= urllib
.request
.urlopen(UCD_URL
+ filename
)
83 return resp
.read().decode('utf-8')
85 def _load_unicode_data(self
):
86 self
.combining_classes
= {}
87 self
.compat_decomp
= {}
88 self
.canon_decomp
= {}
89 self
.general_category_mark
= []
91 for line
in self
._fetch
("UnicodeData.txt").splitlines():
92 # See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
93 pieces
= line
.split(';')
94 assert len(pieces
) == 15
95 char
, category
, cc
, decomp
= pieces
[0], pieces
[2], pieces
[3], pieces
[5]
96 char_int
= int(char
, 16)
99 self
.combining_classes
[char_int
] = cc
101 if decomp
.startswith('<'):
102 self
.compat_decomp
[char_int
] = [int(c
, 16) for c
in decomp
.split()[1:]]
104 self
.canon_decomp
[char_int
] = [int(c
, 16) for c
in decomp
.split()]
106 if category
== 'M' or 'M' in expanded_categories
.get(category
, []):
107 self
.general_category_mark
.append(char_int
)
109 def _load_norm_props(self
):
110 props
= collections
.defaultdict(list)
112 for line
in self
._fetch
("DerivedNormalizationProps.txt").splitlines():
113 (prop_data
, _
, _
) = line
.partition("#")
114 prop_pieces
= prop_data
.split(";")
116 if len(prop_pieces
) < 2:
119 assert len(prop_pieces
) <= 3
120 (low
, _
, high
) = prop_pieces
[0].strip().partition("..")
122 prop
= prop_pieces
[1].strip()
125 if len(prop_pieces
) == 3:
126 data
= prop_pieces
[2].strip()
128 props
[prop
].append((low
, high
, data
))
132 def _load_norm_tests(self
):
134 for line
in self
._fetch
("NormalizationTest.txt").splitlines():
135 (test_data
, _
, _
) = line
.partition("#")
136 test_pieces
= test_data
.split(";")
138 if len(test_pieces
) < 5:
141 source
, nfc
, nfd
, nfkc
, nfkd
= [[c
.strip() for c
in p
.split()] for p
in test_pieces
[:5]]
142 tests
.append(NormalizationTest(source
, nfc
, nfd
, nfkc
, nfkd
))
146 def _compute_canonical_comp(self
):
149 (int(low
, 16), int(high
or low
, 16))
150 for low
, high
, _
in self
.norm_props
["Full_Composition_Exclusion"]
152 for char_int
, decomp
in self
.canon_decomp
.items():
153 if any(lo
<= char_int
<= hi
for lo
, hi
in comp_exclusions
):
156 assert len(decomp
) == 2
157 assert (decomp
[0], decomp
[1]) not in canon_comp
158 canon_comp
[(decomp
[0], decomp
[1])] = char_int
162 def _compute_fully_decomposed(self
):
164 Even though the decomposition algorithm is recursive, it is possible
165 to precompute the recursion at table generation time with modest
166 increase to the table size. Then, for these precomputed tables, we
167 note that 1) compatible decomposition is a subset of canonical
168 decomposition and 2) they mostly agree on their intersection.
169 Therefore, we don't store entries in the compatible table for
170 characters that decompose the same way under canonical decomposition.
172 Decomposition table stats:
173 Canonical decomp: 2060 chars => 3085 decomposed chars
174 Compatible decomp: 3662 chars => 5440 decomposed chars
175 Canonical fully decomp: 2060 chars => 3404 decomposed chars
176 Compatible fully decomp: 3678 chars => 5599 decomposed chars
178 The upshot is that decomposition code is very simple and easy to inline
179 at mild code size cost.
181 # Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
182 # http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
183 S_BASE
, L_COUNT
, V_COUNT
, T_COUNT
= 0xAC00, 19, 21, 28
184 S_COUNT
= L_COUNT
* V_COUNT
* T_COUNT
186 def _decompose(char_int
, compatible
):
187 # 7-bit ASCII never decomposes
192 # Assert that we're handling Hangul separately.
193 assert not (S_BASE
<= char_int
< S_BASE
+ S_COUNT
)
195 decomp
= self
.canon_decomp
.get(char_int
)
196 if decomp
is not None:
197 for decomposed_ch
in decomp
:
198 for fully_decomposed_ch
in _decompose(decomposed_ch
, compatible
):
199 yield fully_decomposed_ch
202 if compatible
and char_int
in self
.compat_decomp
:
203 for decomposed_ch
in self
.compat_decomp
[char_int
]:
204 for fully_decomposed_ch
in _decompose(decomposed_ch
, compatible
):
205 yield fully_decomposed_ch
212 max(self
.canon_decomp
.keys()),
213 max(self
.compat_decomp
.keys()),
216 canon_fully_decomp
= {}
217 compat_fully_decomp
= {}
219 for char_int
in range(0, end_codepoint
+ 1):
220 # Always skip Hangul, since it's more efficient to represent its
221 # decomposition programmatically.
222 if S_BASE
<= char_int
< S_BASE
+ S_COUNT
:
225 canon
= list(_decompose(char_int
, False))
226 if not (len(canon
) == 1 and canon
[0] == char_int
):
227 canon_fully_decomp
[char_int
] = canon
229 compat
= list(_decompose(char_int
, True))
230 if not (len(compat
) == 1 and compat
[0] == char_int
):
231 compat_fully_decomp
[char_int
] = compat
233 # Since canon_fully_decomp is a subset of compat_fully_decomp, we don't
234 # need to store their overlap when they agree. When they don't agree,
235 # store the decomposition in the compatibility table since we'll check
236 # that first when normalizing to NFKD.
237 assert set(canon_fully_decomp
) <= set(compat_fully_decomp
)
239 for ch
in set(canon_fully_decomp
) & set(compat_fully_decomp
):
240 if canon_fully_decomp
[ch
] == compat_fully_decomp
[ch
]:
241 del compat_fully_decomp
[ch
]
243 return canon_fully_decomp
, compat_fully_decomp
245 def _compute_stream_safe_tables(self
):
247 To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4),
248 we need to be able to know the number of contiguous non-starters *after*
249 applying compatibility decomposition to each character.
251 We can do this incrementally by computing the number of leading and
252 trailing non-starters for each character's compatibility decomposition
253 with the following rules:
255 1) If a character is not affected by compatibility decomposition, look
256 up its canonical combining class to find out if it's a non-starter.
257 2) All Hangul characters are starters, even under decomposition.
258 3) Otherwise, very few decomposing characters have a nonzero count
259 of leading or trailing non-starters, so store these characters
260 with their associated counts in a separate table.
262 leading_nonstarters
= {}
263 trailing_nonstarters
= {}
265 for c
in set(self
.canon_fully_decomp
) |
set(self
.compat_fully_decomp
):
266 decomposed
= self
.compat_fully_decomp
.get(c
) or self
.canon_fully_decomp
[c
]
270 if d
not in self
.combining_classes
:
275 for d
in reversed(decomposed
):
276 if d
not in self
.combining_classes
:
281 leading_nonstarters
[c
] = num_leading
283 trailing_nonstarters
[c
] = num_trailing
285 return leading_nonstarters
, trailing_nonstarters
287 hexify
= lambda c
: '{:04X}'.format(c
)
289 def gen_mph_data(name
, d
, kv_type
, kv_callback
):
290 (salt
, keys
) = minimal_perfect_hash(d
)
291 out
.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name
.upper())
293 out
.write(" 0x{:x},\n".format(s
))
295 out
.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name
.upper(), kv_type
))
297 out
.write(" {},\n".format(kv_callback(k
)))
300 def gen_combining_class(combining_classes
, out
):
301 gen_mph_data('canonical_combining_class', combining_classes
, 'u32',
302 lambda k
: "0x{:X}".format(int(combining_classes
[k
]) |
(k
<< 8)))
304 def gen_composition_table(canon_comp
, out
):
306 for (c1
, c2
), c3
in canon_comp
.items():
307 if c1
< 0x10000 and c2
< 0x10000:
308 table
[(c1
<< 16) | c2
] = c3
309 (salt
, keys
) = minimal_perfect_hash(table
)
310 gen_mph_data('COMPOSITION_TABLE', table
, '(u32, char)',
311 lambda k
: "(0x%s, '\\u{%s}')" % (hexify(k
), hexify(table
[k
])))
313 out
.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n")
314 out
.write(" match (c1, c2) {\n")
315 for (c1
, c2
), c3
in sorted(canon_comp
.items()):
316 if c1
>= 0x10000 and c2
>= 0x10000:
317 out
.write(" ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1
), hexify(c2
), hexify(c3
)))
319 out
.write(" _ => None,\n")
323 def gen_decomposition_tables(canon_decomp
, compat_decomp
, out
):
324 tables
= [(canon_decomp
, 'canonical'), (compat_decomp
, 'compatibility')]
325 for table
, name
in tables
:
326 gen_mph_data(name
+ '_decomposed', table
, "(u32, &'static [char])",
327 lambda k
: "(0x{:x}, &[{}])".format(k
,
328 ", ".join("'\\u{%s}'" % hexify(c
) for c
in table
[k
])))
330 def gen_qc_match(prop_table
, out
):
331 out
.write(" match c {\n")
333 for low
, high
, data
in prop_table
:
334 assert data
in ('N', 'M')
335 result
= "No" if data
== 'N' else "Maybe"
337 out
.write(r
" '\u{%s}'...'\u{%s}' => %s," % (low
, high
, result
))
339 out
.write(r
" '\u{%s}' => %s," % (low
, result
))
342 out
.write(" _ => Yes,\n")
345 def gen_nfc_qc(prop_tables
, out
):
346 out
.write("#[inline]\n")
347 out
.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
348 out
.write("pub fn qc_nfc(c: char) -> IsNormalized {\n")
349 gen_qc_match(prop_tables
['NFC_QC'], out
)
352 def gen_nfkc_qc(prop_tables
, out
):
353 out
.write("#[inline]\n")
354 out
.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
355 out
.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n")
356 gen_qc_match(prop_tables
['NFKC_QC'], out
)
359 def gen_nfd_qc(prop_tables
, out
):
360 out
.write("#[inline]\n")
361 out
.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
362 out
.write("pub fn qc_nfd(c: char) -> IsNormalized {\n")
363 gen_qc_match(prop_tables
['NFD_QC'], out
)
366 def gen_nfkd_qc(prop_tables
, out
):
367 out
.write("#[inline]\n")
368 out
.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
369 out
.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n")
370 gen_qc_match(prop_tables
['NFKD_QC'], out
)
373 def gen_combining_mark(general_category_mark
, out
):
374 gen_mph_data('combining_mark', general_category_mark
, 'u32',
375 lambda k
: '0x{:04x}'.format(k
))
377 def gen_stream_safe(leading
, trailing
, out
):
378 # This could be done as a hash but the table is very small.
379 out
.write("#[inline]\n")
380 out
.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n")
381 out
.write(" match c {\n")
383 for char
, num_leading
in sorted(leading
.items()):
384 out
.write(" '\\u{%s}' => %d,\n" % (hexify(char
), num_leading
))
386 out
.write(" _ => 0,\n")
391 gen_mph_data('trailing_nonstarters', trailing
, 'u32',
392 lambda k
: "0x{:X}".format(int(trailing
[k
]) |
(k
<< 8)))
394 def gen_tests(tests
, out
):
395 out
.write("""#[derive(Debug)]
396 pub struct NormalizationTest {
397 pub source: &'static str,
398 pub nfc: &'static str,
399 pub nfd: &'static str,
400 pub nfkc: &'static str,
401 pub nfkd: &'static str,
406 out
.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n")
407 str_literal
= lambda s
: '"%s"' % "".join("\\u{%s}" % c
for c
in s
)
410 out
.write(" NormalizationTest {\n")
411 out
.write(" source: %s,\n" % str_literal(test
.source
))
412 out
.write(" nfc: %s,\n" % str_literal(test
.nfc
))
413 out
.write(" nfd: %s,\n" % str_literal(test
.nfd
))
414 out
.write(" nfkc: %s,\n" % str_literal(test
.nfkc
))
415 out
.write(" nfkd: %s,\n" % str_literal(test
.nfkd
))
420 # Guaranteed to be less than n.
421 def my_hash(x
, salt
, n
):
422 # This is hash based on the theory that multiplication is efficient
424 y
= ((x
+ salt
) * 2654435769) & mask_32
425 y ^
= (x
* 0x31415926) & mask_32
428 # Compute minimal perfect hash function, d can be either a dict or list of keys.
429 def minimal_perfect_hash(d
):
431 buckets
= dict((h
, []) for h
in range(n
))
433 h
= my_hash(key
, 0, n
)
434 buckets
[h
].append(key
)
435 bsorted
= [(len(buckets
[h
]), h
) for h
in range(n
)]
436 bsorted
.sort(reverse
= True)
437 claimed
= [False] * n
440 for (bucket_size
, h
) in bsorted
:
441 # Note: the traditional perfect hashing approach would also special-case
442 # bucket_size == 1 here and assign any empty slot, rather than iterating
443 # until rehash finds an empty slot. But we're not doing that so we can
448 for salt
in range(1, 32768):
449 rehashes
= [my_hash(key
, salt
, n
) for key
in buckets
[h
]]
450 # Make sure there are no rehash collisions within this bucket.
451 if all(not claimed
[hash] for hash in rehashes
):
452 if len(set(rehashes
)) < bucket_size
:
455 for key
in buckets
[h
]:
456 rehash
= my_hash(key
, salt
, n
)
457 claimed
[rehash
] = True
461 print("minimal perfect hashing failed")
462 # Note: if this happens (because of unfortunate data), then there are
463 # a few things that could be done. First, the hash function could be
464 # tweaked. Second, the bucket order could be scrambled (especially the
465 # singletons). Right now, the buckets are sorted, which has the advantage
466 # of being deterministic.
468 # As a more extreme approach, the singleton bucket optimization could be
469 # applied (give the direct address for singleton buckets, rather than
470 # relying on a rehash). That is definitely the more standard approach in
471 # the minimal perfect hashing literature, but in testing the branch was a
472 # significant slowdown.
476 if __name__
== '__main__':
478 with
open("tables.rs", "w", newline
= "\n") as out
:
480 out
.write("use crate::quick_check::IsNormalized;\n")
481 out
.write("use crate::quick_check::IsNormalized::*;\n")
484 version
= "(%s, %s, %s)" % tuple(UNICODE_VERSION
.split("."))
485 out
.write("#[allow(unused)]\n")
486 out
.write("pub const UNICODE_VERSION: (u64, u64, u64) = %s;\n\n" % version
)
488 gen_combining_class(data
.combining_classes
, out
)
491 gen_composition_table(data
.canon_comp
, out
)
494 gen_decomposition_tables(data
.canon_fully_decomp
, data
.compat_fully_decomp
, out
)
496 gen_combining_mark(data
.general_category_mark
, out
)
499 gen_nfc_qc(data
.norm_props
, out
)
502 gen_nfkc_qc(data
.norm_props
, out
)
505 gen_nfd_qc(data
.norm_props
, out
)
508 gen_nfkd_qc(data
.norm_props
, out
)
511 gen_stream_safe(data
.ss_leading
, data
.ss_trailing
, out
)
514 with
open("normalization_tests.rs", "w", newline
= "\n") as out
:
516 gen_tests(data
.norm_tests
, out
)