vendor/unicode-normalization/scripts/unicode.py

   1 #!/usr/bin/env python
   2 #
   3 # Copyright 2011-2018 The Rust Project Developers. See the COPYRIGHT
   4 # file at the top-level directory of this distribution and at
   5 # http://rust-lang.org/COPYRIGHT.
   6 #
   7 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   8 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   9 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  10 # option. This file may not be copied, modified, or distributed
  11 # except according to those terms.
  12
  13 # This script uses the following Unicode tables:
  14 # - DerivedNormalizationProps.txt
  15 # - NormalizationTest.txt
  16 # - UnicodeData.txt
  17 #
  18 # Since this should not require frequent updates, we just store this
  19 # out-of-line and check the unicode.rs file into git.
  20 import collections
  21 import urllib.request
  22
  23 UNICODE_VERSION = "9.0.0"
  24 UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
  25
  26 PREAMBLE = """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
  27 // file at the top-level directory of this distribution and at
  28 // http://rust-lang.org/COPYRIGHT.
  29 //
  30 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  31 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  32 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  33 // option. This file may not be copied, modified, or distributed
  34 // except according to those terms.
  35
  36 // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
  37
  38 #![allow(missing_docs)]
  39 """
  40
  41 NormalizationTest = collections.namedtuple(
  42     "NormalizationTest",
  43     ["source", "nfc", "nfd", "nfkc", "nfkd"],
  44 )
  45
  46 # Mapping taken from Table 12 from:
  47 # http://www.unicode.org/reports/tr44/#General_Category_Values
  48 expanded_categories = {
  49     'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
  50     'Lm': ['L'], 'Lo': ['L'],
  51     'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
  52     'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
  53     'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
  54     'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
  55     'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
  56     'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
  57     'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
  58 }
  59
  60 class UnicodeData(object):
  61     def __init__(self):
  62         self._load_unicode_data()
  63         self.norm_props = self._load_norm_props()
  64         self.norm_tests = self._load_norm_tests()
  65
  66         self.canon_comp = self._compute_canonical_comp()
  67         self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
  68
  69         def stats(name, table):
  70             count = sum(len(v) for v in table.values())
  71             print("%s: %d chars => %d decomposed chars" % (name, len(table), count))
  72
  73         print("Decomposition table stats:")
  74         stats("Canonical decomp", self.canon_decomp)
  75         stats("Compatible decomp", self.compat_decomp)
  76         stats("Canonical fully decomp", self.canon_fully_decomp)
  77         stats("Compatible fully decomp", self.compat_fully_decomp)
  78
  79         self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
  80
  81     def _fetch(self, filename):
  82         resp = urllib.request.urlopen(UCD_URL + filename)
  83         return resp.read().decode('utf-8')
  84
  85     def _load_unicode_data(self):
  86         self.combining_classes = {}
  87         self.compat_decomp = {}
  88         self.canon_decomp = {}
  89         self.general_category_mark = []
  90
  91         for line in self._fetch("UnicodeData.txt").splitlines():
  92             # See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
  93             pieces = line.split(';')
  94             assert len(pieces) == 15
  95             char, category, cc, decomp = pieces[0], pieces[2], pieces[3], pieces[5]
  96             char_int = int(char, 16)
  97
  98             if cc != '0':
  99                 self.combining_classes[char_int] = cc
 100
 101             if decomp.startswith('<'):
 102                 self.compat_decomp[char_int] = [int(c, 16) for c in decomp.split()[1:]]
 103             elif decomp != '':
 104                 self.canon_decomp[char_int] = [int(c, 16) for c in decomp.split()]
 105
 106             if category == 'M' or 'M' in expanded_categories.get(category, []):
 107                 self.general_category_mark.append(char_int)
 108
 109     def _load_norm_props(self):
 110         props = collections.defaultdict(list)
 111
 112         for line in self._fetch("DerivedNormalizationProps.txt").splitlines():
 113             (prop_data, _, _) = line.partition("#")
 114             prop_pieces = prop_data.split(";")
 115
 116             if len(prop_pieces) < 2:
 117                 continue
 118
 119             assert len(prop_pieces) <= 3
 120             (low, _, high) = prop_pieces[0].strip().partition("..")
 121
 122             prop = prop_pieces[1].strip()
 123
 124             data = None
 125             if len(prop_pieces) == 3:
 126                 data = prop_pieces[2].strip()
 127
 128             props[prop].append((low, high, data))
 129
 130         return props
 131
 132     def _load_norm_tests(self):
 133         tests = []
 134         for line in self._fetch("NormalizationTest.txt").splitlines():
 135             (test_data, _, _) = line.partition("#")
 136             test_pieces = test_data.split(";")
 137
 138             if len(test_pieces) < 5:
 139                 continue
 140
 141             source, nfc, nfd, nfkc, nfkd = [[c.strip() for c in p.split()] for p in test_pieces[:5]]
 142             tests.append(NormalizationTest(source, nfc, nfd, nfkc, nfkd))
 143
 144         return tests
 145
 146     def _compute_canonical_comp(self):
 147         canon_comp = {}
 148         comp_exclusions = [
 149             (int(low, 16), int(high or low, 16))
 150             for low, high, _ in self.norm_props["Full_Composition_Exclusion"]
 151         ]
 152         for char_int, decomp in self.canon_decomp.items():
 153             if any(lo <= char_int <= hi for lo, hi in comp_exclusions):
 154                 continue
 155
 156             assert len(decomp) == 2
 157             assert (decomp[0], decomp[1]) not in canon_comp
 158             canon_comp[(decomp[0], decomp[1])] = char_int
 159
 160         return canon_comp
 161
 162     def _compute_fully_decomposed(self):
 163         """
 164         Even though the decomposition algorithm is recursive, it is possible
 165         to precompute the recursion at table generation time with modest
 166         increase to the table size.  Then, for these precomputed tables, we
 167         note that 1) compatible decomposition is a subset of canonical
 168         decomposition and 2) they mostly agree on their intersection.
 169         Therefore, we don't store entries in the compatible table for
 170         characters that decompose the same way under canonical decomposition.
 171
 172             Decomposition table stats:
 173             Canonical decomp: 2060 chars => 3085 decomposed chars
 174             Compatible decomp: 3662 chars => 5440 decomposed chars
 175             Canonical fully decomp: 2060 chars => 3404 decomposed chars
 176             Compatible fully decomp: 3678 chars => 5599 decomposed chars
 177
 178         The upshot is that decomposition code is very simple and easy to inline
 179         at mild code size cost.
 180         """
 181         # Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
 182         # http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
 183         S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
 184         S_COUNT = L_COUNT * V_COUNT * T_COUNT
 185
 186         def _decompose(char_int, compatible):
 187             # 7-bit ASCII never decomposes
 188             if char_int <= 0x7f:
 189                 yield char_int
 190                 return
 191
 192             # Assert that we're handling Hangul separately.
 193             assert not (S_BASE <= char_int < S_BASE + S_COUNT)
 194
 195             decomp = self.canon_decomp.get(char_int)
 196             if decomp is not None:
 197                 for decomposed_ch in decomp:
 198                     for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
 199                         yield fully_decomposed_ch
 200                 return
 201
 202             if compatible and char_int in self.compat_decomp:
 203                 for decomposed_ch in self.compat_decomp[char_int]:
 204                     for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
 205                         yield fully_decomposed_ch
 206                 return
 207
 208             yield char_int
 209             return
 210
 211         end_codepoint = max(
 212             max(self.canon_decomp.keys()),
 213             max(self.compat_decomp.keys()),
 214         )
 215
 216         canon_fully_decomp = {}
 217         compat_fully_decomp = {}
 218
 219         for char_int in range(0, end_codepoint + 1):
 220             # Always skip Hangul, since it's more efficient to represent its
 221             # decomposition programmatically.
 222             if S_BASE <= char_int < S_BASE + S_COUNT:
 223                 continue
 224
 225             canon = list(_decompose(char_int, False))
 226             if not (len(canon) == 1 and canon[0] == char_int):
 227                 canon_fully_decomp[char_int] = canon
 228
 229             compat = list(_decompose(char_int, True))
 230             if not (len(compat) == 1 and compat[0] == char_int):
 231                 compat_fully_decomp[char_int] = compat
 232
 233         # Since canon_fully_decomp is a subset of compat_fully_decomp, we don't
 234         # need to store their overlap when they agree.  When they don't agree,
 235         # store the decomposition in the compatibility table since we'll check
 236         # that first when normalizing to NFKD.
 237         assert set(canon_fully_decomp) <= set(compat_fully_decomp)
 238
 239         for ch in set(canon_fully_decomp) & set(compat_fully_decomp):
 240             if canon_fully_decomp[ch] == compat_fully_decomp[ch]:
 241                 del compat_fully_decomp[ch]
 242
 243         return canon_fully_decomp, compat_fully_decomp
 244
 245     def _compute_stream_safe_tables(self):
 246         """
 247         To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4),
 248         we need to be able to know the number of contiguous non-starters *after*
 249         applying compatibility decomposition to each character.
 250
 251         We can do this incrementally by computing the number of leading and
 252         trailing non-starters for each character's compatibility decomposition
 253         with the following rules:
 254
 255         1) If a character is not affected by compatibility decomposition, look
 256            up its canonical combining class to find out if it's a non-starter.
 257         2) All Hangul characters are starters, even under decomposition.
 258         3) Otherwise, very few decomposing characters have a nonzero count
 259            of leading or trailing non-starters, so store these characters
 260            with their associated counts in a separate table.
 261         """
 262         leading_nonstarters = {}
 263         trailing_nonstarters = {}
 264
 265         for c in set(self.canon_fully_decomp) | set(self.compat_fully_decomp):
 266             decomposed = self.compat_fully_decomp.get(c) or self.canon_fully_decomp[c]
 267
 268             num_leading = 0
 269             for d in decomposed:
 270                 if d not in self.combining_classes:
 271                     break
 272                 num_leading += 1
 273
 274             num_trailing = 0
 275             for d in reversed(decomposed):
 276                 if d not in self.combining_classes:
 277                     break
 278                 num_trailing += 1
 279
 280             if num_leading > 0:
 281                 leading_nonstarters[c] = num_leading
 282             if num_trailing > 0:
 283                 trailing_nonstarters[c] = num_trailing
 284
 285         return leading_nonstarters, trailing_nonstarters
 286
 287 hexify = lambda c: '{:04X}'.format(c)
 288
 289 def gen_mph_data(name, d, kv_type, kv_callback):
 290     (salt, keys) = minimal_perfect_hash(d)
 291     out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper())
 292     for s in salt:
 293         out.write("    0x{:x},\n".format(s))
 294     out.write("];\n")
 295     out.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name.upper(), kv_type))
 296     for k in keys:
 297         out.write("    {},\n".format(kv_callback(k)))
 298     out.write("];\n\n")
 299
 300 def gen_combining_class(combining_classes, out):
 301     gen_mph_data('canonical_combining_class', combining_classes, 'u32',
 302         lambda k: "0x{:X}".format(int(combining_classes[k]) | (k << 8)))
 303
 304 def gen_composition_table(canon_comp, out):
 305     table = {}
 306     for (c1, c2), c3 in canon_comp.items():
 307         if c1 < 0x10000 and c2 < 0x10000:
 308             table[(c1 << 16) | c2] = c3
 309     (salt, keys) = minimal_perfect_hash(table)
 310     gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)',
 311         lambda k: "(0x%s, '\\u{%s}')" % (hexify(k), hexify(table[k])))
 312
 313     out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n")
 314     out.write("    match (c1, c2) {\n")
 315     for (c1, c2), c3 in sorted(canon_comp.items()):
 316         if c1 >= 0x10000 and c2 >= 0x10000:
 317             out.write("        ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3)))
 318
 319     out.write("        _ => None,\n")
 320     out.write("    }\n")
 321     out.write("}\n")
 322
 323 def gen_decomposition_tables(canon_decomp, compat_decomp, out):
 324     tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility')]
 325     for table, name in tables:
 326         gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])",
 327             lambda k: "(0x{:x}, &[{}])".format(k,
 328                 ", ".join("'\\u{%s}'" % hexify(c) for c in table[k])))
 329
 330 def gen_qc_match(prop_table, out):
 331     out.write("    match c {\n")
 332
 333     for low, high, data in prop_table:
 334         assert data in ('N', 'M')
 335         result = "No" if data == 'N' else "Maybe"
 336         if high:
 337             out.write(r"        '\u{%s}'...'\u{%s}' => %s," % (low, high, result))
 338         else:
 339             out.write(r"        '\u{%s}' => %s," % (low, result))
 340         out.write("\n")
 341
 342     out.write("        _ => Yes,\n")
 343     out.write("    }\n")
 344
 345 def gen_nfc_qc(prop_tables, out):
 346     out.write("#[inline]\n")
 347     out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
 348     out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n")
 349     gen_qc_match(prop_tables['NFC_QC'], out)
 350     out.write("}\n")
 351
 352 def gen_nfkc_qc(prop_tables, out):
 353     out.write("#[inline]\n")
 354     out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
 355     out.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n")
 356     gen_qc_match(prop_tables['NFKC_QC'], out)
 357     out.write("}\n")
 358
 359 def gen_nfd_qc(prop_tables, out):
 360     out.write("#[inline]\n")
 361     out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
 362     out.write("pub fn qc_nfd(c: char) -> IsNormalized {\n")
 363     gen_qc_match(prop_tables['NFD_QC'], out)
 364     out.write("}\n")
 365
 366 def gen_nfkd_qc(prop_tables, out):
 367     out.write("#[inline]\n")
 368     out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
 369     out.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n")
 370     gen_qc_match(prop_tables['NFKD_QC'], out)
 371     out.write("}\n")
 372
 373 def gen_combining_mark(general_category_mark, out):
 374     gen_mph_data('combining_mark', general_category_mark, 'u32',
 375         lambda k: '0x{:04x}'.format(k))
 376
 377 def gen_stream_safe(leading, trailing, out):
 378     # This could be done as a hash but the table is very small.
 379     out.write("#[inline]\n")
 380     out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n")
 381     out.write("    match c {\n")
 382
 383     for char, num_leading in sorted(leading.items()):
 384         out.write("        '\\u{%s}' => %d,\n" % (hexify(char), num_leading))
 385
 386     out.write("        _ => 0,\n")
 387     out.write("    }\n")
 388     out.write("}\n")
 389     out.write("\n")
 390
 391     gen_mph_data('trailing_nonstarters', trailing, 'u32',
 392         lambda k: "0x{:X}".format(int(trailing[k]) | (k << 8)))
 393
 394 def gen_tests(tests, out):
 395     out.write("""#[derive(Debug)]
 396 pub struct NormalizationTest {
 397     pub source: &'static str,
 398     pub nfc: &'static str,
 399     pub nfd: &'static str,
 400     pub nfkc: &'static str,
 401     pub nfkd: &'static str,
 402 }
 403
 404 """)
 405
 406     out.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n")
 407     str_literal = lambda s: '"%s"' % "".join("\\u{%s}" % c for c in s)
 408
 409     for test in tests:
 410         out.write("    NormalizationTest {\n")
 411         out.write("        source: %s,\n" % str_literal(test.source))
 412         out.write("        nfc: %s,\n" % str_literal(test.nfc))
 413         out.write("        nfd: %s,\n" % str_literal(test.nfd))
 414         out.write("        nfkc: %s,\n" % str_literal(test.nfkc))
 415         out.write("        nfkd: %s,\n" % str_literal(test.nfkd))
 416         out.write("    },\n")
 417
 418     out.write("];\n")
 419
 420 # Guaranteed to be less than n.
 421 def my_hash(x, salt, n):
 422     # This is hash based on the theory that multiplication is efficient
 423     mask_32 = 0xffffffff
 424     y = ((x + salt) * 2654435769) & mask_32
 425     y ^= (x * 0x31415926) & mask_32
 426     return (y * n) >> 32
 427
 428 # Compute minimal perfect hash function, d can be either a dict or list of keys.
 429 def minimal_perfect_hash(d):
 430     n = len(d)
 431     buckets = dict((h, []) for h in range(n))
 432     for key in d:
 433         h = my_hash(key, 0, n)
 434         buckets[h].append(key)
 435     bsorted = [(len(buckets[h]), h) for h in range(n)]
 436     bsorted.sort(reverse = True)
 437     claimed = [False] * n
 438     salts = [0] * n
 439     keys = [0] * n
 440     for (bucket_size, h) in bsorted:
 441         # Note: the traditional perfect hashing approach would also special-case
 442         # bucket_size == 1 here and assign any empty slot, rather than iterating
 443         # until rehash finds an empty slot. But we're not doing that so we can
 444         # avoid the branch.
 445         if bucket_size == 0:
 446             break
 447         else:
 448             for salt in range(1, 32768):
 449                 rehashes = [my_hash(key, salt, n) for key in buckets[h]]
 450                 # Make sure there are no rehash collisions within this bucket.
 451                 if all(not claimed[hash] for hash in rehashes):
 452                     if len(set(rehashes)) < bucket_size:
 453                         continue
 454                     salts[h] = salt
 455                     for key in buckets[h]:
 456                         rehash = my_hash(key, salt, n)
 457                         claimed[rehash] = True
 458                         keys[rehash] = key
 459                     break
 460             if salts[h] == 0:
 461                 print("minimal perfect hashing failed")
 462                 # Note: if this happens (because of unfortunate data), then there are
 463                 # a few things that could be done. First, the hash function could be
 464                 # tweaked. Second, the bucket order could be scrambled (especially the
 465                 # singletons). Right now, the buckets are sorted, which has the advantage
 466                 # of being deterministic.
 467                 #
 468                 # As a more extreme approach, the singleton bucket optimization could be
 469                 # applied (give the direct address for singleton buckets, rather than
 470                 # relying on a rehash). That is definitely the more standard approach in
 471                 # the minimal perfect hashing literature, but in testing the branch was a
 472                 # significant slowdown.
 473                 exit(1)
 474     return (salts, keys)
 475
 476 if __name__ == '__main__':
 477     data = UnicodeData()
 478     with open("tables.rs", "w", newline = "\n") as out:
 479         out.write(PREAMBLE)
 480         out.write("use crate::quick_check::IsNormalized;\n")
 481         out.write("use crate::quick_check::IsNormalized::*;\n")
 482         out.write("\n")
 483
 484         version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split("."))
 485         out.write("#[allow(unused)]\n")
 486         out.write("pub const UNICODE_VERSION: (u64, u64, u64) = %s;\n\n" % version)
 487
 488         gen_combining_class(data.combining_classes, out)
 489         out.write("\n")
 490
 491         gen_composition_table(data.canon_comp, out)
 492         out.write("\n")
 493
 494         gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, out)
 495
 496         gen_combining_mark(data.general_category_mark, out)
 497         out.write("\n")
 498
 499         gen_nfc_qc(data.norm_props, out)
 500         out.write("\n")
 501
 502         gen_nfkc_qc(data.norm_props, out)
 503         out.write("\n")
 504
 505         gen_nfd_qc(data.norm_props, out)
 506         out.write("\n")
 507
 508         gen_nfkd_qc(data.norm_props, out)
 509         out.write("\n")
 510
 511         gen_stream_safe(data.ss_leading, data.ss_trailing, out)
 512         out.write("\n")
 513
 514     with open("normalization_tests.rs", "w", newline = "\n") as out:
 515         out.write(PREAMBLE)
 516         gen_tests(data.norm_tests, out)