vendor/idna/src/make_uts46_mapping_table.py

   1 # Copyright 2013-2014 The rust-url developers.
   2 #
   3 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   4 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   5 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   6 # option. This file may not be copied, modified, or distributed
   7 # except according to those terms.
   8
   9 # Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs
  10 # You can get the latest idna table from
  11 # http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt
  12
  13 import collections
  14 import itertools
  15
  16 print('''\
  17 // Copyright 2013-2020 The rust-url developers.
  18 //
  19 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  20 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  21 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  22 // option. This file may not be copied, modified, or distributed
  23 // except according to those terms.
  24
  25 // Generated by make_idna_table.py
  26 ''')
  27
  28 txt = open("IdnaMappingTable.txt")
  29
  30 def escape_char(c):
  31     return "\\u{%x}" % ord(c[0])
  32
  33 def char(s):
  34     return chr(int(s, 16))
  35
  36 strtab = collections.OrderedDict()
  37 strtab_offset = 0
  38
  39 def strtab_slice(s):
  40     global strtab, strtab_offset
  41
  42     if s in strtab:
  43         return strtab[s]
  44     else:
  45         utf8_len = len(s.encode('utf8'))
  46         c = (strtab_offset, utf8_len)
  47         strtab[s] = c
  48         strtab_offset += utf8_len
  49         return c
  50
  51 def rust_slice(s):
  52     start = s[0]
  53     length = s[1]
  54     start_lo = start & 0xff
  55     start_hi = start >> 8
  56     assert length <= 255
  57     assert start_hi <= 255
  58     return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length)
  59
  60 ranges = []
  61
  62 for line in txt:
  63     # remove comments
  64     line, _, _ = line.partition('#')
  65     # skip empty lines
  66     if len(line.strip()) == 0:
  67         continue
  68     fields = line.split(';')
  69     if fields[0].strip() == 'D800..DFFF':
  70         continue  # Surrogates don't occur in Rust strings.
  71     first, _, last = fields[0].strip().partition('..')
  72     if not last:
  73         last = first
  74     mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '')
  75     unicode_str = None
  76     if len(fields) > 2:
  77         if fields[2].strip():
  78             unicode_str = u''.join(char(c) for c in fields[2].strip().split(' '))
  79         elif mapping == "Deviation":
  80             unicode_str = u''
  81
  82     if len(fields) > 3:
  83         assert fields[3].strip() in ('NV8', 'XV8'), fields[3]
  84         assert mapping == 'Valid', mapping
  85         mapping = 'DisallowedIdna2008'
  86
  87     ranges.append((first, last, mapping, unicode_str))
  88
  89 def mergeable_key(r):
  90     mapping = r[2]
  91
  92     # These types have associated data, so we should not merge them.
  93     if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
  94         return r
  95     assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid', 'DisallowedIdna2008')
  96     return mapping
  97
  98 grouped_ranges = itertools.groupby(ranges, key=mergeable_key)
  99
 100 optimized_ranges = []
 101
 102 for (k, g) in grouped_ranges:
 103     group = list(g)
 104     if len(group) == 1:
 105         optimized_ranges.append(group[0])
 106         continue
 107     # Assert that nothing in the group has an associated unicode string.
 108     for g in group:
 109         if g[3] is not None and len(g[3]) > 2:
 110             assert not g[3][2].strip()
 111     # Assert that consecutive members of the group don't leave gaps in
 112     # the codepoint space.
 113     a, b = itertools.tee(group)
 114     next(b, None)
 115     for (g1, g2) in zip(a, b):
 116         last_char = int(g1[1], 16)
 117         next_char = int(g2[0], 16)
 118         if last_char + 1 == next_char:
 119             continue
 120         # There's a gap where surrogates would appear, but we don't have to
 121         # worry about that gap, as surrogates never appear in Rust strings.
 122         # Assert we're seeing the surrogate case here.
 123         assert last_char == 0xd7ff
 124         assert next_char == 0xe000
 125     optimized_ranges.append((group[0][0], group[-1][1]) + group[0][2:])
 126
 127 def is_single_char_range(r):
 128     (first, last, _, _) = r
 129     return first == last
 130
 131 # We can reduce the size of the character range table and the index table to about 1/4
 132 # by merging runs of single character ranges and using character offsets from the start
 133 # of that range to retrieve the correct `Mapping` value
 134 def merge_single_char_ranges(ranges):
 135     current = []
 136     for r in ranges:
 137         if not current or is_single_char_range(current[-1]) and is_single_char_range(r):
 138             current.append(r)
 139             continue
 140         if len(current) != 0:
 141             ret = current
 142             current = [r]
 143             yield ret
 144             continue
 145         current.append(r)
 146         ret = current
 147         current = []
 148         yield ret
 149     yield current
 150
 151 optimized_ranges = list(merge_single_char_ranges(optimized_ranges))
 152
 153 SINGLE_MARKER = 1 << 15
 154
 155 print("static TABLE: &[(char, u16)] = &[")
 156
 157 offset = 0
 158 for ranges in optimized_ranges:
 159     assert offset < SINGLE_MARKER
 160
 161     block_len = len(ranges)
 162     single = SINGLE_MARKER if block_len == 1 else 0
 163     index = offset | single
 164     offset += block_len
 165
 166     start = escape_char(char(ranges[0][0]))
 167     print("    ('%s', %s)," % (start, index))
 168
 169 print("];\n")
 170
 171 print("static MAPPING_TABLE: &[Mapping] = &[")
 172
 173 for ranges in optimized_ranges:
 174     for (first, last, mapping, unicode_str) in ranges:
 175         if unicode_str is not None:
 176             mapping += rust_slice(strtab_slice(unicode_str))
 177         print("    %s," % mapping)
 178
 179 print("];\n")
 180
 181 def escape_str(s):
 182     return [escape_char(c) for c in s]
 183
 184 print("static STRING_TABLE: &str = \"%s\";"
 185       % '\\\n  '.join(itertools.chain(*[escape_str(s) for s in strtab.keys()])))