vendor/idna-0.1.5/src/make_uts46_mapping_table.py

   1 # Copyright 2013-2014 The rust-url developers.
   2 #
   3 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   4 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   5 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   6 # option. This file may not be copied, modified, or distributed
   7 # except according to those terms.
   8
   9 # Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs
  10 # You can get the latest idna table from
  11 # http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt
  12
  13 from __future__ import print_function
  14 import collections
  15 import itertools
  16
  17 print('''\
  18 // Copyright 2013-2014 The rust-url developers.
  19 //
  20 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  21 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  22 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  23 // option. This file may not be copied, modified, or distributed
  24 // except according to those terms.
  25
  26 // Generated by make_idna_table.py
  27 ''')
  28
  29 txt = open("IdnaMappingTable.txt")
  30
  31 def escape_char(c):
  32     return "\\u{%x}" % ord(c[0])
  33
  34 def char(s):
  35     return unichr(int(s, 16))
  36
  37 strtab = collections.OrderedDict()
  38 strtab_offset = 0
  39
  40 def strtab_slice(s):
  41     global strtab, strtab_offset
  42
  43     if s in strtab:
  44         return strtab[s]
  45     else:
  46         utf8_len = len(s.encode('utf8'))
  47         c = (strtab_offset, utf8_len)
  48         strtab[s] = c
  49         strtab_offset += utf8_len
  50         return c
  51
  52 def rust_slice(s):
  53     start = s[0]
  54     length = s[1]
  55     start_lo = start & 0xff
  56     start_hi = start >> 8
  57     assert length <= 255
  58     assert start_hi <= 255
  59     return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length)
  60
  61 ranges = []
  62
  63 for line in txt:
  64     # remove comments
  65     line, _, _ = line.partition('#')
  66     # skip empty lines
  67     if len(line.strip()) == 0:
  68         continue
  69     fields = line.split(';')
  70     if fields[0].strip() == 'D800..DFFF':
  71         continue  # Surrogates don't occur in Rust strings.
  72     first, _, last = fields[0].strip().partition('..')
  73     if not last:
  74         last = first
  75     mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '')
  76     unicode_str = None
  77     if len(fields) > 2:
  78         if fields[2].strip():
  79             unicode_str = u''.join(char(c) for c in fields[2].strip().split(' '))
  80         elif mapping == "Deviation":
  81             unicode_str = u''
  82     ranges.append((first, last, mapping, unicode_str))
  83
  84 def mergeable_key(r):
  85     mapping = r[2]
  86
  87     # These types have associated data, so we should not merge them.
  88     if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
  89         return r
  90     assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid')
  91     return mapping
  92
  93 grouped_ranges = itertools.groupby(ranges, key=mergeable_key)
  94
  95 optimized_ranges = []
  96
  97 for (k, g) in grouped_ranges:
  98     group = list(g)
  99     if len(group) == 1:
 100         optimized_ranges.append(group[0])
 101         continue
 102     # Assert that nothing in the group has an associated unicode string.
 103     for g in group:
 104         if g[3] is not None and len(g[3]) > 2:
 105             assert not g[3][2].strip()
 106     # Assert that consecutive members of the group don't leave gaps in
 107     # the codepoint space.
 108     a, b = itertools.tee(group)
 109     next(b, None)
 110     for (g1, g2) in itertools.izip(a, b):
 111         last_char = int(g1[1], 16)
 112         next_char = int(g2[0], 16)
 113         if last_char + 1 == next_char:
 114             continue
 115         # There's a gap where surrogates would appear, but we don't have to
 116         # worry about that gap, as surrogates never appear in Rust strings.
 117         # Assert we're seeing the surrogate case here.
 118         assert last_char == 0xd7ff
 119         assert next_char == 0xe000
 120     first = group[0][0]
 121     last = group[-1][1]
 122     mapping = group[0][2]
 123     unicode_str = group[0][3]
 124     optimized_ranges.append((first, last, mapping, unicode_str))
 125
 126 def is_single_char_range(r):
 127     (first, last, _, _) = r
 128     return first == last
 129
 130 # We can reduce the size of the character range table and the index table to about 1/4
 131 # by merging runs of single character ranges and using character offsets from the start
 132 # of that range to retrieve the correct `Mapping` value
 133 def merge_single_char_ranges(ranges):
 134     current = []
 135     for r in ranges:
 136         if not current or is_single_char_range(current[-1]) and is_single_char_range(r):
 137             current.append(r)
 138             continue
 139         if len(current) != 0:
 140             ret = current
 141             current = [r]
 142             yield ret
 143             continue
 144         current.append(r)
 145         ret = current
 146         current = []
 147         yield ret
 148     yield current
 149
 150 optimized_ranges = list(merge_single_char_ranges(optimized_ranges))
 151
 152
 153 print("static TABLE: &'static [Range] = &[")
 154
 155 for ranges in optimized_ranges:
 156     first = ranges[0][0]
 157     last = ranges[-1][1]
 158     print("    Range { from: '%s', to: '%s', }," % (escape_char(char(first)),
 159                                                             escape_char(char(last))))
 160
 161 print("];\n")
 162
 163 print("static INDEX_TABLE: &'static [u16] = &[")
 164
 165 SINGLE_MARKER = 1 << 15
 166
 167 offset = 0
 168 for ranges in optimized_ranges:
 169     assert offset < SINGLE_MARKER
 170
 171     block_len = len(ranges)
 172     single = SINGLE_MARKER if block_len == 1 else 0
 173     print("    %s," % (offset | single))
 174     offset += block_len
 175
 176 print("];\n")
 177
 178 print("static MAPPING_TABLE: &'static [Mapping] = &[")
 179
 180 for ranges in optimized_ranges:
 181     for (first, last, mapping, unicode_str) in ranges:
 182         if unicode_str is not None:
 183             mapping += rust_slice(strtab_slice(unicode_str))
 184         print("    %s," % mapping)
 185
 186 print("];\n")
 187
 188 def escape_str(s):
 189     return [escape_char(c) for c in s]
 190
 191 print("static STRING_TABLE: &'static str = \"%s\";"
 192       % '\\\n  '.join(itertools.chain(*[escape_str(s) for s in strtab.iterkeys()])))