]> git.proxmox.com Git - rustc.git/blob - vendor/idna/src/make_uts46_mapping_table.py
New upstream version 1.52.0~beta.3+dfsg1
[rustc.git] / vendor / idna / src / make_uts46_mapping_table.py
1 # Copyright 2013-2014 The rust-url developers.
2 #
3 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 # option. This file may not be copied, modified, or distributed
7 # except according to those terms.
8
9 # Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs
10 # You can get the latest idna table from
11 # http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt
12
13 import collections
14 import itertools
15
16 print('''\
17 // Copyright 2013-2020 The rust-url developers.
18 //
19 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
20 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
21 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
22 // option. This file may not be copied, modified, or distributed
23 // except according to those terms.
24
25 // Generated by make_idna_table.py
26 ''')
27
28 txt = open("IdnaMappingTable.txt")
29
30 def escape_char(c):
31 return "\\u{%x}" % ord(c[0])
32
33 def char(s):
34 return chr(int(s, 16))
35
36 strtab = collections.OrderedDict()
37 strtab_offset = 0
38
39 def strtab_slice(s):
40 global strtab, strtab_offset
41
42 if s in strtab:
43 return strtab[s]
44 else:
45 utf8_len = len(s.encode('utf8'))
46 c = (strtab_offset, utf8_len)
47 strtab[s] = c
48 strtab_offset += utf8_len
49 return c
50
51 def rust_slice(s):
52 start = s[0]
53 length = s[1]
54 start_lo = start & 0xff
55 start_hi = start >> 8
56 assert length <= 255
57 assert start_hi <= 255
58 return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length)
59
60 ranges = []
61
62 for line in txt:
63 # remove comments
64 line, _, _ = line.partition('#')
65 # skip empty lines
66 if len(line.strip()) == 0:
67 continue
68 fields = line.split(';')
69 if fields[0].strip() == 'D800..DFFF':
70 continue # Surrogates don't occur in Rust strings.
71 first, _, last = fields[0].strip().partition('..')
72 if not last:
73 last = first
74 mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '')
75 unicode_str = None
76 if len(fields) > 2:
77 if fields[2].strip():
78 unicode_str = u''.join(char(c) for c in fields[2].strip().split(' '))
79 elif mapping == "Deviation":
80 unicode_str = u''
81
82 if len(fields) > 3:
83 assert fields[3].strip() in ('NV8', 'XV8'), fields[3]
84 assert mapping == 'Valid', mapping
85 mapping = 'DisallowedIdna2008'
86
87 ranges.append((first, last, mapping, unicode_str))
88
89 def mergeable_key(r):
90 mapping = r[2]
91
92 # These types have associated data, so we should not merge them.
93 if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
94 return r
95 assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid', 'DisallowedIdna2008')
96 return mapping
97
98 grouped_ranges = itertools.groupby(ranges, key=mergeable_key)
99
100 optimized_ranges = []
101
102 for (k, g) in grouped_ranges:
103 group = list(g)
104 if len(group) == 1:
105 optimized_ranges.append(group[0])
106 continue
107 # Assert that nothing in the group has an associated unicode string.
108 for g in group:
109 if g[3] is not None and len(g[3]) > 2:
110 assert not g[3][2].strip()
111 # Assert that consecutive members of the group don't leave gaps in
112 # the codepoint space.
113 a, b = itertools.tee(group)
114 next(b, None)
115 for (g1, g2) in zip(a, b):
116 last_char = int(g1[1], 16)
117 next_char = int(g2[0], 16)
118 if last_char + 1 == next_char:
119 continue
120 # There's a gap where surrogates would appear, but we don't have to
121 # worry about that gap, as surrogates never appear in Rust strings.
122 # Assert we're seeing the surrogate case here.
123 assert last_char == 0xd7ff
124 assert next_char == 0xe000
125 optimized_ranges.append((group[0][0], group[-1][1]) + group[0][2:])
126
127 def is_single_char_range(r):
128 (first, last, _, _) = r
129 return first == last
130
131 # We can reduce the size of the character range table and the index table to about 1/4
132 # by merging runs of single character ranges and using character offsets from the start
133 # of that range to retrieve the correct `Mapping` value
134 def merge_single_char_ranges(ranges):
135 current = []
136 for r in ranges:
137 if not current or is_single_char_range(current[-1]) and is_single_char_range(r):
138 current.append(r)
139 continue
140 if len(current) != 0:
141 ret = current
142 current = [r]
143 yield ret
144 continue
145 current.append(r)
146 ret = current
147 current = []
148 yield ret
149 yield current
150
151 optimized_ranges = list(merge_single_char_ranges(optimized_ranges))
152
153 SINGLE_MARKER = 1 << 15
154
155 print("static TABLE: &[(char, u16)] = &[")
156
157 offset = 0
158 for ranges in optimized_ranges:
159 assert offset < SINGLE_MARKER
160
161 block_len = len(ranges)
162 single = SINGLE_MARKER if block_len == 1 else 0
163 index = offset | single
164 offset += block_len
165
166 start = escape_char(char(ranges[0][0]))
167 print(" ('%s', %s)," % (start, index))
168
169 print("];\n")
170
171 print("static MAPPING_TABLE: &[Mapping] = &[")
172
173 for ranges in optimized_ranges:
174 for (first, last, mapping, unicode_str) in ranges:
175 if unicode_str is not None:
176 mapping += rust_slice(strtab_slice(unicode_str))
177 print(" %s," % mapping)
178
179 print("];\n")
180
181 def escape_str(s):
182 return [escape_char(c) for c in s]
183
184 print("static STRING_TABLE: &str = \"%s\";"
185 % '\\\n '.join(itertools.chain(*[escape_str(s) for s in strtab.keys()])))