]> git.proxmox.com Git - rustc.git/blob - vendor/idna-0.1.5/src/make_uts46_mapping_table.py
New upstream version 1.41.1+dfsg1
[rustc.git] / vendor / idna-0.1.5 / src / make_uts46_mapping_table.py
1 # Copyright 2013-2014 The rust-url developers.
2 #
3 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 # option. This file may not be copied, modified, or distributed
7 # except according to those terms.
8
9 # Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs
10 # You can get the latest idna table from
11 # http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt
12
13 from __future__ import print_function
14 import collections
15 import itertools
16
17 print('''\
18 // Copyright 2013-2014 The rust-url developers.
19 //
20 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
21 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
22 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
23 // option. This file may not be copied, modified, or distributed
24 // except according to those terms.
25
26 // Generated by make_idna_table.py
27 ''')
28
29 txt = open("IdnaMappingTable.txt")
30
31 def escape_char(c):
32 return "\\u{%x}" % ord(c[0])
33
34 def char(s):
35 return unichr(int(s, 16))
36
37 strtab = collections.OrderedDict()
38 strtab_offset = 0
39
40 def strtab_slice(s):
41 global strtab, strtab_offset
42
43 if s in strtab:
44 return strtab[s]
45 else:
46 utf8_len = len(s.encode('utf8'))
47 c = (strtab_offset, utf8_len)
48 strtab[s] = c
49 strtab_offset += utf8_len
50 return c
51
52 def rust_slice(s):
53 start = s[0]
54 length = s[1]
55 start_lo = start & 0xff
56 start_hi = start >> 8
57 assert length <= 255
58 assert start_hi <= 255
59 return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length)
60
61 ranges = []
62
63 for line in txt:
64 # remove comments
65 line, _, _ = line.partition('#')
66 # skip empty lines
67 if len(line.strip()) == 0:
68 continue
69 fields = line.split(';')
70 if fields[0].strip() == 'D800..DFFF':
71 continue # Surrogates don't occur in Rust strings.
72 first, _, last = fields[0].strip().partition('..')
73 if not last:
74 last = first
75 mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '')
76 unicode_str = None
77 if len(fields) > 2:
78 if fields[2].strip():
79 unicode_str = u''.join(char(c) for c in fields[2].strip().split(' '))
80 elif mapping == "Deviation":
81 unicode_str = u''
82 ranges.append((first, last, mapping, unicode_str))
83
84 def mergeable_key(r):
85 mapping = r[2]
86
87 # These types have associated data, so we should not merge them.
88 if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
89 return r
90 assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid')
91 return mapping
92
93 grouped_ranges = itertools.groupby(ranges, key=mergeable_key)
94
95 optimized_ranges = []
96
97 for (k, g) in grouped_ranges:
98 group = list(g)
99 if len(group) == 1:
100 optimized_ranges.append(group[0])
101 continue
102 # Assert that nothing in the group has an associated unicode string.
103 for g in group:
104 if g[3] is not None and len(g[3]) > 2:
105 assert not g[3][2].strip()
106 # Assert that consecutive members of the group don't leave gaps in
107 # the codepoint space.
108 a, b = itertools.tee(group)
109 next(b, None)
110 for (g1, g2) in itertools.izip(a, b):
111 last_char = int(g1[1], 16)
112 next_char = int(g2[0], 16)
113 if last_char + 1 == next_char:
114 continue
115 # There's a gap where surrogates would appear, but we don't have to
116 # worry about that gap, as surrogates never appear in Rust strings.
117 # Assert we're seeing the surrogate case here.
118 assert last_char == 0xd7ff
119 assert next_char == 0xe000
120 first = group[0][0]
121 last = group[-1][1]
122 mapping = group[0][2]
123 unicode_str = group[0][3]
124 optimized_ranges.append((first, last, mapping, unicode_str))
125
126 def is_single_char_range(r):
127 (first, last, _, _) = r
128 return first == last
129
130 # We can reduce the size of the character range table and the index table to about 1/4
131 # by merging runs of single character ranges and using character offsets from the start
132 # of that range to retrieve the correct `Mapping` value
133 def merge_single_char_ranges(ranges):
134 current = []
135 for r in ranges:
136 if not current or is_single_char_range(current[-1]) and is_single_char_range(r):
137 current.append(r)
138 continue
139 if len(current) != 0:
140 ret = current
141 current = [r]
142 yield ret
143 continue
144 current.append(r)
145 ret = current
146 current = []
147 yield ret
148 yield current
149
150 optimized_ranges = list(merge_single_char_ranges(optimized_ranges))
151
152
153 print("static TABLE: &'static [Range] = &[")
154
155 for ranges in optimized_ranges:
156 first = ranges[0][0]
157 last = ranges[-1][1]
158 print(" Range { from: '%s', to: '%s', }," % (escape_char(char(first)),
159 escape_char(char(last))))
160
161 print("];\n")
162
163 print("static INDEX_TABLE: &'static [u16] = &[")
164
165 SINGLE_MARKER = 1 << 15
166
167 offset = 0
168 for ranges in optimized_ranges:
169 assert offset < SINGLE_MARKER
170
171 block_len = len(ranges)
172 single = SINGLE_MARKER if block_len == 1 else 0
173 print(" %s," % (offset | single))
174 offset += block_len
175
176 print("];\n")
177
178 print("static MAPPING_TABLE: &'static [Mapping] = &[")
179
180 for ranges in optimized_ranges:
181 for (first, last, mapping, unicode_str) in ranges:
182 if unicode_str is not None:
183 mapping += rust_slice(strtab_slice(unicode_str))
184 print(" %s," % mapping)
185
186 print("];\n")
187
188 def escape_str(s):
189 return [escape_char(c) for c in s]
190
191 print("static STRING_TABLE: &'static str = \"%s\";"
192 % '\\\n '.join(itertools.chain(*[escape_str(s) for s in strtab.iterkeys()])))