]>
git.proxmox.com Git - rustc.git/blob - vendor/idna-0.1.5/src/make_uts46_mapping_table.py
1 # Copyright 2013-2014 The rust-url developers.
3 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 # option. This file may not be copied, modified, or distributed
7 # except according to those terms.
9 # Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs
10 # You can get the latest idna table from
11 # http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt
13 from __future__
import print_function
18 // Copyright 2013-2014 The rust-url developers.
20 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
21 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
22 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
23 // option. This file may not be copied, modified, or distributed
24 // except according to those terms.
26 // Generated by make_idna_table.py
29 txt
= open("IdnaMappingTable.txt")
32 return "\\u{%x}" % ord(c
[0])
35 return unichr(int(s
, 16))
37 strtab
= collections
.OrderedDict()
41 global strtab
, strtab_offset
46 utf8_len
= len(s
.encode('utf8'))
47 c
= (strtab_offset
, utf8_len
)
49 strtab_offset
+= utf8_len
55 start_lo
= start
& 0xff
58 assert start_hi
<= 255
59 return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo
, start_hi
, length
)
65 line
, _
, _
= line
.partition('#')
67 if len(line
.strip()) == 0:
69 fields
= line
.split(';')
70 if fields
[0].strip() == 'D800..DFFF':
71 continue # Surrogates don't occur in Rust strings.
72 first
, _
, last
= fields
[0].strip().partition('..')
75 mapping
= fields
[1].strip().replace('_', ' ').title().replace(' ', '')
79 unicode_str
= u
''.join(char(c
) for c
in fields
[2].strip().split(' '))
80 elif mapping
== "Deviation":
82 ranges
.append((first
, last
, mapping
, unicode_str
))
87 # These types have associated data, so we should not merge them.
88 if mapping
in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
90 assert mapping
in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid')
93 grouped_ranges
= itertools
.groupby(ranges
, key
=mergeable_key
)
97 for (k
, g
) in grouped_ranges
:
100 optimized_ranges
.append(group
[0])
102 # Assert that nothing in the group has an associated unicode string.
104 if g
[3] is not None and len(g
[3]) > 2:
105 assert not g
[3][2].strip()
106 # Assert that consecutive members of the group don't leave gaps in
107 # the codepoint space.
108 a
, b
= itertools
.tee(group
)
110 for (g1
, g2
) in itertools
.izip(a
, b
):
111 last_char
= int(g1
[1], 16)
112 next_char
= int(g2
[0], 16)
113 if last_char
+ 1 == next_char
:
115 # There's a gap where surrogates would appear, but we don't have to
116 # worry about that gap, as surrogates never appear in Rust strings.
117 # Assert we're seeing the surrogate case here.
118 assert last_char
== 0xd7ff
119 assert next_char
== 0xe000
122 mapping
= group
[0][2]
123 unicode_str
= group
[0][3]
124 optimized_ranges
.append((first
, last
, mapping
, unicode_str
))
126 def is_single_char_range(r
):
127 (first
, last
, _
, _
) = r
130 # We can reduce the size of the character range table and the index table to about 1/4
131 # by merging runs of single character ranges and using character offsets from the start
132 # of that range to retrieve the correct `Mapping` value
133 def merge_single_char_ranges(ranges
):
136 if not current
or is_single_char_range(current
[-1]) and is_single_char_range(r
):
139 if len(current
) != 0:
150 optimized_ranges
= list(merge_single_char_ranges(optimized_ranges
))
153 print("static TABLE: &'static [Range] = &[")
155 for ranges
in optimized_ranges
:
158 print(" Range { from: '%s', to: '%s', }," % (escape_char(char(first
)),
159 escape_char(char(last
))))
163 print("static INDEX_TABLE: &'static [u16] = &[")
165 SINGLE_MARKER
= 1 << 15
168 for ranges
in optimized_ranges
:
169 assert offset
< SINGLE_MARKER
171 block_len
= len(ranges
)
172 single
= SINGLE_MARKER
if block_len
== 1 else 0
173 print(" %s," % (offset | single
))
178 print("static MAPPING_TABLE: &'static [Mapping] = &[")
180 for ranges
in optimized_ranges
:
181 for (first
, last
, mapping
, unicode_str
) in ranges
:
182 if unicode_str
is not None:
183 mapping
+= rust_slice(strtab_slice(unicode_str
))
184 print(" %s," % mapping
)
189 return [escape_char(c
) for c
in s
]
191 print("static STRING_TABLE: &'static str = \"%s\";"
192 % '\\\n '.join(itertools
.chain(*[escape_str(s
) for s
in strtab
.iterkeys()])))