]> git.proxmox.com Git - rustc.git/blob - src/etc/char_private.py
New upstream version 1.26.2+dfsg1
[rustc.git] / src / etc / char_private.py
1 #!/usr/bin/env python
2 #
3 # Copyright 2011-2016 The Rust Project Developers. See the COPYRIGHT
4 # file at the top-level directory of this distribution and at
5 # http://rust-lang.org/COPYRIGHT.
6 #
7 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10 # option. This file may not be copied, modified, or distributed
11 # except according to those terms.
12
13 # This script uses the following Unicode tables:
14 # - UnicodeData.txt
15
16
17 from collections import namedtuple
18 import csv
19 import os
20 import subprocess
21
22 NUM_CODEPOINTS=0x110000
23
24 def to_ranges(iter):
25 current = None
26 for i in iter:
27 if current is None or i != current[1] or i in (0x10000, 0x20000):
28 if current is not None:
29 yield tuple(current)
30 current = [i, i + 1]
31 else:
32 current[1] += 1
33 if current is not None:
34 yield tuple(current)
35
36 def get_escaped(codepoints):
37 for c in codepoints:
38 if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
39 yield c.value
40
41 def get_file(f):
42 try:
43 return open(os.path.basename(f))
44 except FileNotFoundError:
45 subprocess.run(["curl", "-O", f], check=True)
46 return open(os.path.basename(f))
47
48 Codepoint = namedtuple('Codepoint', 'value class_')
49
50 def get_codepoints(f):
51 r = csv.reader(f, delimiter=";")
52 prev_codepoint = 0
53 class_first = None
54 for row in r:
55 codepoint = int(row[0], 16)
56 name = row[1]
57 class_ = row[2]
58
59 if class_first is not None:
60 if not name.endswith("Last>"):
61 raise ValueError("Missing Last after First")
62
63 for c in range(prev_codepoint + 1, codepoint):
64 yield Codepoint(c, class_first)
65
66 class_first = None
67 if name.endswith("First>"):
68 class_first = class_
69
70 yield Codepoint(codepoint, class_)
71 prev_codepoint = codepoint
72
73 if class_first != None:
74 raise ValueError("Missing Last after First")
75
76 for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
77 yield Codepoint(c, None)
78
79 def compress_singletons(singletons):
80 uppers = [] # (upper, # items in lowers)
81 lowers = []
82
83 for i in singletons:
84 upper = i >> 8
85 lower = i & 0xff
86 if len(uppers) == 0 or uppers[-1][0] != upper:
87 uppers.append((upper, 1))
88 else:
89 upper, count = uppers[-1]
90 uppers[-1] = upper, count + 1
91 lowers.append(lower)
92
93 return uppers, lowers
94
95 def compress_normal(normal):
96 # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
97 # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
98 compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]
99
100 prev_start = 0
101 for start, count in normal:
102 truelen = start - prev_start
103 falselen = count
104 prev_start = start + count
105
106 assert truelen < 0x8000 and falselen < 0x8000
107 entry = []
108 if truelen > 0x7f:
109 entry.append(0x80 | (truelen >> 8))
110 entry.append(truelen & 0xff)
111 else:
112 entry.append(truelen & 0x7f)
113 if falselen > 0x7f:
114 entry.append(0x80 | (falselen >> 8))
115 entry.append(falselen & 0xff)
116 else:
117 entry.append(falselen & 0x7f)
118
119 compressed.append(entry)
120
121 return compressed
122
123 def print_singletons(uppers, lowers, uppersname, lowersname):
124 print("const {}: &'static [(u8, u8)] = &[".format(uppersname))
125 for u, c in uppers:
126 print(" ({:#04x}, {}),".format(u, c))
127 print("];")
128 print("const {}: &'static [u8] = &[".format(lowersname))
129 for i in range(0, len(lowers), 8):
130 print(" {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))
131 print("];")
132
133 def print_normal(normal, normalname):
134 print("const {}: &'static [u8] = &[".format(normalname))
135 for v in normal:
136 print(" {}".format(" ".join("{:#04x},".format(i) for i in v)))
137 print("];")
138
139 def main():
140 file = get_file("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
141
142 codepoints = get_codepoints(file)
143
144 CUTOFF=0x10000
145 singletons0 = []
146 singletons1 = []
147 normal0 = []
148 normal1 = []
149 extra = []
150
151 for a, b in to_ranges(get_escaped(codepoints)):
152 if a > 2 * CUTOFF:
153 extra.append((a, b - a))
154 elif a == b - 1:
155 if a & CUTOFF:
156 singletons1.append(a & ~CUTOFF)
157 else:
158 singletons0.append(a)
159 elif a == b - 2:
160 if a & CUTOFF:
161 singletons1.append(a & ~CUTOFF)
162 singletons1.append((a + 1) & ~CUTOFF)
163 else:
164 singletons0.append(a)
165 singletons0.append(a + 1)
166 else:
167 if a >= 2 * CUTOFF:
168 extra.append((a, b - a))
169 elif a & CUTOFF:
170 normal1.append((a & ~CUTOFF, b - a))
171 else:
172 normal0.append((a, b - a))
173
174 singletons0u, singletons0l = compress_singletons(singletons0)
175 singletons1u, singletons1l = compress_singletons(singletons1)
176 normal0 = compress_normal(normal0)
177 normal1 = compress_normal(normal1)
178
179 print("""\
180 // Copyright 2012-2017 The Rust Project Developers. See the COPYRIGHT
181 // file at the top-level directory of this distribution and at
182 // http://rust-lang.org/COPYRIGHT.
183 //
184 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
185 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
186 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
187 // option. This file may not be copied, modified, or distributed
188 // except according to those terms.
189
190 // NOTE: The following code was generated by "src/etc/char_private.py",
191 // do not edit directly!
192
193 fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8],
194 normal: &[u8]) -> bool {
195 let xupper = (x >> 8) as u8;
196 let mut lowerstart = 0;
197 for &(upper, lowercount) in singletonuppers {
198 let lowerend = lowerstart + lowercount as usize;
199 if xupper == upper {
200 for &lower in &singletonlowers[lowerstart..lowerend] {
201 if lower == x as u8 {
202 return false;
203 }
204 }
205 } else if xupper < upper {
206 break;
207 }
208 lowerstart = lowerend;
209 }
210
211 let mut x = x as i32;
212 let mut normal = normal.iter().cloned();
213 let mut current = true;
214 while let Some(v) = normal.next() {
215 let len = if v & 0x80 != 0 {
216 ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32
217 } else {
218 v as i32
219 };
220 x -= len;
221 if x < 0 {
222 break;
223 }
224 current = !current;
225 }
226 current
227 }
228
229 pub(crate) fn is_printable(x: char) -> bool {
230 let x = x as u32;
231 let lower = x as u16;
232 if x < 0x10000 {
233 check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0)
234 } else if x < 0x20000 {
235 check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1)
236 } else {\
237 """)
238 for a, b in extra:
239 print(" if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b))
240 print(" return false;")
241 print(" }")
242 print("""\
243 true
244 }
245 }\
246 """)
247 print()
248 print_singletons(singletons0u, singletons0l, 'SINGLETONS0U', 'SINGLETONS0L')
249 print_singletons(singletons1u, singletons1l, 'SINGLETONS1U', 'SINGLETONS1L')
250 print_normal(normal0, 'NORMAL0')
251 print_normal(normal1, 'NORMAL1')
252
253 if __name__ == '__main__':
254 main()