]>
Commit | Line | Data |
---|---|---|
5bcae85e | 1 | #!/usr/bin/env python |
5bcae85e SL |
2 | |
3 | # This script uses the following Unicode tables: | |
476ff2be | 4 | # - UnicodeData.txt |
5bcae85e | 5 | |
476ff2be SL |
6 | |
7 | from collections import namedtuple | |
8 | import csv | |
5bcae85e SL |
9 | import os |
10 | import subprocess | |
11 | ||
476ff2be SL |
12 | NUM_CODEPOINTS=0x110000 |
13 | ||
5bcae85e SL |
14 | def to_ranges(iter): |
15 | current = None | |
16 | for i in iter: | |
17 | if current is None or i != current[1] or i in (0x10000, 0x20000): | |
18 | if current is not None: | |
19 | yield tuple(current) | |
20 | current = [i, i + 1] | |
21 | else: | |
22 | current[1] += 1 | |
23 | if current is not None: | |
24 | yield tuple(current) | |
25 | ||
476ff2be SL |
26 | def get_escaped(codepoints): |
27 | for c in codepoints: | |
28 | if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '): | |
29 | yield c.value | |
5bcae85e SL |
30 | |
31 | def get_file(f): | |
32 | try: | |
33 | return open(os.path.basename(f)) | |
34 | except FileNotFoundError: | |
35 | subprocess.run(["curl", "-O", f], check=True) | |
36 | return open(os.path.basename(f)) | |
37 | ||
476ff2be SL |
38 | Codepoint = namedtuple('Codepoint', 'value class_') |
39 | ||
40 | def get_codepoints(f): | |
41 | r = csv.reader(f, delimiter=";") | |
42 | prev_codepoint = 0 | |
43 | class_first = None | |
44 | for row in r: | |
45 | codepoint = int(row[0], 16) | |
46 | name = row[1] | |
47 | class_ = row[2] | |
48 | ||
49 | if class_first is not None: | |
50 | if not name.endswith("Last>"): | |
51 | raise ValueError("Missing Last after First") | |
52 | ||
53 | for c in range(prev_codepoint + 1, codepoint): | |
54 | yield Codepoint(c, class_first) | |
55 | ||
56 | class_first = None | |
57 | if name.endswith("First>"): | |
58 | class_first = class_ | |
59 | ||
60 | yield Codepoint(codepoint, class_) | |
61 | prev_codepoint = codepoint | |
62 | ||
e1599b0c | 63 | if class_first is not None: |
476ff2be SL |
64 | raise ValueError("Missing Last after First") |
65 | ||
66 | for c in range(prev_codepoint + 1, NUM_CODEPOINTS): | |
67 | yield Codepoint(c, None) | |
68 | ||
cc61c64b XL |
69 | def compress_singletons(singletons): |
70 | uppers = [] # (upper, # items in lowers) | |
71 | lowers = [] | |
72 | ||
73 | for i in singletons: | |
74 | upper = i >> 8 | |
75 | lower = i & 0xff | |
76 | if len(uppers) == 0 or uppers[-1][0] != upper: | |
77 | uppers.append((upper, 1)) | |
78 | else: | |
79 | upper, count = uppers[-1] | |
80 | uppers[-1] = upper, count + 1 | |
81 | lowers.append(lower) | |
82 | ||
83 | return uppers, lowers | |
84 | ||
85 | def compress_normal(normal): | |
86 | # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f | |
87 | # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff | |
88 | compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)] | |
89 | ||
90 | prev_start = 0 | |
91 | for start, count in normal: | |
92 | truelen = start - prev_start | |
93 | falselen = count | |
94 | prev_start = start + count | |
95 | ||
96 | assert truelen < 0x8000 and falselen < 0x8000 | |
97 | entry = [] | |
98 | if truelen > 0x7f: | |
99 | entry.append(0x80 | (truelen >> 8)) | |
100 | entry.append(truelen & 0xff) | |
101 | else: | |
102 | entry.append(truelen & 0x7f) | |
103 | if falselen > 0x7f: | |
104 | entry.append(0x80 | (falselen >> 8)) | |
105 | entry.append(falselen & 0xff) | |
106 | else: | |
107 | entry.append(falselen & 0x7f) | |
108 | ||
109 | compressed.append(entry) | |
110 | ||
111 | return compressed | |
112 | ||
113 | def print_singletons(uppers, lowers, uppersname, lowersname): | |
60c5eb7d | 114 | print("#[rustfmt::skip]") |
416331ca | 115 | print("const {}: &[(u8, u8)] = &[".format(uppersname)) |
cc61c64b XL |
116 | for u, c in uppers: |
117 | print(" ({:#04x}, {}),".format(u, c)) | |
118 | print("];") | |
60c5eb7d | 119 | print("#[rustfmt::skip]") |
416331ca | 120 | print("const {}: &[u8] = &[".format(lowersname)) |
cc61c64b XL |
121 | for i in range(0, len(lowers), 8): |
122 | print(" {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8]))) | |
123 | print("];") | |
124 | ||
125 | def print_normal(normal, normalname): | |
60c5eb7d | 126 | print("#[rustfmt::skip]") |
416331ca | 127 | print("const {}: &[u8] = &[".format(normalname)) |
cc61c64b XL |
128 | for v in normal: |
129 | print(" {}".format(" ".join("{:#04x},".format(i) for i in v))) | |
130 | print("];") | |
131 | ||
5bcae85e | 132 | def main(): |
136023e0 | 133 | file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt") |
5bcae85e | 134 | |
476ff2be | 135 | codepoints = get_codepoints(file) |
5bcae85e SL |
136 | |
137 | CUTOFF=0x10000 | |
138 | singletons0 = [] | |
139 | singletons1 = [] | |
140 | normal0 = [] | |
141 | normal1 = [] | |
142 | extra = [] | |
143 | ||
476ff2be | 144 | for a, b in to_ranges(get_escaped(codepoints)): |
5bcae85e SL |
145 | if a > 2 * CUTOFF: |
146 | extra.append((a, b - a)) | |
147 | elif a == b - 1: | |
148 | if a & CUTOFF: | |
149 | singletons1.append(a & ~CUTOFF) | |
150 | else: | |
151 | singletons0.append(a) | |
152 | elif a == b - 2: | |
153 | if a & CUTOFF: | |
154 | singletons1.append(a & ~CUTOFF) | |
155 | singletons1.append((a + 1) & ~CUTOFF) | |
156 | else: | |
157 | singletons0.append(a) | |
158 | singletons0.append(a + 1) | |
159 | else: | |
160 | if a >= 2 * CUTOFF: | |
161 | extra.append((a, b - a)) | |
162 | elif a & CUTOFF: | |
163 | normal1.append((a & ~CUTOFF, b - a)) | |
164 | else: | |
165 | normal0.append((a, b - a)) | |
166 | ||
cc61c64b XL |
167 | singletons0u, singletons0l = compress_singletons(singletons0) |
168 | singletons1u, singletons1l = compress_singletons(singletons1) | |
169 | normal0 = compress_normal(normal0) | |
170 | normal1 = compress_normal(normal1) | |
171 | ||
5bcae85e | 172 | print("""\ |
83c7162d | 173 | // NOTE: The following code was generated by "src/libcore/unicode/printable.py", |
5bcae85e SL |
174 | // do not edit directly! |
175 | ||
60c5eb7d | 176 | fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], normal: &[u8]) -> bool { |
cc61c64b XL |
177 | let xupper = (x >> 8) as u8; |
178 | let mut lowerstart = 0; | |
179 | for &(upper, lowercount) in singletonuppers { | |
180 | let lowerend = lowerstart + lowercount as usize; | |
181 | if xupper == upper { | |
182 | for &lower in &singletonlowers[lowerstart..lowerend] { | |
183 | if lower == x as u8 { | |
184 | return false; | |
185 | } | |
186 | } | |
187 | } else if xupper < upper { | |
5bcae85e SL |
188 | break; |
189 | } | |
cc61c64b | 190 | lowerstart = lowerend; |
5bcae85e | 191 | } |
cc61c64b XL |
192 | |
193 | let mut x = x as i32; | |
194 | let mut normal = normal.iter().cloned(); | |
195 | let mut current = true; | |
196 | while let Some(v) = normal.next() { | |
197 | let len = if v & 0x80 != 0 { | |
198 | ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32 | |
5bcae85e | 199 | } else { |
cc61c64b XL |
200 | v as i32 |
201 | }; | |
202 | x -= len; | |
203 | if x < 0 { | |
5bcae85e SL |
204 | break; |
205 | } | |
cc61c64b | 206 | current = !current; |
5bcae85e | 207 | } |
cc61c64b | 208 | current |
5bcae85e SL |
209 | } |
210 | ||
abe05a73 | 211 | pub(crate) fn is_printable(x: char) -> bool { |
5bcae85e SL |
212 | let x = x as u32; |
213 | let lower = x as u16; | |
214 | if x < 0x10000 { | |
cc61c64b | 215 | check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0) |
5bcae85e | 216 | } else if x < 0x20000 { |
cc61c64b | 217 | check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1) |
5bcae85e SL |
218 | } else {\ |
219 | """) | |
220 | for a, b in extra: | |
221 | print(" if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b)) | |
222 | print(" return false;") | |
223 | print(" }") | |
224 | print("""\ | |
225 | true | |
226 | } | |
227 | }\ | |
228 | """) | |
229 | print() | |
cc61c64b XL |
230 | print_singletons(singletons0u, singletons0l, 'SINGLETONS0U', 'SINGLETONS0L') |
231 | print_singletons(singletons1u, singletons1l, 'SINGLETONS1U', 'SINGLETONS1L') | |
232 | print_normal(normal0, 'NORMAL0') | |
233 | print_normal(normal1, 'NORMAL1') | |
5bcae85e SL |
234 | |
235 | if __name__ == '__main__': | |
236 | main() |