]> git.proxmox.com Git - rustc.git/blame - library/core/src/unicode/printable.py
New upstream version 1.55.0+dfsg1
[rustc.git] / library / core / src / unicode / printable.py
CommitLineData
5bcae85e 1#!/usr/bin/env python
5bcae85e
SL
2
3# This script uses the following Unicode tables:
476ff2be 4# - UnicodeData.txt
5bcae85e 5
476ff2be
SL
6
7from collections import namedtuple
8import csv
5bcae85e
SL
9import os
10import subprocess
11
476ff2be
SL
12NUM_CODEPOINTS=0x110000
13
5bcae85e
SL
14def to_ranges(iter):
15 current = None
16 for i in iter:
17 if current is None or i != current[1] or i in (0x10000, 0x20000):
18 if current is not None:
19 yield tuple(current)
20 current = [i, i + 1]
21 else:
22 current[1] += 1
23 if current is not None:
24 yield tuple(current)
25
476ff2be
SL
26def get_escaped(codepoints):
27 for c in codepoints:
28 if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
29 yield c.value
5bcae85e
SL
30
31def get_file(f):
32 try:
33 return open(os.path.basename(f))
34 except FileNotFoundError:
35 subprocess.run(["curl", "-O", f], check=True)
36 return open(os.path.basename(f))
37
476ff2be
SL
38Codepoint = namedtuple('Codepoint', 'value class_')
39
40def get_codepoints(f):
41 r = csv.reader(f, delimiter=";")
42 prev_codepoint = 0
43 class_first = None
44 for row in r:
45 codepoint = int(row[0], 16)
46 name = row[1]
47 class_ = row[2]
48
49 if class_first is not None:
50 if not name.endswith("Last>"):
51 raise ValueError("Missing Last after First")
52
53 for c in range(prev_codepoint + 1, codepoint):
54 yield Codepoint(c, class_first)
55
56 class_first = None
57 if name.endswith("First>"):
58 class_first = class_
59
60 yield Codepoint(codepoint, class_)
61 prev_codepoint = codepoint
62
e1599b0c 63 if class_first is not None:
476ff2be
SL
64 raise ValueError("Missing Last after First")
65
66 for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
67 yield Codepoint(c, None)
68
cc61c64b
XL
69def compress_singletons(singletons):
70 uppers = [] # (upper, # items in lowers)
71 lowers = []
72
73 for i in singletons:
74 upper = i >> 8
75 lower = i & 0xff
76 if len(uppers) == 0 or uppers[-1][0] != upper:
77 uppers.append((upper, 1))
78 else:
79 upper, count = uppers[-1]
80 uppers[-1] = upper, count + 1
81 lowers.append(lower)
82
83 return uppers, lowers
84
85def compress_normal(normal):
86 # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
87 # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
88 compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]
89
90 prev_start = 0
91 for start, count in normal:
92 truelen = start - prev_start
93 falselen = count
94 prev_start = start + count
95
96 assert truelen < 0x8000 and falselen < 0x8000
97 entry = []
98 if truelen > 0x7f:
99 entry.append(0x80 | (truelen >> 8))
100 entry.append(truelen & 0xff)
101 else:
102 entry.append(truelen & 0x7f)
103 if falselen > 0x7f:
104 entry.append(0x80 | (falselen >> 8))
105 entry.append(falselen & 0xff)
106 else:
107 entry.append(falselen & 0x7f)
108
109 compressed.append(entry)
110
111 return compressed
112
113def print_singletons(uppers, lowers, uppersname, lowersname):
60c5eb7d 114 print("#[rustfmt::skip]")
416331ca 115 print("const {}: &[(u8, u8)] = &[".format(uppersname))
cc61c64b
XL
116 for u, c in uppers:
117 print(" ({:#04x}, {}),".format(u, c))
118 print("];")
60c5eb7d 119 print("#[rustfmt::skip]")
416331ca 120 print("const {}: &[u8] = &[".format(lowersname))
cc61c64b
XL
121 for i in range(0, len(lowers), 8):
122 print(" {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))
123 print("];")
124
125def print_normal(normal, normalname):
60c5eb7d 126 print("#[rustfmt::skip]")
416331ca 127 print("const {}: &[u8] = &[".format(normalname))
cc61c64b
XL
128 for v in normal:
129 print(" {}".format(" ".join("{:#04x},".format(i) for i in v)))
130 print("];")
131
5bcae85e 132def main():
136023e0 133 file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
5bcae85e 134
476ff2be 135 codepoints = get_codepoints(file)
5bcae85e
SL
136
137 CUTOFF=0x10000
138 singletons0 = []
139 singletons1 = []
140 normal0 = []
141 normal1 = []
142 extra = []
143
476ff2be 144 for a, b in to_ranges(get_escaped(codepoints)):
5bcae85e
SL
145 if a > 2 * CUTOFF:
146 extra.append((a, b - a))
147 elif a == b - 1:
148 if a & CUTOFF:
149 singletons1.append(a & ~CUTOFF)
150 else:
151 singletons0.append(a)
152 elif a == b - 2:
153 if a & CUTOFF:
154 singletons1.append(a & ~CUTOFF)
155 singletons1.append((a + 1) & ~CUTOFF)
156 else:
157 singletons0.append(a)
158 singletons0.append(a + 1)
159 else:
160 if a >= 2 * CUTOFF:
161 extra.append((a, b - a))
162 elif a & CUTOFF:
163 normal1.append((a & ~CUTOFF, b - a))
164 else:
165 normal0.append((a, b - a))
166
cc61c64b
XL
167 singletons0u, singletons0l = compress_singletons(singletons0)
168 singletons1u, singletons1l = compress_singletons(singletons1)
169 normal0 = compress_normal(normal0)
170 normal1 = compress_normal(normal1)
171
5bcae85e 172 print("""\
83c7162d 173// NOTE: The following code was generated by "src/libcore/unicode/printable.py",
5bcae85e
SL
174// do not edit directly!
175
60c5eb7d 176fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], normal: &[u8]) -> bool {
cc61c64b
XL
177 let xupper = (x >> 8) as u8;
178 let mut lowerstart = 0;
179 for &(upper, lowercount) in singletonuppers {
180 let lowerend = lowerstart + lowercount as usize;
181 if xupper == upper {
182 for &lower in &singletonlowers[lowerstart..lowerend] {
183 if lower == x as u8 {
184 return false;
185 }
186 }
187 } else if xupper < upper {
5bcae85e
SL
188 break;
189 }
cc61c64b 190 lowerstart = lowerend;
5bcae85e 191 }
cc61c64b
XL
192
193 let mut x = x as i32;
194 let mut normal = normal.iter().cloned();
195 let mut current = true;
196 while let Some(v) = normal.next() {
197 let len = if v & 0x80 != 0 {
198 ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32
5bcae85e 199 } else {
cc61c64b
XL
200 v as i32
201 };
202 x -= len;
203 if x < 0 {
5bcae85e
SL
204 break;
205 }
cc61c64b 206 current = !current;
5bcae85e 207 }
cc61c64b 208 current
5bcae85e
SL
209}
210
abe05a73 211pub(crate) fn is_printable(x: char) -> bool {
5bcae85e
SL
212 let x = x as u32;
213 let lower = x as u16;
214 if x < 0x10000 {
cc61c64b 215 check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0)
5bcae85e 216 } else if x < 0x20000 {
cc61c64b 217 check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1)
5bcae85e
SL
218 } else {\
219""")
220 for a, b in extra:
221 print(" if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b))
222 print(" return false;")
223 print(" }")
224 print("""\
225 true
226 }
227}\
228""")
229 print()
cc61c64b
XL
230 print_singletons(singletons0u, singletons0l, 'SINGLETONS0U', 'SINGLETONS0L')
231 print_singletons(singletons1u, singletons1l, 'SINGLETONS1U', 'SINGLETONS1L')
232 print_normal(normal0, 'NORMAL0')
233 print_normal(normal1, 'NORMAL1')
5bcae85e
SL
234
235if __name__ == '__main__':
236 main()