]> git.proxmox.com Git - rustc.git/blob - src/tools/unicode-table-generator/src/raw_emitter.rs
New upstream version 1.65.0+dfsg1
[rustc.git] / src / tools / unicode-table-generator / src / raw_emitter.rs
1 use crate::fmt_list;
2 use std::collections::{BTreeMap, BTreeSet, HashMap};
3 use std::convert::TryFrom;
4 use std::fmt::{self, Write};
5 use std::ops::Range;
6
7 #[derive(Clone)]
8 pub struct RawEmitter {
9 pub file: String,
10 pub desc: String,
11 pub bytes_used: usize,
12 }
13
14 impl RawEmitter {
15 pub fn new() -> RawEmitter {
16 RawEmitter { file: String::new(), bytes_used: 0, desc: String::new() }
17 }
18
19 fn blank_line(&mut self) {
20 if self.file.is_empty() || self.file.ends_with("\n\n") {
21 return;
22 }
23 writeln!(&mut self.file).unwrap();
24 }
25
26 fn emit_bitset(&mut self, ranges: &[Range<u32>]) -> Result<(), String> {
27 let last_code_point = ranges.last().unwrap().end;
28 // bitset for every bit in the codepoint range
29 //
30 // + 2 to ensure an all zero word to use for padding
31 let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2];
32 for range in ranges {
33 for codepoint in range.clone() {
34 let bucket = codepoint as usize / 64;
35 let bit = codepoint as u64 % 64;
36 buckets[bucket] |= 1 << bit;
37 }
38 }
39
40 let mut words = buckets;
41 // Ensure that there's a zero word in the dataset, used for padding and
42 // such.
43 words.push(0);
44 let unique_words =
45 words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>();
46 if unique_words.len() > u8::MAX as usize {
47 return Err(format!("cannot pack {} into 8 bits", unique_words.len()));
48 }
49 // needed for the chunk mapping to work
50 assert_eq!(unique_words[0], 0, "has a zero word");
51 let canonicalized = Canonicalized::canonicalize(&unique_words);
52
53 let word_indices = canonicalized.unique_mapping.clone();
54 let compressed_words = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>();
55
56 let mut best = None;
57 for length in 1..=64 {
58 let mut temp = self.clone();
59 temp.emit_chunk_map(word_indices[&0], &compressed_words, length);
60 if let Some((_, size)) = best {
61 if temp.bytes_used < size {
62 best = Some((length, temp.bytes_used));
63 }
64 } else {
65 best = Some((length, temp.bytes_used));
66 }
67 }
68 self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0);
69
70 struct Bits(u64);
71 impl fmt::Debug for Bits {
72 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
73 write!(f, "0b{:064b}", self.0)
74 }
75 }
76
77 writeln!(
78 &mut self.file,
79 "const BITSET_CANONICAL: &'static [u64; {}] = &[{}];",
80 canonicalized.canonical_words.len(),
81 fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))),
82 )
83 .unwrap();
84 self.bytes_used += 8 * canonicalized.canonical_words.len();
85 writeln!(
86 &mut self.file,
87 "const BITSET_MAPPING: &'static [(u8, u8); {}] = &[{}];",
88 canonicalized.canonicalized_words.len(),
89 fmt_list(&canonicalized.canonicalized_words),
90 )
91 .unwrap();
92 // 8 bit index into shifted words, 7 bits for shift + optional flip
93 // We only need it for the words that we removed by applying a shift and
94 // flip to them.
95 self.bytes_used += 2 * canonicalized.canonicalized_words.len();
96
97 self.blank_line();
98
99 writeln!(
100 &mut self.file,
101 r#"#[rustc_const_unstable(feature = "const_unicode_case_lookup", issue = "101400")]"#
102 )
103 .unwrap();
104 writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
105 writeln!(&mut self.file, " super::bitset_search(",).unwrap();
106 writeln!(&mut self.file, " c as u32,").unwrap();
107 writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
108 writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();
109 writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap();
110 writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap();
111 writeln!(&mut self.file, " )").unwrap();
112 writeln!(&mut self.file, "}}").unwrap();
113
114 Ok(())
115 }
116
117 fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) {
118 let mut compressed_words = compressed_words.to_vec();
119 for _ in 0..(chunk_length - (compressed_words.len() % chunk_length)) {
120 // pad out bitset index with zero words so we have all chunks of
121 // chunkchunk_length
122 compressed_words.push(zero_at);
123 }
124
125 let mut chunks = BTreeSet::new();
126 for chunk in compressed_words.chunks(chunk_length) {
127 chunks.insert(chunk);
128 }
129 let chunk_map =
130 chunks.iter().enumerate().map(|(idx, &chunk)| (chunk, idx)).collect::<HashMap<_, _>>();
131 let mut chunk_indices = Vec::new();
132 for chunk in compressed_words.chunks(chunk_length) {
133 chunk_indices.push(chunk_map[chunk]);
134 }
135
136 writeln!(
137 &mut self.file,
138 "const BITSET_CHUNKS_MAP: &'static [u8; {}] = &[{}];",
139 chunk_indices.len(),
140 fmt_list(&chunk_indices),
141 )
142 .unwrap();
143 self.bytes_used += chunk_indices.len();
144 writeln!(
145 &mut self.file,
146 "const BITSET_INDEX_CHUNKS: &'static [[u8; {}]; {}] = &[{}];",
147 chunk_length,
148 chunks.len(),
149 fmt_list(chunks.iter()),
150 )
151 .unwrap();
152 self.bytes_used += chunk_length * chunks.len();
153 }
154 }
155
156 pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
157 emitter.blank_line();
158
159 let mut bitset = emitter.clone();
160 let bitset_ok = bitset.emit_bitset(&ranges).is_ok();
161
162 let mut skiplist = emitter.clone();
163 skiplist.emit_skiplist(&ranges);
164
165 if bitset_ok && bitset.bytes_used <= skiplist.bytes_used {
166 *emitter = bitset;
167 emitter.desc = String::from("bitset");
168 } else {
169 *emitter = skiplist;
170 emitter.desc = String::from("skiplist");
171 }
172 }
173
174 pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
175 emitter.blank_line();
176
177 let mut cascading = emitter.clone();
178 cascading.emit_cascading_map(&ranges);
179 *emitter = cascading;
180 emitter.desc = String::from("cascading");
181 }
182
183 struct Canonicalized {
184 canonical_words: Vec<u64>,
185 canonicalized_words: Vec<(u8, u8)>,
186
187 /// Maps an input unique word to the associated index (u8) which is into
188 /// canonical_words or canonicalized_words (in order).
189 unique_mapping: HashMap<u64, u8>,
190 }
191
192 impl Canonicalized {
193 fn canonicalize(unique_words: &[u64]) -> Self {
194 #[derive(Copy, Clone, Debug)]
195 enum Mapping {
196 Rotate(u32),
197 Invert,
198 RotateAndInvert(u32),
199 ShiftRight(u32),
200 }
201
202 // key is the word being mapped to
203 let mut mappings: BTreeMap<u64, Vec<(u64, Mapping)>> = BTreeMap::new();
204 for &a in unique_words {
205 'b: for &b in unique_words {
206 // skip self
207 if a == b {
208 continue;
209 }
210
211 // All possible distinct rotations
212 for rotation in 1..64 {
213 if a.rotate_right(rotation) == b {
214 mappings.entry(b).or_default().push((a, Mapping::Rotate(rotation)));
215 // We're not interested in further mappings between a and b
216 continue 'b;
217 }
218 }
219
220 if (!a) == b {
221 mappings.entry(b).or_default().push((a, Mapping::Invert));
222 // We're not interested in further mappings between a and b
223 continue 'b;
224 }
225
226 // All possible distinct rotations, inverted
227 for rotation in 1..64 {
228 if (!a.rotate_right(rotation)) == b {
229 mappings
230 .entry(b)
231 .or_default()
232 .push((a, Mapping::RotateAndInvert(rotation)));
233 // We're not interested in further mappings between a and b
234 continue 'b;
235 }
236 }
237
238 // All possible shifts
239 for shift_by in 1..64 {
240 if a == (b >> shift_by) {
241 mappings
242 .entry(b)
243 .or_default()
244 .push((a, Mapping::ShiftRight(shift_by as u32)));
245 // We're not interested in further mappings between a and b
246 continue 'b;
247 }
248 }
249 }
250 }
251 // These are the bitset words which will be represented "raw" (as a u64)
252 let mut canonical_words = Vec::new();
253 // These are mapped words, which will be represented by an index into
254 // the canonical_words and a Mapping; u16 when encoded.
255 let mut canonicalized_words = Vec::new();
256 let mut unique_mapping = HashMap::new();
257
258 #[derive(Debug, PartialEq, Eq)]
259 enum UniqueMapping {
260 Canonical(usize),
261 Canonicalized(usize),
262 }
263
264 // Map 0 first, so that it is the first canonical word.
265 // This is realistically not inefficient because 0 is not mapped to by
266 // anything else (a shift pattern could do it, but would be wasteful).
267 //
268 // However, 0s are quite common in the overall dataset, and it is quite
269 // wasteful to have to go through a mapping function to determine that
270 // we have a zero.
271 //
272 // FIXME: Experiment with choosing most common words in overall data set
273 // for canonical when possible.
274 while let Some((&to, _)) = mappings
275 .iter()
276 .find(|(&to, _)| to == 0)
277 .or_else(|| mappings.iter().max_by_key(|m| m.1.len()))
278 {
279 // Get the mapping with the most entries. Currently, no mapping can
280 // only exist transitively (i.e., there is no A, B, C such that A
281 // does not map to C and but A maps to B maps to C), so this is
282 // guaranteed to be acceptable.
283 //
284 // In the future, we may need a more sophisticated algorithm to
285 // identify which keys to prefer as canonical.
286 let mapped_from = mappings.remove(&to).unwrap();
287 for (from, how) in &mapped_from {
288 // Remove the entries which mapped to this one.
289 // Noting that it should be associated with the Nth canonical word.
290 //
291 // We do not assert that this is present, because there may be
292 // no mappings to the `from` word; that's fine.
293 mappings.remove(from);
294 assert_eq!(
295 unique_mapping
296 .insert(*from, UniqueMapping::Canonicalized(canonicalized_words.len())),
297 None
298 );
299 canonicalized_words.push((canonical_words.len(), *how));
300
301 // Remove the now-canonicalized word from other mappings,
302 // to ensure that we deprioritize them in the next iteration of
303 // the while loop.
304 for mapped in mappings.values_mut() {
305 let mut i = 0;
306 while i != mapped.len() {
307 if mapped[i].0 == *from {
308 mapped.remove(i);
309 } else {
310 i += 1;
311 }
312 }
313 }
314 }
315 assert!(
316 unique_mapping
317 .insert(to, UniqueMapping::Canonical(canonical_words.len()))
318 .is_none()
319 );
320 canonical_words.push(to);
321
322 // Remove the now-canonical word from other mappings, to ensure that
323 // we deprioritize them in the next iteration of the while loop.
324 for mapped in mappings.values_mut() {
325 let mut i = 0;
326 while i != mapped.len() {
327 if mapped[i].0 == to {
328 mapped.remove(i);
329 } else {
330 i += 1;
331 }
332 }
333 }
334 }
335
336 // Any words which we couldn't shrink, just stick into the canonical
337 // words.
338 //
339 // FIXME: work harder -- there are more possibilities for mapping
340 // functions (e.g., multiplication, shifting instead of rotation, etc.)
341 // We'll probably always have some slack though so this loop will still
342 // be needed.
343 for &w in unique_words {
344 if !unique_mapping.contains_key(&w) {
345 assert!(
346 unique_mapping
347 .insert(w, UniqueMapping::Canonical(canonical_words.len()))
348 .is_none()
349 );
350 canonical_words.push(w);
351 }
352 }
353 assert_eq!(canonicalized_words.len() + canonical_words.len(), unique_words.len());
354 assert_eq!(unique_mapping.len(), unique_words.len());
355
356 let unique_mapping = unique_mapping
357 .into_iter()
358 .map(|(key, value)| {
359 (
360 key,
361 match value {
362 UniqueMapping::Canonicalized(idx) => {
363 u8::try_from(canonical_words.len() + idx).unwrap()
364 }
365 UniqueMapping::Canonical(idx) => u8::try_from(idx).unwrap(),
366 },
367 )
368 })
369 .collect::<HashMap<_, _>>();
370
371 let mut distinct_indices = BTreeSet::new();
372 for &w in unique_words {
373 let idx = unique_mapping.get(&w).unwrap();
374 assert!(distinct_indices.insert(idx));
375 }
376
377 const LOWER_6: u32 = (1 << 6) - 1;
378
379 let canonicalized_words = canonicalized_words
380 .into_iter()
381 .map(|v| {
382 (
383 u8::try_from(v.0).unwrap(),
384 match v.1 {
385 Mapping::RotateAndInvert(amount) => {
386 assert_eq!(amount, amount & LOWER_6);
387 1 << 6 | (amount as u8)
388 }
389 Mapping::Rotate(amount) => {
390 assert_eq!(amount, amount & LOWER_6);
391 amount as u8
392 }
393 Mapping::Invert => 1 << 6,
394 Mapping::ShiftRight(shift_by) => {
395 assert_eq!(shift_by, shift_by & LOWER_6);
396 1 << 7 | (shift_by as u8)
397 }
398 },
399 )
400 })
401 .collect::<Vec<(u8, u8)>>();
402 Canonicalized { unique_mapping, canonical_words, canonicalized_words }
403 }
404 }