src/tools/unicode-table-generator/src/raw_emitter.rs

   1 use crate::fmt_list;
   2 use std::collections::{BTreeMap, BTreeSet, HashMap};
   3 use std::convert::TryFrom;
   4 use std::fmt::{self, Write};
   5 use std::ops::Range;
   6
   7 #[derive(Clone)]
   8 pub struct RawEmitter {
   9     pub file: String,
  10     pub desc: String,
  11     pub bytes_used: usize,
  12 }
  13
  14 impl RawEmitter {
  15     pub fn new() -> RawEmitter {
  16         RawEmitter { file: String::new(), bytes_used: 0, desc: String::new() }
  17     }
  18
  19     fn blank_line(&mut self) {
  20         if self.file.is_empty() || self.file.ends_with("\n\n") {
  21             return;
  22         }
  23         writeln!(&mut self.file).unwrap();
  24     }
  25
  26     fn emit_bitset(&mut self, ranges: &[Range<u32>]) -> Result<(), String> {
  27         let last_code_point = ranges.last().unwrap().end;
  28         // bitset for every bit in the codepoint range
  29         //
  30         // + 2 to ensure an all zero word to use for padding
  31         let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2];
  32         for range in ranges {
  33             for codepoint in range.clone() {
  34                 let bucket = codepoint as usize / 64;
  35                 let bit = codepoint as u64 % 64;
  36                 buckets[bucket] |= 1 << bit;
  37             }
  38         }
  39
  40         let mut words = buckets;
  41         // Ensure that there's a zero word in the dataset, used for padding and
  42         // such.
  43         words.push(0);
  44         let unique_words =
  45             words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>();
  46         if unique_words.len() > u8::MAX as usize {
  47             return Err(format!("cannot pack {} into 8 bits", unique_words.len()));
  48         }
  49         // needed for the chunk mapping to work
  50         assert_eq!(unique_words[0], 0, "has a zero word");
  51         let canonicalized = Canonicalized::canonicalize(&unique_words);
  52
  53         let word_indices = canonicalized.unique_mapping.clone();
  54         let compressed_words = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>();
  55
  56         let mut best = None;
  57         for length in 1..=64 {
  58             let mut temp = self.clone();
  59             temp.emit_chunk_map(word_indices[&0], &compressed_words, length);
  60             if let Some((_, size)) = best {
  61                 if temp.bytes_used < size {
  62                     best = Some((length, temp.bytes_used));
  63                 }
  64             } else {
  65                 best = Some((length, temp.bytes_used));
  66             }
  67         }
  68         self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0);
  69
  70         struct Bits(u64);
  71         impl fmt::Debug for Bits {
  72             fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  73                 write!(f, "0b{:064b}", self.0)
  74             }
  75         }
  76
  77         writeln!(
  78             &mut self.file,
  79             "const BITSET_CANONICAL: &'static [u64; {}] = &[{}];",
  80             canonicalized.canonical_words.len(),
  81             fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))),
  82         )
  83         .unwrap();
  84         self.bytes_used += 8 * canonicalized.canonical_words.len();
  85         writeln!(
  86             &mut self.file,
  87             "const BITSET_MAPPING: &'static [(u8, u8); {}] = &[{}];",
  88             canonicalized.canonicalized_words.len(),
  89             fmt_list(&canonicalized.canonicalized_words),
  90         )
  91         .unwrap();
  92         // 8 bit index into shifted words, 7 bits for shift + optional flip
  93         // We only need it for the words that we removed by applying a shift and
  94         // flip to them.
  95         self.bytes_used += 2 * canonicalized.canonicalized_words.len();
  96
  97         self.blank_line();
  98
  99         writeln!(
 100             &mut self.file,
 101             r#"#[rustc_const_unstable(feature = "const_unicode_case_lookup", issue = "101400")]"#
 102         )
 103         .unwrap();
 104         writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
 105         writeln!(&mut self.file, "    super::bitset_search(",).unwrap();
 106         writeln!(&mut self.file, "        c as u32,").unwrap();
 107         writeln!(&mut self.file, "        &BITSET_CHUNKS_MAP,").unwrap();
 108         writeln!(&mut self.file, "        &BITSET_INDEX_CHUNKS,").unwrap();
 109         writeln!(&mut self.file, "        &BITSET_CANONICAL,").unwrap();
 110         writeln!(&mut self.file, "        &BITSET_MAPPING,").unwrap();
 111         writeln!(&mut self.file, "    )").unwrap();
 112         writeln!(&mut self.file, "}}").unwrap();
 113
 114         Ok(())
 115     }
 116
 117     fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) {
 118         let mut compressed_words = compressed_words.to_vec();
 119         for _ in 0..(chunk_length - (compressed_words.len() % chunk_length)) {
 120             // pad out bitset index with zero words so we have all chunks of
 121             // chunkchunk_length
 122             compressed_words.push(zero_at);
 123         }
 124
 125         let mut chunks = BTreeSet::new();
 126         for chunk in compressed_words.chunks(chunk_length) {
 127             chunks.insert(chunk);
 128         }
 129         let chunk_map =
 130             chunks.iter().enumerate().map(|(idx, &chunk)| (chunk, idx)).collect::<HashMap<_, _>>();
 131         let mut chunk_indices = Vec::new();
 132         for chunk in compressed_words.chunks(chunk_length) {
 133             chunk_indices.push(chunk_map[chunk]);
 134         }
 135
 136         writeln!(
 137             &mut self.file,
 138             "const BITSET_CHUNKS_MAP: &'static [u8; {}] = &[{}];",
 139             chunk_indices.len(),
 140             fmt_list(&chunk_indices),
 141         )
 142         .unwrap();
 143         self.bytes_used += chunk_indices.len();
 144         writeln!(
 145             &mut self.file,
 146             "const BITSET_INDEX_CHUNKS: &'static [[u8; {}]; {}] = &[{}];",
 147             chunk_length,
 148             chunks.len(),
 149             fmt_list(chunks.iter()),
 150         )
 151         .unwrap();
 152         self.bytes_used += chunk_length * chunks.len();
 153     }
 154 }
 155
 156 pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
 157     emitter.blank_line();
 158
 159     let mut bitset = emitter.clone();
 160     let bitset_ok = bitset.emit_bitset(&ranges).is_ok();
 161
 162     let mut skiplist = emitter.clone();
 163     skiplist.emit_skiplist(&ranges);
 164
 165     if bitset_ok && bitset.bytes_used <= skiplist.bytes_used {
 166         *emitter = bitset;
 167         emitter.desc = String::from("bitset");
 168     } else {
 169         *emitter = skiplist;
 170         emitter.desc = String::from("skiplist");
 171     }
 172 }
 173
 174 pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
 175     emitter.blank_line();
 176
 177     let mut cascading = emitter.clone();
 178     cascading.emit_cascading_map(&ranges);
 179     *emitter = cascading;
 180     emitter.desc = String::from("cascading");
 181 }
 182
 183 struct Canonicalized {
 184     canonical_words: Vec<u64>,
 185     canonicalized_words: Vec<(u8, u8)>,
 186
 187     /// Maps an input unique word to the associated index (u8) which is into
 188     /// canonical_words or canonicalized_words (in order).
 189     unique_mapping: HashMap<u64, u8>,
 190 }
 191
 192 impl Canonicalized {
 193     fn canonicalize(unique_words: &[u64]) -> Self {
 194         #[derive(Copy, Clone, Debug)]
 195         enum Mapping {
 196             Rotate(u32),
 197             Invert,
 198             RotateAndInvert(u32),
 199             ShiftRight(u32),
 200         }
 201
 202         // key is the word being mapped to
 203         let mut mappings: BTreeMap<u64, Vec<(u64, Mapping)>> = BTreeMap::new();
 204         for &a in unique_words {
 205             'b: for &b in unique_words {
 206                 // skip self
 207                 if a == b {
 208                     continue;
 209                 }
 210
 211                 // All possible distinct rotations
 212                 for rotation in 1..64 {
 213                     if a.rotate_right(rotation) == b {
 214                         mappings.entry(b).or_default().push((a, Mapping::Rotate(rotation)));
 215                         // We're not interested in further mappings between a and b
 216                         continue 'b;
 217                     }
 218                 }
 219
 220                 if (!a) == b {
 221                     mappings.entry(b).or_default().push((a, Mapping::Invert));
 222                     // We're not interested in further mappings between a and b
 223                     continue 'b;
 224                 }
 225
 226                 // All possible distinct rotations, inverted
 227                 for rotation in 1..64 {
 228                     if (!a.rotate_right(rotation)) == b {
 229                         mappings
 230                             .entry(b)
 231                             .or_default()
 232                             .push((a, Mapping::RotateAndInvert(rotation)));
 233                         // We're not interested in further mappings between a and b
 234                         continue 'b;
 235                     }
 236                 }
 237
 238                 // All possible shifts
 239                 for shift_by in 1..64 {
 240                     if a == (b >> shift_by) {
 241                         mappings
 242                             .entry(b)
 243                             .or_default()
 244                             .push((a, Mapping::ShiftRight(shift_by as u32)));
 245                         // We're not interested in further mappings between a and b
 246                         continue 'b;
 247                     }
 248                 }
 249             }
 250         }
 251         // These are the bitset words which will be represented "raw" (as a u64)
 252         let mut canonical_words = Vec::new();
 253         // These are mapped words, which will be represented by an index into
 254         // the canonical_words and a Mapping; u16 when encoded.
 255         let mut canonicalized_words = Vec::new();
 256         let mut unique_mapping = HashMap::new();
 257
 258         #[derive(Debug, PartialEq, Eq)]
 259         enum UniqueMapping {
 260             Canonical(usize),
 261             Canonicalized(usize),
 262         }
 263
 264         // Map 0 first, so that it is the first canonical word.
 265         // This is realistically not inefficient because 0 is not mapped to by
 266         // anything else (a shift pattern could do it, but would be wasteful).
 267         //
 268         // However, 0s are quite common in the overall dataset, and it is quite
 269         // wasteful to have to go through a mapping function to determine that
 270         // we have a zero.
 271         //
 272         // FIXME: Experiment with choosing most common words in overall data set
 273         // for canonical when possible.
 274         while let Some((&to, _)) = mappings
 275             .iter()
 276             .find(|(&to, _)| to == 0)
 277             .or_else(|| mappings.iter().max_by_key(|m| m.1.len()))
 278         {
 279             // Get the mapping with the most entries. Currently, no mapping can
 280             // only exist transitively (i.e., there is no A, B, C such that A
 281             // does not map to C and but A maps to B maps to C), so this is
 282             // guaranteed to be acceptable.
 283             //
 284             // In the future, we may need a more sophisticated algorithm to
 285             // identify which keys to prefer as canonical.
 286             let mapped_from = mappings.remove(&to).unwrap();
 287             for (from, how) in &mapped_from {
 288                 // Remove the entries which mapped to this one.
 289                 // Noting that it should be associated with the Nth canonical word.
 290                 //
 291                 // We do not assert that this is present, because there may be
 292                 // no mappings to the `from` word; that's fine.
 293                 mappings.remove(from);
 294                 assert_eq!(
 295                     unique_mapping
 296                         .insert(*from, UniqueMapping::Canonicalized(canonicalized_words.len())),
 297                     None
 298                 );
 299                 canonicalized_words.push((canonical_words.len(), *how));
 300
 301                 // Remove the now-canonicalized word from other mappings,
 302                 // to ensure that we deprioritize them in the next iteration of
 303                 // the while loop.
 304                 for mapped in mappings.values_mut() {
 305                     let mut i = 0;
 306                     while i != mapped.len() {
 307                         if mapped[i].0 == *from {
 308                             mapped.remove(i);
 309                         } else {
 310                             i += 1;
 311                         }
 312                     }
 313                 }
 314             }
 315             assert!(
 316                 unique_mapping
 317                     .insert(to, UniqueMapping::Canonical(canonical_words.len()))
 318                     .is_none()
 319             );
 320             canonical_words.push(to);
 321
 322             // Remove the now-canonical word from other mappings, to ensure that
 323             // we deprioritize them in the next iteration of the while loop.
 324             for mapped in mappings.values_mut() {
 325                 let mut i = 0;
 326                 while i != mapped.len() {
 327                     if mapped[i].0 == to {
 328                         mapped.remove(i);
 329                     } else {
 330                         i += 1;
 331                     }
 332                 }
 333             }
 334         }
 335
 336         // Any words which we couldn't shrink, just stick into the canonical
 337         // words.
 338         //
 339         // FIXME: work harder -- there are more possibilities for mapping
 340         // functions (e.g., multiplication, shifting instead of rotation, etc.)
 341         // We'll probably always have some slack though so this loop will still
 342         // be needed.
 343         for &w in unique_words {
 344             if !unique_mapping.contains_key(&w) {
 345                 assert!(
 346                     unique_mapping
 347                         .insert(w, UniqueMapping::Canonical(canonical_words.len()))
 348                         .is_none()
 349                 );
 350                 canonical_words.push(w);
 351             }
 352         }
 353         assert_eq!(canonicalized_words.len() + canonical_words.len(), unique_words.len());
 354         assert_eq!(unique_mapping.len(), unique_words.len());
 355
 356         let unique_mapping = unique_mapping
 357             .into_iter()
 358             .map(|(key, value)| {
 359                 (
 360                     key,
 361                     match value {
 362                         UniqueMapping::Canonicalized(idx) => {
 363                             u8::try_from(canonical_words.len() + idx).unwrap()
 364                         }
 365                         UniqueMapping::Canonical(idx) => u8::try_from(idx).unwrap(),
 366                     },
 367                 )
 368             })
 369             .collect::<HashMap<_, _>>();
 370
 371         let mut distinct_indices = BTreeSet::new();
 372         for &w in unique_words {
 373             let idx = unique_mapping.get(&w).unwrap();
 374             assert!(distinct_indices.insert(idx));
 375         }
 376
 377         const LOWER_6: u32 = (1 << 6) - 1;
 378
 379         let canonicalized_words = canonicalized_words
 380             .into_iter()
 381             .map(|v| {
 382                 (
 383                     u8::try_from(v.0).unwrap(),
 384                     match v.1 {
 385                         Mapping::RotateAndInvert(amount) => {
 386                             assert_eq!(amount, amount & LOWER_6);
 387                             1 << 6 | (amount as u8)
 388                         }
 389                         Mapping::Rotate(amount) => {
 390                             assert_eq!(amount, amount & LOWER_6);
 391                             amount as u8
 392                         }
 393                         Mapping::Invert => 1 << 6,
 394                         Mapping::ShiftRight(shift_by) => {
 395                             assert_eq!(shift_by, shift_by & LOWER_6);
 396                             1 << 7 | (shift_by as u8)
 397                         }
 398                     },
 399                 )
 400             })
 401             .collect::<Vec<(u8, u8)>>();
 402         Canonicalized { unique_mapping, canonical_words, canonicalized_words }
 403     }
 404 }