2 use std
::collections
::{BTreeMap, BTreeSet, HashMap}
;
3 use std
::convert
::TryFrom
;
4 use std
::fmt
::{self, Write}
;
8 pub struct RawEmitter
{
11 pub bytes_used
: usize,
15 pub fn new() -> RawEmitter
{
16 RawEmitter { file: String::new(), bytes_used: 0, desc: String::new() }
19 fn blank_line(&mut self) {
20 if self.file
.is_empty() || self.file
.ends_with("\n\n") {
23 writeln
!(&mut self.file
).unwrap();
26 fn emit_bitset(&mut self, ranges
: &[Range
<u32>]) -> Result
<(), String
> {
27 let last_code_point
= ranges
.last().unwrap().end
;
28 // bitset for every bit in the codepoint range
30 // + 2 to ensure an all zero word to use for padding
31 let mut buckets
= vec
![0u64; (last_code_point
as usize / 64) + 2];
33 for codepoint
in range
.clone() {
34 let bucket
= codepoint
as usize / 64;
35 let bit
= codepoint
as u64 % 64;
36 buckets
[bucket
] |= 1 << bit
;
40 let mut words
= buckets
;
41 // Ensure that there's a zero word in the dataset, used for padding and
45 words
.iter().cloned().collect
::<BTreeSet
<_
>>().into_iter().collect
::<Vec
<_
>>();
46 if unique_words
.len() > u8::MAX
as usize {
47 return Err(format
!("cannot pack {} into 8 bits", unique_words
.len()));
49 // needed for the chunk mapping to work
50 assert_eq
!(unique_words
[0], 0, "has a zero word");
51 let canonicalized
= Canonicalized
::canonicalize(&unique_words
);
53 let word_indices
= canonicalized
.unique_mapping
.clone();
54 let compressed_words
= words
.iter().map(|w
| word_indices
[w
]).collect
::<Vec
<u8>>();
57 for length
in 1..=64 {
58 let mut temp
= self.clone();
59 temp
.emit_chunk_map(word_indices
[&0], &compressed_words
, length
);
60 if let Some((_
, size
)) = best
{
61 if temp
.bytes_used
< size
{
62 best
= Some((length
, temp
.bytes_used
));
65 best
= Some((length
, temp
.bytes_used
));
68 self.emit_chunk_map(word_indices
[&0], &compressed_words
, best
.unwrap().0);
71 impl fmt
::Debug
for Bits
{
72 fn fmt(&self, f
: &mut fmt
::Formatter
<'_
>) -> fmt
::Result
{
73 write
!(f
, "0b{:064b}", self.0)
79 "const BITSET_CANONICAL: &'static [u64; {}] = &[{}];",
80 canonicalized
.canonical_words
.len(),
81 fmt_list(canonicalized
.canonical_words
.iter().map(|v
| Bits(*v
))),
84 self.bytes_used
+= 8 * canonicalized
.canonical_words
.len();
87 "const BITSET_MAPPING: &'static [(u8, u8); {}] = &[{}];",
88 canonicalized
.canonicalized_words
.len(),
89 fmt_list(&canonicalized
.canonicalized_words
),
92 // 8 bit index into shifted words, 7 bits for shift + optional flip
93 // We only need it for the words that we removed by applying a shift and
95 self.bytes_used
+= 2 * canonicalized
.canonicalized_words
.len();
101 r
#"#[rustc_const_unstable(feature = "const_unicode_case_lookup", issue = "101400")]"#
104 writeln
!(&mut self.file
, "pub const fn lookup(c: char) -> bool {{").unwrap();
105 writeln
!(&mut self.file
, " super::bitset_search(",).unwrap();
106 writeln
!(&mut self.file
, " c as u32,").unwrap();
107 writeln
!(&mut self.file
, " &BITSET_CHUNKS_MAP,").unwrap();
108 writeln
!(&mut self.file
, " &BITSET_INDEX_CHUNKS,").unwrap();
109 writeln
!(&mut self.file
, " &BITSET_CANONICAL,").unwrap();
110 writeln
!(&mut self.file
, " &BITSET_MAPPING,").unwrap();
111 writeln
!(&mut self.file
, " )").unwrap();
112 writeln
!(&mut self.file
, "}}").unwrap();
117 fn emit_chunk_map(&mut self, zero_at
: u8, compressed_words
: &[u8], chunk_length
: usize) {
118 let mut compressed_words
= compressed_words
.to_vec();
119 for _
in 0..(chunk_length
- (compressed_words
.len() % chunk_length
)) {
120 // pad out bitset index with zero words so we have all chunks of
122 compressed_words
.push(zero_at
);
125 let mut chunks
= BTreeSet
::new();
126 for chunk
in compressed_words
.chunks(chunk_length
) {
127 chunks
.insert(chunk
);
130 chunks
.iter().enumerate().map(|(idx
, &chunk
)| (chunk
, idx
)).collect
::<HashMap
<_
, _
>>();
131 let mut chunk_indices
= Vec
::new();
132 for chunk
in compressed_words
.chunks(chunk_length
) {
133 chunk_indices
.push(chunk_map
[chunk
]);
138 "const BITSET_CHUNKS_MAP: &'static [u8; {}] = &[{}];",
140 fmt_list(&chunk_indices
),
143 self.bytes_used
+= chunk_indices
.len();
146 "const BITSET_INDEX_CHUNKS: &'static [[u8; {}]; {}] = &[{}];",
149 fmt_list(chunks
.iter()),
152 self.bytes_used
+= chunk_length
* chunks
.len();
156 pub fn emit_codepoints(emitter
: &mut RawEmitter
, ranges
: &[Range
<u32>]) {
157 emitter
.blank_line();
159 let mut bitset
= emitter
.clone();
160 let bitset_ok
= bitset
.emit_bitset(&ranges
).is_ok();
162 let mut skiplist
= emitter
.clone();
163 skiplist
.emit_skiplist(&ranges
);
165 if bitset_ok
&& bitset
.bytes_used
<= skiplist
.bytes_used
{
167 emitter
.desc
= String
::from("bitset");
170 emitter
.desc
= String
::from("skiplist");
174 pub fn emit_whitespace(emitter
: &mut RawEmitter
, ranges
: &[Range
<u32>]) {
175 emitter
.blank_line();
177 let mut cascading
= emitter
.clone();
178 cascading
.emit_cascading_map(&ranges
);
179 *emitter
= cascading
;
180 emitter
.desc
= String
::from("cascading");
183 struct Canonicalized
{
184 canonical_words
: Vec
<u64>,
185 canonicalized_words
: Vec
<(u8, u8)>,
187 /// Maps an input unique word to the associated index (u8) which is into
188 /// canonical_words or canonicalized_words (in order).
189 unique_mapping
: HashMap
<u64, u8>,
193 fn canonicalize(unique_words
: &[u64]) -> Self {
194 #[derive(Copy, Clone, Debug)]
198 RotateAndInvert(u32),
202 // key is the word being mapped to
203 let mut mappings
: BTreeMap
<u64, Vec
<(u64, Mapping
)>> = BTreeMap
::new();
204 for &a
in unique_words
{
205 'b
: for &b
in unique_words
{
211 // All possible distinct rotations
212 for rotation
in 1..64 {
213 if a
.rotate_right(rotation
) == b
{
214 mappings
.entry(b
).or_default().push((a
, Mapping
::Rotate(rotation
)));
215 // We're not interested in further mappings between a and b
221 mappings
.entry(b
).or_default().push((a
, Mapping
::Invert
));
222 // We're not interested in further mappings between a and b
226 // All possible distinct rotations, inverted
227 for rotation
in 1..64 {
228 if (!a
.rotate_right(rotation
)) == b
{
232 .push((a
, Mapping
::RotateAndInvert(rotation
)));
233 // We're not interested in further mappings between a and b
238 // All possible shifts
239 for shift_by
in 1..64 {
240 if a
== (b
>> shift_by
) {
244 .push((a
, Mapping
::ShiftRight(shift_by
as u32)));
245 // We're not interested in further mappings between a and b
251 // These are the bitset words which will be represented "raw" (as a u64)
252 let mut canonical_words
= Vec
::new();
253 // These are mapped words, which will be represented by an index into
254 // the canonical_words and a Mapping; u16 when encoded.
255 let mut canonicalized_words
= Vec
::new();
256 let mut unique_mapping
= HashMap
::new();
258 #[derive(Debug, PartialEq, Eq)]
261 Canonicalized(usize),
264 // Map 0 first, so that it is the first canonical word.
265 // This is realistically not inefficient because 0 is not mapped to by
266 // anything else (a shift pattern could do it, but would be wasteful).
268 // However, 0s are quite common in the overall dataset, and it is quite
269 // wasteful to have to go through a mapping function to determine that
272 // FIXME: Experiment with choosing most common words in overall data set
273 // for canonical when possible.
274 while let Some((&to
, _
)) = mappings
276 .find(|(&to
, _
)| to
== 0)
277 .or_else(|| mappings
.iter().max_by_key(|m
| m
.1.len()))
279 // Get the mapping with the most entries. Currently, no mapping can
280 // only exist transitively (i.e., there is no A, B, C such that A
281 // does not map to C and but A maps to B maps to C), so this is
282 // guaranteed to be acceptable.
284 // In the future, we may need a more sophisticated algorithm to
285 // identify which keys to prefer as canonical.
286 let mapped_from
= mappings
.remove(&to
).unwrap();
287 for (from
, how
) in &mapped_from
{
288 // Remove the entries which mapped to this one.
289 // Noting that it should be associated with the Nth canonical word.
291 // We do not assert that this is present, because there may be
292 // no mappings to the `from` word; that's fine.
293 mappings
.remove(from
);
296 .insert(*from
, UniqueMapping
::Canonicalized(canonicalized_words
.len())),
299 canonicalized_words
.push((canonical_words
.len(), *how
));
301 // Remove the now-canonicalized word from other mappings,
302 // to ensure that we deprioritize them in the next iteration of
304 for mapped
in mappings
.values_mut() {
306 while i
!= mapped
.len() {
307 if mapped
[i
].0 == *from
{
317 .insert(to
, UniqueMapping
::Canonical(canonical_words
.len()))
320 canonical_words
.push(to
);
322 // Remove the now-canonical word from other mappings, to ensure that
323 // we deprioritize them in the next iteration of the while loop.
324 for mapped
in mappings
.values_mut() {
326 while i
!= mapped
.len() {
327 if mapped
[i
].0 == to
{
336 // Any words which we couldn't shrink, just stick into the canonical
339 // FIXME: work harder -- there are more possibilities for mapping
340 // functions (e.g., multiplication, shifting instead of rotation, etc.)
341 // We'll probably always have some slack though so this loop will still
343 for &w
in unique_words
{
344 if !unique_mapping
.contains_key(&w
) {
347 .insert(w
, UniqueMapping
::Canonical(canonical_words
.len()))
350 canonical_words
.push(w
);
353 assert_eq
!(canonicalized_words
.len() + canonical_words
.len(), unique_words
.len());
354 assert_eq
!(unique_mapping
.len(), unique_words
.len());
356 let unique_mapping
= unique_mapping
358 .map(|(key
, value
)| {
362 UniqueMapping
::Canonicalized(idx
) => {
363 u8::try_from(canonical_words
.len() + idx
).unwrap()
365 UniqueMapping
::Canonical(idx
) => u8::try_from(idx
).unwrap(),
369 .collect
::<HashMap
<_
, _
>>();
371 let mut distinct_indices
= BTreeSet
::new();
372 for &w
in unique_words
{
373 let idx
= unique_mapping
.get(&w
).unwrap();
374 assert
!(distinct_indices
.insert(idx
));
377 const LOWER_6
: u32 = (1 << 6) - 1;
379 let canonicalized_words
= canonicalized_words
383 u8::try_from(v
.0).unwrap(),
385 Mapping
::RotateAndInvert(amount
) => {
386 assert_eq
!(amount
, amount
& LOWER_6
);
387 1 << 6 | (amount
as u8)
389 Mapping
::Rotate(amount
) => {
390 assert_eq
!(amount
, amount
& LOWER_6
);
393 Mapping
::Invert
=> 1 << 6,
394 Mapping
::ShiftRight(shift_by
) => {
395 assert_eq
!(shift_by
, shift_by
& LOWER_6
);
396 1 << 7 | (shift_by
as u8)
401 .collect
::<Vec
<(u8, u8)>>();
402 Canonicalized { unique_mapping, canonical_words, canonicalized_words }