]>
Commit | Line | Data |
---|---|---|
2c00a5a8 XL |
1 | /// BoolTrie is a trie for representing a set of Unicode codepoints. It is |
2 | /// implemented with postfix compression (sharing of identical child nodes), | |
3 | /// which gives both compact size and fast lookup. | |
4 | /// | |
5 | /// The space of Unicode codepoints is divided into 3 subareas, each | |
6 | /// represented by a trie with different depth. In the first (0..0x800), there | |
7 | /// is no trie structure at all; each u64 entry corresponds to a bitvector | |
8 | /// effectively holding 64 bool values. | |
9 | /// | |
10 | /// In the second (0x800..0x10000), each child of the root node represents a | |
11 | /// 64-wide subrange, but instead of storing the full 64-bit value of the leaf, | |
12 | /// the trie stores an 8-bit index into a shared table of leaf values. This | |
13 | /// exploits the fact that in reasonable sets, many such leaves can be shared. | |
14 | /// | |
15 | /// In the third (0x10000..0x110000), each child of the root node represents a | |
16 | /// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice | |
17 | /// of a child tree. Each of these 64 bytes represents an index into the table | |
18 | /// of shared 64-bit leaf values. This exploits the sparse structure in the | |
19 | /// non-BMP range of most Unicode sets. | |
20 | pub struct BoolTrie { | |
21 | // 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences) | |
60c5eb7d | 22 | pub r1: [u64; 32], // leaves |
2c00a5a8 XL |
23 | |
24 | // 0x800..0x10000 (corresponding to 3 byte utf-8 sequences) | |
25 | pub r2: [u8; 992], // first level | |
60c5eb7d | 26 | pub r3: &'static [u64], // leaves |
2c00a5a8 XL |
27 | |
28 | // 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences) | |
60c5eb7d XL |
29 | pub r4: [u8; 256], // first level |
30 | pub r5: &'static [u8], // second level | |
31 | pub r6: &'static [u64], // leaves | |
2c00a5a8 XL |
32 | } |
33 | impl BoolTrie { | |
34 | pub fn lookup(&self, c: char) -> bool { | |
83c7162d | 35 | let c = c as u32; |
2c00a5a8 | 36 | if c < 0x800 { |
83c7162d | 37 | trie_range_leaf(c, self.r1[(c >> 6) as usize]) |
2c00a5a8 | 38 | } else if c < 0x10000 { |
83c7162d | 39 | let child = self.r2[(c >> 6) as usize - 0x20]; |
2c00a5a8 XL |
40 | trie_range_leaf(c, self.r3[child as usize]) |
41 | } else { | |
83c7162d XL |
42 | let child = self.r4[(c >> 12) as usize - 0x10]; |
43 | let leaf = self.r5[((child as usize) << 6) + ((c >> 6) as usize & 0x3f)]; | |
2c00a5a8 XL |
44 | trie_range_leaf(c, self.r6[leaf as usize]) |
45 | } | |
46 | } | |
47 | } | |
48 | ||
49 | pub struct SmallBoolTrie { | |
50 | pub(crate) r1: &'static [u8], // first level | |
60c5eb7d | 51 | pub(crate) r2: &'static [u64], // leaves |
2c00a5a8 XL |
52 | } |
53 | ||
54 | impl SmallBoolTrie { | |
55 | pub fn lookup(&self, c: char) -> bool { | |
83c7162d XL |
56 | let c = c as u32; |
57 | match self.r1.get((c >> 6) as usize) { | |
2c00a5a8 XL |
58 | Some(&child) => trie_range_leaf(c, self.r2[child as usize]), |
59 | None => false, | |
60 | } | |
61 | } | |
62 | } | |
63 | ||
83c7162d | 64 | fn trie_range_leaf(c: u32, bitmap_chunk: u64) -> bool { |
2c00a5a8 XL |
65 | ((bitmap_chunk >> (c & 63)) & 1) != 0 |
66 | } |