]> git.proxmox.com Git - rustc.git/blob - vendor/regex-automata/src/nfa/thompson/map.rs
New upstream version 1.67.1+dfsg1
[rustc.git] / vendor / regex-automata / src / nfa / thompson / map.rs
1 // This module contains a couple simple and purpose built hash maps. The key
2 // trade off they make is that they serve as caches rather than true maps. That
3 // is, inserting a new entry may cause eviction of another entry. This gives
4 // us two things. First, there's less overhead associated with inserts and
5 // lookups. Secondly, it lets us control our memory usage.
6 //
7 // These maps are used in some fairly hot code when generating NFA states for
8 // large Unicode character classes.
9 //
10 // Instead of exposing a rich hashmap entry API, we just permit the caller to
11 // produce a hash of the key directly. The hash can then be reused for both
12 // lookups and insertions at the cost of leaking abstraction a bit. But these
13 // are for internal use only, so it's fine.
14 //
15 // The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
16 // (almost) minimal DFA for large Unicode character classes in linear time.
17 // (Daciuk's algorithm is always used when compiling forward NFAs. For reverse
18 // NFAs, it's only used when the compiler is configured to 'shrink' the NFA,
19 // since there's a bit more expense in the reverse direction.)
20 //
21 // The Utf8SuffixMap is used when compiling large Unicode character classes for
22 // reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive
23 // construction of UTF-8 automata by caching common suffixes. This doesn't
24 // get the same space savings as Daciuk's algorithm, but it's basically as
25 // fast as the naive approach and typically winds up using less memory (since
26 // it generates smaller NFAs) despite the presence of the cache.
27 //
28 // These maps effectively represent caching mechanisms for CState::Sparse and
29 // CState::Range, respectively. The former represents a single NFA state with
30 // many transitions of equivalent priority while the latter represents a single
31 // NFA state with a single transition. (Neither state ever has or is an
32 // epsilon transition.) Thus, they have different key types. It's likely we
33 // could make one generic map, but the machinery didn't seem worth it. They
34 // are simple enough.
35
36 use alloc::{vec, vec::Vec};
37
38 use crate::{nfa::thompson::Transition, util::id::StateID};
39
40 // Basic FNV-1a hash constants as described in:
41 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
42 const PRIME: u64 = 1099511628211;
43 const INIT: u64 = 14695981039346656037;
44
45 /// A bounded hash map where the key is a sequence of NFA transitions and the
46 /// value is a pre-existing NFA state ID.
47 ///
48 /// std's hashmap can be used for this, however, this map has two important
49 /// advantages. Firstly, it has lower overhead. Secondly, it permits us to
50 /// control our memory usage by limited the number of slots. In general, the
51 /// cost here is that this map acts as a cache. That is, inserting a new entry
52 /// may remove an old entry. We are okay with this, since it does not impact
53 /// correctness in the cases where it is used. The only effect that dropping
54 /// states from the cache has is that the resulting NFA generated may be bigger
55 /// than it otherwise would be.
56 ///
57 /// This improves benchmarks that compile large Unicode character classes,
58 /// since it makes the generation of (almost) minimal UTF-8 automaton faster.
59 /// Specifically, one could observe the difference with std's hashmap via
60 /// something like the following benchmark:
61 ///
62 /// hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'"
63 ///
64 /// But to observe that difference, you'd have to modify the code to use
65 /// std's hashmap.
66 ///
67 /// It is quite possible that there is a better way to approach this problem.
68 /// For example, if there happens to be a very common state that collides with
69 /// a lot of less frequent states, then we could wind up with very poor caching
70 /// behavior. Alas, the effectiveness of this cache has not been measured.
71 /// Instead, ad hoc experiments suggest that it is "good enough." Additional
72 /// smarts (such as an LRU eviction policy) have to be weighed against the
73 /// amount of extra time they cost.
74 #[derive(Clone, Debug)]
75 pub struct Utf8BoundedMap {
76 /// The current version of this map. Only entries with matching versions
77 /// are considered during lookups. If an entry is found with a mismatched
78 /// version, then the map behaves as if the entry does not exist.
79 ///
80 /// This makes it possible to clear the map by simply incrementing the
81 /// version number instead of actually deallocating any storage.
82 version: u16,
83 /// The total number of entries this map can store.
84 capacity: usize,
85 /// The actual entries, keyed by hash. Collisions between different states
86 /// result in the old state being dropped.
87 map: Vec<Utf8BoundedEntry>,
88 }
89
90 /// An entry in this map.
91 #[derive(Clone, Debug, Default)]
92 struct Utf8BoundedEntry {
93 /// The version of the map used to produce this entry. If this entry's
94 /// version does not match the current version of the map, then the map
95 /// should behave as if this entry does not exist.
96 version: u16,
97 /// The key, which is a sorted sequence of non-overlapping NFA transitions.
98 key: Vec<Transition>,
99 /// The state ID corresponding to the state containing the transitions in
100 /// this entry.
101 val: StateID,
102 }
103
104 impl Utf8BoundedMap {
105 /// Create a new bounded map with the given capacity. The map will never
106 /// grow beyond the given size.
107 ///
108 /// Note that this does not allocate. Instead, callers must call `clear`
109 /// before using this map. `clear` will allocate space if necessary.
110 ///
111 /// This avoids the need to pay for the allocation of this map when
112 /// compiling regexes that lack large Unicode character classes.
113 pub fn new(capacity: usize) -> Utf8BoundedMap {
114 assert!(capacity > 0);
115 Utf8BoundedMap { version: 0, capacity, map: vec![] }
116 }
117
118 /// Clear this map of all entries, but permit the reuse of allocation
119 /// if possible.
120 ///
121 /// This must be called before the map can be used.
122 pub fn clear(&mut self) {
123 if self.map.is_empty() {
124 self.map = vec![Utf8BoundedEntry::default(); self.capacity];
125 } else {
126 self.version = self.version.wrapping_add(1);
127 // If we loop back to version 0, then we forcefully clear the
128 // entire map. Otherwise, it might be possible to incorrectly
129 // match entries used to generate other NFAs.
130 if self.version == 0 {
131 self.map = vec![Utf8BoundedEntry::default(); self.capacity];
132 }
133 }
134 }
135
136 /// Return a hash of the given transitions.
137 pub fn hash(&self, key: &[Transition]) -> usize {
138 let mut h = INIT;
139 for t in key {
140 h = (h ^ (t.start as u64)).wrapping_mul(PRIME);
141 h = (h ^ (t.end as u64)).wrapping_mul(PRIME);
142 h = (h ^ (t.next.as_usize() as u64)).wrapping_mul(PRIME);
143 }
144 (h as usize) % self.map.len()
145 }
146
147 /// Retrieve the cached state ID corresponding to the given key. The hash
148 /// given must have been computed with `hash` using the same key value.
149 ///
150 /// If there is no cached state with the given transitions, then None is
151 /// returned.
152 pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> {
153 let entry = &self.map[hash];
154 if entry.version != self.version {
155 return None;
156 }
157 // There may be a hash collision, so we need to confirm real equality.
158 if entry.key != key {
159 return None;
160 }
161 Some(entry.val)
162 }
163
164 /// Add a cached state to this map with the given key. Callers should
165 /// ensure that `state_id` points to a state that contains precisely the
166 /// NFA transitions given.
167 ///
168 /// `hash` must have been computed using the `hash` method with the same
169 /// key.
170 pub fn set(
171 &mut self,
172 key: Vec<Transition>,
173 hash: usize,
174 state_id: StateID,
175 ) {
176 self.map[hash] =
177 Utf8BoundedEntry { version: self.version, key, val: state_id };
178 }
179 }
180
181 /// A cache of suffixes used to modestly compress UTF-8 automata for large
182 /// Unicode character classes.
183 #[derive(Clone, Debug)]
184 pub struct Utf8SuffixMap {
185 /// The current version of this map. Only entries with matching versions
186 /// are considered during lookups. If an entry is found with a mismatched
187 /// version, then the map behaves as if the entry does not exist.
188 version: u16,
189 /// The total number of entries this map can store.
190 capacity: usize,
191 /// The actual entries, keyed by hash. Collisions between different states
192 /// result in the old state being dropped.
193 map: Vec<Utf8SuffixEntry>,
194 }
195
196 /// A key that uniquely identifies an NFA state. It is a triple that represents
197 /// a transition from one state for a particular byte range.
198 #[derive(Clone, Debug, Default, Eq, PartialEq)]
199 pub struct Utf8SuffixKey {
200 pub from: StateID,
201 pub start: u8,
202 pub end: u8,
203 }
204
205 /// An entry in this map.
206 #[derive(Clone, Debug, Default)]
207 struct Utf8SuffixEntry {
208 /// The version of the map used to produce this entry. If this entry's
209 /// version does not match the current version of the map, then the map
210 /// should behave as if this entry does not exist.
211 version: u16,
212 /// The key, which consists of a transition in a particular state.
213 key: Utf8SuffixKey,
214 /// The identifier that the transition in the key maps to.
215 val: StateID,
216 }
217
218 impl Utf8SuffixMap {
219 /// Create a new bounded map with the given capacity. The map will never
220 /// grow beyond the given size.
221 ///
222 /// Note that this does not allocate. Instead, callers must call `clear`
223 /// before using this map. `clear` will allocate space if necessary.
224 ///
225 /// This avoids the need to pay for the allocation of this map when
226 /// compiling regexes that lack large Unicode character classes.
227 pub fn new(capacity: usize) -> Utf8SuffixMap {
228 assert!(capacity > 0);
229 Utf8SuffixMap { version: 0, capacity, map: vec![] }
230 }
231
232 /// Clear this map of all entries, but permit the reuse of allocation
233 /// if possible.
234 ///
235 /// This must be called before the map can be used.
236 pub fn clear(&mut self) {
237 if self.map.is_empty() {
238 self.map = vec![Utf8SuffixEntry::default(); self.capacity];
239 } else {
240 self.version = self.version.wrapping_add(1);
241 if self.version == 0 {
242 self.map = vec![Utf8SuffixEntry::default(); self.capacity];
243 }
244 }
245 }
246
247 /// Return a hash of the given transition.
248 pub fn hash(&self, key: &Utf8SuffixKey) -> usize {
249 // Basic FNV-1a hash as described:
250 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
251 const PRIME: u64 = 1099511628211;
252 const INIT: u64 = 14695981039346656037;
253
254 let mut h = INIT;
255 h = (h ^ (key.from.as_usize() as u64)).wrapping_mul(PRIME);
256 h = (h ^ (key.start as u64)).wrapping_mul(PRIME);
257 h = (h ^ (key.end as u64)).wrapping_mul(PRIME);
258 (h as usize) % self.map.len()
259 }
260
261 /// Retrieve the cached state ID corresponding to the given key. The hash
262 /// given must have been computed with `hash` using the same key value.
263 ///
264 /// If there is no cached state with the given key, then None is returned.
265 pub fn get(
266 &mut self,
267 key: &Utf8SuffixKey,
268 hash: usize,
269 ) -> Option<StateID> {
270 let entry = &self.map[hash];
271 if entry.version != self.version {
272 return None;
273 }
274 if key != &entry.key {
275 return None;
276 }
277 Some(entry.val)
278 }
279
280 /// Add a cached state to this map with the given key. Callers should
281 /// ensure that `state_id` points to a state that contains precisely the
282 /// NFA transition given.
283 ///
284 /// `hash` must have been computed using the `hash` method with the same
285 /// key.
286 pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) {
287 self.map[hash] =
288 Utf8SuffixEntry { version: self.version, key, val: state_id };
289 }
290 }