]> git.proxmox.com Git - rustc.git/blob - vendor/regex-automata/src/dfa/determinize.rs
New upstream version 1.67.1+dfsg1
[rustc.git] / vendor / regex-automata / src / dfa / determinize.rs
1 use alloc::{
2 collections::BTreeMap,
3 vec::{self, Vec},
4 };
5
6 use crate::{
7 dfa::{dense, Error, DEAD},
8 nfa::thompson,
9 util::{
10 self,
11 alphabet::{self, ByteSet},
12 determinize::{State, StateBuilderEmpty, StateBuilderNFA},
13 id::{PatternID, StateID},
14 matchtypes::MatchKind,
15 sparse_set::{SparseSet, SparseSets},
16 start::Start,
17 },
18 };
19
20 /// A builder for configuring and running a DFA determinizer.
21 #[derive(Clone, Debug)]
22 pub(crate) struct Config {
23 anchored: bool,
24 match_kind: MatchKind,
25 quit: ByteSet,
26 dfa_size_limit: Option<usize>,
27 determinize_size_limit: Option<usize>,
28 }
29
30 impl Config {
31 /// Create a new default config for a determinizer. The determinizer may be
32 /// configured before calling `run`.
33 pub fn new() -> Config {
34 Config {
35 anchored: false,
36 match_kind: MatchKind::LeftmostFirst,
37 quit: ByteSet::empty(),
38 dfa_size_limit: None,
39 determinize_size_limit: None,
40 }
41 }
42
43 /// Run determinization on the given NFA and write the resulting DFA into
44 /// the one given. The DFA given should be initialized but otherwise empty.
45 /// "Initialized" means that it is setup to handle the NFA's byte classes,
46 /// number of patterns and whether to build start states for each pattern.
47 pub fn run(
48 &self,
49 nfa: &thompson::NFA,
50 dfa: &mut dense::OwnedDFA,
51 ) -> Result<(), Error> {
52 let dead = State::dead();
53 let quit = State::dead();
54 let mut cache = StateMap::default();
55 // We only insert the dead state here since its representation is
56 // identical to the quit state. And we never want anything pointing
57 // to the quit state other than specific transitions derived from the
58 // determinizer's configured "quit" bytes.
59 //
60 // We do put the quit state into 'builder_states' below. This ensures
61 // that a proper DFA state ID is allocated for it, and that no other
62 // DFA state uses the "location after the DEAD state." That is, it
63 // is assumed that the quit state is always the state immediately
64 // following the DEAD state.
65 cache.insert(dead.clone(), DEAD);
66
67 let runner = Runner {
68 config: self.clone(),
69 nfa,
70 dfa,
71 builder_states: alloc::vec![dead, quit],
72 cache,
73 memory_usage_state: 0,
74 sparses: SparseSets::new(nfa.len()),
75 stack: alloc::vec![],
76 scratch_state_builder: StateBuilderEmpty::new(),
77 };
78 runner.run()
79 }
80
81 /// Whether to build an anchored DFA or not. When disabled (the default),
82 /// the unanchored prefix from the NFA is used to start the DFA. Otherwise,
83 /// the anchored start state of the NFA is used to start the DFA.
84 pub fn anchored(&mut self, yes: bool) -> &mut Config {
85 self.anchored = yes;
86 self
87 }
88
89 /// The match semantics to use for determinization.
90 ///
91 /// MatchKind::All corresponds to the standard textbook construction.
92 /// All possible match states are represented in the DFA.
93 /// MatchKind::LeftmostFirst permits greediness and otherwise tries to
94 /// simulate the match semantics of backtracking regex engines. Namely,
95 /// only a subset of match states are built, and dead states are used to
96 /// stop searches with an unanchored prefix.
97 ///
98 /// The default is MatchKind::LeftmostFirst.
99 pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
100 self.match_kind = kind;
101 self
102 }
103
104 /// The set of bytes to use that will cause the DFA to enter a quit state,
105 /// stop searching and return an error. By default, this is empty.
106 pub fn quit(&mut self, set: ByteSet) -> &mut Config {
107 self.quit = set;
108 self
109 }
110
111 /// The limit, in bytes of the heap, that the DFA is permitted to use. This
112 /// does not include the auxiliary heap storage used by determinization.
113 pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config {
114 self.dfa_size_limit = bytes;
115 self
116 }
117
118 /// The limit, in bytes of the heap, that determinization itself is allowed
119 /// to use. This does not include the size of the DFA being built.
120 pub fn determinize_size_limit(
121 &mut self,
122 bytes: Option<usize>,
123 ) -> &mut Config {
124 self.determinize_size_limit = bytes;
125 self
126 }
127 }
128
129 /// The actual implementation of determinization that converts an NFA to a DFA
130 /// through powerset construction.
131 ///
132 /// This determinizer roughly follows the typical powerset construction, where
133 /// each DFA state is comprised of one or more NFA states. In the worst case,
134 /// there is one DFA state for every possible combination of NFA states. In
135 /// practice, this only happens in certain conditions, typically when there are
136 /// bounded repetitions.
137 ///
138 /// The main differences between this implementation and typical deteminization
139 /// are that this implementation delays matches by one state and hackily makes
140 /// look-around work. Comments below attempt to explain this.
141 ///
142 /// The lifetime variable `'a` refers to the lifetime of the NFA or DFA,
143 /// whichever is shorter.
144 #[derive(Debug)]
145 struct Runner<'a> {
146 /// The configuration used to initialize determinization.
147 config: Config,
148 /// The NFA we're converting into a DFA.
149 nfa: &'a thompson::NFA,
150 /// The DFA we're building.
151 dfa: &'a mut dense::OwnedDFA,
152 /// Each DFA state being built is defined as an *ordered* set of NFA
153 /// states, along with some meta facts about the ordered set of NFA states.
154 ///
155 /// This is never empty. The first state is always a dummy state such that
156 /// a state id == 0 corresponds to a dead state. The second state is always
157 /// the quit state.
158 ///
159 /// Why do we have states in both a `Vec` and in a cache map below?
160 /// Well, they serve two different roles based on access patterns.
161 /// `builder_states` is the canonical home of each state, and provides
162 /// constant random access by a DFA state's ID. The cache map below, on
163 /// the other hand, provides a quick way of searching for identical DFA
164 /// states by using the DFA state as a key in the map. Of course, we use
165 /// reference counting to avoid actually duplicating the state's data
166 /// itself. (Although this has never been benchmarked.) Note that the cache
167 /// map does not give us full minimization; it just lets us avoid some very
168 /// obvious redundant states.
169 ///
170 /// Note that the index into this Vec isn't quite the DFA's state ID.
171 /// Rather, it's just an index. To get the state ID, you have to multiply
172 /// it by the DFA's stride. That's done by self.dfa.from_index. And the
173 /// inverse is self.dfa.to_index.
174 ///
175 /// Moreover, DFA states don't usually retain the IDs assigned to them
176 /// by their position in this Vec. After determinization completes,
177 /// states are shuffled around to support other optimizations. See the
178 /// sibling 'special' module for more details on that. (The reason for
179 /// mentioning this is that if you print out the DFA for debugging during
180 /// determinization, and then print out the final DFA after it is fully
181 /// built, then the state IDs likely won't match up.)
182 builder_states: Vec<State>,
183 /// A cache of DFA states that already exist and can be easily looked up
184 /// via ordered sets of NFA states.
185 ///
186 /// See `builder_states` docs for why we store states in two different
187 /// ways.
188 cache: StateMap,
189 /// The memory usage, in bytes, used by builder_states and cache. We track
190 /// this as new states are added since states use a variable amount of
191 /// heap. Tracking this as we add states makes it possible to compute the
192 /// total amount of memory used by the determinizer in constant time.
193 memory_usage_state: usize,
194 /// A pair of sparse sets for tracking ordered sets of NFA state IDs.
195 /// These are reused throughout determinization. A bounded sparse set
196 /// gives us constant time insertion, membership testing and clearing.
197 sparses: SparseSets,
198 /// Scratch space for a stack of NFA states to visit, for depth first
199 /// visiting without recursion.
200 stack: Vec<StateID>,
201 /// Scratch space for storing an ordered sequence of NFA states, for
202 /// amortizing allocation. This is principally useful for when we avoid
203 /// adding a new DFA state since it already exists. In order to detect this
204 /// case though, we still need an ordered set of NFA state IDs. So we use
205 /// this space to stage that ordered set before we know whether we need to
206 /// create a new DFA state or not.
207 scratch_state_builder: StateBuilderEmpty,
208 }
209
210 /// A map from states to state identifiers. When using std, we use a standard
211 /// hashmap, since it's a bit faster for this use case. (Other maps, like
212 /// one's based on FNV, have not yet been benchmarked.)
213 ///
214 /// The main purpose of this map is to reuse states where possible. This won't
215 /// fully minimize the DFA, but it works well in a lot of cases.
216 #[cfg(feature = "std")]
217 type StateMap = std::collections::HashMap<State, StateID>;
218 #[cfg(not(feature = "std"))]
219 type StateMap = BTreeMap<State, StateID>;
220
221 impl<'a> Runner<'a> {
222 /// Build the DFA. If there was a problem constructing the DFA (e.g., if
223 /// the chosen state identifier representation is too small), then an error
224 /// is returned.
225 fn run(mut self) -> Result<(), Error> {
226 if self.nfa.has_word_boundary_unicode()
227 && !self.config.quit.contains_range(0x80, 0xFF)
228 {
229 return Err(Error::unsupported_dfa_word_boundary_unicode());
230 }
231
232 // A sequence of "representative" bytes drawn from each equivalence
233 // class. These representative bytes are fed to the NFA to compute
234 // state transitions. This allows us to avoid re-computing state
235 // transitions for bytes that are guaranteed to produce identical
236 // results.
237 let representatives: Vec<alphabet::Unit> =
238 self.dfa.byte_classes().representatives().collect();
239 // The set of all DFA state IDs that still need to have their
240 // transitions set. We start by seeding this with all starting states.
241 let mut uncompiled = alloc::vec![];
242 self.add_all_starts(&mut uncompiled)?;
243 while let Some(dfa_id) = uncompiled.pop() {
244 for &unit in &representatives {
245 if unit.as_u8().map_or(false, |b| self.config.quit.contains(b))
246 {
247 continue;
248 }
249 // In many cases, the state we transition to has already been
250 // computed. 'cached_state' will do the minimal amount of work
251 // to check this, and if it exists, immediately return an
252 // already existing state ID.
253 let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?;
254 self.dfa.set_transition(dfa_id, unit, next_dfa_id);
255 // If the state ID we got back is newly created, then we need
256 // to compile it, so add it to our uncompiled frontier.
257 if is_new {
258 uncompiled.push(next_dfa_id);
259 }
260 }
261 }
262 trace!(
263 "determinization complete, memory usage: {}, dense DFA size: {}",
264 self.memory_usage(),
265 self.dfa.memory_usage(),
266 );
267
268 // A map from DFA state ID to one or more NFA match IDs. Each NFA match
269 // ID corresponds to a distinct regex pattern that matches in the state
270 // corresponding to the key.
271 let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new();
272 self.cache.clear();
273 #[allow(unused_variables)]
274 let mut total_pat_count = 0;
275 for (i, state) in self.builder_states.into_iter().enumerate() {
276 if let Some(pat_ids) = state.match_pattern_ids() {
277 let id = self.dfa.from_index(i);
278 total_pat_count += pat_ids.len();
279 matches.insert(id, pat_ids);
280 }
281 }
282 log! {
283 use core::mem::size_of;
284 let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>();
285 let pats = total_pat_count * size_of::<PatternID>();
286 let mem = (matches.len() * per_elem) + pats;
287 log::trace!("matches map built, memory usage: {}", mem);
288 }
289 // At this point, we shuffle the "special" states in the final DFA.
290 // This permits a DFA's match loop to detect a match condition (among
291 // other things) by merely inspecting the current state's identifier,
292 // and avoids the need for any additional auxiliary storage.
293 self.dfa.shuffle(matches)?;
294 Ok(())
295 }
296
297 /// Return the identifier for the next DFA state given an existing DFA
298 /// state and an input byte. If the next DFA state already exists, then
299 /// return its identifier from the cache. Otherwise, build the state, cache
300 /// it and return its identifier.
301 ///
302 /// This routine returns a boolean indicating whether a new state was
303 /// built. If a new state is built, then the caller needs to add it to its
304 /// frontier of uncompiled DFA states to compute transitions for.
305 fn cached_state(
306 &mut self,
307 dfa_id: StateID,
308 unit: alphabet::Unit,
309 ) -> Result<(StateID, bool), Error> {
310 // Compute the set of all reachable NFA states, including epsilons.
311 let empty_builder = self.get_state_builder();
312 let builder = util::determinize::next(
313 self.nfa,
314 self.config.match_kind,
315 &mut self.sparses,
316 &mut self.stack,
317 &self.builder_states[self.dfa.to_index(dfa_id)],
318 unit,
319 empty_builder,
320 );
321 self.maybe_add_state(builder)
322 }
323
324 /// Compute the set of DFA start states and add their identifiers in
325 /// 'dfa_state_ids' (no duplicates are added).
326 fn add_all_starts(
327 &mut self,
328 dfa_state_ids: &mut Vec<StateID>,
329 ) -> Result<(), Error> {
330 // Always add the (possibly unanchored) start states for matching any
331 // of the patterns in this DFA.
332 self.add_start_group(None, dfa_state_ids)?;
333 // We only need to compute anchored start states for each pattern if it
334 // was requested to do so.
335 if self.dfa.has_starts_for_each_pattern() {
336 for pid in PatternID::iter(self.dfa.pattern_count()) {
337 self.add_start_group(Some(pid), dfa_state_ids)?;
338 }
339 }
340 Ok(())
341 }
342
343 /// Add a group of start states for the given match pattern ID. Any new
344 /// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are
345 /// pushed.)
346 ///
347 /// When pattern_id is None, then this will compile a group of unanchored
348 /// start states (if the DFA is unanchored). When the pattern_id is
349 /// present, then this will compile a group of anchored start states that
350 /// only match the given pattern.
351 fn add_start_group(
352 &mut self,
353 pattern_id: Option<PatternID>,
354 dfa_state_ids: &mut Vec<StateID>,
355 ) -> Result<(), Error> {
356 let nfa_start = match pattern_id {
357 Some(pid) => self.nfa.start_pattern(pid),
358 None if self.config.anchored => self.nfa.start_anchored(),
359 None => self.nfa.start_unanchored(),
360 };
361
362 // When compiling start states, we're careful not to build additional
363 // states that aren't necessary. For example, if the NFA has no word
364 // boundary assertion, then there's no reason to have distinct start
365 // states for 'NonWordByte' and 'WordByte' starting configurations.
366 // Instead, the 'WordByte' starting configuration can just point
367 // directly to the start state for the 'NonWordByte' config.
368
369 let (id, is_new) =
370 self.add_one_start(nfa_start, Start::NonWordByte)?;
371 self.dfa.set_start_state(Start::NonWordByte, pattern_id, id);
372 if is_new {
373 dfa_state_ids.push(id);
374 }
375
376 if !self.nfa.has_word_boundary() {
377 self.dfa.set_start_state(Start::WordByte, pattern_id, id);
378 } else {
379 let (id, is_new) =
380 self.add_one_start(nfa_start, Start::WordByte)?;
381 self.dfa.set_start_state(Start::WordByte, pattern_id, id);
382 if is_new {
383 dfa_state_ids.push(id);
384 }
385 }
386 if !self.nfa.has_any_anchor() {
387 self.dfa.set_start_state(Start::Text, pattern_id, id);
388 self.dfa.set_start_state(Start::Line, pattern_id, id);
389 } else {
390 let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?;
391 self.dfa.set_start_state(Start::Text, pattern_id, id);
392 if is_new {
393 dfa_state_ids.push(id);
394 }
395
396 let (id, is_new) = self.add_one_start(nfa_start, Start::Line)?;
397 self.dfa.set_start_state(Start::Line, pattern_id, id);
398 if is_new {
399 dfa_state_ids.push(id);
400 }
401 }
402
403 Ok(())
404 }
405
406 /// Add a new DFA start state corresponding to the given starting NFA
407 /// state, and the starting search configuration. (The starting search
408 /// configuration essentially tells us which look-behind assertions are
409 /// true for this particular state.)
410 ///
411 /// The boolean returned indicates whether the state ID returned is a newly
412 /// created state, or a previously cached state.
413 fn add_one_start(
414 &mut self,
415 nfa_start: StateID,
416 start: Start,
417 ) -> Result<(StateID, bool), Error> {
418 // Compute the look-behind assertions that are true in this starting
419 // configuration, and the determine the epsilon closure. While
420 // computing the epsilon closure, we only follow condiional epsilon
421 // transitions that satisfy the look-behind assertions in 'facts'.
422 let mut builder_matches = self.get_state_builder().into_matches();
423 util::determinize::set_lookbehind_from_start(
424 &start,
425 &mut builder_matches,
426 );
427 self.sparses.set1.clear();
428 util::determinize::epsilon_closure(
429 self.nfa,
430 nfa_start,
431 *builder_matches.look_have(),
432 &mut self.stack,
433 &mut self.sparses.set1,
434 );
435 let mut builder = builder_matches.into_nfa();
436 util::determinize::add_nfa_states(
437 &self.nfa,
438 &self.sparses.set1,
439 &mut builder,
440 );
441 self.maybe_add_state(builder)
442 }
443
444 /// Adds the given state to the DFA being built depending on whether it
445 /// already exists in this determinizer's cache.
446 ///
447 /// If it does exist, then the memory used by 'state' is put back into the
448 /// determinizer and the previously created state's ID is returned. (Along
449 /// with 'false', indicating that no new state was added.)
450 ///
451 /// If it does not exist, then the state is added to the DFA being built
452 /// and a fresh ID is allocated (if ID allocation fails, then an error is
453 /// returned) and returned. (Along with 'true', indicating that a new state
454 /// was added.)
455 fn maybe_add_state(
456 &mut self,
457 builder: StateBuilderNFA,
458 ) -> Result<(StateID, bool), Error> {
459 if let Some(&cached_id) = self.cache.get(builder.as_bytes()) {
460 // Since we have a cached state, put the constructed state's
461 // memory back into our scratch space, so that it can be reused.
462 self.put_state_builder(builder);
463 return Ok((cached_id, false));
464 }
465 self.add_state(builder).map(|sid| (sid, true))
466 }
467
468 /// Add the given state to the DFA and make it available in the cache.
469 ///
470 /// The state initially has no transitions. That is, it transitions to the
471 /// dead state for all possible inputs, and transitions to the quit state
472 /// for all quit bytes.
473 ///
474 /// If adding the state would exceed the maximum value for StateID, then an
475 /// error is returned.
476 fn add_state(
477 &mut self,
478 builder: StateBuilderNFA,
479 ) -> Result<StateID, Error> {
480 let id = self.dfa.add_empty_state()?;
481 if !self.config.quit.is_empty() {
482 for b in self.config.quit.iter() {
483 self.dfa.set_transition(
484 id,
485 alphabet::Unit::u8(b),
486 self.dfa.quit_id(),
487 );
488 }
489 }
490 let state = builder.to_state();
491 // States use reference counting internally, so we only need to count
492 // their memroy usage once.
493 self.memory_usage_state += state.memory_usage();
494 self.builder_states.push(state.clone());
495 self.cache.insert(state, id);
496 self.put_state_builder(builder);
497 if let Some(limit) = self.config.dfa_size_limit {
498 if self.dfa.memory_usage() > limit {
499 return Err(Error::dfa_exceeded_size_limit(limit));
500 }
501 }
502 if let Some(limit) = self.config.determinize_size_limit {
503 if self.memory_usage() > limit {
504 return Err(Error::determinize_exceeded_size_limit(limit));
505 }
506 }
507 Ok(id)
508 }
509
510 /// Returns a state builder from this determinizer that might have existing
511 /// capacity. This helps avoid allocs in cases where a state is built that
512 /// turns out to already be cached.
513 ///
514 /// Callers must put the state builder back with 'put_state_builder',
515 /// otherwise the allocation reuse won't work.
516 fn get_state_builder(&mut self) -> StateBuilderEmpty {
517 core::mem::replace(
518 &mut self.scratch_state_builder,
519 StateBuilderEmpty::new(),
520 )
521 }
522
523 /// Puts the given state builder back into this determinizer for reuse.
524 ///
525 /// Note that building a 'State' from a builder always creates a new
526 /// alloc, so callers should always put the builder back.
527 fn put_state_builder(&mut self, builder: StateBuilderNFA) {
528 let _ = core::mem::replace(
529 &mut self.scratch_state_builder,
530 builder.clear(),
531 );
532 }
533
534 /// Return the memory usage, in bytes, of this determinizer at the current
535 /// point in time. This does not include memory used by the NFA or the
536 /// dense DFA itself.
537 fn memory_usage(&self) -> usize {
538 use core::mem::size_of;
539
540 self.builder_states.len() * size_of::<State>()
541 // Maps likely use more memory than this, but it's probably close.
542 + self.cache.len() * (size_of::<State>() + size_of::<StateID>())
543 + self.memory_usage_state
544 + self.stack.capacity() * size_of::<StateID>()
545 + self.scratch_state_builder.capacity()
546 }
547 }