vendor/regex-automata/src/dfa/determinize.rs

   1 use alloc::{
   2     collections::BTreeMap,
   3     vec::{self, Vec},
   4 };
   5
   6 use crate::{
   7     dfa::{dense, Error, DEAD},
   8     nfa::thompson,
   9     util::{
  10         self,
  11         alphabet::{self, ByteSet},
  12         determinize::{State, StateBuilderEmpty, StateBuilderNFA},
  13         id::{PatternID, StateID},
  14         matchtypes::MatchKind,
  15         sparse_set::{SparseSet, SparseSets},
  16         start::Start,
  17     },
  18 };
  19
  20 /// A builder for configuring and running a DFA determinizer.
  21 #[derive(Clone, Debug)]
  22 pub(crate) struct Config {
  23     anchored: bool,
  24     match_kind: MatchKind,
  25     quit: ByteSet,
  26     dfa_size_limit: Option<usize>,
  27     determinize_size_limit: Option<usize>,
  28 }
  29
  30 impl Config {
  31     /// Create a new default config for a determinizer. The determinizer may be
  32     /// configured before calling `run`.
  33     pub fn new() -> Config {
  34         Config {
  35             anchored: false,
  36             match_kind: MatchKind::LeftmostFirst,
  37             quit: ByteSet::empty(),
  38             dfa_size_limit: None,
  39             determinize_size_limit: None,
  40         }
  41     }
  42
  43     /// Run determinization on the given NFA and write the resulting DFA into
  44     /// the one given. The DFA given should be initialized but otherwise empty.
  45     /// "Initialized" means that it is setup to handle the NFA's byte classes,
  46     /// number of patterns and whether to build start states for each pattern.
  47     pub fn run(
  48         &self,
  49         nfa: &thompson::NFA,
  50         dfa: &mut dense::OwnedDFA,
  51     ) -> Result<(), Error> {
  52         let dead = State::dead();
  53         let quit = State::dead();
  54         let mut cache = StateMap::default();
  55         // We only insert the dead state here since its representation is
  56         // identical to the quit state. And we never want anything pointing
  57         // to the quit state other than specific transitions derived from the
  58         // determinizer's configured "quit" bytes.
  59         //
  60         // We do put the quit state into 'builder_states' below. This ensures
  61         // that a proper DFA state ID is allocated for it, and that no other
  62         // DFA state uses the "location after the DEAD state." That is, it
  63         // is assumed that the quit state is always the state immediately
  64         // following the DEAD state.
  65         cache.insert(dead.clone(), DEAD);
  66
  67         let runner = Runner {
  68             config: self.clone(),
  69             nfa,
  70             dfa,
  71             builder_states: alloc::vec![dead, quit],
  72             cache,
  73             memory_usage_state: 0,
  74             sparses: SparseSets::new(nfa.len()),
  75             stack: alloc::vec![],
  76             scratch_state_builder: StateBuilderEmpty::new(),
  77         };
  78         runner.run()
  79     }
  80
  81     /// Whether to build an anchored DFA or not. When disabled (the default),
  82     /// the unanchored prefix from the NFA is used to start the DFA. Otherwise,
  83     /// the anchored start state of the NFA is used to start the DFA.
  84     pub fn anchored(&mut self, yes: bool) -> &mut Config {
  85         self.anchored = yes;
  86         self
  87     }
  88
  89     /// The match semantics to use for determinization.
  90     ///
  91     /// MatchKind::All corresponds to the standard textbook construction.
  92     /// All possible match states are represented in the DFA.
  93     /// MatchKind::LeftmostFirst permits greediness and otherwise tries to
  94     /// simulate the match semantics of backtracking regex engines. Namely,
  95     /// only a subset of match states are built, and dead states are used to
  96     /// stop searches with an unanchored prefix.
  97     ///
  98     /// The default is MatchKind::LeftmostFirst.
  99     pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
 100         self.match_kind = kind;
 101         self
 102     }
 103
 104     /// The set of bytes to use that will cause the DFA to enter a quit state,
 105     /// stop searching and return an error. By default, this is empty.
 106     pub fn quit(&mut self, set: ByteSet) -> &mut Config {
 107         self.quit = set;
 108         self
 109     }
 110
 111     /// The limit, in bytes of the heap, that the DFA is permitted to use. This
 112     /// does not include the auxiliary heap storage used by determinization.
 113     pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config {
 114         self.dfa_size_limit = bytes;
 115         self
 116     }
 117
 118     /// The limit, in bytes of the heap, that determinization itself is allowed
 119     /// to use. This does not include the size of the DFA being built.
 120     pub fn determinize_size_limit(
 121         &mut self,
 122         bytes: Option<usize>,
 123     ) -> &mut Config {
 124         self.determinize_size_limit = bytes;
 125         self
 126     }
 127 }
 128
 129 /// The actual implementation of determinization that converts an NFA to a DFA
 130 /// through powerset construction.
 131 ///
 132 /// This determinizer roughly follows the typical powerset construction, where
 133 /// each DFA state is comprised of one or more NFA states. In the worst case,
 134 /// there is one DFA state for every possible combination of NFA states. In
 135 /// practice, this only happens in certain conditions, typically when there are
 136 /// bounded repetitions.
 137 ///
 138 /// The main differences between this implementation and typical deteminization
 139 /// are that this implementation delays matches by one state and hackily makes
 140 /// look-around work. Comments below attempt to explain this.
 141 ///
 142 /// The lifetime variable `'a` refers to the lifetime of the NFA or DFA,
 143 /// whichever is shorter.
 144 #[derive(Debug)]
 145 struct Runner<'a> {
 146     /// The configuration used to initialize determinization.
 147     config: Config,
 148     /// The NFA we're converting into a DFA.
 149     nfa: &'a thompson::NFA,
 150     /// The DFA we're building.
 151     dfa: &'a mut dense::OwnedDFA,
 152     /// Each DFA state being built is defined as an *ordered* set of NFA
 153     /// states, along with some meta facts about the ordered set of NFA states.
 154     ///
 155     /// This is never empty. The first state is always a dummy state such that
 156     /// a state id == 0 corresponds to a dead state. The second state is always
 157     /// the quit state.
 158     ///
 159     /// Why do we have states in both a `Vec` and in a cache map below?
 160     /// Well, they serve two different roles based on access patterns.
 161     /// `builder_states` is the canonical home of each state, and provides
 162     /// constant random access by a DFA state's ID. The cache map below, on
 163     /// the other hand, provides a quick way of searching for identical DFA
 164     /// states by using the DFA state as a key in the map. Of course, we use
 165     /// reference counting to avoid actually duplicating the state's data
 166     /// itself. (Although this has never been benchmarked.) Note that the cache
 167     /// map does not give us full minimization; it just lets us avoid some very
 168     /// obvious redundant states.
 169     ///
 170     /// Note that the index into this Vec isn't quite the DFA's state ID.
 171     /// Rather, it's just an index. To get the state ID, you have to multiply
 172     /// it by the DFA's stride. That's done by self.dfa.from_index. And the
 173     /// inverse is self.dfa.to_index.
 174     ///
 175     /// Moreover, DFA states don't usually retain the IDs assigned to them
 176     /// by their position in this Vec. After determinization completes,
 177     /// states are shuffled around to support other optimizations. See the
 178     /// sibling 'special' module for more details on that. (The reason for
 179     /// mentioning this is that if you print out the DFA for debugging during
 180     /// determinization, and then print out the final DFA after it is fully
 181     /// built, then the state IDs likely won't match up.)
 182     builder_states: Vec<State>,
 183     /// A cache of DFA states that already exist and can be easily looked up
 184     /// via ordered sets of NFA states.
 185     ///
 186     /// See `builder_states` docs for why we store states in two different
 187     /// ways.
 188     cache: StateMap,
 189     /// The memory usage, in bytes, used by builder_states and cache. We track
 190     /// this as new states are added since states use a variable amount of
 191     /// heap. Tracking this as we add states makes it possible to compute the
 192     /// total amount of memory used by the determinizer in constant time.
 193     memory_usage_state: usize,
 194     /// A pair of sparse sets for tracking ordered sets of NFA state IDs.
 195     /// These are reused throughout determinization. A bounded sparse set
 196     /// gives us constant time insertion, membership testing and clearing.
 197     sparses: SparseSets,
 198     /// Scratch space for a stack of NFA states to visit, for depth first
 199     /// visiting without recursion.
 200     stack: Vec<StateID>,
 201     /// Scratch space for storing an ordered sequence of NFA states, for
 202     /// amortizing allocation. This is principally useful for when we avoid
 203     /// adding a new DFA state since it already exists. In order to detect this
 204     /// case though, we still need an ordered set of NFA state IDs. So we use
 205     /// this space to stage that ordered set before we know whether we need to
 206     /// create a new DFA state or not.
 207     scratch_state_builder: StateBuilderEmpty,
 208 }
 209
 210 /// A map from states to state identifiers. When using std, we use a standard
 211 /// hashmap, since it's a bit faster for this use case. (Other maps, like
 212 /// one's based on FNV, have not yet been benchmarked.)
 213 ///
 214 /// The main purpose of this map is to reuse states where possible. This won't
 215 /// fully minimize the DFA, but it works well in a lot of cases.
 216 #[cfg(feature = "std")]
 217 type StateMap = std::collections::HashMap<State, StateID>;
 218 #[cfg(not(feature = "std"))]
 219 type StateMap = BTreeMap<State, StateID>;
 220
 221 impl<'a> Runner<'a> {
 222     /// Build the DFA. If there was a problem constructing the DFA (e.g., if
 223     /// the chosen state identifier representation is too small), then an error
 224     /// is returned.
 225     fn run(mut self) -> Result<(), Error> {
 226         if self.nfa.has_word_boundary_unicode()
 227             && !self.config.quit.contains_range(0x80, 0xFF)
 228         {
 229             return Err(Error::unsupported_dfa_word_boundary_unicode());
 230         }
 231
 232         // A sequence of "representative" bytes drawn from each equivalence
 233         // class. These representative bytes are fed to the NFA to compute
 234         // state transitions. This allows us to avoid re-computing state
 235         // transitions for bytes that are guaranteed to produce identical
 236         // results.
 237         let representatives: Vec<alphabet::Unit> =
 238             self.dfa.byte_classes().representatives().collect();
 239         // The set of all DFA state IDs that still need to have their
 240         // transitions set. We start by seeding this with all starting states.
 241         let mut uncompiled = alloc::vec![];
 242         self.add_all_starts(&mut uncompiled)?;
 243         while let Some(dfa_id) = uncompiled.pop() {
 244             for &unit in &representatives {
 245                 if unit.as_u8().map_or(false, |b| self.config.quit.contains(b))
 246                 {
 247                     continue;
 248                 }
 249                 // In many cases, the state we transition to has already been
 250                 // computed. 'cached_state' will do the minimal amount of work
 251                 // to check this, and if it exists, immediately return an
 252                 // already existing state ID.
 253                 let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?;
 254                 self.dfa.set_transition(dfa_id, unit, next_dfa_id);
 255                 // If the state ID we got back is newly created, then we need
 256                 // to compile it, so add it to our uncompiled frontier.
 257                 if is_new {
 258                     uncompiled.push(next_dfa_id);
 259                 }
 260             }
 261         }
 262         trace!(
 263             "determinization complete, memory usage: {}, dense DFA size: {}",
 264             self.memory_usage(),
 265             self.dfa.memory_usage(),
 266         );
 267
 268         // A map from DFA state ID to one or more NFA match IDs. Each NFA match
 269         // ID corresponds to a distinct regex pattern that matches in the state
 270         // corresponding to the key.
 271         let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new();
 272         self.cache.clear();
 273         #[allow(unused_variables)]
 274         let mut total_pat_count = 0;
 275         for (i, state) in self.builder_states.into_iter().enumerate() {
 276             if let Some(pat_ids) = state.match_pattern_ids() {
 277                 let id = self.dfa.from_index(i);
 278                 total_pat_count += pat_ids.len();
 279                 matches.insert(id, pat_ids);
 280             }
 281         }
 282         log! {
 283             use core::mem::size_of;
 284             let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>();
 285             let pats = total_pat_count * size_of::<PatternID>();
 286             let mem = (matches.len() * per_elem) + pats;
 287             log::trace!("matches map built, memory usage: {}", mem);
 288         }
 289         // At this point, we shuffle the "special" states in the final DFA.
 290         // This permits a DFA's match loop to detect a match condition (among
 291         // other things) by merely inspecting the current state's identifier,
 292         // and avoids the need for any additional auxiliary storage.
 293         self.dfa.shuffle(matches)?;
 294         Ok(())
 295     }
 296
 297     /// Return the identifier for the next DFA state given an existing DFA
 298     /// state and an input byte. If the next DFA state already exists, then
 299     /// return its identifier from the cache. Otherwise, build the state, cache
 300     /// it and return its identifier.
 301     ///
 302     /// This routine returns a boolean indicating whether a new state was
 303     /// built. If a new state is built, then the caller needs to add it to its
 304     /// frontier of uncompiled DFA states to compute transitions for.
 305     fn cached_state(
 306         &mut self,
 307         dfa_id: StateID,
 308         unit: alphabet::Unit,
 309     ) -> Result<(StateID, bool), Error> {
 310         // Compute the set of all reachable NFA states, including epsilons.
 311         let empty_builder = self.get_state_builder();
 312         let builder = util::determinize::next(
 313             self.nfa,
 314             self.config.match_kind,
 315             &mut self.sparses,
 316             &mut self.stack,
 317             &self.builder_states[self.dfa.to_index(dfa_id)],
 318             unit,
 319             empty_builder,
 320         );
 321         self.maybe_add_state(builder)
 322     }
 323
 324     /// Compute the set of DFA start states and add their identifiers in
 325     /// 'dfa_state_ids' (no duplicates are added).
 326     fn add_all_starts(
 327         &mut self,
 328         dfa_state_ids: &mut Vec<StateID>,
 329     ) -> Result<(), Error> {
 330         // Always add the (possibly unanchored) start states for matching any
 331         // of the patterns in this DFA.
 332         self.add_start_group(None, dfa_state_ids)?;
 333         // We only need to compute anchored start states for each pattern if it
 334         // was requested to do so.
 335         if self.dfa.has_starts_for_each_pattern() {
 336             for pid in PatternID::iter(self.dfa.pattern_count()) {
 337                 self.add_start_group(Some(pid), dfa_state_ids)?;
 338             }
 339         }
 340         Ok(())
 341     }
 342
 343     /// Add a group of start states for the given match pattern ID. Any new
 344     /// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are
 345     /// pushed.)
 346     ///
 347     /// When pattern_id is None, then this will compile a group of unanchored
 348     /// start states (if the DFA is unanchored). When the pattern_id is
 349     /// present, then this will compile a group of anchored start states that
 350     /// only match the given pattern.
 351     fn add_start_group(
 352         &mut self,
 353         pattern_id: Option<PatternID>,
 354         dfa_state_ids: &mut Vec<StateID>,
 355     ) -> Result<(), Error> {
 356         let nfa_start = match pattern_id {
 357             Some(pid) => self.nfa.start_pattern(pid),
 358             None if self.config.anchored => self.nfa.start_anchored(),
 359             None => self.nfa.start_unanchored(),
 360         };
 361
 362         // When compiling start states, we're careful not to build additional
 363         // states that aren't necessary. For example, if the NFA has no word
 364         // boundary assertion, then there's no reason to have distinct start
 365         // states for 'NonWordByte' and 'WordByte' starting configurations.
 366         // Instead, the 'WordByte' starting configuration can just point
 367         // directly to the start state for the 'NonWordByte' config.
 368
 369         let (id, is_new) =
 370             self.add_one_start(nfa_start, Start::NonWordByte)?;
 371         self.dfa.set_start_state(Start::NonWordByte, pattern_id, id);
 372         if is_new {
 373             dfa_state_ids.push(id);
 374         }
 375
 376         if !self.nfa.has_word_boundary() {
 377             self.dfa.set_start_state(Start::WordByte, pattern_id, id);
 378         } else {
 379             let (id, is_new) =
 380                 self.add_one_start(nfa_start, Start::WordByte)?;
 381             self.dfa.set_start_state(Start::WordByte, pattern_id, id);
 382             if is_new {
 383                 dfa_state_ids.push(id);
 384             }
 385         }
 386         if !self.nfa.has_any_anchor() {
 387             self.dfa.set_start_state(Start::Text, pattern_id, id);
 388             self.dfa.set_start_state(Start::Line, pattern_id, id);
 389         } else {
 390             let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?;
 391             self.dfa.set_start_state(Start::Text, pattern_id, id);
 392             if is_new {
 393                 dfa_state_ids.push(id);
 394             }
 395
 396             let (id, is_new) = self.add_one_start(nfa_start, Start::Line)?;
 397             self.dfa.set_start_state(Start::Line, pattern_id, id);
 398             if is_new {
 399                 dfa_state_ids.push(id);
 400             }
 401         }
 402
 403         Ok(())
 404     }
 405
 406     /// Add a new DFA start state corresponding to the given starting NFA
 407     /// state, and the starting search configuration. (The starting search
 408     /// configuration essentially tells us which look-behind assertions are
 409     /// true for this particular state.)
 410     ///
 411     /// The boolean returned indicates whether the state ID returned is a newly
 412     /// created state, or a previously cached state.
 413     fn add_one_start(
 414         &mut self,
 415         nfa_start: StateID,
 416         start: Start,
 417     ) -> Result<(StateID, bool), Error> {
 418         // Compute the look-behind assertions that are true in this starting
 419         // configuration, and the determine the epsilon closure. While
 420         // computing the epsilon closure, we only follow condiional epsilon
 421         // transitions that satisfy the look-behind assertions in 'facts'.
 422         let mut builder_matches = self.get_state_builder().into_matches();
 423         util::determinize::set_lookbehind_from_start(
 424             &start,
 425             &mut builder_matches,
 426         );
 427         self.sparses.set1.clear();
 428         util::determinize::epsilon_closure(
 429             self.nfa,
 430             nfa_start,
 431             *builder_matches.look_have(),
 432             &mut self.stack,
 433             &mut self.sparses.set1,
 434         );
 435         let mut builder = builder_matches.into_nfa();
 436         util::determinize::add_nfa_states(
 437             &self.nfa,
 438             &self.sparses.set1,
 439             &mut builder,
 440         );
 441         self.maybe_add_state(builder)
 442     }
 443
 444     /// Adds the given state to the DFA being built depending on whether it
 445     /// already exists in this determinizer's cache.
 446     ///
 447     /// If it does exist, then the memory used by 'state' is put back into the
 448     /// determinizer and the previously created state's ID is returned. (Along
 449     /// with 'false', indicating that no new state was added.)
 450     ///
 451     /// If it does not exist, then the state is added to the DFA being built
 452     /// and a fresh ID is allocated (if ID allocation fails, then an error is
 453     /// returned) and returned. (Along with 'true', indicating that a new state
 454     /// was added.)
 455     fn maybe_add_state(
 456         &mut self,
 457         builder: StateBuilderNFA,
 458     ) -> Result<(StateID, bool), Error> {
 459         if let Some(&cached_id) = self.cache.get(builder.as_bytes()) {
 460             // Since we have a cached state, put the constructed state's
 461             // memory back into our scratch space, so that it can be reused.
 462             self.put_state_builder(builder);
 463             return Ok((cached_id, false));
 464         }
 465         self.add_state(builder).map(|sid| (sid, true))
 466     }
 467
 468     /// Add the given state to the DFA and make it available in the cache.
 469     ///
 470     /// The state initially has no transitions. That is, it transitions to the
 471     /// dead state for all possible inputs, and transitions to the quit state
 472     /// for all quit bytes.
 473     ///
 474     /// If adding the state would exceed the maximum value for StateID, then an
 475     /// error is returned.
 476     fn add_state(
 477         &mut self,
 478         builder: StateBuilderNFA,
 479     ) -> Result<StateID, Error> {
 480         let id = self.dfa.add_empty_state()?;
 481         if !self.config.quit.is_empty() {
 482             for b in self.config.quit.iter() {
 483                 self.dfa.set_transition(
 484                     id,
 485                     alphabet::Unit::u8(b),
 486                     self.dfa.quit_id(),
 487                 );
 488             }
 489         }
 490         let state = builder.to_state();
 491         // States use reference counting internally, so we only need to count
 492         // their memroy usage once.
 493         self.memory_usage_state += state.memory_usage();
 494         self.builder_states.push(state.clone());
 495         self.cache.insert(state, id);
 496         self.put_state_builder(builder);
 497         if let Some(limit) = self.config.dfa_size_limit {
 498             if self.dfa.memory_usage() > limit {
 499                 return Err(Error::dfa_exceeded_size_limit(limit));
 500             }
 501         }
 502         if let Some(limit) = self.config.determinize_size_limit {
 503             if self.memory_usage() > limit {
 504                 return Err(Error::determinize_exceeded_size_limit(limit));
 505             }
 506         }
 507         Ok(id)
 508     }
 509
 510     /// Returns a state builder from this determinizer that might have existing
 511     /// capacity. This helps avoid allocs in cases where a state is built that
 512     /// turns out to already be cached.
 513     ///
 514     /// Callers must put the state builder back with 'put_state_builder',
 515     /// otherwise the allocation reuse won't work.
 516     fn get_state_builder(&mut self) -> StateBuilderEmpty {
 517         core::mem::replace(
 518             &mut self.scratch_state_builder,
 519             StateBuilderEmpty::new(),
 520         )
 521     }
 522
 523     /// Puts the given state builder back into this determinizer for reuse.
 524     ///
 525     /// Note that building a 'State' from a builder always creates a new
 526     /// alloc, so callers should always put the builder back.
 527     fn put_state_builder(&mut self, builder: StateBuilderNFA) {
 528         let _ = core::mem::replace(
 529             &mut self.scratch_state_builder,
 530             builder.clear(),
 531         );
 532     }
 533
 534     /// Return the memory usage, in bytes, of this determinizer at the current
 535     /// point in time. This does not include memory used by the NFA or the
 536     /// dense DFA itself.
 537     fn memory_usage(&self) -> usize {
 538         use core::mem::size_of;
 539
 540         self.builder_states.len() * size_of::<State>()
 541         // Maps likely use more memory than this, but it's probably close.
 542         + self.cache.len() * (size_of::<State>() + size_of::<StateID>())
 543         + self.memory_usage_state
 544         + self.stack.capacity() * size_of::<StateID>()
 545         + self.scratch_state_builder.capacity()
 546     }
 547 }