vendor/regex-automata/src/determinize.rs

   1 use std::collections::HashMap;
   2 use std::mem;
   3 use std::rc::Rc;
   4
   5 use dense;
   6 use error::Result;
   7 use nfa::{self, NFA};
   8 use sparse_set::SparseSet;
   9 use state_id::{dead_id, StateID};
  10
  11 type DFARepr<S> = dense::Repr<Vec<S>, S>;
  12
  13 /// A determinizer converts an NFA to a DFA.
  14 ///
  15 /// This determinizer follows the typical powerset construction, where each
  16 /// DFA state is comprised of one or more NFA states. In the worst case, there
  17 /// is one DFA state for every possible combination of NFA states. In practice,
  18 /// this only happens in certain conditions, typically when there are bounded
  19 /// repetitions.
  20 ///
  21 /// The type variable `S` refers to the chosen state identifier representation
  22 /// used for the DFA.
  23 ///
  24 /// The lifetime variable `'a` refers to the lifetime of the NFA being
  25 /// converted to a DFA.
  26 #[derive(Debug)]
  27 pub(crate) struct Determinizer<'a, S: StateID> {
  28     /// The NFA we're converting into a DFA.
  29     nfa: &'a NFA,
  30     /// The DFA we're building.
  31     dfa: DFARepr<S>,
  32     /// Each DFA state being built is defined as an *ordered* set of NFA
  33     /// states, along with a flag indicating whether the state is a match
  34     /// state or not.
  35     ///
  36     /// This is never empty. The first state is always a dummy state such that
  37     /// a state id == 0 corresponds to a dead state.
  38     builder_states: Vec<Rc<State>>,
  39     /// A cache of DFA states that already exist and can be easily looked up
  40     /// via ordered sets of NFA states.
  41     cache: HashMap<Rc<State>, S>,
  42     /// Scratch space for a stack of NFA states to visit, for depth first
  43     /// visiting without recursion.
  44     stack: Vec<nfa::StateID>,
  45     /// Scratch space for storing an ordered sequence of NFA states, for
  46     /// amortizing allocation.
  47     scratch_nfa_states: Vec<nfa::StateID>,
  48     /// Whether to build a DFA that finds the longest possible match.
  49     longest_match: bool,
  50 }
  51
  52 /// An intermediate representation for a DFA state during determinization.
  53 #[derive(Debug, Eq, Hash, PartialEq)]
  54 struct State {
  55     /// Whether this state is a match state or not.
  56     is_match: bool,
  57     /// An ordered sequence of NFA states that make up this DFA state.
  58     nfa_states: Vec<nfa::StateID>,
  59 }
  60
  61 impl<'a, S: StateID> Determinizer<'a, S> {
  62     /// Create a new determinizer for converting the given NFA to a DFA.
  63     pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
  64         let dead = Rc::new(State::dead());
  65         let mut cache = HashMap::default();
  66         cache.insert(dead.clone(), dead_id());
  67
  68         Determinizer {
  69             nfa,
  70             dfa: DFARepr::empty().anchored(nfa.is_anchored()),
  71             builder_states: vec![dead],
  72             cache,
  73             stack: vec![],
  74             scratch_nfa_states: vec![],
  75             longest_match: false,
  76         }
  77     }
  78
  79     /// Instruct the determinizer to use equivalence classes as the transition
  80     /// alphabet instead of all possible byte values.
  81     pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
  82         let byte_classes = self.nfa.byte_classes().clone();
  83         self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
  84             .anchored(self.nfa.is_anchored());
  85         self
  86     }
  87
  88     /// Instruct the determinizer to build a DFA that recognizes the longest
  89     /// possible match instead of the leftmost first match. This is useful when
  90     /// constructing reverse DFAs for finding the start of a match.
  91     pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
  92         self.longest_match = yes;
  93         self
  94     }
  95
  96     /// Build the DFA. If there was a problem constructing the DFA (e.g., if
  97     /// the chosen state identifier representation is too small), then an error
  98     /// is returned.
  99     pub fn build(mut self) -> Result<DFARepr<S>> {
 100         let representative_bytes: Vec<u8> =
 101             self.dfa.byte_classes().representatives().collect();
 102         let mut sparse = self.new_sparse_set();
 103         let mut uncompiled = vec![self.add_start(&mut sparse)?];
 104         while let Some(dfa_id) = uncompiled.pop() {
 105             for &b in &representative_bytes {
 106                 let (next_dfa_id, is_new) =
 107                     self.cached_state(dfa_id, b, &mut sparse)?;
 108                 self.dfa.add_transition(dfa_id, b, next_dfa_id);
 109                 if is_new {
 110                     uncompiled.push(next_dfa_id);
 111                 }
 112             }
 113         }
 114
 115         // At this point, we shuffle the matching states in the final DFA to
 116         // the beginning. This permits a DFA's match loop to detect a match
 117         // condition by merely inspecting the current state's identifier, and
 118         // avoids the need for any additional auxiliary storage.
 119         let is_match: Vec<bool> =
 120             self.builder_states.iter().map(|s| s.is_match).collect();
 121         self.dfa.shuffle_match_states(&is_match);
 122         Ok(self.dfa)
 123     }
 124
 125     /// Return the identifier for the next DFA state given an existing DFA
 126     /// state and an input byte. If the next DFA state already exists, then
 127     /// return its identifier from the cache. Otherwise, build the state, cache
 128     /// it and return its identifier.
 129     ///
 130     /// The given sparse set is used for scratch space. It must have a capacity
 131     /// equivalent to the total number of NFA states, but its contents are
 132     /// otherwise unspecified.
 133     ///
 134     /// This routine returns a boolean indicating whether a new state was
 135     /// built. If a new state is built, then the caller needs to add it to its
 136     /// frontier of uncompiled DFA states to compute transitions for.
 137     fn cached_state(
 138         &mut self,
 139         dfa_id: S,
 140         b: u8,
 141         sparse: &mut SparseSet,
 142     ) -> Result<(S, bool)> {
 143         sparse.clear();
 144         // Compute the set of all reachable NFA states, including epsilons.
 145         self.next(dfa_id, b, sparse);
 146         // Build a candidate state and check if it has already been built.
 147         let state = self.new_state(sparse);
 148         if let Some(&cached_id) = self.cache.get(&state) {
 149             // Since we have a cached state, put the constructed state's
 150             // memory back into our scratch space, so that it can be reused.
 151             let _ =
 152                 mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
 153             return Ok((cached_id, false));
 154         }
 155         // Nothing was in the cache, so add this state to the cache.
 156         self.add_state(state).map(|s| (s, true))
 157     }
 158
 159     /// Compute the set of all eachable NFA states, including the full epsilon
 160     /// closure, from a DFA state for a single byte of input.
 161     fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
 162         next_nfa_states.clear();
 163         for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
 164             let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
 165             match *self.nfa.state(nfa_id) {
 166                 nfa::State::Union { .. }
 167                 | nfa::State::Fail
 168                 | nfa::State::Match => {}
 169                 nfa::State::Range { range: ref r } => {
 170                     if r.start <= b && b <= r.end {
 171                         self.epsilon_closure(r.next, next_nfa_states);
 172                     }
 173                 }
 174                 nfa::State::Sparse { ref ranges } => {
 175                     for r in ranges.iter() {
 176                         if r.start > b {
 177                             break;
 178                         } else if r.start <= b && b <= r.end {
 179                             self.epsilon_closure(r.next, next_nfa_states);
 180                             break;
 181                         }
 182                     }
 183                 }
 184             }
 185         }
 186     }
 187
 188     /// Compute the epsilon closure for the given NFA state.
 189     fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
 190         if !self.nfa.state(start).is_epsilon() {
 191             set.insert(start);
 192             return;
 193         }
 194
 195         self.stack.push(start);
 196         while let Some(mut id) = self.stack.pop() {
 197             loop {
 198                 if set.contains(id) {
 199                     break;
 200                 }
 201                 set.insert(id);
 202                 match *self.nfa.state(id) {
 203                     nfa::State::Range { .. }
 204                     | nfa::State::Sparse { .. }
 205                     | nfa::State::Fail
 206                     | nfa::State::Match => break,
 207                     nfa::State::Union { ref alternates } => {
 208                         id = match alternates.get(0) {
 209                             None => break,
 210                             Some(&id) => id,
 211                         };
 212                         self.stack.extend(alternates[1..].iter().rev());
 213                     }
 214                 }
 215             }
 216         }
 217     }
 218
 219     /// Compute the initial DFA state and return its identifier.
 220     ///
 221     /// The sparse set given is used for scratch space, and must have capacity
 222     /// equal to the total number of NFA states. Its contents are unspecified.
 223     fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
 224         sparse.clear();
 225         self.epsilon_closure(self.nfa.start(), sparse);
 226         let state = self.new_state(&sparse);
 227         let id = self.add_state(state)?;
 228         self.dfa.set_start_state(id);
 229         Ok(id)
 230     }
 231
 232     /// Add the given state to the DFA and make it available in the cache.
 233     ///
 234     /// The state initially has no transitions. That is, it transitions to the
 235     /// dead state for all possible inputs.
 236     fn add_state(&mut self, state: State) -> Result<S> {
 237         let id = self.dfa.add_empty_state()?;
 238         let rstate = Rc::new(state);
 239         self.builder_states.push(rstate.clone());
 240         self.cache.insert(rstate, id);
 241         Ok(id)
 242     }
 243
 244     /// Convert the given set of ordered NFA states to a DFA state.
 245     fn new_state(&mut self, set: &SparseSet) -> State {
 246         let mut state = State {
 247             is_match: false,
 248             nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
 249         };
 250         state.nfa_states.clear();
 251
 252         for &id in set {
 253             match *self.nfa.state(id) {
 254                 nfa::State::Range { .. } => {
 255                     state.nfa_states.push(id);
 256                 }
 257                 nfa::State::Sparse { .. } => {
 258                     state.nfa_states.push(id);
 259                 }
 260                 nfa::State::Fail => {
 261                     break;
 262                 }
 263                 nfa::State::Match => {
 264                     state.is_match = true;
 265                     if !self.longest_match {
 266                         break;
 267                     }
 268                 }
 269                 nfa::State::Union { .. } => {}
 270             }
 271         }
 272         state
 273     }
 274
 275     /// Create a new sparse set with enough capacity to hold all NFA states.
 276     fn new_sparse_set(&self) -> SparseSet {
 277         SparseSet::new(self.nfa.len())
 278     }
 279 }
 280
 281 impl State {
 282     /// Create a new empty dead state.
 283     fn dead() -> State {
 284         State { nfa_states: vec![], is_match: false }
 285     }
 286 }