vendor/regex-automata-0.1.10/src/minimize.rs

   1 use std::cell::RefCell;
   2 use std::fmt;
   3 use std::mem;
   4 use std::rc::Rc;
   5
   6 use dense;
   7 use state_id::{dead_id, StateID};
   8
   9 type DFARepr<S> = dense::Repr<Vec<S>, S>;
  10
  11 /// An implementation of Hopcroft's algorithm for minimizing DFAs.
  12 ///
  13 /// The algorithm implemented here is mostly taken from Wikipedia:
  14 /// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
  15 ///
  16 /// This code has had some light optimization attention paid to it,
  17 /// particularly in the form of reducing allocation as much as possible.
  18 /// However, it is still generally slow. Future optimization work should
  19 /// probably focus on the bigger picture rather than micro-optimizations. For
  20 /// example:
  21 ///
  22 /// 1. Figure out how to more intelligently create initial partitions. That is,
  23 ///    Hopcroft's algorithm starts by creating two partitions of DFA states
  24 ///    that are known to NOT be equivalent: match states and non-match states.
  25 ///    The algorithm proceeds by progressively refining these partitions into
  26 ///    smaller partitions. If we could start with more partitions, then we
  27 ///    could reduce the amount of work that Hopcroft's algorithm needs to do.
  28 /// 2. For every partition that we visit, we find all incoming transitions to
  29 ///    every state in the partition for *every* element in the alphabet. (This
  30 ///    is why using byte classes can significantly decrease minimization times,
  31 ///    since byte classes shrink the alphabet.) This is quite costly and there
  32 ///    is perhaps some redundant work being performed depending on the specific
  33 ///    states in the set. For example, we might be able to only visit some
  34 ///    elements of the alphabet based on the transitions.
  35 /// 3. Move parts of minimization into determinization. If minimization has
  36 ///    fewer states to deal with, then it should run faster. A prime example
  37 ///    of this might be large Unicode classes, which are generated in way that
  38 ///    can create a lot of redundant states. (Some work has been done on this
  39 ///    point during NFA compilation via the algorithm described in the
  40 ///    "Incremental Construction of MinimalAcyclic Finite-State Automata"
  41 ///    paper.)
  42 pub(crate) struct Minimizer<'a, S: 'a> {
  43     dfa: &'a mut DFARepr<S>,
  44     in_transitions: Vec<Vec<Vec<S>>>,
  45     partitions: Vec<StateSet<S>>,
  46     waiting: Vec<StateSet<S>>,
  47 }
  48
  49 impl<'a, S: StateID> fmt::Debug for Minimizer<'a, S> {
  50     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  51         f.debug_struct("Minimizer")
  52             .field("dfa", &self.dfa)
  53             .field("in_transitions", &self.in_transitions)
  54             .field("partitions", &self.partitions)
  55             .field("waiting", &self.waiting)
  56             .finish()
  57     }
  58 }
  59
  60 /// A set of states. A state set makes up a single partition in Hopcroft's
  61 /// algorithm.
  62 ///
  63 /// It is represented by an ordered set of state identifiers. We use shared
  64 /// ownership so that a single state set can be in both the set of partitions
  65 /// and in the set of waiting sets simultaneously without an additional
  66 /// allocation. Generally, once a state set is built, it becomes immutable.
  67 ///
  68 /// We use this representation because it avoids the overhead of more
  69 /// traditional set data structures (HashSet/BTreeSet), and also because
  70 /// computing intersection/subtraction on this representation is especially
  71 /// fast.
  72 #[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
  73 struct StateSet<S>(Rc<RefCell<Vec<S>>>);
  74
  75 impl<'a, S: StateID> Minimizer<'a, S> {
  76     pub fn new(dfa: &'a mut DFARepr<S>) -> Minimizer<'a, S> {
  77         let in_transitions = Minimizer::incoming_transitions(dfa);
  78         let partitions = Minimizer::initial_partitions(dfa);
  79         let waiting = vec![partitions[0].clone()];
  80
  81         Minimizer { dfa, in_transitions, partitions, waiting }
  82     }
  83
  84     pub fn run(mut self) {
  85         let mut incoming = StateSet::empty();
  86         let mut scratch1 = StateSet::empty();
  87         let mut scratch2 = StateSet::empty();
  88         let mut newparts = vec![];
  89
  90         while let Some(set) = self.waiting.pop() {
  91             for b in (0..self.dfa.alphabet_len()).map(|b| b as u8) {
  92                 self.find_incoming_to(b, &set, &mut incoming);
  93
  94                 for p in 0..self.partitions.len() {
  95                     self.partitions[p].intersection(&incoming, &mut scratch1);
  96                     if scratch1.is_empty() {
  97                         newparts.push(self.partitions[p].clone());
  98                         continue;
  99                     }
 100
 101                     self.partitions[p].subtract(&incoming, &mut scratch2);
 102                     if scratch2.is_empty() {
 103                         newparts.push(self.partitions[p].clone());
 104                         continue;
 105                     }
 106
 107                     let (x, y) =
 108                         (scratch1.deep_clone(), scratch2.deep_clone());
 109                     newparts.push(x.clone());
 110                     newparts.push(y.clone());
 111                     match self.find_waiting(&self.partitions[p]) {
 112                         Some(i) => {
 113                             self.waiting[i] = x;
 114                             self.waiting.push(y);
 115                         }
 116                         None => {
 117                             if x.len() <= y.len() {
 118                                 self.waiting.push(x);
 119                             } else {
 120                                 self.waiting.push(y);
 121                             }
 122                         }
 123                     }
 124                 }
 125                 newparts = mem::replace(&mut self.partitions, newparts);
 126                 newparts.clear();
 127             }
 128         }
 129
 130         // At this point, we now have a minimal partitioning of states, where
 131         // each partition is an equivalence class of DFA states. Now we need to
 132         // use this partioning to update the DFA to only contain one state for
 133         // each partition.
 134
 135         // Create a map from DFA state ID to the representative ID of the
 136         // equivalence class to which it belongs. The representative ID of an
 137         // equivalence class of states is the minimum ID in that class.
 138         let mut state_to_part = vec![dead_id(); self.dfa.state_count()];
 139         for p in &self.partitions {
 140             p.iter(|id| state_to_part[id.to_usize()] = p.min());
 141         }
 142
 143         // Generate a new contiguous sequence of IDs for minimal states, and
 144         // create a map from equivalence IDs to the new IDs. Thus, the new
 145         // minimal ID of *any* state in the unminimized DFA can be obtained
 146         // with minimals_ids[state_to_part[old_id]].
 147         let mut minimal_ids = vec![dead_id(); self.dfa.state_count()];
 148         let mut new_id = S::from_usize(0);
 149         for (id, _) in self.dfa.states() {
 150             if state_to_part[id.to_usize()] == id {
 151                 minimal_ids[id.to_usize()] = new_id;
 152                 new_id = S::from_usize(new_id.to_usize() + 1);
 153             }
 154         }
 155         // The total number of states in the minimal DFA.
 156         let minimal_count = new_id.to_usize();
 157
 158         // Re-map this DFA in place such that the only states remaining
 159         // correspond to the representative states of every equivalence class.
 160         for id in (0..self.dfa.state_count()).map(S::from_usize) {
 161             // If this state isn't a representative for an equivalence class,
 162             // then we skip it since it won't appear in the minimal DFA.
 163             if state_to_part[id.to_usize()] != id {
 164                 continue;
 165             }
 166             for (_, next) in self.dfa.get_state_mut(id).iter_mut() {
 167                 *next = minimal_ids[state_to_part[next.to_usize()].to_usize()];
 168             }
 169             self.dfa.swap_states(id, minimal_ids[id.to_usize()]);
 170         }
 171         // Trim off all unused states from the pre-minimized DFA. This
 172         // represents all states that were merged into a non-singleton
 173         // equivalence class of states, and appeared after the first state
 174         // in each such class. (Because the state with the smallest ID in each
 175         // equivalence class is its representative ID.)
 176         self.dfa.truncate_states(minimal_count);
 177
 178         // Update the new start state, which is now just the minimal ID of
 179         // whatever state the old start state was collapsed into.
 180         let old_start = self.dfa.start_state();
 181         self.dfa.set_start_state(
 182             minimal_ids[state_to_part[old_start.to_usize()].to_usize()],
 183         );
 184
 185         // In order to update the ID of the maximum match state, we need to
 186         // find the maximum ID among all of the match states in the minimized
 187         // DFA. This is not necessarily the new ID of the unminimized maximum
 188         // match state, since that could have been collapsed with a much
 189         // earlier match state. Therefore, to find the new max match state,
 190         // we iterate over all previous match states, find their corresponding
 191         // new minimal ID, and take the maximum of those.
 192         let old_max = self.dfa.max_match_state();
 193         self.dfa.set_max_match_state(dead_id());
 194         for id in (0..(old_max.to_usize() + 1)).map(S::from_usize) {
 195             let part = state_to_part[id.to_usize()];
 196             let new_id = minimal_ids[part.to_usize()];
 197             if new_id > self.dfa.max_match_state() {
 198                 self.dfa.set_max_match_state(new_id);
 199             }
 200         }
 201     }
 202
 203     fn find_waiting(&self, set: &StateSet<S>) -> Option<usize> {
 204         self.waiting.iter().position(|s| s == set)
 205     }
 206
 207     fn find_incoming_to(
 208         &self,
 209         b: u8,
 210         set: &StateSet<S>,
 211         incoming: &mut StateSet<S>,
 212     ) {
 213         incoming.clear();
 214         set.iter(|id| {
 215             for &inid in &self.in_transitions[id.to_usize()][b as usize] {
 216                 incoming.add(inid);
 217             }
 218         });
 219         incoming.canonicalize();
 220     }
 221
 222     fn initial_partitions(dfa: &DFARepr<S>) -> Vec<StateSet<S>> {
 223         let mut is_match = StateSet::empty();
 224         let mut no_match = StateSet::empty();
 225         for (id, _) in dfa.states() {
 226             if dfa.is_match_state(id) {
 227                 is_match.add(id);
 228             } else {
 229                 no_match.add(id);
 230             }
 231         }
 232
 233         let mut sets = vec![is_match];
 234         if !no_match.is_empty() {
 235             sets.push(no_match);
 236         }
 237         sets.sort_by_key(|s| s.len());
 238         sets
 239     }
 240
 241     fn incoming_transitions(dfa: &DFARepr<S>) -> Vec<Vec<Vec<S>>> {
 242         let mut incoming = vec![];
 243         for _ in dfa.states() {
 244             incoming.push(vec![vec![]; dfa.alphabet_len()]);
 245         }
 246         for (id, state) in dfa.states() {
 247             for (b, next) in state.transitions() {
 248                 incoming[next.to_usize()][b as usize].push(id);
 249             }
 250         }
 251         incoming
 252     }
 253 }
 254
 255 impl<S: StateID> StateSet<S> {
 256     fn empty() -> StateSet<S> {
 257         StateSet(Rc::new(RefCell::new(vec![])))
 258     }
 259
 260     fn add(&mut self, id: S) {
 261         self.0.borrow_mut().push(id);
 262     }
 263
 264     fn min(&self) -> S {
 265         self.0.borrow()[0]
 266     }
 267
 268     fn canonicalize(&mut self) {
 269         self.0.borrow_mut().sort();
 270         self.0.borrow_mut().dedup();
 271     }
 272
 273     fn clear(&mut self) {
 274         self.0.borrow_mut().clear();
 275     }
 276
 277     fn len(&self) -> usize {
 278         self.0.borrow().len()
 279     }
 280
 281     fn is_empty(&self) -> bool {
 282         self.len() == 0
 283     }
 284
 285     fn deep_clone(&self) -> StateSet<S> {
 286         let ids = self.0.borrow().iter().cloned().collect();
 287         StateSet(Rc::new(RefCell::new(ids)))
 288     }
 289
 290     fn iter<F: FnMut(S)>(&self, mut f: F) {
 291         for &id in self.0.borrow().iter() {
 292             f(id);
 293         }
 294     }
 295
 296     fn intersection(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
 297         dest.clear();
 298         if self.is_empty() || other.is_empty() {
 299             return;
 300         }
 301
 302         let (seta, setb) = (self.0.borrow(), other.0.borrow());
 303         let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
 304         let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
 305         loop {
 306             if a == b {
 307                 dest.add(a);
 308                 a = match ita.next() {
 309                     None => break,
 310                     Some(a) => a,
 311                 };
 312                 b = match itb.next() {
 313                     None => break,
 314                     Some(b) => b,
 315                 };
 316             } else if a < b {
 317                 a = match ita.next() {
 318                     None => break,
 319                     Some(a) => a,
 320                 };
 321             } else {
 322                 b = match itb.next() {
 323                     None => break,
 324                     Some(b) => b,
 325                 };
 326             }
 327         }
 328     }
 329
 330     fn subtract(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
 331         dest.clear();
 332         if self.is_empty() || other.is_empty() {
 333             self.iter(|s| dest.add(s));
 334             return;
 335         }
 336
 337         let (seta, setb) = (self.0.borrow(), other.0.borrow());
 338         let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
 339         let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
 340         loop {
 341             if a == b {
 342                 a = match ita.next() {
 343                     None => break,
 344                     Some(a) => a,
 345                 };
 346                 b = match itb.next() {
 347                     None => {
 348                         dest.add(a);
 349                         break;
 350                     }
 351                     Some(b) => b,
 352                 };
 353             } else if a < b {
 354                 dest.add(a);
 355                 a = match ita.next() {
 356                     None => break,
 357                     Some(a) => a,
 358                 };
 359             } else {
 360                 b = match itb.next() {
 361                     None => {
 362                         dest.add(a);
 363                         break;
 364                     }
 365                     Some(b) => b,
 366                 };
 367             }
 368         }
 369         for a in ita {
 370             dest.add(a);
 371         }
 372     }
 373 }