vendor/regex-automata-0.2.0/src/util/determinize/mod.rs

   1 /*!
   2 This module contains types and routines for implementing determinization.
   3
   4 In this crate, there are at least two places where we implement
   5 determinization: fully ahead-of-time compiled DFAs in the `dfa` module and
   6 lazily compiled DFAs in the `hybrid` module. The stuff in this module
   7 corresponds to the things that are in common between these implementations.
   8
   9 There are three broad things that our implementations of determinization have
  10 in common, as defined by this module:
  11
  12 * The classification of start states. That is, whether we're dealing with
  13 word boundaries, line boundaries, etc., is all the same. This also includes
  14 the look-behind assertions that are satisfied by each starting state
  15 classification.
  16
  17 * The representation of DFA states as sets of NFA states, including
  18 convenience types for building these DFA states that are amenable to reusing
  19 allocations.
  20
  21 * Routines for the "classical" parts of determinization: computing the
  22 epsilon closure, tracking match states (with corresponding pattern IDs, since
  23 we support multi-pattern finite automata) and, of course, computing the
  24 transition function between states for units of input.
  25
  26 I did consider a couple of alternatives to this particular form of code reuse:
  27
  28 1. Don't do any code reuse. The problem here is that we *really* want both
  29 forms of determinization to do exactly identical things when it comes to
  30 their handling of NFA states. While our tests generally ensure this, the code
  31 is tricky and large enough where not reusing code is a pretty big bummer.
  32
  33 2. Implement all of determinization once and make it generic over fully
  34 compiled DFAs and lazily compiled DFAs. While I didn't actually try this
  35 approach, my instinct is that it would be more complex than is needed here.
  36 And the interface required would be pretty hairy. Instead, I think splitting
  37 it into logical sub-components works better.
  38 */
  39
  40 use alloc::vec::Vec;
  41
  42 pub(crate) use self::state::{
  43     State, StateBuilderEmpty, StateBuilderMatches, StateBuilderNFA,
  44 };
  45
  46 use crate::{
  47     nfa::thompson::{self, Look, LookSet},
  48     util::{
  49         alphabet,
  50         id::StateID,
  51         matchtypes::MatchKind,
  52         sparse_set::{SparseSet, SparseSets},
  53         start::Start,
  54     },
  55 };
  56
  57 mod state;
  58
  59 /// Compute the set of all eachable NFA states, including the full epsilon
  60 /// closure, from a DFA state for a single unit of input. The set of reachable
  61 /// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned
  62 /// also includes any look-behind assertions satisfied by `unit`, in addition
  63 /// to whether it is a match state. For multi-pattern DFAs, the builder will
  64 /// also include the pattern IDs that match (in the order seen).
  65 ///
  66 /// `nfa` must be able to resolve any NFA state in `state` and any NFA state
  67 /// reachable via the epsilon closure of any NFA state in `state`. `sparses`
  68 /// must have capacity equivalent to `nfa.len()`.
  69 ///
  70 /// `match_kind` should correspond to the match semantics implemented by the
  71 /// DFA being built. Generally speaking, for leftmost-first match semantics,
  72 /// states that appear after the first NFA match state will not be included in
  73 /// the `StateBuilderNFA` returned since they are impossible to visit.
  74 ///
  75 /// `sparses` is used as scratch space for NFA traversal. Other than their
  76 /// capacity requirements (detailed above), there are no requirements on what's
  77 /// contained within them (if anything). Similarly, what's inside of them once
  78 /// this routine returns is unspecified.
  79 ///
  80 /// `stack` must have length 0. It is used as scratch space for depth first
  81 /// traversal. After returning, it is guaranteed that `stack` will have length
  82 /// 0.
  83 ///
  84 /// `state` corresponds to the current DFA state on which one wants to compute
  85 /// the transition for the input `unit`.
  86 ///
  87 /// `empty_builder` corresponds to the builder allocation to use to produce a
  88 /// complete `StateBuilderNFA` state. If the state is not needed (or is already
  89 /// cached), then it can be cleared and reused without needing to create a new
  90 /// `State`. The `StateBuilderNFA` state returned is final and ready to be
  91 /// turned into a `State` if necessary.
  92 pub(crate) fn next(
  93     nfa: &thompson::NFA,
  94     match_kind: MatchKind,
  95     sparses: &mut SparseSets,
  96     stack: &mut Vec<StateID>,
  97     state: &State,
  98     unit: alphabet::Unit,
  99     empty_builder: StateBuilderEmpty,
 100 ) -> StateBuilderNFA {
 101     sparses.clear();
 102
 103     // Put the NFA state IDs into a sparse set in case we need to
 104     // re-compute their epsilon closure.
 105     //
 106     // Doing this state shuffling is technically not necessary unless some
 107     // kind of look-around is used in the DFA. Some ad hoc experiments
 108     // suggested that avoiding this didn't lead to much of an improvement,
 109     // but perhaps more rigorous experimentation should be done. And in
 110     // particular, avoiding this check requires some light refactoring of
 111     // the code below.
 112     state.iter_nfa_state_ids(|nfa_id| {
 113         sparses.set1.insert(nfa_id);
 114     });
 115
 116     // Compute look-ahead assertions originating from the current state.
 117     // Based on the input unit we're transitioning over, some additional
 118     // set of assertions may be true. Thus, we re-compute this state's
 119     // epsilon closure (but only if necessary).
 120     if !state.look_need().is_empty() {
 121         // Add look-ahead assertions that are now true based on the current
 122         // input unit.
 123         let mut look_have = state.look_have().clone();
 124         match unit.as_u8() {
 125             Some(b'\n') => {
 126                 look_have.insert(Look::EndLine);
 127             }
 128             Some(_) => {}
 129             None => {
 130                 look_have.insert(Look::EndText);
 131                 look_have.insert(Look::EndLine);
 132             }
 133         }
 134         if state.is_from_word() == unit.is_word_byte() {
 135             look_have.insert(Look::WordBoundaryUnicodeNegate);
 136             look_have.insert(Look::WordBoundaryAsciiNegate);
 137         } else {
 138             look_have.insert(Look::WordBoundaryUnicode);
 139             look_have.insert(Look::WordBoundaryAscii);
 140         }
 141         // If we have new assertions satisfied that are among the set of
 142         // assertions that exist in this state (that is, just because
 143         // we added an EndLine assertion above doesn't mean there is an
 144         // EndLine conditional epsilon transition in this state), then we
 145         // re-compute this state's epsilon closure using the updated set of
 146         // assertions.
 147         if !look_have
 148             .subtract(state.look_have())
 149             .intersect(state.look_need())
 150             .is_empty()
 151         {
 152             for nfa_id in &sparses.set1 {
 153                 epsilon_closure(
 154                     nfa,
 155                     nfa_id,
 156                     look_have,
 157                     stack,
 158                     &mut sparses.set2,
 159                 );
 160             }
 161             sparses.swap();
 162             sparses.set2.clear();
 163         }
 164     }
 165
 166     // Convert our empty builder into one that can record assertions and match
 167     // pattern IDs.
 168     let mut builder = empty_builder.into_matches();
 169     // Set whether the StartLine look-behind assertion is true for this
 170     // transition or not. The look-behind assertion for ASCII word boundaries
 171     // is handled below.
 172     if nfa.has_any_anchor() {
 173         if unit.as_u8().map_or(false, |b| b == b'\n') {
 174             // Why only handle StartLine here and not StartText? That's
 175             // because StartText can only impact the starting state, which
 176             // is speical cased in start state handling.
 177             builder.look_have().insert(Look::StartLine);
 178         }
 179     }
 180     for nfa_id in &sparses.set1 {
 181         match *nfa.state(nfa_id) {
 182             thompson::State::Union { .. }
 183             | thompson::State::Fail
 184             | thompson::State::Look { .. }
 185             | thompson::State::Capture { .. } => {}
 186             thompson::State::Match { id } => {
 187                 // Notice here that we are calling the NEW state a match
 188                 // state if the OLD state we are transitioning from
 189                 // contains an NFA match state. This is precisely how we
 190                 // delay all matches by one byte and also what therefore
 191                 // guarantees that starting states cannot be match states.
 192                 //
 193                 // If we didn't delay matches by one byte, then whether
 194                 // a DFA is a matching state or not would be determined
 195                 // by whether one of its own constituent NFA states
 196                 // was a match state. (And that would be done in
 197                 // 'add_nfa_states'.)
 198                 //
 199                 // Also, 'add_match_pattern_id' requires that callers never
 200                 // pass duplicative pattern IDs. We do in fact uphold that
 201                 // guarantee here, but it's subtle. In particular, a Thompson
 202                 // NFA guarantees that each pattern has exactly one match
 203                 // state. Moreover, since we're iterating over the NFA state
 204                 // IDs in a set, we are guarateed not to have any duplicative
 205                 // match states. Thus, it is impossible to add the same pattern
 206                 // ID more than once.
 207                 builder.add_match_pattern_id(id);
 208                 if !match_kind.continue_past_first_match() {
 209                     break;
 210                 }
 211             }
 212             thompson::State::Range { range: ref r } => {
 213                 if r.matches_unit(unit) {
 214                     epsilon_closure(
 215                         nfa,
 216                         r.next,
 217                         *builder.look_have(),
 218                         stack,
 219                         &mut sparses.set2,
 220                     );
 221                 }
 222             }
 223             thompson::State::Sparse(ref sparse) => {
 224                 if let Some(next) = sparse.matches_unit(unit) {
 225                     epsilon_closure(
 226                         nfa,
 227                         next,
 228                         *builder.look_have(),
 229                         stack,
 230                         &mut sparses.set2,
 231                     );
 232                 }
 233             }
 234         }
 235     }
 236     // We only set the word byte if there's a word boundary look-around
 237     // anywhere in this regex. Otherwise, there's no point in bloating the
 238     // number of states if we don't have one.
 239     //
 240     // We also only set it when the state has a non-zero number of NFA states.
 241     // Otherwise, we could wind up with states that *should* be DEAD states
 242     // but are otherwise distinct from DEAD states because of this look-behind
 243     // assertion being set. While this can't technically impact correctness *in
 244     // theory*, it can create pathological DFAs that consume input until EOI or
 245     // a quit byte is seen. Consuming until EOI isn't a correctness problem,
 246     // but a (serious) perf problem. Hitting a quit byte, however, could be a
 247     // correctness problem since it could cause search routines to report an
 248     // error instead of a detected match once the quit state is entered. (The
 249     // search routine could be made to be a bit smarter by reporting a match
 250     // if one was detected once it enters a quit state (and indeed, the search
 251     // routines in this crate do just that), but it seems better to prevent
 252     // these things by construction if possible.)
 253     if nfa.has_word_boundary()
 254         && unit.is_word_byte()
 255         && !sparses.set2.is_empty()
 256     {
 257         builder.set_is_from_word();
 258     }
 259     let mut builder_nfa = builder.into_nfa();
 260     add_nfa_states(nfa, &sparses.set2, &mut builder_nfa);
 261     builder_nfa
 262 }
 263
 264 /// Compute the epsilon closure for the given NFA state. The epsilon closure
 265 /// consists of all NFA state IDs, including `start_nfa_id`, that can be
 266 /// reached from `start_nfa_id` without consuming any input. These state IDs
 267 /// are written to `set` in the order they are visited, but only if they are
 268 /// not already in `set`. `start_nfa_id` must be a valid state ID for the NFA
 269 /// given.
 270 ///
 271 /// `look_have` consists of the satisfied assertions at the current
 272 /// position. For conditional look-around epsilon transitions, these are
 273 /// only followed if they are satisfied by `look_have`.
 274 ///
 275 /// `stack` must have length 0. It is used as scratch space for depth first
 276 /// traversal. After returning, it is guaranteed that `stack` will have length
 277 /// 0.
 278 pub(crate) fn epsilon_closure(
 279     nfa: &thompson::NFA,
 280     start_nfa_id: StateID,
 281     look_have: LookSet,
 282     stack: &mut Vec<StateID>,
 283     set: &mut SparseSet,
 284 ) {
 285     assert!(stack.is_empty());
 286     // If this isn't an epsilon state, then the epsilon closure is always just
 287     // itself, so there's no need to spin up the machinery below to handle it.
 288     if !nfa.state(start_nfa_id).is_epsilon() {
 289         set.insert(start_nfa_id);
 290         return;
 291     }
 292
 293     stack.push(start_nfa_id);
 294     while let Some(mut id) = stack.pop() {
 295         // In many cases, we can avoid stack operations when an NFA state only
 296         // adds one new state to visit. In that case, we just set our ID to
 297         // that state and mush on. We only use the stack when an NFA state
 298         // introduces multiple new states to visit.
 299         loop {
 300             // Insert this NFA state, and if it's already in the set and thus
 301             // already visited, then we can move on to the next one.
 302             if !set.insert(id) {
 303                 break;
 304             }
 305             match *nfa.state(id) {
 306                 thompson::State::Range { .. }
 307                 | thompson::State::Sparse { .. }
 308                 | thompson::State::Fail
 309                 | thompson::State::Match { .. } => break,
 310                 thompson::State::Look { look, next } => {
 311                     if !look_have.contains(look) {
 312                         break;
 313                     }
 314                     id = next;
 315                 }
 316                 thompson::State::Union { ref alternates } => {
 317                     id = match alternates.get(0) {
 318                         None => break,
 319                         Some(&id) => id,
 320                     };
 321                     // We need to process our alternates in order to preserve
 322                     // match preferences, so put the earliest alternates closer
 323                     // to the top of the stack.
 324                     stack.extend(alternates[1..].iter().rev());
 325                 }
 326                 thompson::State::Capture { next, .. } => {
 327                     id = next;
 328                 }
 329             }
 330         }
 331     }
 332 }
 333
 334 /// Add the NFA state IDs in the given `set` to the given DFA builder state.
 335 /// The order in which states are added corresponds to the order in which they
 336 /// were added to `set`.
 337 ///
 338 /// The DFA builder state given should already have its complete set of match
 339 /// pattern IDs added (if any) and any look-behind assertions (StartLine,
 340 /// StartText and whether this state is being generated for a transition over a
 341 /// word byte when applicable) that are true immediately prior to transitioning
 342 /// into this state (via `builder.look_have()`). The match pattern IDs should
 343 /// correspond to matches that occured on the previous transition, since all
 344 /// matches are delayed by one byte. The things that should _not_ be set are
 345 /// look-ahead assertions (EndLine, EndText and whether the next byte is a
 346 /// word byte or not). The builder state should also not have anything in
 347 /// `look_need` set, as this routine will compute that for you.
 348 ///
 349 /// The given NFA should be able to resolve all identifiers in `set` to a
 350 /// particular NFA state. Additionally, `set` must have capacity equivalent
 351 /// to `nfa.len()`.
 352 pub(crate) fn add_nfa_states(
 353     nfa: &thompson::NFA,
 354     set: &SparseSet,
 355     builder: &mut StateBuilderNFA,
 356 ) {
 357     for nfa_id in set {
 358         match *nfa.state(nfa_id) {
 359             thompson::State::Range { .. } => {
 360                 builder.add_nfa_state_id(nfa_id);
 361             }
 362             thompson::State::Sparse { .. } => {
 363                 builder.add_nfa_state_id(nfa_id);
 364             }
 365             thompson::State::Look { look, .. } => {
 366                 builder.add_nfa_state_id(nfa_id);
 367                 builder.look_need().insert(look);
 368             }
 369             thompson::State::Union { .. }
 370             | thompson::State::Capture { .. } => {
 371                 // Pure epsilon transitions don't need to be tracked
 372                 // as part of the DFA state. Tracking them is actually
 373                 // superfluous; they won't cause any harm other than making
 374                 // determinization slower.
 375                 //
 376                 // Why aren't these needed? Well, in an NFA, epsilon
 377                 // transitions are really just jumping points to other
 378                 // states. So once you hit an epsilon transition, the same
 379                 // set of resulting states always appears. Therefore,
 380                 // putting them in a DFA's set of ordered NFA states is
 381                 // strictly redundant.
 382                 //
 383                 // Look-around states are also epsilon transitions, but
 384                 // they are *conditional*. So their presence could be
 385                 // discriminatory, and thus, they are tracked above.
 386                 //
 387                 // But wait... why are epsilon states in our `set` in the
 388                 // first place? Why not just leave them out? They're in
 389                 // our `set` because it was generated by computing an
 390                 // epsilon closure, and we want to keep track of all states
 391                 // we visited to avoid re-visiting them. In exchange, we
 392                 // have to do this second iteration over our collected
 393                 // states to finalize our DFA state.
 394                 //
 395                 // Note that this optimization requires that we re-compute
 396                 // the epsilon closure to account for look-ahead in 'next'
 397                 // *only when necessary*. Namely, only when the set of
 398                 // look-around assertions changes and only when those
 399                 // changes are within the set of assertions that are
 400                 // needed in order to step through the closure correctly.
 401                 // Otherwise, if we re-do the epsilon closure needlessly,
 402                 // it could change based on the fact that we are omitting
 403                 // epsilon states here.
 404             }
 405             thompson::State::Fail => {
 406                 break;
 407             }
 408             thompson::State::Match { .. } => {
 409                 // Normally, the NFA match state doesn't actually need to
 410                 // be inside the DFA state. But since we delay matches by
 411                 // one byte, the matching DFA state corresponds to states
 412                 // that transition from the one we're building here. And
 413                 // the way we detect those cases is by looking for an NFA
 414                 // match state. See 'next' for how this is handled.
 415                 builder.add_nfa_state_id(nfa_id);
 416             }
 417         }
 418     }
 419     // If we know this state contains no look-around assertions, then
 420     // there's no reason to track which look-around assertions were
 421     // satisfied when this state was created.
 422     if builder.look_need().is_empty() {
 423         builder.look_have().clear();
 424     }
 425 }
 426
 427 /// Sets the appropriate look-behind assertions on the given state based on
 428 /// this starting configuration.
 429 pub(crate) fn set_lookbehind_from_start(
 430     start: &Start,
 431     builder: &mut StateBuilderMatches,
 432 ) {
 433     match *start {
 434         Start::NonWordByte => {}
 435         Start::WordByte => {
 436             builder.set_is_from_word();
 437         }
 438         Start::Text => {
 439             builder.look_have().insert(Look::StartText);
 440             builder.look_have().insert(Look::StartLine);
 441         }
 442         Start::Line => {
 443             builder.look_have().insert(Look::StartLine);
 444         }
 445     }
 446 }
 447
 448 #[cfg(test)]
 449 mod tests {
 450     use super::Start;
 451
 452     #[test]
 453     #[should_panic]
 454     fn start_fwd_bad_range() {
 455         Start::from_position_fwd(&[], 0, 1);
 456     }
 457
 458     #[test]
 459     #[should_panic]
 460     fn start_rev_bad_range() {
 461         Start::from_position_rev(&[], 0, 1);
 462     }
 463
 464     #[test]
 465     fn start_fwd() {
 466         let f = Start::from_position_fwd;
 467
 468         assert_eq!(Start::Text, f(&[], 0, 0));
 469         assert_eq!(Start::Text, f(b"abc", 0, 3));
 470         assert_eq!(Start::Text, f(b"\nabc", 0, 3));
 471
 472         assert_eq!(Start::Line, f(b"\nabc", 1, 3));
 473
 474         assert_eq!(Start::WordByte, f(b"abc", 1, 3));
 475
 476         assert_eq!(Start::NonWordByte, f(b" abc", 1, 3));
 477     }
 478
 479     #[test]
 480     fn start_rev() {
 481         let f = Start::from_position_rev;
 482
 483         assert_eq!(Start::Text, f(&[], 0, 0));
 484         assert_eq!(Start::Text, f(b"abc", 0, 3));
 485         assert_eq!(Start::Text, f(b"abc\n", 0, 4));
 486
 487         assert_eq!(Start::Line, f(b"abc\nz", 0, 3));
 488
 489         assert_eq!(Start::WordByte, f(b"abc", 0, 2));
 490
 491         assert_eq!(Start::NonWordByte, f(b"abc ", 0, 3));
 492     }
 493 }