vendor/regex-automata/src/nfa/thompson/compiler.rs

   1 /*
   2 This module provides an NFA compiler using Thompson's construction
   3 algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA
   4 graph as output. The NFA graph is structured in a way that permits it to be
   5 executed by a virtual machine and also used to efficiently build a DFA.
   6
   7 The compiler deals with a slightly expanded set of NFA states that notably
   8 includes an empty node that has exactly one epsilon transition to the next
   9 state. In other words, it's a "goto" instruction if one views Thompson's NFA
  10 as a set of bytecode instructions. These goto instructions are removed in
  11 a subsequent phase before returning the NFA to the caller. The purpose of
  12 these empty nodes is that they make the construction algorithm substantially
  13 simpler to implement. We remove them before returning to the caller because
  14 they can represent substantial overhead when traversing the NFA graph
  15 (either while searching using the NFA directly or while building a DFA).
  16
  17 In the future, it would be nice to provide a Glushkov compiler as well,
  18 as it would work well as a bit-parallel NFA for smaller regexes. But
  19 the Thompson construction is one I'm more familiar with and seems more
  20 straight-forward to deal with when it comes to large Unicode character
  21 classes.
  22
  23 Internally, the compiler uses interior mutability to improve composition
  24 in the face of the borrow checker. In particular, we'd really like to be
  25 able to write things like this:
  26
  27     self.c_concat(exprs.iter().map(|e| self.c(e)))
  28
  29 Which elegantly uses iterators to build up a sequence of compiled regex
  30 sub-expressions and then hands it off to the concatenating compiler
  31 routine. Without interior mutability, the borrow checker won't let us
  32 borrow `self` mutably both inside and outside the closure at the same
  33 time.
  34 */
  35
  36 use core::{
  37     borrow::Borrow,
  38     cell::{Cell, RefCell},
  39     mem,
  40 };
  41
  42 use alloc::{sync::Arc, vec, vec::Vec};
  43
  44 use regex_syntax::{
  45     hir::{self, Anchor, Class, Hir, HirKind, Literal, WordBoundary},
  46     utf8::{Utf8Range, Utf8Sequences},
  47     ParserBuilder,
  48 };
  49
  50 use crate::{
  51     nfa::thompson::{
  52         error::Error,
  53         map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap},
  54         range_trie::RangeTrie,
  55         Look, SparseTransitions, State, Transition, NFA,
  56     },
  57     util::{
  58         alphabet::ByteClassSet,
  59         id::{IteratorIDExt, PatternID, StateID},
  60     },
  61 };
  62
  63 /// The configuration used for compiling a Thompson NFA from a regex pattern.
  64 #[derive(Clone, Copy, Debug, Default)]
  65 pub struct Config {
  66     reverse: Option<bool>,
  67     utf8: Option<bool>,
  68     nfa_size_limit: Option<Option<usize>>,
  69     shrink: Option<bool>,
  70     captures: Option<bool>,
  71     #[cfg(test)]
  72     unanchored_prefix: Option<bool>,
  73 }
  74
  75 impl Config {
  76     /// Return a new default Thompson NFA compiler configuration.
  77     pub fn new() -> Config {
  78         Config::default()
  79     }
  80
  81     /// Reverse the NFA.
  82     ///
  83     /// A NFA reversal is performed by reversing all of the concatenated
  84     /// sub-expressions in the original pattern, recursively. The resulting
  85     /// NFA can be used to match the pattern starting from the end of a string
  86     /// instead of the beginning of a string.
  87     ///
  88     /// Reversing the NFA is useful for building a reverse DFA, which is most
  89     /// useful for finding the start of a match after its ending position has
  90     /// been found.
  91     ///
  92     /// This is disabled by default.
  93     pub fn reverse(mut self, yes: bool) -> Config {
  94         self.reverse = Some(yes);
  95         self
  96     }
  97
  98     /// Whether to enable UTF-8 mode or not.
  99     ///
 100     /// When UTF-8 mode is enabled (which is the default), unanchored searches
 101     /// will only match through valid UTF-8. If invalid UTF-8 is seen, then
 102     /// an unanchored search will stop at that point. This is equivalent to
 103     /// putting a `(?s:.)*?` at the start of the regex.
 104     ///
 105     /// When UTF-8 mode is disabled, then unanchored searches will match
 106     /// through any arbitrary byte. This is equivalent to putting a
 107     /// `(?s-u:.)*?` at the start of the regex.
 108     ///
 109     /// Generally speaking, UTF-8 mode should only be used when you know you
 110     /// are searching valid UTF-8, such as a Rust `&str`. If UTF-8 mode is used
 111     /// on input that is not valid UTF-8, then the regex is not likely to work
 112     /// as expected.
 113     ///
 114     /// This is enabled by default.
 115     pub fn utf8(mut self, yes: bool) -> Config {
 116         self.utf8 = Some(yes);
 117         self
 118     }
 119
 120     /// Sets an approximate size limit on the total heap used by the NFA being
 121     /// compiled.
 122     ///
 123     /// This permits imposing constraints on the size of a compiled NFA. This
 124     /// may be useful in contexts where the regex pattern is untrusted and one
 125     /// wants to avoid using too much memory.
 126     ///
 127     /// This size limit does not apply to auxiliary heap used during
 128     /// compilation that is not part of the built NFA.
 129     ///
 130     /// Note that this size limit is applied during compilation in order for
 131     /// the limit to prevent too much heap from being used. However, the
 132     /// implementation may use an intermediate NFA representation that is
 133     /// otherwise slightly bigger than the final public form. Since the size
 134     /// limit may be applied to an intermediate representation, there is not
 135     /// necessarily a precise correspondence between the configured size limit
 136     /// and the heap usage of the final NFA.
 137     ///
 138     /// There is no size limit by default.
 139     ///
 140     /// # Example
 141     ///
 142     /// This example demonstrates how Unicode mode can greatly increase the
 143     /// size of the NFA.
 144     ///
 145     /// ```
 146     /// use regex_automata::nfa::thompson::NFA;
 147     ///
 148     /// // 300KB isn't enough!
 149     /// NFA::builder()
 150     ///     .configure(NFA::config().nfa_size_limit(Some(300_000)))
 151     ///     .build(r"\w{20}")
 152     ///     .unwrap_err();
 153     ///
 154     /// // ... but 400KB probably is.
 155     /// let nfa = NFA::builder()
 156     ///     .configure(NFA::config().nfa_size_limit(Some(400_000)))
 157     ///     .build(r"\w{20}")?;
 158     ///
 159     /// assert_eq!(nfa.pattern_len(), 1);
 160     ///
 161     /// # Ok::<(), Box<dyn std::error::Error>>(())
 162     /// ```
 163     pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config {
 164         self.nfa_size_limit = Some(bytes);
 165         self
 166     }
 167
 168     /// Apply best effort heuristics to shrink the NFA at the expense of more
 169     /// time/memory.
 170     ///
 171     /// This is enabled by default. Generally speaking, if one is using an NFA
 172     /// to compile a DFA, then the extra time used to shrink the NFA will be
 173     /// more than made up for during DFA construction (potentially by a lot).
 174     /// In other words, enabling this can substantially decrease the overall
 175     /// amount of time it takes to build a DFA.
 176     ///
 177     /// The only reason to disable this if you want to compile an NFA and start
 178     /// using it as quickly as possible without needing to build a DFA. e.g.,
 179     /// for an NFA simulation or for a lazy DFA.
 180     ///
 181     /// This is enabled by default.
 182     pub fn shrink(mut self, yes: bool) -> Config {
 183         self.shrink = Some(yes);
 184         self
 185     }
 186
 187     /// Whether to include 'Capture' states in the NFA.
 188     ///
 189     /// This can only be enabled when compiling a forward NFA. This is
 190     /// always disabled---with no way to override it---when the `reverse`
 191     /// configuration is enabled.
 192     ///
 193     /// This is enabled by default.
 194     pub fn captures(mut self, yes: bool) -> Config {
 195         self.captures = Some(yes);
 196         self
 197     }
 198
 199     /// Whether to compile an unanchored prefix into this NFA.
 200     ///
 201     /// This is enabled by default. It is made available for tests only to make
 202     /// it easier to unit test the output of the compiler.
 203     #[cfg(test)]
 204     fn unanchored_prefix(mut self, yes: bool) -> Config {
 205         self.unanchored_prefix = Some(yes);
 206         self
 207     }
 208
 209     pub fn get_reverse(&self) -> bool {
 210         self.reverse.unwrap_or(false)
 211     }
 212
 213     pub fn get_utf8(&self) -> bool {
 214         self.utf8.unwrap_or(true)
 215     }
 216
 217     pub fn get_nfa_size_limit(&self) -> Option<usize> {
 218         self.nfa_size_limit.unwrap_or(None)
 219     }
 220
 221     pub fn get_shrink(&self) -> bool {
 222         self.shrink.unwrap_or(true)
 223     }
 224
 225     pub fn get_captures(&self) -> bool {
 226         !self.get_reverse() && self.captures.unwrap_or(true)
 227     }
 228
 229     fn get_unanchored_prefix(&self) -> bool {
 230         #[cfg(test)]
 231         {
 232             self.unanchored_prefix.unwrap_or(true)
 233         }
 234         #[cfg(not(test))]
 235         {
 236             true
 237         }
 238     }
 239
 240     pub(crate) fn overwrite(self, o: Config) -> Config {
 241         Config {
 242             reverse: o.reverse.or(self.reverse),
 243             utf8: o.utf8.or(self.utf8),
 244             nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
 245             shrink: o.shrink.or(self.shrink),
 246             captures: o.captures.or(self.captures),
 247             #[cfg(test)]
 248             unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix),
 249         }
 250     }
 251 }
 252
 253 /// A builder for compiling an NFA.
 254 #[derive(Clone, Debug)]
 255 pub struct Builder {
 256     config: Config,
 257     parser: ParserBuilder,
 258 }
 259
 260 impl Builder {
 261     /// Create a new NFA builder with its default configuration.
 262     pub fn new() -> Builder {
 263         Builder { config: Config::default(), parser: ParserBuilder::new() }
 264     }
 265
 266     /// Compile the given regular expression into an NFA.
 267     ///
 268     /// If there was a problem parsing the regex, then that error is returned.
 269     ///
 270     /// Otherwise, if there was a problem building the NFA, then an error is
 271     /// returned. The only error that can occur is if the compiled regex would
 272     /// exceed the size limits configured on this builder.
 273     pub fn build(&self, pattern: &str) -> Result<NFA, Error> {
 274         self.build_many(&[pattern])
 275     }
 276
 277     pub fn build_many<P: AsRef<str>>(
 278         &self,
 279         patterns: &[P],
 280     ) -> Result<NFA, Error> {
 281         let mut hirs = vec![];
 282         for p in patterns {
 283             hirs.push(
 284                 self.parser
 285                     .build()
 286                     .parse(p.as_ref())
 287                     .map_err(Error::syntax)?,
 288             );
 289             log!(log::trace!("parsed: {:?}", p.as_ref()));
 290         }
 291         self.build_many_from_hir(&hirs)
 292     }
 293
 294     /// Compile the given high level intermediate representation of a regular
 295     /// expression into an NFA.
 296     ///
 297     /// If there was a problem building the NFA, then an error is returned. The
 298     /// only error that can occur is if the compiled regex would exceed the
 299     /// size limits configured on this builder.
 300     pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, Error> {
 301         self.build_from_hir_with(&mut Compiler::new(), expr)
 302     }
 303
 304     pub fn build_many_from_hir<H: Borrow<Hir>>(
 305         &self,
 306         exprs: &[H],
 307     ) -> Result<NFA, Error> {
 308         self.build_many_from_hir_with(&mut Compiler::new(), exprs)
 309     }
 310
 311     /// Compile the given high level intermediate representation of a regular
 312     /// expression into the NFA given using the given compiler. Callers may
 313     /// prefer this over `build` if they would like to reuse allocations while
 314     /// compiling many regular expressions.
 315     ///
 316     /// On success, the given NFA is completely overwritten with the NFA
 317     /// produced by the compiler.
 318     ///
 319     /// If there was a problem building the NFA, then an error is returned.
 320     /// The only error that can occur is if the compiled regex would exceed
 321     /// the size limits configured on this builder. When an error is returned,
 322     /// the contents of `nfa` are unspecified and should not be relied upon.
 323     /// However, it can still be reused in subsequent calls to this method.
 324     fn build_from_hir_with(
 325         &self,
 326         compiler: &mut Compiler,
 327         expr: &Hir,
 328     ) -> Result<NFA, Error> {
 329         self.build_many_from_hir_with(compiler, &[expr])
 330     }
 331
 332     fn build_many_from_hir_with<H: Borrow<Hir>>(
 333         &self,
 334         compiler: &mut Compiler,
 335         exprs: &[H],
 336     ) -> Result<NFA, Error> {
 337         compiler.configure(self.config);
 338         compiler.compile(exprs)
 339     }
 340
 341     /// Apply the given NFA configuration options to this builder.
 342     pub fn configure(&mut self, config: Config) -> &mut Builder {
 343         self.config = self.config.overwrite(config);
 344         self
 345     }
 346
 347     /// Set the syntax configuration for this builder using
 348     /// [`SyntaxConfig`](../../struct.SyntaxConfig.html).
 349     ///
 350     /// This permits setting things like case insensitivity, Unicode and multi
 351     /// line mode.
 352     ///
 353     /// This syntax configuration generally only applies when an NFA is built
 354     /// directly from a pattern string. If an NFA is built from an HIR, then
 355     /// all syntax settings are ignored.
 356     pub fn syntax(
 357         &mut self,
 358         config: crate::util::syntax::SyntaxConfig,
 359     ) -> &mut Builder {
 360         config.apply(&mut self.parser);
 361         self
 362     }
 363 }
 364
 365 /// A compiler that converts a regex abstract syntax to an NFA via Thompson's
 366 /// construction. Namely, this compiler permits epsilon transitions between
 367 /// states.
 368 #[derive(Clone, Debug)]
 369 pub struct Compiler {
 370     /// The configuration from the builder.
 371     config: Config,
 372     /// The final NFA that is built.
 373     ///
 374     /// Parts of this NFA are constructed during compilation, but the actual
 375     /// states aren't added until a final "finish" step. This is because the
 376     /// states constructed during compilation have unconditional epsilon
 377     /// transitions, which makes the logic of compilation much simpler. The
 378     /// "finish" step removes these unconditional epsilon transitions and must
 379     /// therefore remap all of the transition state IDs.
 380     nfa: RefCell<NFA>,
 381     /// The set of compiled NFA states. Once a state is compiled, it is
 382     /// assigned a state ID equivalent to its index in this list. Subsequent
 383     /// compilation can modify previous states by adding new transitions.
 384     states: RefCell<Vec<CState>>,
 385     /// State used for compiling character classes to UTF-8 byte automata.
 386     /// State is not retained between character class compilations. This just
 387     /// serves to amortize allocation to the extent possible.
 388     utf8_state: RefCell<Utf8State>,
 389     /// State used for arranging character classes in reverse into a trie.
 390     trie_state: RefCell<RangeTrie>,
 391     /// State used for caching common suffixes when compiling reverse UTF-8
 392     /// automata (for Unicode character classes).
 393     utf8_suffix: RefCell<Utf8SuffixMap>,
 394     /// A map used to re-map state IDs when translating the compiler's internal
 395     /// NFA state representation to the external NFA representation.
 396     remap: RefCell<Vec<StateID>>,
 397     /// A set of compiler internal state IDs that correspond to states that are
 398     /// exclusively epsilon transitions, i.e., goto instructions, combined with
 399     /// the state that they point to. This is used to record said states while
 400     /// transforming the compiler's internal NFA representation to the external
 401     /// form.
 402     empties: RefCell<Vec<(StateID, StateID)>>,
 403     /// The total memory used by each of the 'CState's in 'states'. This only
 404     /// includes heap usage by each state, and not the size of the state
 405     /// itself.
 406     memory_cstates: Cell<usize>,
 407 }
 408
 409 /// A compiler intermediate state representation for an NFA that is only used
 410 /// during compilation. Once compilation is done, `CState`s are converted
 411 /// to `State`s (defined in the parent module), which have a much simpler
 412 /// representation.
 413 #[derive(Clone, Debug, Eq, PartialEq)]
 414 enum CState {
 415     /// An empty state whose only purpose is to forward the automaton to
 416     /// another state via en epsilon transition. These are useful during
 417     /// compilation but are otherwise removed at the end.
 418     Empty {
 419         next: StateID,
 420     },
 421     /// An empty state that records a capture location.
 422     ///
 423     /// From the perspective of finite automata, this is precisely equivalent
 424     /// to 'Empty', but serves the purpose of instructing NFA simulations to
 425     /// record additional state when the finite state machine passes through
 426     /// this epsilon transition.
 427     ///
 428     /// These transitions are treated as epsilon transitions with no additional
 429     /// effects in DFAs.
 430     ///
 431     /// 'slot' in this context refers to the specific capture group offset that
 432     /// is being recorded. Each capturing group has two slots corresponding to
 433     /// the start and end of the matching portion of that group.
 434     CaptureStart {
 435         next: StateID,
 436         capture_index: u32,
 437         name: Option<Arc<str>>,
 438     },
 439     CaptureEnd {
 440         next: StateID,
 441         capture_index: u32,
 442     },
 443     /// A state that only transitions to `next` if the current input byte is
 444     /// in the range `[start, end]` (inclusive on both ends).
 445     Range {
 446         range: Transition,
 447     },
 448     /// A state with possibly many transitions, represented in a sparse
 449     /// fashion. Transitions are ordered lexicographically by input range.
 450     /// As such, this may only be used when every transition has equal
 451     /// priority. (In practice, this is only used for encoding large UTF-8
 452     /// automata.) In contrast, a `Union` state has each alternate in order
 453     /// of priority. Priority is used to implement greedy matching and also
 454     /// alternations themselves, e.g., `abc|a` where `abc` has priority over
 455     /// `a`.
 456     ///
 457     /// To clarify, it is possible to remove `Sparse` and represent all things
 458     /// that `Sparse` is used for via `Union`. But this creates a more bloated
 459     /// NFA with more epsilon transitions than is necessary in the special case
 460     /// of character classes.
 461     Sparse {
 462         ranges: Vec<Transition>,
 463     },
 464     /// A conditional epsilon transition satisfied via some sort of
 465     /// look-around.
 466     Look {
 467         look: Look,
 468         next: StateID,
 469     },
 470     /// An alternation such that there exists an epsilon transition to all
 471     /// states in `alternates`, where matches found via earlier transitions
 472     /// are preferred over later transitions.
 473     Union {
 474         alternates: Vec<StateID>,
 475     },
 476     /// An alternation such that there exists an epsilon transition to all
 477     /// states in `alternates`, where matches found via later transitions are
 478     /// preferred over earlier transitions.
 479     ///
 480     /// This "reverse" state exists for convenience during compilation that
 481     /// permits easy construction of non-greedy combinations of NFA states. At
 482     /// the end of compilation, Union and UnionReverse states are merged into
 483     /// one Union type of state, where the latter has its epsilon transitions
 484     /// reversed to reflect the priority inversion.
 485     ///
 486     /// The "convenience" here arises from the fact that as new states are
 487     /// added to the list of `alternates`, we would like that add operation
 488     /// to be amortized constant time. But if we used a `Union`, we'd need to
 489     /// prepend the state, which takes O(n) time. There are other approaches we
 490     /// could use to solve this, but this seems simple enough.
 491     UnionReverse {
 492         alternates: Vec<StateID>,
 493     },
 494     /// A match state. There is at most one such occurrence of this state in
 495     /// an NFA for each pattern compiled into the NFA. At time of writing, a
 496     /// match state is always produced for every pattern given, but in theory,
 497     /// if a pattern can never lead to a match, then the match state could be
 498     /// omitted.
 499     ///
 500     /// `id` refers to the ID of the pattern itself, which corresponds to the
 501     /// pattern's index (starting at 0). `start_id` refers to the anchored
 502     /// NFA starting state corresponding to this pattern.
 503     Match {
 504         pattern_id: PatternID,
 505         start_id: StateID,
 506     },
 507 }
 508
 509 /// A value that represents the result of compiling a sub-expression of a
 510 /// regex's HIR. Specifically, this represents a sub-graph of the NFA that
 511 /// has an initial state at `start` and a final state at `end`.
 512 #[derive(Clone, Copy, Debug)]
 513 pub struct ThompsonRef {
 514     start: StateID,
 515     end: StateID,
 516 }
 517
 518 impl Compiler {
 519     /// Create a new compiler.
 520     pub fn new() -> Compiler {
 521         Compiler {
 522             config: Config::default(),
 523             nfa: RefCell::new(NFA::empty()),
 524             states: RefCell::new(vec![]),
 525             utf8_state: RefCell::new(Utf8State::new()),
 526             trie_state: RefCell::new(RangeTrie::new()),
 527             utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
 528             remap: RefCell::new(vec![]),
 529             empties: RefCell::new(vec![]),
 530             memory_cstates: Cell::new(0),
 531         }
 532     }
 533
 534     /// Configure and prepare this compiler from the builder's knobs.
 535     ///
 536     /// The compiler is must always reconfigured by the builder before using it
 537     /// to build an NFA. Namely, this will also clear any latent state in the
 538     /// compiler used during previous compilations.
 539     fn configure(&mut self, config: Config) {
 540         self.config = config;
 541         self.nfa.borrow_mut().clear();
 542         self.states.borrow_mut().clear();
 543         self.memory_cstates.set(0);
 544         // We don't need to clear anything else since they are cleared on
 545         // their own and only when they are used.
 546     }
 547
 548     /// Convert the current intermediate NFA to its final compiled form.
 549     fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, Error> {
 550         if exprs.is_empty() {
 551             return Ok(NFA::never_match());
 552         }
 553         if exprs.len() > PatternID::LIMIT {
 554             return Err(Error::too_many_patterns(exprs.len()));
 555         }
 556
 557         // We always add an unanchored prefix unless we were specifically told
 558         // not to (for tests only), or if we know that the regex is anchored
 559         // for all matches. When an unanchored prefix is not added, then the
 560         // NFA's anchored and unanchored start states are equivalent.
 561         let all_anchored =
 562             exprs.iter().all(|e| e.borrow().is_anchored_start());
 563         let anchored = !self.config.get_unanchored_prefix() || all_anchored;
 564         let unanchored_prefix = if anchored {
 565             self.c_empty()?
 566         } else {
 567             if self.config.get_utf8() {
 568                 self.c_unanchored_prefix_valid_utf8()?
 569             } else {
 570                 self.c_unanchored_prefix_invalid_utf8()?
 571             }
 572         };
 573
 574         let compiled = self.c_alternation(
 575             exprs.iter().with_pattern_ids().map(|(pid, e)| {
 576                 let group_kind = hir::GroupKind::CaptureIndex(0);
 577                 let one = self.c_group(&group_kind, e.borrow())?;
 578                 let match_state_id = self.add_match(pid, one.start)?;
 579                 self.patch(one.end, match_state_id)?;
 580                 Ok(ThompsonRef { start: one.start, end: match_state_id })
 581             }),
 582         )?;
 583         self.patch(unanchored_prefix.end, compiled.start)?;
 584         self.finish(compiled.start, unanchored_prefix.start)?;
 585         Ok(self.nfa.replace(NFA::empty()))
 586     }
 587
 588     /// Finishes the compilation process and populates the NFA attached to this
 589     /// compiler with the final graph.
 590     fn finish(
 591         &self,
 592         start_anchored: StateID,
 593         start_unanchored: StateID,
 594     ) -> Result<(), Error> {
 595         trace!(
 596             "intermediate NFA compilation complete, \
 597              intermediate NFA size: {} states, {} bytes on heap",
 598             self.states.borrow().len(),
 599             self.nfa_memory_usage(),
 600         );
 601         let mut nfa = self.nfa.borrow_mut();
 602         let mut bstates = self.states.borrow_mut();
 603         let mut remap = self.remap.borrow_mut();
 604         let mut empties = self.empties.borrow_mut();
 605         remap.resize(bstates.len(), StateID::ZERO);
 606         empties.clear();
 607
 608         // The idea here is to convert our intermediate states to their final
 609         // form. The only real complexity here is the process of converting
 610         // transitions, which are expressed in terms of state IDs. The new
 611         // set of states will be smaller because of partial epsilon removal,
 612         // so the state IDs will not be the same.
 613         for (sid, bstate) in bstates.iter_mut().with_state_ids() {
 614             match *bstate {
 615                 CState::Empty { next } => {
 616                     // Since we're removing empty states, we need to handle
 617                     // them later since we don't yet know which new state this
 618                     // empty state will be mapped to.
 619                     empties.push((sid, next));
 620                 }
 621                 CState::CaptureStart { next, capture_index, ref name } => {
 622                     // We can't remove this empty state because of the side
 623                     // effect of capturing an offset for this capture slot.
 624                     remap[sid] = nfa.add_capture_start(
 625                         next,
 626                         capture_index,
 627                         name.clone(),
 628                     )?;
 629                 }
 630                 CState::CaptureEnd { next, capture_index } => {
 631                     // We can't remove this empty state because of the side
 632                     // effect of capturing an offset for this capture slot.
 633                     remap[sid] = nfa.add_capture_end(next, capture_index)?;
 634                 }
 635                 CState::Range { range } => {
 636                     remap[sid] = nfa.add_range(range)?;
 637                 }
 638                 CState::Sparse { ref mut ranges } => {
 639                     let ranges =
 640                         mem::replace(ranges, vec![]).into_boxed_slice();
 641                     remap[sid] =
 642                         nfa.add_sparse(SparseTransitions { ranges })?;
 643                 }
 644                 CState::Look { look, next } => {
 645                     remap[sid] = nfa.add_look(next, look)?;
 646                 }
 647                 CState::Union { ref mut alternates } => {
 648                     let alternates =
 649                         mem::replace(alternates, vec![]).into_boxed_slice();
 650                     remap[sid] = nfa.add_union(alternates)?;
 651                 }
 652                 CState::UnionReverse { ref mut alternates } => {
 653                     let mut alternates =
 654                         mem::replace(alternates, vec![]).into_boxed_slice();
 655                     alternates.reverse();
 656                     remap[sid] = nfa.add_union(alternates)?;
 657                 }
 658                 CState::Match { start_id, .. } => {
 659                     remap[sid] = nfa.add_match()?;
 660                     nfa.finish_pattern(start_id)?;
 661                 }
 662             }
 663         }
 664         for &(empty_id, mut empty_next) in empties.iter() {
 665             // empty states can point to other empty states, forming a chain.
 666             // So we must follow the chain until the end, which must end at
 667             // a non-empty state, and therefore, a state that is correctly
 668             // remapped. We are guaranteed to terminate because our compiler
 669             // never builds a loop among only empty states.
 670             while let CState::Empty { next } = bstates[empty_next] {
 671                 empty_next = next;
 672             }
 673             remap[empty_id] = remap[empty_next];
 674         }
 675         nfa.set_start_anchored(start_anchored);
 676         nfa.set_start_unanchored(start_unanchored);
 677         nfa.remap(&remap);
 678         trace!(
 679             "final NFA (reverse? {:?}) compilation complete, \
 680              final NFA size: {} states, {} bytes on heap",
 681             self.config.get_reverse(),
 682             nfa.states().len(),
 683             nfa.memory_usage(),
 684         );
 685         Ok(())
 686     }
 687
 688     fn c(&self, expr: &Hir) -> Result<ThompsonRef, Error> {
 689         match *expr.kind() {
 690             HirKind::Empty => self.c_empty(),
 691             HirKind::Literal(Literal::Unicode(ch)) => self.c_char(ch),
 692             HirKind::Literal(Literal::Byte(b)) => self.c_range(b, b),
 693             HirKind::Class(Class::Bytes(ref c)) => self.c_byte_class(c),
 694             HirKind::Class(Class::Unicode(ref c)) => self.c_unicode_class(c),
 695             HirKind::Anchor(ref anchor) => self.c_anchor(anchor),
 696             HirKind::WordBoundary(ref wb) => self.c_word_boundary(wb),
 697             HirKind::Repetition(ref rep) => self.c_repetition(rep),
 698             HirKind::Group(ref group) => self.c_group(&group.kind, &group.hir),
 699             HirKind::Concat(ref es) => {
 700                 self.c_concat(es.iter().map(|e| self.c(e)))
 701             }
 702             HirKind::Alternation(ref es) => {
 703                 self.c_alternation(es.iter().map(|e| self.c(e)))
 704             }
 705         }
 706     }
 707
 708     fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
 709     where
 710         I: DoubleEndedIterator<Item = Result<ThompsonRef, Error>>,
 711     {
 712         let first = if self.is_reverse() { it.next_back() } else { it.next() };
 713         let ThompsonRef { start, mut end } = match first {
 714             Some(result) => result?,
 715             None => return self.c_empty(),
 716         };
 717         loop {
 718             let next =
 719                 if self.is_reverse() { it.next_back() } else { it.next() };
 720             let compiled = match next {
 721                 Some(result) => result?,
 722                 None => break,
 723             };
 724             self.patch(end, compiled.start)?;
 725             end = compiled.end;
 726         }
 727         Ok(ThompsonRef { start, end })
 728     }
 729
 730     fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
 731     where
 732         I: Iterator<Item = Result<ThompsonRef, Error>>,
 733     {
 734         let first = it.next().expect("alternations must be non-empty")?;
 735         let second = match it.next() {
 736             None => return Ok(first),
 737             Some(result) => result?,
 738         };
 739
 740         let union = self.add_union()?;
 741         let end = self.add_empty()?;
 742         self.patch(union, first.start)?;
 743         self.patch(first.end, end)?;
 744         self.patch(union, second.start)?;
 745         self.patch(second.end, end)?;
 746         for result in it {
 747             let compiled = result?;
 748             self.patch(union, compiled.start)?;
 749             self.patch(compiled.end, end)?;
 750         }
 751         Ok(ThompsonRef { start: union, end })
 752     }
 753
 754     fn c_group(
 755         &self,
 756         kind: &hir::GroupKind,
 757         expr: &Hir,
 758     ) -> Result<ThompsonRef, Error> {
 759         if !self.config.get_captures() {
 760             return self.c(expr);
 761         }
 762         let (capi, name) = match *kind {
 763             hir::GroupKind::NonCapturing => return self.c(expr),
 764             hir::GroupKind::CaptureIndex(index) => (index, None),
 765             hir::GroupKind::CaptureName { ref name, index } => {
 766                 (index, Some(Arc::from(&**name)))
 767             }
 768         };
 769
 770         let start = self.add_capture_start(capi, name)?;
 771         let inner = self.c(expr)?;
 772         let end = self.add_capture_end(capi)?;
 773
 774         self.patch(start, inner.start)?;
 775         self.patch(inner.end, end)?;
 776         Ok(ThompsonRef { start, end })
 777     }
 778
 779     fn c_repetition(
 780         &self,
 781         rep: &hir::Repetition,
 782     ) -> Result<ThompsonRef, Error> {
 783         match rep.kind {
 784             hir::RepetitionKind::ZeroOrOne => {
 785                 self.c_zero_or_one(&rep.hir, rep.greedy)
 786             }
 787             hir::RepetitionKind::ZeroOrMore => {
 788                 self.c_at_least(&rep.hir, rep.greedy, 0)
 789             }
 790             hir::RepetitionKind::OneOrMore => {
 791                 self.c_at_least(&rep.hir, rep.greedy, 1)
 792             }
 793             hir::RepetitionKind::Range(ref rng) => match *rng {
 794                 hir::RepetitionRange::Exactly(count) => {
 795                     self.c_exactly(&rep.hir, count)
 796                 }
 797                 hir::RepetitionRange::AtLeast(m) => {
 798                     self.c_at_least(&rep.hir, rep.greedy, m)
 799                 }
 800                 hir::RepetitionRange::Bounded(min, max) => {
 801                     self.c_bounded(&rep.hir, rep.greedy, min, max)
 802                 }
 803             },
 804         }
 805     }
 806
 807     fn c_bounded(
 808         &self,
 809         expr: &Hir,
 810         greedy: bool,
 811         min: u32,
 812         max: u32,
 813     ) -> Result<ThompsonRef, Error> {
 814         let prefix = self.c_exactly(expr, min)?;
 815         if min == max {
 816             return Ok(prefix);
 817         }
 818
 819         // It is tempting here to compile the rest here as a concatenation
 820         // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
 821         // were `aaa?a?a?`. The problem here is that it leads to this program:
 822         //
 823         //     >000000: 61 => 01
 824         //      000001: 61 => 02
 825         //      000002: union(03, 04)
 826         //      000003: 61 => 04
 827         //      000004: union(05, 06)
 828         //      000005: 61 => 06
 829         //      000006: union(07, 08)
 830         //      000007: 61 => 08
 831         //      000008: MATCH
 832         //
 833         // And effectively, once you hit state 2, the epsilon closure will
 834         // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
 835         // to instead compile it like so:
 836         //
 837         //     >000000: 61 => 01
 838         //      000001: 61 => 02
 839         //      000002: union(03, 08)
 840         //      000003: 61 => 04
 841         //      000004: union(05, 08)
 842         //      000005: 61 => 06
 843         //      000006: union(07, 08)
 844         //      000007: 61 => 08
 845         //      000008: MATCH
 846         //
 847         // So that the epsilon closure of state 2 is now just 3 and 8.
 848         let empty = self.add_empty()?;
 849         let mut prev_end = prefix.end;
 850         for _ in min..max {
 851             let union = if greedy {
 852                 self.add_union()
 853             } else {
 854                 self.add_reverse_union()
 855             }?;
 856             let compiled = self.c(expr)?;
 857             self.patch(prev_end, union)?;
 858             self.patch(union, compiled.start)?;
 859             self.patch(union, empty)?;
 860             prev_end = compiled.end;
 861         }
 862         self.patch(prev_end, empty)?;
 863         Ok(ThompsonRef { start: prefix.start, end: empty })
 864     }
 865
 866     fn c_at_least(
 867         &self,
 868         expr: &Hir,
 869         greedy: bool,
 870         n: u32,
 871     ) -> Result<ThompsonRef, Error> {
 872         if n == 0 {
 873             // When the expression cannot match the empty string, then we
 874             // can get away with something much simpler: just one 'alt'
 875             // instruction that optionally repeats itself. But if the expr
 876             // can match the empty string... see below.
 877             if !expr.is_match_empty() {
 878                 let union = if greedy {
 879                     self.add_union()
 880                 } else {
 881                     self.add_reverse_union()
 882                 }?;
 883                 let compiled = self.c(expr)?;
 884                 self.patch(union, compiled.start)?;
 885                 self.patch(compiled.end, union)?;
 886                 return Ok(ThompsonRef { start: union, end: union });
 887             }
 888
 889             // What's going on here? Shouldn't x* be simpler than this? It
 890             // turns out that when implementing leftmost-first (Perl-like)
 891             // match semantics, x* results in an incorrect preference order
 892             // when computing the transitive closure of states if and only if
 893             // 'x' can match the empty string. So instead, we compile x* as
 894             // (x+)?, which preserves the correct preference order.
 895             //
 896             // See: https://github.com/rust-lang/regex/issues/779
 897             let compiled = self.c(expr)?;
 898             let plus = if greedy {
 899                 self.add_union()
 900             } else {
 901                 self.add_reverse_union()
 902             }?;
 903             self.patch(compiled.end, plus)?;
 904             self.patch(plus, compiled.start)?;
 905
 906             let question = if greedy {
 907                 self.add_union()
 908             } else {
 909                 self.add_reverse_union()
 910             }?;
 911             let empty = self.add_empty()?;
 912             self.patch(question, compiled.start)?;
 913             self.patch(question, empty)?;
 914             self.patch(plus, empty)?;
 915             Ok(ThompsonRef { start: question, end: empty })
 916         } else if n == 1 {
 917             let compiled = self.c(expr)?;
 918             let union = if greedy {
 919                 self.add_union()
 920             } else {
 921                 self.add_reverse_union()
 922             }?;
 923             self.patch(compiled.end, union)?;
 924             self.patch(union, compiled.start)?;
 925             Ok(ThompsonRef { start: compiled.start, end: union })
 926         } else {
 927             let prefix = self.c_exactly(expr, n - 1)?;
 928             let last = self.c(expr)?;
 929             let union = if greedy {
 930                 self.add_union()
 931             } else {
 932                 self.add_reverse_union()
 933             }?;
 934             self.patch(prefix.end, last.start)?;
 935             self.patch(last.end, union)?;
 936             self.patch(union, last.start)?;
 937             Ok(ThompsonRef { start: prefix.start, end: union })
 938         }
 939     }
 940
 941     fn c_zero_or_one(
 942         &self,
 943         expr: &Hir,
 944         greedy: bool,
 945     ) -> Result<ThompsonRef, Error> {
 946         let union =
 947             if greedy { self.add_union() } else { self.add_reverse_union() }?;
 948         let compiled = self.c(expr)?;
 949         let empty = self.add_empty()?;
 950         self.patch(union, compiled.start)?;
 951         self.patch(union, empty)?;
 952         self.patch(compiled.end, empty)?;
 953         Ok(ThompsonRef { start: union, end: empty })
 954     }
 955
 956     fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef, Error> {
 957         let it = (0..n).map(|_| self.c(expr));
 958         self.c_concat(it)
 959     }
 960
 961     fn c_byte_class(
 962         &self,
 963         cls: &hir::ClassBytes,
 964     ) -> Result<ThompsonRef, Error> {
 965         let end = self.add_empty()?;
 966         let mut trans = Vec::with_capacity(cls.ranges().len());
 967         for r in cls.iter() {
 968             trans.push(Transition {
 969                 start: r.start(),
 970                 end: r.end(),
 971                 next: end,
 972             });
 973         }
 974         Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
 975     }
 976
 977     fn c_unicode_class(
 978         &self,
 979         cls: &hir::ClassUnicode,
 980     ) -> Result<ThompsonRef, Error> {
 981         // If all we have are ASCII ranges wrapped in a Unicode package, then
 982         // there is zero reason to bring out the big guns. We can fit all ASCII
 983         // ranges within a single sparse state.
 984         if cls.is_all_ascii() {
 985             let end = self.add_empty()?;
 986             let mut trans = Vec::with_capacity(cls.ranges().len());
 987             for r in cls.iter() {
 988                 assert!(r.start() <= '\x7F');
 989                 assert!(r.end() <= '\x7F');
 990                 trans.push(Transition {
 991                     start: r.start() as u8,
 992                     end: r.end() as u8,
 993                     next: end,
 994                 });
 995             }
 996             Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
 997         } else if self.is_reverse() {
 998             if !self.config.get_shrink() {
 999                 // When we don't want to spend the extra time shrinking, we
1000                 // compile the UTF-8 automaton in reverse using something like
1001                 // the "naive" approach, but will attempt to re-use common
1002                 // suffixes.
1003                 self.c_unicode_class_reverse_with_suffix(cls)
1004             } else {
1005                 // When we want to shrink our NFA for reverse UTF-8 automata,
1006                 // we cannot feed UTF-8 sequences directly to the UTF-8
1007                 // compiler, since the UTF-8 compiler requires all sequences
1008                 // to be lexicographically sorted. Instead, we organize our
1009                 // sequences into a range trie, which can then output our
1010                 // sequences in the correct order. Unfortunately, building the
1011                 // range trie is fairly expensive (but not nearly as expensive
1012                 // as building a DFA). Hence the reason why the 'shrink' option
1013                 // exists, so that this path can be toggled off. For example,
1014                 // we might want to turn this off if we know we won't be
1015                 // compiling a DFA.
1016                 let mut trie = self.trie_state.borrow_mut();
1017                 trie.clear();
1018
1019                 for rng in cls.iter() {
1020                     for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
1021                         seq.reverse();
1022                         trie.insert(seq.as_slice());
1023                     }
1024                 }
1025                 let mut utf8_state = self.utf8_state.borrow_mut();
1026                 let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?;
1027                 trie.iter(|seq| {
1028                     utf8c.add(&seq)?;
1029                     Ok(())
1030                 })?;
1031                 utf8c.finish()
1032             }
1033         } else {
1034             // In the forward direction, we always shrink our UTF-8 automata
1035             // because we can stream it right into the UTF-8 compiler. There
1036             // is almost no downside (in either memory or time) to using this
1037             // approach.
1038             let mut utf8_state = self.utf8_state.borrow_mut();
1039             let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?;
1040             for rng in cls.iter() {
1041                 for seq in Utf8Sequences::new(rng.start(), rng.end()) {
1042                     utf8c.add(seq.as_slice())?;
1043                 }
1044             }
1045             utf8c.finish()
1046         }
1047
1048         // For reference, the code below is the "naive" version of compiling a
1049         // UTF-8 automaton. It is deliciously simple (and works for both the
1050         // forward and reverse cases), but will unfortunately produce very
1051         // large NFAs. When compiling a forward automaton, the size difference
1052         // can sometimes be an order of magnitude. For example, the '\w' regex
1053         // will generate about ~3000 NFA states using the naive approach below,
1054         // but only 283 states when using the approach above. This is because
1055         // the approach above actually compiles a *minimal* (or near minimal,
1056         // because of the bounded hashmap for reusing equivalent states) UTF-8
1057         // automaton.
1058         //
1059         // The code below is kept as a reference point in order to make it
1060         // easier to understand the higher level goal here. Although, it will
1061         // almost certainly bit-rot, so keep that in mind.
1062         /*
1063         let it = cls
1064             .iter()
1065             .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
1066             .map(|seq| {
1067                 let it = seq
1068                     .as_slice()
1069                     .iter()
1070                     .map(|rng| self.c_range(rng.start, rng.end));
1071                 self.c_concat(it)
1072             });
1073         self.c_alternation(it)
1074         */
1075     }
1076
1077     fn c_unicode_class_reverse_with_suffix(
1078         &self,
1079         cls: &hir::ClassUnicode,
1080     ) -> Result<ThompsonRef, Error> {
1081         // N.B. It would likely be better to cache common *prefixes* in the
1082         // reverse direction, but it's not quite clear how to do that. The
1083         // advantage of caching suffixes is that it does give us a win, and
1084         // has a very small additional overhead.
1085         let mut cache = self.utf8_suffix.borrow_mut();
1086         cache.clear();
1087
1088         let union = self.add_union()?;
1089         let alt_end = self.add_empty()?;
1090         for urng in cls.iter() {
1091             for seq in Utf8Sequences::new(urng.start(), urng.end()) {
1092                 let mut end = alt_end;
1093                 for brng in seq.as_slice() {
1094                     let key = Utf8SuffixKey {
1095                         from: end,
1096                         start: brng.start,
1097                         end: brng.end,
1098                     };
1099                     let hash = cache.hash(&key);
1100                     if let Some(id) = cache.get(&key, hash) {
1101                         end = id;
1102                         continue;
1103                     }
1104
1105                     let compiled = self.c_range(brng.start, brng.end)?;
1106                     self.patch(compiled.end, end)?;
1107                     end = compiled.start;
1108                     cache.set(key, hash, end);
1109                 }
1110                 self.patch(union, end)?;
1111             }
1112         }
1113         Ok(ThompsonRef { start: union, end: alt_end })
1114     }
1115
1116     fn c_anchor(&self, anchor: &Anchor) -> Result<ThompsonRef, Error> {
1117         let look = match *anchor {
1118             Anchor::StartLine => Look::StartLine,
1119             Anchor::EndLine => Look::EndLine,
1120             Anchor::StartText => Look::StartText,
1121             Anchor::EndText => Look::EndText,
1122         };
1123         let id = self.add_look(look)?;
1124         Ok(ThompsonRef { start: id, end: id })
1125     }
1126
1127     fn c_word_boundary(
1128         &self,
1129         wb: &WordBoundary,
1130     ) -> Result<ThompsonRef, Error> {
1131         let look = match *wb {
1132             WordBoundary::Unicode => Look::WordBoundaryUnicode,
1133             WordBoundary::UnicodeNegate => Look::WordBoundaryUnicodeNegate,
1134             WordBoundary::Ascii => Look::WordBoundaryAscii,
1135             WordBoundary::AsciiNegate => Look::WordBoundaryAsciiNegate,
1136         };
1137         let id = self.add_look(look)?;
1138         Ok(ThompsonRef { start: id, end: id })
1139     }
1140
1141     fn c_char(&self, ch: char) -> Result<ThompsonRef, Error> {
1142         let mut buf = [0; 4];
1143         let it = ch
1144             .encode_utf8(&mut buf)
1145             .as_bytes()
1146             .iter()
1147             .map(|&b| self.c_range(b, b));
1148         self.c_concat(it)
1149     }
1150
1151     fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, Error> {
1152         let id = self.add_range(start, end)?;
1153         Ok(ThompsonRef { start: id, end: id })
1154     }
1155
1156     fn c_empty(&self) -> Result<ThompsonRef, Error> {
1157         let id = self.add_empty()?;
1158         Ok(ThompsonRef { start: id, end: id })
1159     }
1160
1161     fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef, Error> {
1162         self.c_at_least(&Hir::any(false), false, 0)
1163     }
1164
1165     fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef, Error> {
1166         self.c_at_least(&Hir::any(true), false, 0)
1167     }
1168
1169     fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> {
1170         let old_memory_cstates = self.memory_cstates.get();
1171         match self.states.borrow_mut()[from] {
1172             CState::Empty { ref mut next } => {
1173                 *next = to;
1174             }
1175             CState::Range { ref mut range } => {
1176                 range.next = to;
1177             }
1178             CState::Sparse { .. } => {
1179                 panic!("cannot patch from a sparse NFA state")
1180             }
1181             CState::Look { ref mut next, .. } => {
1182                 *next = to;
1183             }
1184             CState::Union { ref mut alternates } => {
1185                 alternates.push(to);
1186                 self.memory_cstates
1187                     .set(old_memory_cstates + mem::size_of::<StateID>());
1188             }
1189             CState::UnionReverse { ref mut alternates } => {
1190                 alternates.push(to);
1191                 self.memory_cstates
1192                     .set(old_memory_cstates + mem::size_of::<StateID>());
1193             }
1194             CState::CaptureStart { ref mut next, .. } => {
1195                 *next = to;
1196             }
1197             CState::CaptureEnd { ref mut next, .. } => {
1198                 *next = to;
1199             }
1200             CState::Match { .. } => {}
1201         }
1202         if old_memory_cstates != self.memory_cstates.get() {
1203             self.check_nfa_size_limit()?;
1204         }
1205         Ok(())
1206     }
1207
1208     fn add_empty(&self) -> Result<StateID, Error> {
1209         self.add_state(CState::Empty { next: StateID::ZERO })
1210     }
1211
1212     fn add_capture_start(
1213         &self,
1214         capture_index: u32,
1215         name: Option<Arc<str>>,
1216     ) -> Result<StateID, Error> {
1217         self.add_state(CState::CaptureStart {
1218             next: StateID::ZERO,
1219             capture_index,
1220             name,
1221         })
1222     }
1223
1224     fn add_capture_end(&self, capture_index: u32) -> Result<StateID, Error> {
1225         self.add_state(CState::CaptureEnd {
1226             next: StateID::ZERO,
1227             capture_index,
1228         })
1229     }
1230
1231     fn add_range(&self, start: u8, end: u8) -> Result<StateID, Error> {
1232         let trans = Transition { start, end, next: StateID::ZERO };
1233         self.add_state(CState::Range { range: trans })
1234     }
1235
1236     fn add_sparse(&self, ranges: Vec<Transition>) -> Result<StateID, Error> {
1237         if ranges.len() == 1 {
1238             self.add_state(CState::Range { range: ranges[0] })
1239         } else {
1240             self.add_state(CState::Sparse { ranges })
1241         }
1242     }
1243
1244     fn add_look(&self, mut look: Look) -> Result<StateID, Error> {
1245         if self.is_reverse() {
1246             look = look.reversed();
1247         }
1248         self.add_state(CState::Look { look, next: StateID::ZERO })
1249     }
1250
1251     fn add_union(&self) -> Result<StateID, Error> {
1252         self.add_state(CState::Union { alternates: vec![] })
1253     }
1254
1255     fn add_reverse_union(&self) -> Result<StateID, Error> {
1256         self.add_state(CState::UnionReverse { alternates: vec![] })
1257     }
1258
1259     fn add_match(
1260         &self,
1261         pattern_id: PatternID,
1262         start_id: StateID,
1263     ) -> Result<StateID, Error> {
1264         self.add_state(CState::Match { pattern_id, start_id })
1265     }
1266
1267     fn add_state(&self, state: CState) -> Result<StateID, Error> {
1268         let mut states = self.states.borrow_mut();
1269         let id = StateID::new(states.len())
1270             .map_err(|_| Error::too_many_states(states.len()))?;
1271         self.memory_cstates
1272             .set(self.memory_cstates.get() + state.memory_usage());
1273         states.push(state);
1274         // If we don't explicitly drop this, then 'nfa_memory_usage' will also
1275         // try to borrow it when we check the size limit and hit an error.
1276         drop(states);
1277         self.check_nfa_size_limit()?;
1278         Ok(id)
1279     }
1280
1281     fn is_reverse(&self) -> bool {
1282         self.config.get_reverse()
1283     }
1284
1285     /// If an NFA size limit was set, this checks that the NFA compiled so far
1286     /// fits within that limit. If so, then nothing is returned. Otherwise, an
1287     /// error is returned.
1288     ///
1289     /// This should be called after increasing the heap usage of the
1290     /// intermediate NFA.
1291     ///
1292     /// Note that this borrows 'self.states', so callers should ensure there is
1293     /// no mutable borrow of it outstanding.
1294     fn check_nfa_size_limit(&self) -> Result<(), Error> {
1295         if let Some(limit) = self.config.get_nfa_size_limit() {
1296             if self.nfa_memory_usage() > limit {
1297                 return Err(Error::exceeded_size_limit(limit));
1298             }
1299         }
1300         Ok(())
1301     }
1302
1303     /// Returns the heap memory usage, in bytes, of the NFA compiled so far.
1304     ///
1305     /// Note that this is an approximation of how big the final NFA will be.
1306     /// In practice, the final NFA will likely be a bit smaller since it uses
1307     /// things like `Box<[T]>` instead of `Vec<T>`.
1308     fn nfa_memory_usage(&self) -> usize {
1309         self.states.borrow().len() * mem::size_of::<CState>()
1310             + self.memory_cstates.get()
1311     }
1312 }
1313
1314 impl CState {
1315     fn memory_usage(&self) -> usize {
1316         match *self {
1317             CState::Empty { .. }
1318             | CState::Range { .. }
1319             | CState::Look { .. }
1320             | CState::CaptureStart { .. }
1321             | CState::CaptureEnd { .. }
1322             | CState::Match { .. } => 0,
1323             CState::Sparse { ref ranges } => {
1324                 ranges.len() * mem::size_of::<Transition>()
1325             }
1326             CState::Union { ref alternates } => {
1327                 alternates.len() * mem::size_of::<StateID>()
1328             }
1329             CState::UnionReverse { ref alternates } => {
1330                 alternates.len() * mem::size_of::<StateID>()
1331             }
1332         }
1333     }
1334 }
1335
1336 #[derive(Debug)]
1337 struct Utf8Compiler<'a> {
1338     nfac: &'a Compiler,
1339     state: &'a mut Utf8State,
1340     target: StateID,
1341 }
1342
1343 #[derive(Clone, Debug)]
1344 struct Utf8State {
1345     compiled: Utf8BoundedMap,
1346     uncompiled: Vec<Utf8Node>,
1347 }
1348
1349 #[derive(Clone, Debug)]
1350 struct Utf8Node {
1351     trans: Vec<Transition>,
1352     last: Option<Utf8LastTransition>,
1353 }
1354
1355 #[derive(Clone, Debug)]
1356 struct Utf8LastTransition {
1357     start: u8,
1358     end: u8,
1359 }
1360
1361 impl Utf8State {
1362     fn new() -> Utf8State {
1363         Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] }
1364     }
1365
1366     fn clear(&mut self) {
1367         self.compiled.clear();
1368         self.uncompiled.clear();
1369     }
1370 }
1371
1372 impl<'a> Utf8Compiler<'a> {
1373     fn new(
1374         nfac: &'a Compiler,
1375         state: &'a mut Utf8State,
1376     ) -> Result<Utf8Compiler<'a>, Error> {
1377         let target = nfac.add_empty()?;
1378         state.clear();
1379         let mut utf8c = Utf8Compiler { nfac, state, target };
1380         utf8c.add_empty();
1381         Ok(utf8c)
1382     }
1383
1384     fn finish(&mut self) -> Result<ThompsonRef, Error> {
1385         self.compile_from(0)?;
1386         let node = self.pop_root();
1387         let start = self.compile(node)?;
1388         Ok(ThompsonRef { start, end: self.target })
1389     }
1390
1391     fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), Error> {
1392         let prefix_len = ranges
1393             .iter()
1394             .zip(&self.state.uncompiled)
1395             .take_while(|&(range, node)| {
1396                 node.last.as_ref().map_or(false, |t| {
1397                     (t.start, t.end) == (range.start, range.end)
1398                 })
1399             })
1400             .count();
1401         assert!(prefix_len < ranges.len());
1402         self.compile_from(prefix_len)?;
1403         self.add_suffix(&ranges[prefix_len..]);
1404         Ok(())
1405     }
1406
1407     fn compile_from(&mut self, from: usize) -> Result<(), Error> {
1408         let mut next = self.target;
1409         while from + 1 < self.state.uncompiled.len() {
1410             let node = self.pop_freeze(next);
1411             next = self.compile(node)?;
1412         }
1413         self.top_last_freeze(next);
1414         Ok(())
1415     }
1416
1417     fn compile(&mut self, node: Vec<Transition>) -> Result<StateID, Error> {
1418         let hash = self.state.compiled.hash(&node);
1419         if let Some(id) = self.state.compiled.get(&node, hash) {
1420             return Ok(id);
1421         }
1422         let id = self.nfac.add_sparse(node.clone())?;
1423         self.state.compiled.set(node, hash, id);
1424         Ok(id)
1425     }
1426
1427     fn add_suffix(&mut self, ranges: &[Utf8Range]) {
1428         assert!(!ranges.is_empty());
1429         let last = self
1430             .state
1431             .uncompiled
1432             .len()
1433             .checked_sub(1)
1434             .expect("non-empty nodes");
1435         assert!(self.state.uncompiled[last].last.is_none());
1436         self.state.uncompiled[last].last = Some(Utf8LastTransition {
1437             start: ranges[0].start,
1438             end: ranges[0].end,
1439         });
1440         for r in &ranges[1..] {
1441             self.state.uncompiled.push(Utf8Node {
1442                 trans: vec![],
1443                 last: Some(Utf8LastTransition { start: r.start, end: r.end }),
1444             });
1445         }
1446     }
1447
1448     fn add_empty(&mut self) {
1449         self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
1450     }
1451
1452     fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
1453         let mut uncompiled = self.state.uncompiled.pop().unwrap();
1454         uncompiled.set_last_transition(next);
1455         uncompiled.trans
1456     }
1457
1458     fn pop_root(&mut self) -> Vec<Transition> {
1459         assert_eq!(self.state.uncompiled.len(), 1);
1460         assert!(self.state.uncompiled[0].last.is_none());
1461         self.state.uncompiled.pop().expect("non-empty nodes").trans
1462     }
1463
1464     fn top_last_freeze(&mut self, next: StateID) {
1465         let last = self
1466             .state
1467             .uncompiled
1468             .len()
1469             .checked_sub(1)
1470             .expect("non-empty nodes");
1471         self.state.uncompiled[last].set_last_transition(next);
1472     }
1473 }
1474
1475 impl Utf8Node {
1476     fn set_last_transition(&mut self, next: StateID) {
1477         if let Some(last) = self.last.take() {
1478             self.trans.push(Transition {
1479                 start: last.start,
1480                 end: last.end,
1481                 next,
1482             });
1483         }
1484     }
1485 }
1486
1487 #[cfg(test)]
1488 mod tests {
1489     use alloc::vec::Vec;
1490
1491     use super::{
1492         Builder, Config, PatternID, SparseTransitions, State, StateID,
1493         Transition, NFA,
1494     };
1495
1496     fn build(pattern: &str) -> NFA {
1497         Builder::new()
1498             .configure(Config::new().captures(false).unanchored_prefix(false))
1499             .build(pattern)
1500             .unwrap()
1501     }
1502
1503     fn pid(id: usize) -> PatternID {
1504         PatternID::new(id).unwrap()
1505     }
1506
1507     fn sid(id: usize) -> StateID {
1508         StateID::new(id).unwrap()
1509     }
1510
1511     fn s_byte(byte: u8, next: usize) -> State {
1512         let next = sid(next);
1513         let trans = Transition { start: byte, end: byte, next };
1514         State::Range { range: trans }
1515     }
1516
1517     fn s_range(start: u8, end: u8, next: usize) -> State {
1518         let next = sid(next);
1519         let trans = Transition { start, end, next };
1520         State::Range { range: trans }
1521     }
1522
1523     fn s_sparse(ranges: &[(u8, u8, usize)]) -> State {
1524         let ranges = ranges
1525             .iter()
1526             .map(|&(start, end, next)| Transition {
1527                 start,
1528                 end,
1529                 next: sid(next),
1530             })
1531             .collect();
1532         State::Sparse(SparseTransitions { ranges })
1533     }
1534
1535     fn s_union(alts: &[usize]) -> State {
1536         State::Union {
1537             alternates: alts
1538                 .iter()
1539                 .map(|&id| sid(id))
1540                 .collect::<Vec<StateID>>()
1541                 .into_boxed_slice(),
1542         }
1543     }
1544
1545     fn s_match(id: usize) -> State {
1546         State::Match { id: pid(id) }
1547     }
1548
1549     // Test that building an unanchored NFA has an appropriate `(?s:.)*?`
1550     // prefix.
1551     #[test]
1552     fn compile_unanchored_prefix() {
1553         // When the machine can only match valid UTF-8.
1554         let nfa = Builder::new()
1555             .configure(Config::new().captures(false))
1556             .build(r"a")
1557             .unwrap();
1558         // There should be many states since the `.` in `(?s:.)*?` matches any
1559         // Unicode scalar value.
1560         assert_eq!(11, nfa.len());
1561         assert_eq!(nfa.states[10], s_match(0));
1562         assert_eq!(nfa.states[9], s_byte(b'a', 10));
1563
1564         // When the machine can match through invalid UTF-8.
1565         let nfa = Builder::new()
1566             .configure(Config::new().captures(false).utf8(false))
1567             .build(r"a")
1568             .unwrap();
1569         assert_eq!(
1570             nfa.states,
1571             &[
1572                 s_union(&[2, 1]),
1573                 s_range(0, 255, 0),
1574                 s_byte(b'a', 3),
1575                 s_match(0),
1576             ]
1577         );
1578     }
1579
1580     #[test]
1581     fn compile_empty() {
1582         assert_eq!(build("").states, &[s_match(0),]);
1583     }
1584
1585     #[test]
1586     fn compile_literal() {
1587         assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(0),]);
1588         assert_eq!(
1589             build("ab").states,
1590             &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),]
1591         );
1592         assert_eq!(
1593             build("☃").states,
1594             &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)]
1595         );
1596
1597         // Check that non-UTF-8 literals work.
1598         let nfa = Builder::new()
1599             .configure(
1600                 Config::new()
1601                     .captures(false)
1602                     .utf8(false)
1603                     .unanchored_prefix(false),
1604             )
1605             .syntax(crate::SyntaxConfig::new().utf8(false))
1606             .build(r"(?-u)\xFF")
1607             .unwrap();
1608         assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(0),]);
1609     }
1610
1611     #[test]
1612     fn compile_class() {
1613         assert_eq!(
1614             build(r"[a-z]").states,
1615             &[s_range(b'a', b'z', 1), s_match(0),]
1616         );
1617         assert_eq!(
1618             build(r"[x-za-c]").states,
1619             &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)]
1620         );
1621         assert_eq!(
1622             build(r"[\u03B1-\u03B4]").states,
1623             &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)]
1624         );
1625         assert_eq!(
1626             build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states,
1627             &[
1628                 s_range(0xB1, 0xB4, 5),
1629                 s_range(0x99, 0x9E, 5),
1630                 s_byte(0xA4, 1),
1631                 s_byte(0x9F, 2),
1632                 s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
1633                 s_match(0),
1634             ]
1635         );
1636         assert_eq!(
1637             build(r"[a-z☃]").states,
1638             &[
1639                 s_byte(0x83, 3),
1640                 s_byte(0x98, 0),
1641                 s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
1642                 s_match(0),
1643             ]
1644         );
1645     }
1646
1647     #[test]
1648     fn compile_repetition() {
1649         assert_eq!(
1650             build(r"a?").states,
1651             &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(0),]
1652         );
1653         assert_eq!(
1654             build(r"a??").states,
1655             &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(0),]
1656         );
1657     }
1658
1659     #[test]
1660     fn compile_group() {
1661         assert_eq!(
1662             build(r"ab+").states,
1663             &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(0)]
1664         );
1665         assert_eq!(
1666             build(r"(ab)").states,
1667             &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)]
1668         );
1669         assert_eq!(
1670             build(r"(ab)+").states,
1671             &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(0)]
1672         );
1673     }
1674
1675     #[test]
1676     fn compile_alternation() {
1677         assert_eq!(
1678             build(r"a|b").states,
1679             &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(0)]
1680         );
1681         assert_eq!(
1682             build(r"|b").states,
1683             &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(0)]
1684         );
1685         assert_eq!(
1686             build(r"a|").states,
1687             &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(0)]
1688         );
1689     }
1690
1691     #[test]
1692     fn many_start_pattern() {
1693         let nfa = Builder::new()
1694             .configure(Config::new().captures(false).unanchored_prefix(false))
1695             .build_many(&["a", "b"])
1696             .unwrap();
1697         assert_eq!(
1698             nfa.states,
1699             &[
1700                 s_byte(b'a', 1),
1701                 s_match(0),
1702                 s_byte(b'b', 3),
1703                 s_match(1),
1704                 s_union(&[0, 2]),
1705             ]
1706         );
1707         assert_eq!(nfa.start_anchored().as_usize(), 4);
1708         assert_eq!(nfa.start_unanchored().as_usize(), 4);
1709         // Test that the start states for each individual pattern are correct.
1710         assert_eq!(nfa.start_pattern(pid(0)), sid(0));
1711         assert_eq!(nfa.start_pattern(pid(1)), sid(2));
1712     }
1713 }