src/vendor/regex/src/exec.rs

   1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 use std::cell::RefCell;
  12 use std::collections::HashMap;
  13 use std::cmp;
  14 use std::sync::Arc;
  15
  16 use thread_local::CachedThreadLocal;
  17 use syntax::{Expr, ExprBuilder, Literals};
  18
  19 use backtrack;
  20 use compile::Compiler;
  21 use dfa;
  22 use error::Error;
  23 use input::{ByteInput, CharInput};
  24 use literals::LiteralSearcher;
  25 use pikevm;
  26 use prog::Program;
  27 use re_builder::RegexOptions;
  28 use re_bytes;
  29 use re_set;
  30 use re_trait::{RegularExpression, Slot, Locations, as_slots};
  31 use re_unicode;
  32 use utf8::next_utf8;
  33
  34 /// `Exec` manages the execution of a regular expression.
  35 ///
  36 /// In particular, this manages the various compiled forms of a single regular
  37 /// expression and the choice of which matching engine to use to execute a
  38 /// regular expression.
  39 pub struct Exec {
  40     /// All read only state.
  41     ro: Arc<ExecReadOnly>,
  42     /// Caches for the various matching engines.
  43     cache: CachedThreadLocal<ProgramCache>,
  44 }
  45
  46 /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
  47 /// means it is no longer Sync, but we can now avoid the overhead of
  48 /// synchronization to fetch the cache.
  49 #[derive(Debug)]
  50 pub struct ExecNoSync<'c> {
  51     /// All read only state.
  52     ro: &'c Arc<ExecReadOnly>,
  53     /// Caches for the various matching engines.
  54     cache: &'c ProgramCache,
  55 }
  56
  57 /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
  58 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
  59
  60 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
  61 /// state is determined at compile time and never changes during search.
  62 #[derive(Debug)]
  63 struct ExecReadOnly {
  64     /// The original regular expressions given by the caller to compile.
  65     res: Vec<String>,
  66     /// A compiled program that is used in the NFA simulation and backtracking.
  67     /// It can be byte-based or Unicode codepoint based.
  68     ///
  69     /// N.B. It is not possibly to make this byte-based from the public API.
  70     /// It is only used for testing byte based programs in the NFA simulations.
  71     nfa: Program,
  72     /// A compiled byte based program for DFA execution. This is only used
  73     /// if a DFA can be executed. (Currently, only word boundary assertions are
  74     /// not supported.) Note that this program contains an embedded `.*?`
  75     /// preceding the first capture group, unless the regex is anchored at the
  76     /// beginning.
  77     dfa: Program,
  78     /// The same as above, except the program is reversed (and there is no
  79     /// preceding `.*?`). This is used by the DFA to find the starting location
  80     /// of matches.
  81     dfa_reverse: Program,
  82     /// A set of suffix literals extracted from the regex.
  83     ///
  84     /// Prefix literals are stored on the `Program`, since they are used inside
  85     /// the matching engines.
  86     suffixes: LiteralSearcher,
  87     /// match_type encodes as much upfront knowledge about how we're going to
  88     /// execute a search as possible.
  89     match_type: MatchType,
  90 }
  91
  92 /// Facilitates the construction of an executor by exposing various knobs
  93 /// to control how a regex is executed and what kinds of resources it's
  94 /// permitted to use.
  95 pub struct ExecBuilder {
  96     options: RegexOptions,
  97     match_type: Option<MatchType>,
  98     bytes: bool,
  99     only_utf8: bool,
 100 }
 101
 102 /// Parsed represents a set of parsed regular expressions and their detected
 103 /// literals.
 104 struct Parsed {
 105     exprs: Vec<Expr>,
 106     prefixes: Literals,
 107     suffixes: Literals,
 108     bytes: bool,
 109 }
 110
 111 impl ExecBuilder {
 112     /// Create a regex execution builder.
 113     ///
 114     /// This uses default settings for everything except the regex itself,
 115     /// which must be provided. Further knobs can be set by calling methods,
 116     /// and then finally, `build` to actually create the executor.
 117     pub fn new(re: &str) -> Self {
 118         Self::new_many(&[re])
 119     }
 120
 121     /// Like new, but compiles the union of the given regular expressions.
 122     ///
 123     /// Note that when compiling 2 or more regular expressions, capture groups
 124     /// are completely unsupported. (This means both `find` and `captures`
 125     /// wont work.)
 126     pub fn new_many<I, S>(res: I) -> Self
 127             where S: AsRef<str>, I: IntoIterator<Item=S> {
 128         let mut opts = RegexOptions::default();
 129         opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
 130         Self::new_options(opts)
 131     }
 132
 133     /// Create a regex execution builder.
 134     pub fn new_options(opts: RegexOptions) -> Self {
 135         ExecBuilder {
 136             options: opts,
 137             match_type: None,
 138             bytes: false,
 139             only_utf8: true,
 140         }
 141     }
 142
 143     /// Set the matching engine to be automatically determined.
 144     ///
 145     /// This is the default state and will apply whatever optimizations are
 146     /// possible, such as running a DFA.
 147     ///
 148     /// This overrides whatever was previously set via the `nfa` or
 149     /// `bounded_backtracking` methods.
 150     pub fn automatic(mut self) -> Self {
 151         self.match_type = None;
 152         self
 153     }
 154
 155     /// Sets the matching engine to use the NFA algorithm no matter what
 156     /// optimizations are possible.
 157     ///
 158     /// This overrides whatever was previously set via the `automatic` or
 159     /// `bounded_backtracking` methods.
 160     pub fn nfa(mut self) -> Self {
 161         self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
 162         self
 163     }
 164
 165     /// Sets the matching engine to use a bounded backtracking engine no
 166     /// matter what optimizations are possible.
 167     ///
 168     /// One must use this with care, since the bounded backtracking engine
 169     /// uses memory proportion to `len(regex) * len(text)`.
 170     ///
 171     /// This overrides whatever was previously set via the `automatic` or
 172     /// `nfa` methods.
 173     pub fn bounded_backtracking(mut self) -> Self {
 174         self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
 175         self
 176     }
 177
 178     /// Compiles byte based programs for use with the NFA matching engines.
 179     ///
 180     /// By default, the NFA engines match on Unicode scalar values. They can
 181     /// be made to use byte based programs instead. In general, the byte based
 182     /// programs are slower because of a less efficient encoding of character
 183     /// classes.
 184     ///
 185     /// Note that this does not impact DFA matching engines, which always
 186     /// execute on bytes.
 187     pub fn bytes(mut self, yes: bool) -> Self {
 188         self.bytes = yes;
 189         self
 190     }
 191
 192     /// When disabled, the program compiled may match arbitrary bytes.
 193     ///
 194     /// When enabled (the default), all compiled programs exclusively match
 195     /// valid UTF-8 bytes.
 196     pub fn only_utf8(mut self, yes: bool) -> Self {
 197         self.only_utf8 = yes;
 198         self
 199     }
 200
 201     /// Set the Unicode flag.
 202     pub fn unicode(mut self, yes: bool) -> Self {
 203         self.options.unicode = yes;
 204         self
 205     }
 206
 207     /// Parse the current set of patterns into their AST and extract literals.
 208     fn parse(&self) -> Result<Parsed, Error> {
 209         let mut exprs = Vec::with_capacity(self.options.pats.len());
 210         let mut prefixes = Some(Literals::empty());
 211         let mut suffixes = Some(Literals::empty());
 212         let mut bytes = false;
 213         let is_set = self.options.pats.len() > 1;
 214         // If we're compiling a regex set and that set has any anchored
 215         // expressions, then disable all literal optimizations.
 216         for pat in &self.options.pats {
 217             let parser =
 218                 ExprBuilder::new()
 219                     .case_insensitive(self.options.case_insensitive)
 220                     .multi_line(self.options.multi_line)
 221                     .dot_matches_new_line(self.options.dot_matches_new_line)
 222                     .swap_greed(self.options.swap_greed)
 223                     .ignore_whitespace(self.options.ignore_whitespace)
 224                     .unicode(self.options.unicode)
 225                     .allow_bytes(!self.only_utf8);
 226             let expr = try!(parser.parse(pat));
 227             bytes = bytes || expr.has_bytes();
 228
 229             if !expr.is_anchored_start() && expr.has_anchored_start() {
 230                 // Partial anchors unfortunately make it hard to use prefixes,
 231                 // so disable them.
 232                 prefixes = None;
 233             } else if is_set && expr.is_anchored_start() {
 234                 // Regex sets with anchors do not go well with literal
 235                 // optimizations.
 236                 prefixes = None;
 237             }
 238             prefixes = prefixes.and_then(|mut prefixes| {
 239                 if !prefixes.union_prefixes(&expr) {
 240                     None
 241                 } else {
 242                     Some(prefixes)
 243                 }
 244             });
 245
 246             if !expr.is_anchored_end() && expr.has_anchored_end() {
 247                 // Partial anchors unfortunately make it hard to use suffixes,
 248                 // so disable them.
 249                 suffixes = None;
 250             } else if is_set && expr.is_anchored_end() {
 251                 // Regex sets with anchors do not go well with literal
 252                 // optimizations.
 253                 prefixes = None;
 254             }
 255             suffixes = suffixes.and_then(|mut suffixes| {
 256                 if !suffixes.union_suffixes(&expr) {
 257                     None
 258                 } else {
 259                     Some(suffixes)
 260                 }
 261             });
 262             exprs.push(expr);
 263         }
 264         Ok(Parsed {
 265             exprs: exprs,
 266             prefixes: prefixes.unwrap_or_else(Literals::empty),
 267             suffixes: suffixes.unwrap_or_else(Literals::empty),
 268             bytes: bytes,
 269         })
 270     }
 271
 272     /// Build an executor that can run a regular expression.
 273     pub fn build(self) -> Result<Exec, Error> {
 274         // Special case when we have no patterns to compile.
 275         // This can happen when compiling a regex set.
 276         if self.options.pats.is_empty() {
 277             let ro = Arc::new(ExecReadOnly {
 278                 res: vec![],
 279                 nfa: Program::new(),
 280                 dfa: Program::new(),
 281                 dfa_reverse: Program::new(),
 282                 suffixes: LiteralSearcher::empty(),
 283                 match_type: MatchType::Nothing,
 284             });
 285             return Ok(Exec { ro: ro, cache: CachedThreadLocal::new() });
 286         }
 287         let parsed = try!(self.parse());
 288         let mut nfa = try!(
 289             Compiler::new()
 290                      .size_limit(self.options.size_limit)
 291                      .bytes(self.bytes || parsed.bytes)
 292                      .only_utf8(self.only_utf8)
 293                      .compile(&parsed.exprs));
 294         let mut dfa = try!(
 295             Compiler::new()
 296                      .size_limit(self.options.size_limit)
 297                      .dfa(true)
 298                      .only_utf8(self.only_utf8)
 299                      .compile(&parsed.exprs));
 300         let mut dfa_reverse = try!(
 301             Compiler::new()
 302                      .size_limit(self.options.size_limit)
 303                      .dfa(true)
 304                      .only_utf8(self.only_utf8)
 305                      .reverse(true)
 306                      .compile(&parsed.exprs));
 307
 308         let prefixes = parsed.prefixes.unambiguous_prefixes();
 309         let suffixes = parsed.suffixes.unambiguous_suffixes();
 310         nfa.prefixes = LiteralSearcher::prefixes(prefixes);
 311         dfa.prefixes = nfa.prefixes.clone();
 312         dfa.dfa_size_limit = self.options.dfa_size_limit;
 313         dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
 314
 315         let mut ro = ExecReadOnly {
 316             res: self.options.pats,
 317             nfa: nfa,
 318             dfa: dfa,
 319             dfa_reverse: dfa_reverse,
 320             suffixes: LiteralSearcher::suffixes(suffixes),
 321             match_type: MatchType::Nothing,
 322         };
 323         ro.match_type = ro.choose_match_type(self.match_type);
 324
 325         let ro = Arc::new(ro);
 326         Ok(Exec { ro: ro, cache: CachedThreadLocal::new() })
 327     }
 328 }
 329
 330 impl<'c> RegularExpression for ExecNoSyncStr<'c> {
 331     type Text = str;
 332
 333     fn slots_len(&self) -> usize { self.0.slots_len() }
 334
 335     fn next_after_empty(&self, text: &str, i: usize) -> usize {
 336         next_utf8(text.as_bytes(), i)
 337     }
 338
 339     #[inline(always)] // reduces constant overhead
 340     fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
 341         self.0.shortest_match_at(text.as_bytes(), start)
 342     }
 343
 344     #[inline(always)] // reduces constant overhead
 345     fn is_match_at(&self, text: &str, start: usize) -> bool {
 346         self.0.is_match_at(text.as_bytes(), start)
 347     }
 348
 349     #[inline(always)] // reduces constant overhead
 350     fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
 351         self.0.find_at(text.as_bytes(), start)
 352     }
 353
 354     #[inline(always)] // reduces constant overhead
 355     fn read_captures_at(
 356         &self,
 357         locs: &mut Locations,
 358         text: &str,
 359         start: usize,
 360     ) -> Option<(usize, usize)> {
 361         self.0.read_captures_at(locs, text.as_bytes(), start)
 362     }
 363 }
 364
 365 impl<'c> RegularExpression for ExecNoSync<'c> {
 366     type Text = [u8];
 367
 368     /// Returns the number of capture slots in the regular expression. (There
 369     /// are two slots for every capture group, corresponding to possibly empty
 370     /// start and end locations of the capture.)
 371     fn slots_len(&self) -> usize {
 372         self.ro.nfa.captures.len() * 2
 373     }
 374
 375     fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
 376         i + 1
 377     }
 378
 379     /// Returns the end of a match location, possibly occurring before the
 380     /// end location of the correct leftmost-first match.
 381     #[inline(always)] // reduces constant overhead
 382     fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
 383         if !self.is_anchor_end_match(text) {
 384             return None;
 385         }
 386         match self.ro.match_type {
 387             MatchType::Literal(ty) => {
 388                 self.find_literals(ty, text, start).map(|(_, e)| e)
 389             }
 390             MatchType::Dfa | MatchType::DfaMany => {
 391                 match self.shortest_dfa(text, start) {
 392                     dfa::Result::Match(end) => Some(end),
 393                     dfa::Result::NoMatch(_) => None,
 394                     dfa::Result::Quit => self.shortest_nfa(text, start),
 395                 }
 396             }
 397             MatchType::DfaAnchoredReverse => {
 398                 match dfa::Fsm::reverse(
 399                     &self.ro.dfa_reverse,
 400                     self.cache,
 401                     true,
 402                     &text[start..],
 403                     text.len(),
 404                 ) {
 405                     dfa::Result::Match(_) => Some(text.len()),
 406                     dfa::Result::NoMatch(_) => None,
 407                     dfa::Result::Quit => self.shortest_nfa(text, start),
 408                 }
 409             }
 410             MatchType::DfaSuffix => {
 411                 match self.shortest_dfa_reverse_suffix(text, start) {
 412                     dfa::Result::Match(e) => Some(e),
 413                     dfa::Result::NoMatch(_) => None,
 414                     dfa::Result::Quit => self.shortest_nfa(text, start),
 415                 }
 416             }
 417             MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
 418             MatchType::Nothing => None,
 419         }
 420     }
 421
 422     /// Returns true if and only if the regex matches text.
 423     ///
 424     /// For single regular expressions, this is equivalent to calling
 425     /// shortest_match(...).is_some().
 426     #[inline(always)] // reduces constant overhead
 427     fn is_match_at(&self, text: &[u8], start: usize) -> bool {
 428         if !self.is_anchor_end_match(text) {
 429             return false;
 430         }
 431         // We need to do this dance because shortest_match relies on the NFA
 432         // filling in captures[1], but a RegexSet has no captures. In other
 433         // words, a RegexSet can't (currently) use shortest_match. ---AG
 434         match self.ro.match_type {
 435             MatchType::Literal(ty) => {
 436                 self.find_literals(ty, text, start).is_some()
 437             }
 438             MatchType::Dfa | MatchType::DfaMany => {
 439                 match self.shortest_dfa(text, start) {
 440                     dfa::Result::Match(_) => true,
 441                     dfa::Result::NoMatch(_) => false,
 442                     dfa::Result::Quit => self.match_nfa(text, start),
 443                 }
 444             }
 445             MatchType::DfaAnchoredReverse => {
 446                 match dfa::Fsm::reverse(
 447                     &self.ro.dfa_reverse,
 448                     self.cache,
 449                     true,
 450                     &text[start..],
 451                     text.len(),
 452                 ) {
 453                     dfa::Result::Match(_) => true,
 454                     dfa::Result::NoMatch(_) => false,
 455                     dfa::Result::Quit => self.match_nfa(text, start),
 456                 }
 457             }
 458             MatchType::DfaSuffix => {
 459                 match self.shortest_dfa_reverse_suffix(text, start) {
 460                     dfa::Result::Match(_) => true,
 461                     dfa::Result::NoMatch(_) => false,
 462                     dfa::Result::Quit => self.match_nfa(text, start),
 463                 }
 464             }
 465             MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
 466             MatchType::Nothing => false,
 467         }
 468     }
 469
 470     /// Finds the start and end location of the leftmost-first match, starting
 471     /// at the given location.
 472     #[inline(always)] // reduces constant overhead
 473     fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
 474         if !self.is_anchor_end_match(text) {
 475             return None;
 476         }
 477         match self.ro.match_type {
 478             MatchType::Literal(ty) => {
 479                 self.find_literals(ty, text, start)
 480             }
 481             MatchType::Dfa => {
 482                 match self.find_dfa_forward(text, start) {
 483                     dfa::Result::Match((s, e)) => Some((s, e)),
 484                     dfa::Result::NoMatch(_) => None,
 485                     dfa::Result::Quit => {
 486                         self.find_nfa(MatchNfaType::Auto, text, start)
 487                     }
 488                 }
 489             }
 490             MatchType::DfaAnchoredReverse => {
 491                 match self.find_dfa_anchored_reverse(text, start) {
 492                     dfa::Result::Match((s, e)) => Some((s, e)),
 493                     dfa::Result::NoMatch(_) => None,
 494                     dfa::Result::Quit => {
 495                         self.find_nfa(MatchNfaType::Auto, text, start)
 496                     }
 497                 }
 498             }
 499             MatchType::DfaSuffix => {
 500                 match self.find_dfa_reverse_suffix(text, start) {
 501                     dfa::Result::Match((s, e)) => Some((s, e)),
 502                     dfa::Result::NoMatch(_) => None,
 503                     dfa::Result::Quit => {
 504                         self.find_nfa(MatchNfaType::Auto, text, start)
 505                     }
 506                 }
 507             }
 508             MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
 509             MatchType::Nothing => None,
 510             MatchType::DfaMany => {
 511                 unreachable!("BUG: RegexSet cannot be used with find")
 512             }
 513         }
 514     }
 515
 516     /// Finds the start and end location of the leftmost-first match and also
 517     /// fills in all matching capture groups.
 518     ///
 519     /// The number of capture slots given should be equal to the total number
 520     /// of capture slots in the compiled program.
 521     ///
 522     /// Note that the first two slots always correspond to the start and end
 523     /// locations of the overall match.
 524     fn read_captures_at(
 525         &self,
 526         locs: &mut Locations,
 527         text: &[u8],
 528         start: usize,
 529     ) -> Option<(usize, usize)> {
 530         let slots = as_slots(locs);
 531         for slot in slots.iter_mut() {
 532             *slot = None;
 533         }
 534         // If the caller unnecessarily uses this, then we try to save them
 535         // from themselves.
 536         match slots.len() {
 537             0 => return self.find_at(text, start),
 538             2 => {
 539                 return self.find_at(text, start).map(|(s, e)| {
 540                     slots[0] = Some(s);
 541                     slots[1] = Some(e);
 542                     (s, e)
 543                 });
 544             }
 545             _ => {} // fallthrough
 546         }
 547         if !self.is_anchor_end_match(text) {
 548             return None;
 549         }
 550         match self.ro.match_type {
 551             MatchType::Literal(ty) => {
 552                 self.find_literals(ty, text, start).and_then(|(s, e)| {
 553                     self.captures_nfa_with_match(slots, text, s, e)
 554                 })
 555             }
 556             MatchType::Dfa => {
 557                 if self.ro.nfa.is_anchored_start {
 558                     self.captures_nfa(slots, text, start)
 559                 } else {
 560                     match self.find_dfa_forward(text, start) {
 561                         dfa::Result::Match((s, e)) => {
 562                             self.captures_nfa_with_match(slots, text, s, e)
 563                         }
 564                         dfa::Result::NoMatch(_) => None,
 565                         dfa::Result::Quit => self.captures_nfa(slots, text, start),
 566                     }
 567                 }
 568             }
 569             MatchType::DfaAnchoredReverse => {
 570                 match self.find_dfa_anchored_reverse(text, start) {
 571                     dfa::Result::Match((s, e)) => {
 572                         self.captures_nfa_with_match(slots, text, s, e)
 573                     }
 574                     dfa::Result::NoMatch(_) => None,
 575                     dfa::Result::Quit => self.captures_nfa(slots, text, start),
 576                 }
 577             }
 578             MatchType::DfaSuffix => {
 579                 match self.find_dfa_reverse_suffix(text, start) {
 580                     dfa::Result::Match((s, e)) => {
 581                         self.captures_nfa_with_match(slots, text, s, e)
 582                     }
 583                     dfa::Result::NoMatch(_) => None,
 584                     dfa::Result::Quit => self.captures_nfa(slots, text, start),
 585                 }
 586             }
 587             MatchType::Nfa(ty) => {
 588                 self.captures_nfa_type(ty, slots, text, start)
 589             }
 590             MatchType::Nothing => None,
 591             MatchType::DfaMany => {
 592                 unreachable!("BUG: RegexSet cannot be used with captures")
 593             }
 594         }
 595     }
 596 }
 597
 598 impl<'c> ExecNoSync<'c> {
 599     /// Finds the leftmost-first match using only literal search.
 600     #[inline(always)] // reduces constant overhead
 601     fn find_literals(
 602         &self,
 603         ty: MatchLiteralType,
 604         text: &[u8],
 605         start: usize,
 606     ) -> Option<(usize, usize)> {
 607         use self::MatchLiteralType::*;
 608         match ty {
 609             Unanchored => {
 610                 let lits = &self.ro.nfa.prefixes;
 611                 lits.find(&text[start..])
 612                     .map(|(s, e)| (start + s, start + e))
 613             }
 614             AnchoredStart => {
 615                 let lits = &self.ro.nfa.prefixes;
 616                 lits.find_start(&text[start..])
 617                     .map(|(s, e)| (start + s, start + e))
 618             }
 619             AnchoredEnd => {
 620                 let lits = &self.ro.suffixes;
 621                 lits.find_end(&text[start..])
 622                     .map(|(s, e)| (start + s, start + e))
 623             }
 624         }
 625     }
 626
 627     /// Finds the leftmost-first match (start and end) using only the DFA.
 628     ///
 629     /// If the result returned indicates that the DFA quit, then another
 630     /// matching engine should be used.
 631     #[inline(always)] // reduces constant overhead
 632     fn find_dfa_forward(
 633         &self,
 634         text: &[u8],
 635         start: usize,
 636     ) -> dfa::Result<(usize, usize)> {
 637         use dfa::Result::*;
 638         let end = match dfa::Fsm::forward(
 639             &self.ro.dfa,
 640             self.cache,
 641             false,
 642             text,
 643             start,
 644         ) {
 645             NoMatch(i) => return NoMatch(i),
 646             Quit => return Quit,
 647             Match(end) if start == end => return Match((start, start)),
 648             Match(end) => end,
 649         };
 650         // Now run the DFA in reverse to find the start of the match.
 651         match dfa::Fsm::reverse(
 652             &self.ro.dfa_reverse,
 653             self.cache,
 654             false,
 655             &text[start..],
 656             end - start,
 657         ) {
 658             Match(s) => Match((start + s, end)),
 659             NoMatch(i) => NoMatch(i),
 660             Quit => Quit,
 661         }
 662     }
 663
 664     /// Finds the leftmost-first match (start and end) using only the DFA,
 665     /// but assumes the regex is anchored at the end and therefore starts at
 666     /// the end of the regex and matches in reverse.
 667     ///
 668     /// If the result returned indicates that the DFA quit, then another
 669     /// matching engine should be used.
 670     #[inline(always)] // reduces constant overhead
 671     fn find_dfa_anchored_reverse(
 672         &self,
 673         text: &[u8],
 674         start: usize,
 675     ) -> dfa::Result<(usize, usize)> {
 676         use dfa::Result::*;
 677         match dfa::Fsm::reverse(
 678             &self.ro.dfa_reverse,
 679             self.cache,
 680             false,
 681             &text[start..],
 682             text.len() - start,
 683         ) {
 684             Match(s) => Match((start + s, text.len())),
 685             NoMatch(i) => NoMatch(i),
 686             Quit => Quit,
 687         }
 688     }
 689
 690     /// Finds the end of the shortest match using only the DFA.
 691     #[inline(always)] // reduces constant overhead
 692     fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
 693         dfa::Fsm::forward(&self.ro.dfa, self.cache, true, text, start)
 694     }
 695
 696     /// Finds the end of the shortest match using only the DFA by scanning for
 697     /// suffix literals.
 698     ///
 699     #[inline(always)] // reduces constant overhead
 700     fn shortest_dfa_reverse_suffix(
 701         &self,
 702         text: &[u8],
 703         start: usize,
 704     ) -> dfa::Result<usize> {
 705         match self.exec_dfa_reverse_suffix(text, start) {
 706             None => self.shortest_dfa(text, start),
 707             Some(r) => r.map(|(_, end)| end),
 708         }
 709     }
 710
 711     /// Finds the end of the shortest match using only the DFA by scanning for
 712     /// suffix literals. It also reports the start of the match.
 713     ///
 714     /// Note that if None is returned, then the optimization gave up to avoid
 715     /// worst case quadratic behavior. A forward scanning DFA should be tried
 716     /// next.
 717     ///
 718     /// If a match is returned and the full leftmost-first match is desired,
 719     /// then a forward scan starting from the beginning of the match must be
 720     /// done.
 721     ///
 722     /// If the result returned indicates that the DFA quit, then another
 723     /// matching engine should be used.
 724     #[inline(always)] // reduces constant overhead
 725     fn exec_dfa_reverse_suffix(
 726         &self,
 727         text: &[u8],
 728         original_start: usize,
 729     ) -> Option<dfa::Result<(usize, usize)>> {
 730         use dfa::Result::*;
 731
 732         let lcs = self.ro.suffixes.lcs();
 733         debug_assert!(lcs.len() >= 1);
 734         let mut start = original_start;
 735         let mut end = start;
 736         while end <= text.len() {
 737             start = end;
 738             end += match lcs.find(&text[end..]) {
 739                 None => return Some(NoMatch(text.len())),
 740                 Some(start) => start + lcs.len(),
 741             };
 742             match dfa::Fsm::reverse(
 743                 &self.ro.dfa_reverse,
 744                 self.cache,
 745                 false,
 746                 &text[start..end],
 747                 end - start,
 748             ) {
 749                 Match(0) | NoMatch(0) => return None,
 750                 Match(s) => return Some(Match((s + start, end))),
 751                 NoMatch(_) => continue,
 752                 Quit => return Some(Quit),
 753             };
 754         }
 755         Some(NoMatch(text.len()))
 756     }
 757
 758     /// Finds the leftmost-first match (start and end) using only the DFA
 759     /// by scanning for suffix literals.
 760     ///
 761     /// If the result returned indicates that the DFA quit, then another
 762     /// matching engine should be used.
 763     #[inline(always)] // reduces constant overhead
 764     fn find_dfa_reverse_suffix(
 765         &self,
 766         text: &[u8],
 767         start: usize,
 768     ) -> dfa::Result<(usize, usize)> {
 769         use dfa::Result::*;
 770
 771         let match_start = match self.exec_dfa_reverse_suffix(text, start) {
 772             None => return self.find_dfa_forward(text, start),
 773             Some(Match((start, _))) => start,
 774             Some(r) => return r,
 775         };
 776         // At this point, we've found a match. The only way to quit now
 777         // without a match is if the DFA gives up (seems unlikely).
 778         //
 779         // Now run the DFA forwards to find the proper end of the match.
 780         // (The suffix literal match can only indicate the earliest
 781         // possible end location, which may appear before the end of the
 782         // leftmost-first match.)
 783         match dfa::Fsm::forward(
 784             &self.ro.dfa,
 785             self.cache,
 786             false,
 787             text,
 788             match_start,
 789         ) {
 790             NoMatch(_) => panic!("BUG: reverse match implies forward match"),
 791             Quit => Quit,
 792             Match(e) => Match((match_start, e)),
 793         }
 794     }
 795
 796     /// Executes the NFA engine to return whether there is a match or not.
 797     ///
 798     /// Ideally, we could use shortest_nfa(...).is_some() and get the same
 799     /// performance characteristics, but regex sets don't have captures, which
 800     /// shortest_nfa depends on.
 801     fn match_nfa(
 802         &self,
 803         text: &[u8],
 804         start: usize,
 805     ) -> bool {
 806         self.match_nfa_type(MatchNfaType::Auto, text, start)
 807     }
 808
 809     /// Like match_nfa, but allows specification of the type of NFA engine.
 810     fn match_nfa_type(
 811         &self,
 812         ty: MatchNfaType,
 813         text: &[u8],
 814         start: usize,
 815     ) -> bool {
 816         self.exec_nfa(ty, &mut [false], &mut [], true, text, start)
 817     }
 818
 819     /// Finds the shortest match using an NFA.
 820     fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
 821         self.shortest_nfa_type(MatchNfaType::Auto, text, start)
 822     }
 823
 824     /// Like shortest_nfa, but allows specification of the type of NFA engine.
 825     fn shortest_nfa_type(
 826         &self,
 827         ty: MatchNfaType,
 828         text: &[u8],
 829         start: usize,
 830     ) -> Option<usize> {
 831         let mut slots = [None, None];
 832         if self.exec_nfa(ty, &mut [false], &mut slots, true, text, start) {
 833             slots[1]
 834         } else {
 835             None
 836         }
 837     }
 838
 839     /// Like find, but executes an NFA engine.
 840     fn find_nfa(
 841         &self,
 842         ty: MatchNfaType,
 843         text: &[u8],
 844         start: usize,
 845     ) -> Option<(usize, usize)> {
 846         let mut slots = [None, None];
 847         if self.exec_nfa(ty, &mut [false], &mut slots, false, text, start) {
 848             match (slots[0], slots[1]) {
 849                 (Some(s), Some(e)) => Some((s, e)),
 850                 _ => None,
 851             }
 852         } else {
 853             None
 854         }
 855     }
 856
 857     /// Like find_nfa, but fills in captures and restricts the search space
 858     /// using previously found match information.
 859     ///
 860     /// `slots` should have length equal to `2 * nfa.captures.len()`.
 861     fn captures_nfa_with_match(
 862         &self,
 863         slots: &mut [Slot],
 864         text: &[u8],
 865         match_start: usize,
 866         match_end: usize,
 867     ) -> Option<(usize, usize)> {
 868         // We can't use match_end directly, because we may need to examine one
 869         // "character" after the end of a match for lookahead operators. We
 870         // need to move two characters beyond the end, since some look-around
 871         // operations may falsely assume a premature end of text otherwise.
 872         let e = cmp::min(
 873             next_utf8(text, next_utf8(text, match_end)), text.len());
 874         self.captures_nfa(slots, &text[..e], match_start)
 875     }
 876
 877     /// Like find_nfa, but fills in captures.
 878     ///
 879     /// `slots` should have length equal to `2 * nfa.captures.len()`.
 880     fn captures_nfa(
 881         &self,
 882         slots: &mut [Slot],
 883         text: &[u8],
 884         start: usize,
 885     ) -> Option<(usize, usize)> {
 886         self.captures_nfa_type(MatchNfaType::Auto, slots, text, start)
 887     }
 888
 889     /// Like captures_nfa, but allows specification of type of NFA engine.
 890     fn captures_nfa_type(
 891         &self,
 892         ty: MatchNfaType,
 893         slots: &mut [Slot],
 894         text: &[u8],
 895         start: usize,
 896     ) -> Option<(usize, usize)> {
 897         if self.exec_nfa(ty, &mut [false], slots, false, text, start) {
 898             match (slots[0], slots[1]) {
 899                 (Some(s), Some(e)) => Some((s, e)),
 900                 _ => None,
 901             }
 902         } else {
 903             None
 904         }
 905     }
 906
 907     fn exec_nfa(
 908         &self,
 909         mut ty: MatchNfaType,
 910         matches: &mut [bool],
 911         slots: &mut [Slot],
 912         quit_after_match: bool,
 913         text: &[u8],
 914         start: usize,
 915     ) -> bool {
 916         use self::MatchNfaType::*;
 917         if let Auto = ty {
 918             if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
 919                 ty = Backtrack;
 920             } else {
 921                 ty = PikeVM;
 922             }
 923         }
 924         match ty {
 925             Auto => unreachable!(),
 926             Backtrack => self.exec_backtrack(matches, slots, text, start),
 927             PikeVM => {
 928                 self.exec_pikevm(
 929                     matches, slots, quit_after_match, text, start)
 930             }
 931         }
 932     }
 933
 934     /// Always run the NFA algorithm.
 935     fn exec_pikevm(
 936         &self,
 937         matches: &mut [bool],
 938         slots: &mut [Slot],
 939         quit_after_match: bool,
 940         text: &[u8],
 941         start: usize,
 942     ) -> bool {
 943         if self.ro.nfa.uses_bytes() {
 944             pikevm::Fsm::exec(
 945                 &self.ro.nfa,
 946                 self.cache,
 947                 matches,
 948                 slots,
 949                 quit_after_match,
 950                 ByteInput::new(text, self.ro.nfa.only_utf8),
 951                 start)
 952         } else {
 953             pikevm::Fsm::exec(
 954                 &self.ro.nfa,
 955                 self.cache,
 956                 matches,
 957                 slots,
 958                 quit_after_match,
 959                 CharInput::new(text),
 960                 start)
 961         }
 962     }
 963
 964     /// Always runs the NFA using bounded backtracking.
 965     fn exec_backtrack(
 966         &self,
 967         matches: &mut [bool],
 968         slots: &mut [Slot],
 969         text: &[u8],
 970         start: usize,
 971     ) -> bool {
 972         if self.ro.nfa.uses_bytes() {
 973             backtrack::Bounded::exec(
 974                 &self.ro.nfa,
 975                 self.cache,
 976                 matches,
 977                 slots,
 978                 ByteInput::new(text, self.ro.nfa.only_utf8),
 979                 start)
 980         } else {
 981             backtrack::Bounded::exec(
 982                 &self.ro.nfa,
 983                 self.cache,
 984                 matches,
 985                 slots,
 986                 CharInput::new(text),
 987                 start)
 988         }
 989     }
 990
 991     /// Finds which regular expressions match the given text.
 992     ///
 993     /// `matches` should have length equal to the number of regexes being
 994     /// searched.
 995     ///
 996     /// This is only useful when one wants to know which regexes in a set
 997     /// match some text.
 998     pub fn many_matches_at(
 999         &self,
1000         matches: &mut [bool],
1001         text: &[u8],
1002         start: usize,
1003     ) -> bool {
1004         use self::MatchType::*;
1005         if !self.is_anchor_end_match(text) {
1006             return false;
1007         }
1008         match self.ro.match_type {
1009             Literal(ty) => {
1010                 debug_assert_eq!(matches.len(), 1);
1011                 matches[0] = self.find_literals(ty, text, start).is_some();
1012                 matches[0]
1013             }
1014             Dfa | DfaAnchoredReverse | DfaSuffix | DfaMany => {
1015                 match dfa::Fsm::forward_many(
1016                     &self.ro.dfa,
1017                     self.cache,
1018                     matches,
1019                     text,
1020                     start,
1021                 ) {
1022                     dfa::Result::Match(_) => true,
1023                     dfa::Result::NoMatch(_) => false,
1024                     dfa::Result::Quit => {
1025                         self.exec_nfa(
1026                             MatchNfaType::Auto,
1027                             matches,
1028                             &mut [],
1029                             false,
1030                             text,
1031                             start)
1032                     }
1033                 }
1034             }
1035             Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start),
1036             Nothing => false,
1037         }
1038     }
1039
1040     #[inline(always)] // reduces constant overhead
1041     fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1042         // Only do this check if the haystack is big (>1MB).
1043         if text.len() > (1<<20) && self.ro.nfa.is_anchored_end {
1044             let lcs = self.ro.suffixes.lcs();
1045             if lcs.len() >= 1 && !lcs.is_suffix(text) {
1046                 return false;
1047             }
1048         }
1049         true
1050     }
1051
1052     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1053         &self.ro.nfa.capture_name_idx
1054     }
1055 }
1056
1057 impl<'c> ExecNoSyncStr<'c> {
1058     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1059         self.0.capture_name_idx()
1060     }
1061 }
1062
1063 impl Exec {
1064     /// Get a searcher that isn't Sync.
1065     #[inline(always)] // reduces constant overhead
1066     pub fn searcher(&self) -> ExecNoSync {
1067         let create = || Box::new(RefCell::new(ProgramCacheInner::new(&self.ro)));
1068         ExecNoSync {
1069             ro: &self.ro, // a clone is too expensive here! (and not needed)
1070             cache: self.cache.get_or(create),
1071         }
1072     }
1073
1074     /// Get a searcher that isn't Sync and can match on &str.
1075     #[inline(always)] // reduces constant overhead
1076     pub fn searcher_str(&self) -> ExecNoSyncStr {
1077         ExecNoSyncStr(self.searcher())
1078     }
1079
1080     /// Build a Regex from this executor.
1081     pub fn into_regex(self) -> re_unicode::Regex {
1082         re_unicode::Regex::from(self)
1083     }
1084
1085     /// Build a RegexSet from this executor.
1086     pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1087         re_set::unicode::RegexSet::from(self)
1088     }
1089
1090     /// Build a Regex from this executor that can match arbitrary bytes.
1091     pub fn into_byte_regex(self) -> re_bytes::Regex {
1092         re_bytes::Regex::from(self)
1093     }
1094
1095     /// Build a RegexSet from this executor that can match arbitrary bytes.
1096     pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1097         re_set::bytes::RegexSet::from(self)
1098     }
1099
1100     /// The original regular expressions given by the caller that were
1101     /// compiled.
1102     pub fn regex_strings(&self) -> &[String] {
1103         &self.ro.res
1104     }
1105
1106     /// Return a slice of capture names.
1107     ///
1108     /// Any capture that isn't named is None.
1109     pub fn capture_names(&self) -> &[Option<String>] {
1110         &self.ro.nfa.captures
1111     }
1112
1113     /// Return a reference to named groups mapping (from group name to
1114     /// group position).
1115     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1116         &self.ro.nfa.capture_name_idx
1117     }
1118 }
1119
1120 impl Clone for Exec {
1121     fn clone(&self) -> Exec {
1122         Exec {
1123             ro: self.ro.clone(),
1124             cache: CachedThreadLocal::new(),
1125         }
1126     }
1127 }
1128
1129 impl ExecReadOnly {
1130     fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1131         use self::MatchType::*;
1132         if let Some(Nfa(_)) = hint {
1133             return hint.unwrap();
1134         }
1135         // If the NFA is empty, then we'll never match anything.
1136         if self.nfa.insts.is_empty() {
1137             return Nothing;
1138         }
1139         // If our set of prefixes is complete, then we can use it to find
1140         // a match in lieu of a regex engine. This doesn't quite work well in
1141         // the presence of multiple regexes, so only do it when there's one.
1142         //
1143         // TODO(burntsushi): Also, don't try to match literals if the regex is
1144         // partially anchored. We could technically do it, but we'd need to
1145         // create two sets of literals: all of them and then the subset that
1146         // aren't anchored. We would then only search for all of them when at
1147         // the beginning of the input and use the subset in all other cases.
1148         if self.res.len() == 1 {
1149             if self.nfa.prefixes.complete() {
1150                 return if self.nfa.is_anchored_start {
1151                     Literal(MatchLiteralType::AnchoredStart)
1152                 } else {
1153                     Literal(MatchLiteralType::Unanchored)
1154                 };
1155             }
1156             if self.suffixes.complete() {
1157                 return if self.nfa.is_anchored_end {
1158                     Literal(MatchLiteralType::AnchoredEnd)
1159                 } else {
1160                     // This case shouldn't happen. When the regex isn't
1161                     // anchored, then complete prefixes should imply complete
1162                     // suffixes.
1163                     Literal(MatchLiteralType::Unanchored)
1164                 };
1165             }
1166         }
1167         // If we can execute the DFA, then we totally should.
1168         if dfa::can_exec(&self.dfa) {
1169             // Regex sets require a slightly specialized path.
1170             if self.res.len() >= 2 {
1171                 return DfaMany;
1172             }
1173             // If the regex is anchored at the end but not the start, then
1174             // just match in reverse from the end of the haystack.
1175             if !self.nfa.is_anchored_start && self.nfa.is_anchored_end {
1176                 return DfaAnchoredReverse;
1177             }
1178             // If there's a longish suffix literal, then it might be faster
1179             // to look for that first.
1180             if self.should_suffix_scan() {
1181                 return DfaSuffix;
1182             }
1183             // Fall back to your garden variety forward searching lazy DFA.
1184             return Dfa;
1185         }
1186         // We're so totally hosed.
1187         Nfa(MatchNfaType::Auto)
1188     }
1189
1190     /// Returns true if the program is amenable to suffix scanning.
1191     ///
1192     /// When this is true, as a heuristic, we assume it is OK to quickly scan
1193     /// for suffix literals and then do a *reverse* DFA match from any matches
1194     /// produced by the literal scan. (And then followed by a forward DFA
1195     /// search, since the previously found suffix literal maybe not actually be
1196     /// the end of a match.)
1197     ///
1198     /// This is a bit of a specialized optimization, but can result in pretty
1199     /// big performance wins if 1) there are no prefix literals and 2) the
1200     /// suffix literals are pretty rare in the text. (1) is obviously easy to
1201     /// account for but (2) is harder. As a proxy, we assume that longer
1202     /// strings are generally rarer, so we only enable this optimization when
1203     /// we have a meaty suffix.
1204     fn should_suffix_scan(&self) -> bool {
1205         if self.suffixes.is_empty() {
1206             return false;
1207         }
1208         let lcs_len = self.suffixes.lcs().char_len();
1209         lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1210     }
1211 }
1212
1213 #[derive(Clone, Copy, Debug)]
1214 enum MatchType {
1215     /// A single or multiple literal search. This is only used when the regex
1216     /// can be decomposed into unambiguous literal search.
1217     Literal(MatchLiteralType),
1218     /// A normal DFA search.
1219     Dfa,
1220     /// A reverse DFA search starting from the end of a haystack.
1221     DfaAnchoredReverse,
1222     /// A reverse DFA search with suffix literal scanning.
1223     DfaSuffix,
1224     /// Use the DFA on two or more regular expressions.
1225     DfaMany,
1226     /// An NFA variant.
1227     Nfa(MatchNfaType),
1228     /// No match is ever possible, so don't ever try to search.
1229     Nothing,
1230 }
1231
1232 #[derive(Clone, Copy, Debug)]
1233 enum MatchLiteralType {
1234     /// Match literals anywhere in text.
1235     Unanchored,
1236     /// Match literals only at the start of text.
1237     AnchoredStart,
1238     /// Match literals only at the end of text.
1239     AnchoredEnd,
1240 }
1241
1242 #[derive(Clone, Copy, Debug)]
1243 enum MatchNfaType {
1244     /// Choose between Backtrack and PikeVM.
1245     Auto,
1246     /// NFA bounded backtracking.
1247     ///
1248     /// (This is only set by tests, since it never makes sense to always want
1249     /// backtracking.)
1250     Backtrack,
1251     /// The Pike VM.
1252     ///
1253     /// (This is only set by tests, since it never makes sense to always want
1254     /// the Pike VM.)
1255     PikeVM,
1256 }
1257
1258 /// `ProgramCache` maintains reusable allocations for each matching engine
1259 /// available to a particular program.
1260 pub type ProgramCache = RefCell<ProgramCacheInner>;
1261
1262 #[derive(Clone, Debug)]
1263 pub struct ProgramCacheInner {
1264     pub pikevm: pikevm::Cache,
1265     pub backtrack: backtrack::Cache,
1266     pub dfa: dfa::Cache,
1267     pub dfa_reverse: dfa::Cache,
1268 }
1269
1270 impl ProgramCacheInner {
1271     fn new(ro: &ExecReadOnly) -> Self {
1272         ProgramCacheInner {
1273             pikevm: pikevm::Cache::new(&ro.nfa),
1274             backtrack: backtrack::Cache::new(&ro.nfa),
1275             dfa: dfa::Cache::new(&ro.dfa),
1276             dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1277         }
1278     }
1279 }