1 use std
::cell
::RefCell
;
2 use std
::collections
::HashMap
;
3 use std
::panic
::AssertUnwindSafe
;
6 #[cfg(feature = "perf-literal")]
7 use aho_corasick
::{AhoCorasick, AhoCorasickBuilder, MatchKind}
;
8 use syntax
::hir
::literal
::Literals
;
10 use syntax
::ParserBuilder
;
13 use compile
::Compiler
;
14 #[cfg(feature = "perf-dfa")]
17 use input
::{ByteInput, CharInput}
;
18 use literal
::LiteralSearcher
;
20 use pool
::{Pool, PoolGuard}
;
22 use re_builder
::RegexOptions
;
25 use re_trait
::{Locations, RegularExpression, Slot}
;
29 /// `Exec` manages the execution of a regular expression.
31 /// In particular, this manages the various compiled forms of a single regular
32 /// expression and the choice of which matching engine to use to execute a
33 /// regular expression.
36 /// All read only state.
37 ro
: Arc
<ExecReadOnly
>,
38 /// A pool of reusable values for the various matching engines.
40 /// Note that boxing this value is not strictly necessary, but it is an
41 /// easy way to ensure that T does not bloat the stack sized used by a pool
42 /// in the case where T is big. And this turns out to be the case at the
43 /// time of writing for regex's use of this pool. At the time of writing,
44 /// the size of a Regex on the stack is 856 bytes. Boxing this value
45 /// reduces that size to 16 bytes.
46 pool
: Box
<Pool
<ProgramCache
>>,
49 /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
50 /// means it is no longer Sync, but we can now avoid the overhead of
51 /// synchronization to fetch the cache.
53 pub struct ExecNoSync
<'c
> {
54 /// All read only state.
55 ro
: &'c Arc
<ExecReadOnly
>,
56 /// Caches for the various matching engines.
57 cache
: PoolGuard
<'c
, ProgramCache
>,
60 /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
62 pub struct ExecNoSyncStr
<'c
>(ExecNoSync
<'c
>);
64 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
65 /// state is determined at compile time and never changes during search.
68 /// The original regular expressions given by the caller to compile.
70 /// A compiled program that is used in the NFA simulation and backtracking.
71 /// It can be byte-based or Unicode codepoint based.
73 /// N.B. It is not possibly to make this byte-based from the public API.
74 /// It is only used for testing byte based programs in the NFA simulations.
76 /// A compiled byte based program for DFA execution. This is only used
77 /// if a DFA can be executed. (Currently, only word boundary assertions are
78 /// not supported.) Note that this program contains an embedded `.*?`
79 /// preceding the first capture group, unless the regex is anchored at the
82 /// The same as above, except the program is reversed (and there is no
83 /// preceding `.*?`). This is used by the DFA to find the starting location
86 /// A set of suffix literals extracted from the regex.
88 /// Prefix literals are stored on the `Program`, since they are used inside
89 /// the matching engines.
90 suffixes
: LiteralSearcher
,
91 /// An Aho-Corasick automaton with leftmost-first match semantics.
93 /// This is only set when the entire regex is a simple unanchored
94 /// alternation of literals. We could probably use it more circumstances,
95 /// but this is already hacky enough in this architecture.
97 /// N.B. We use u32 as a state ID representation under the assumption that
98 /// if we were to exhaust the ID space, we probably would have long
99 /// surpassed the compilation size limit.
100 #[cfg(feature = "perf-literal")]
101 ac
: Option
<AhoCorasick
<u32>>,
102 /// match_type encodes as much upfront knowledge about how we're going to
103 /// execute a search as possible.
104 match_type
: MatchType
,
107 /// Facilitates the construction of an executor by exposing various knobs
108 /// to control how a regex is executed and what kinds of resources it's
109 /// permitted to use.
110 // `ExecBuilder` is only public via the `internal` module, so avoid deriving
112 #[allow(missing_debug_implementations)]
113 pub struct ExecBuilder
{
114 options
: RegexOptions
,
115 match_type
: Option
<MatchType
>,
120 /// Parsed represents a set of parsed regular expressions and their detected
130 /// Create a regex execution builder.
132 /// This uses default settings for everything except the regex itself,
133 /// which must be provided. Further knobs can be set by calling methods,
134 /// and then finally, `build` to actually create the executor.
135 pub fn new(re
: &str) -> Self {
136 Self::new_many(&[re
])
139 /// Like new, but compiles the union of the given regular expressions.
141 /// Note that when compiling 2 or more regular expressions, capture groups
142 /// are completely unsupported. (This means both `find` and `captures`
144 pub fn new_many
<I
, S
>(res
: I
) -> Self
147 I
: IntoIterator
<Item
= S
>,
149 let mut opts
= RegexOptions
::default();
150 opts
.pats
= res
.into_iter().map(|s
| s
.as_ref().to_owned()).collect();
151 Self::new_options(opts
)
154 /// Create a regex execution builder.
155 pub fn new_options(opts
: RegexOptions
) -> Self {
164 /// Set the matching engine to be automatically determined.
166 /// This is the default state and will apply whatever optimizations are
167 /// possible, such as running a DFA.
169 /// This overrides whatever was previously set via the `nfa` or
170 /// `bounded_backtracking` methods.
171 pub fn automatic(mut self) -> Self {
172 self.match_type
= None
;
176 /// Sets the matching engine to use the NFA algorithm no matter what
177 /// optimizations are possible.
179 /// This overrides whatever was previously set via the `automatic` or
180 /// `bounded_backtracking` methods.
181 pub fn nfa(mut self) -> Self {
182 self.match_type
= Some(MatchType
::Nfa(MatchNfaType
::PikeVM
));
186 /// Sets the matching engine to use a bounded backtracking engine no
187 /// matter what optimizations are possible.
189 /// One must use this with care, since the bounded backtracking engine
190 /// uses memory proportion to `len(regex) * len(text)`.
192 /// This overrides whatever was previously set via the `automatic` or
194 pub fn bounded_backtracking(mut self) -> Self {
195 self.match_type
= Some(MatchType
::Nfa(MatchNfaType
::Backtrack
));
199 /// Compiles byte based programs for use with the NFA matching engines.
201 /// By default, the NFA engines match on Unicode scalar values. They can
202 /// be made to use byte based programs instead. In general, the byte based
203 /// programs are slower because of a less efficient encoding of character
206 /// Note that this does not impact DFA matching engines, which always
207 /// execute on bytes.
208 pub fn bytes(mut self, yes
: bool
) -> Self {
213 /// When disabled, the program compiled may match arbitrary bytes.
215 /// When enabled (the default), all compiled programs exclusively match
216 /// valid UTF-8 bytes.
217 pub fn only_utf8(mut self, yes
: bool
) -> Self {
218 self.only_utf8
= yes
;
222 /// Set the Unicode flag.
223 pub fn unicode(mut self, yes
: bool
) -> Self {
224 self.options
.unicode
= yes
;
228 /// Parse the current set of patterns into their AST and extract literals.
229 fn parse(&self) -> Result
<Parsed
, Error
> {
230 let mut exprs
= Vec
::with_capacity(self.options
.pats
.len());
231 let mut prefixes
= Some(Literals
::empty());
232 let mut suffixes
= Some(Literals
::empty());
233 let mut bytes
= false;
234 let is_set
= self.options
.pats
.len() > 1;
235 // If we're compiling a regex set and that set has any anchored
236 // expressions, then disable all literal optimizations.
237 for pat
in &self.options
.pats
{
238 let mut parser
= ParserBuilder
::new()
239 .octal(self.options
.octal
)
240 .case_insensitive(self.options
.case_insensitive
)
241 .multi_line(self.options
.multi_line
)
242 .dot_matches_new_line(self.options
.dot_matches_new_line
)
243 .swap_greed(self.options
.swap_greed
)
244 .ignore_whitespace(self.options
.ignore_whitespace
)
245 .unicode(self.options
.unicode
)
246 .allow_invalid_utf8(!self.only_utf8
)
247 .nest_limit(self.options
.nest_limit
)
250 parser
.parse(pat
).map_err(|e
| Error
::Syntax(e
.to_string()))?
;
251 bytes
= bytes
|| !expr
.is_always_utf8();
253 if cfg
!(feature
= "perf-literal") {
254 if !expr
.is_anchored_start() && expr
.is_any_anchored_start() {
255 // Partial anchors unfortunately make it hard to use
256 // prefixes, so disable them.
258 } else if is_set
&& expr
.is_anchored_start() {
259 // Regex sets with anchors do not go well with literal
263 prefixes
= prefixes
.and_then(|mut prefixes
| {
264 if !prefixes
.union_prefixes(&expr
) {
271 if !expr
.is_anchored_end() && expr
.is_any_anchored_end() {
272 // Partial anchors unfortunately make it hard to use
273 // suffixes, so disable them.
275 } else if is_set
&& expr
.is_anchored_end() {
276 // Regex sets with anchors do not go well with literal
280 suffixes
= suffixes
.and_then(|mut suffixes
| {
281 if !suffixes
.union_suffixes(&expr
) {
292 prefixes
: prefixes
.unwrap_or_else(Literals
::empty
),
293 suffixes
: suffixes
.unwrap_or_else(Literals
::empty
),
298 /// Build an executor that can run a regular expression.
299 pub fn build(self) -> Result
<Exec
, Error
> {
300 // Special case when we have no patterns to compile.
301 // This can happen when compiling a regex set.
302 if self.options
.pats
.is_empty() {
303 let ro
= Arc
::new(ExecReadOnly
{
307 dfa_reverse
: Program
::new(),
308 suffixes
: LiteralSearcher
::empty(),
309 #[cfg(feature = "perf-literal")]
311 match_type
: MatchType
::Nothing
,
313 let pool
= ExecReadOnly
::new_pool(&ro
);
314 return Ok(Exec { ro: ro, pool }
);
316 let parsed
= self.parse()?
;
317 let mut nfa
= Compiler
::new()
318 .size_limit(self.options
.size_limit
)
319 .bytes(self.bytes
|| parsed
.bytes
)
320 .only_utf8(self.only_utf8
)
321 .compile(&parsed
.exprs
)?
;
322 let mut dfa
= Compiler
::new()
323 .size_limit(self.options
.size_limit
)
325 .only_utf8(self.only_utf8
)
326 .compile(&parsed
.exprs
)?
;
327 let mut dfa_reverse
= Compiler
::new()
328 .size_limit(self.options
.size_limit
)
330 .only_utf8(self.only_utf8
)
332 .compile(&parsed
.exprs
)?
;
334 #[cfg(feature = "perf-literal")]
335 let ac
= self.build_aho_corasick(&parsed
);
336 nfa
.prefixes
= LiteralSearcher
::prefixes(parsed
.prefixes
);
337 dfa
.prefixes
= nfa
.prefixes
.clone();
338 dfa
.dfa_size_limit
= self.options
.dfa_size_limit
;
339 dfa_reverse
.dfa_size_limit
= self.options
.dfa_size_limit
;
341 let mut ro
= ExecReadOnly
{
342 res
: self.options
.pats
,
345 dfa_reverse
: dfa_reverse
,
346 suffixes
: LiteralSearcher
::suffixes(parsed
.suffixes
),
347 #[cfg(feature = "perf-literal")]
349 match_type
: MatchType
::Nothing
,
351 ro
.match_type
= ro
.choose_match_type(self.match_type
);
353 let ro
= Arc
::new(ro
);
354 let pool
= ExecReadOnly
::new_pool(&ro
);
355 Ok(Exec { ro, pool }
)
358 #[cfg(feature = "perf-literal")]
359 fn build_aho_corasick(&self, parsed
: &Parsed
) -> Option
<AhoCorasick
<u32>> {
360 if parsed
.exprs
.len() != 1 {
363 let lits
= match alternation_literals(&parsed
.exprs
[0]) {
367 // If we have a small number of literals, then let Teddy handle
368 // things (see literal/mod.rs).
369 if lits
.len() <= 32 {
373 AhoCorasickBuilder
::new()
374 .match_kind(MatchKind
::LeftmostFirst
)
375 .auto_configure(&lits
)
376 // We always want this to reduce size, regardless
377 // of what auto-configure does.
379 .build_with_size
::<u32, _
, _
>(&lits
)
380 // This should never happen because we'd long exceed the
381 // compilation limit for regexes first.
382 .expect("AC automaton too big"),
387 impl<'c
> RegularExpression
for ExecNoSyncStr
<'c
> {
390 fn slots_len(&self) -> usize {
394 fn next_after_empty(&self, text
: &str, i
: usize) -> usize {
395 next_utf8(text
.as_bytes(), i
)
398 #[cfg_attr(feature = "perf-inline", inline(always))]
399 fn shortest_match_at(&self, text
: &str, start
: usize) -> Option
<usize> {
400 self.0.shortest_match_at(text
.as_bytes(), start
)
403 #[cfg_attr(feature = "perf-inline", inline(always))]
404 fn is_match_at(&self, text
: &str, start
: usize) -> bool
{
405 self.0.is_match_at
(text
.as_bytes(), start
)
408 #[cfg_attr(feature = "perf-inline", inline(always))]
409 fn find_at(&self, text
: &str, start
: usize) -> Option
<(usize, usize)> {
410 self.0.find_at
(text
.as_bytes(), start
)
413 #[cfg_attr(feature = "perf-inline", inline(always))]
416 locs
: &mut Locations
,
419 ) -> Option
<(usize, usize)> {
420 self.0.captures_read_at(locs
, text
.as_bytes(), start
)
424 impl<'c
> RegularExpression
for ExecNoSync
<'c
> {
427 /// Returns the number of capture slots in the regular expression. (There
428 /// are two slots for every capture group, corresponding to possibly empty
429 /// start and end locations of the capture.)
430 fn slots_len(&self) -> usize {
431 self.ro
.nfa
.captures
.len() * 2
434 fn next_after_empty(&self, _text
: &[u8], i
: usize) -> usize {
438 /// Returns the end of a match location, possibly occurring before the
439 /// end location of the correct leftmost-first match.
440 #[cfg_attr(feature = "perf-inline", inline(always))]
441 fn shortest_match_at(&self, text
: &[u8], start
: usize) -> Option
<usize> {
442 if !self.is_anchor_end_match(text
) {
445 match self.ro
.match_type
{
446 #[cfg(feature = "perf-literal")]
447 MatchType
::Literal(ty
) => {
448 self.find_literals(ty
, text
, start
).map(|(_
, e
)| e
)
450 #[cfg(feature = "perf-dfa")]
451 MatchType
::Dfa
| MatchType
::DfaMany
=> {
452 match self.shortest_dfa(text
, start
) {
453 dfa
::Result
::Match(end
) => Some(end
),
454 dfa
::Result
::NoMatch(_
) => None
,
455 dfa
::Result
::Quit
=> self.shortest_nfa(text
, start
),
458 #[cfg(feature = "perf-dfa")]
459 MatchType
::DfaAnchoredReverse
=> {
460 match dfa
::Fsm
::reverse(
461 &self.ro
.dfa_reverse
,
467 dfa
::Result
::Match(_
) => Some(text
.len()),
468 dfa
::Result
::NoMatch(_
) => None
,
469 dfa
::Result
::Quit
=> self.shortest_nfa(text
, start
),
472 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
473 MatchType
::DfaSuffix
=> {
474 match self.shortest_dfa_reverse_suffix(text
, start
) {
475 dfa
::Result
::Match(e
) => Some(e
),
476 dfa
::Result
::NoMatch(_
) => None
,
477 dfa
::Result
::Quit
=> self.shortest_nfa(text
, start
),
480 MatchType
::Nfa(ty
) => self.shortest_nfa_type(ty
, text
, start
),
481 MatchType
::Nothing
=> None
,
485 /// Returns true if and only if the regex matches text.
487 /// For single regular expressions, this is equivalent to calling
488 /// shortest_match(...).is_some().
489 #[cfg_attr(feature = "perf-inline", inline(always))]
490 fn is_match_at(&self, text
: &[u8], start
: usize) -> bool
{
491 if !self.is_anchor_end_match(text
) {
494 // We need to do this dance because shortest_match relies on the NFA
495 // filling in captures[1], but a RegexSet has no captures. In other
496 // words, a RegexSet can't (currently) use shortest_match. ---AG
497 match self.ro
.match_type
{
498 #[cfg(feature = "perf-literal")]
499 MatchType
::Literal(ty
) => {
500 self.find_literals(ty
, text
, start
).is_some()
502 #[cfg(feature = "perf-dfa")]
503 MatchType
::Dfa
| MatchType
::DfaMany
=> {
504 match self.shortest_dfa(text
, start
) {
505 dfa
::Result
::Match(_
) => true,
506 dfa
::Result
::NoMatch(_
) => false,
507 dfa
::Result
::Quit
=> self.match_nfa(text
, start
),
510 #[cfg(feature = "perf-dfa")]
511 MatchType
::DfaAnchoredReverse
=> {
512 match dfa
::Fsm
::reverse(
513 &self.ro
.dfa_reverse
,
519 dfa
::Result
::Match(_
) => true,
520 dfa
::Result
::NoMatch(_
) => false,
521 dfa
::Result
::Quit
=> self.match_nfa(text
, start
),
524 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
525 MatchType
::DfaSuffix
=> {
526 match self.shortest_dfa_reverse_suffix(text
, start
) {
527 dfa
::Result
::Match(_
) => true,
528 dfa
::Result
::NoMatch(_
) => false,
529 dfa
::Result
::Quit
=> self.match_nfa(text
, start
),
532 MatchType
::Nfa(ty
) => self.match_nfa_type(ty
, text
, start
),
533 MatchType
::Nothing
=> false,
537 /// Finds the start and end location of the leftmost-first match, starting
538 /// at the given location.
539 #[cfg_attr(feature = "perf-inline", inline(always))]
540 fn find_at(&self, text
: &[u8], start
: usize) -> Option
<(usize, usize)> {
541 if !self.is_anchor_end_match(text
) {
544 match self.ro
.match_type
{
545 #[cfg(feature = "perf-literal")]
546 MatchType
::Literal(ty
) => self.find_literals(ty
, text
, start
),
547 #[cfg(feature = "perf-dfa")]
548 MatchType
::Dfa
=> match self.find_dfa_forward(text
, start
) {
549 dfa
::Result
::Match((s
, e
)) => Some((s
, e
)),
550 dfa
::Result
::NoMatch(_
) => None
,
551 dfa
::Result
::Quit
=> {
552 self.find_nfa(MatchNfaType
::Auto
, text
, start
)
555 #[cfg(feature = "perf-dfa")]
556 MatchType
::DfaAnchoredReverse
=> {
557 match self.find_dfa_anchored_reverse(text
, start
) {
558 dfa
::Result
::Match((s
, e
)) => Some((s
, e
)),
559 dfa
::Result
::NoMatch(_
) => None
,
560 dfa
::Result
::Quit
=> {
561 self.find_nfa(MatchNfaType
::Auto
, text
, start
)
565 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
566 MatchType
::DfaSuffix
=> {
567 match self.find_dfa_reverse_suffix(text
, start
) {
568 dfa
::Result
::Match((s
, e
)) => Some((s
, e
)),
569 dfa
::Result
::NoMatch(_
) => None
,
570 dfa
::Result
::Quit
=> {
571 self.find_nfa(MatchNfaType
::Auto
, text
, start
)
575 MatchType
::Nfa(ty
) => self.find_nfa(ty
, text
, start
),
576 MatchType
::Nothing
=> None
,
577 #[cfg(feature = "perf-dfa")]
578 MatchType
::DfaMany
=> {
579 unreachable
!("BUG: RegexSet cannot be used with find")
584 /// Finds the start and end location of the leftmost-first match and also
585 /// fills in all matching capture groups.
587 /// The number of capture slots given should be equal to the total number
588 /// of capture slots in the compiled program.
590 /// Note that the first two slots always correspond to the start and end
591 /// locations of the overall match.
594 locs
: &mut Locations
,
597 ) -> Option
<(usize, usize)> {
598 let slots
= locs
.as_slots();
599 for slot
in slots
.iter_mut() {
602 // If the caller unnecessarily uses this, then we try to save them
605 0 => return self.find_at(text
, start
),
607 return self.find_at(text
, start
).map(|(s
, e
)| {
613 _
=> {}
// fallthrough
615 if !self.is_anchor_end_match(text
) {
618 match self.ro
.match_type
{
619 #[cfg(feature = "perf-literal")]
620 MatchType
::Literal(ty
) => {
621 self.find_literals(ty
, text
, start
).and_then(|(s
, e
)| {
622 self.captures_nfa_type(
631 #[cfg(feature = "perf-dfa")]
633 if self.ro
.nfa
.is_anchored_start
{
634 self.captures_nfa(slots
, text
, start
)
636 match self.find_dfa_forward(text
, start
) {
637 dfa
::Result
::Match((s
, e
)) => self.captures_nfa_type(
644 dfa
::Result
::NoMatch(_
) => None
,
645 dfa
::Result
::Quit
=> {
646 self.captures_nfa(slots
, text
, start
)
651 #[cfg(feature = "perf-dfa")]
652 MatchType
::DfaAnchoredReverse
=> {
653 match self.find_dfa_anchored_reverse(text
, start
) {
654 dfa
::Result
::Match((s
, e
)) => self.captures_nfa_type(
661 dfa
::Result
::NoMatch(_
) => None
,
662 dfa
::Result
::Quit
=> self.captures_nfa(slots
, text
, start
),
665 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
666 MatchType
::DfaSuffix
=> {
667 match self.find_dfa_reverse_suffix(text
, start
) {
668 dfa
::Result
::Match((s
, e
)) => self.captures_nfa_type(
675 dfa
::Result
::NoMatch(_
) => None
,
676 dfa
::Result
::Quit
=> self.captures_nfa(slots
, text
, start
),
679 MatchType
::Nfa(ty
) => {
680 self.captures_nfa_type(ty
, slots
, text
, start
, text
.len())
682 MatchType
::Nothing
=> None
,
683 #[cfg(feature = "perf-dfa")]
684 MatchType
::DfaMany
=> {
685 unreachable
!("BUG: RegexSet cannot be used with captures")
691 impl<'c
> ExecNoSync
<'c
> {
692 /// Finds the leftmost-first match using only literal search.
693 #[cfg(feature = "perf-literal")]
694 #[cfg_attr(feature = "perf-inline", inline(always))]
697 ty
: MatchLiteralType
,
700 ) -> Option
<(usize, usize)> {
701 use self::MatchLiteralType
::*;
704 let lits
= &self.ro
.nfa
.prefixes
;
705 lits
.find(&text
[start
..]).map(|(s
, e
)| (start
+ s
, start
+ e
))
708 let lits
= &self.ro
.nfa
.prefixes
;
709 if start
== 0 || !self.ro
.nfa
.is_anchored_start
{
710 lits
.find_start(&text
[start
..])
711 .map(|(s
, e
)| (start
+ s
, start
+ e
))
717 let lits
= &self.ro
.suffixes
;
718 lits
.find_end(&text
[start
..])
719 .map(|(s
, e
)| (start
+ s
, start
+ e
))
726 .find(&text
[start
..])
727 .map(|m
| (start
+ m
.start(), start
+ m
.end())),
731 /// Finds the leftmost-first match (start and end) using only the DFA.
733 /// If the result returned indicates that the DFA quit, then another
734 /// matching engine should be used.
735 #[cfg(feature = "perf-dfa")]
736 #[cfg_attr(feature = "perf-inline", inline(always))]
741 ) -> dfa
::Result
<(usize, usize)> {
743 let end
= match dfa
::Fsm
::forward(
750 NoMatch(i
) => return NoMatch(i
),
752 Match(end
) if start
== end
=> return Match((start
, start
)),
755 // Now run the DFA in reverse to find the start of the match.
756 match dfa
::Fsm
::reverse(
757 &self.ro
.dfa_reverse
,
763 Match(s
) => Match((start
+ s
, end
)),
764 NoMatch(i
) => NoMatch(i
),
769 /// Finds the leftmost-first match (start and end) using only the DFA,
770 /// but assumes the regex is anchored at the end and therefore starts at
771 /// the end of the regex and matches in reverse.
773 /// If the result returned indicates that the DFA quit, then another
774 /// matching engine should be used.
775 #[cfg(feature = "perf-dfa")]
776 #[cfg_attr(feature = "perf-inline", inline(always))]
777 fn find_dfa_anchored_reverse(
781 ) -> dfa
::Result
<(usize, usize)> {
783 match dfa
::Fsm
::reverse(
784 &self.ro
.dfa_reverse
,
790 Match(s
) => Match((start
+ s
, text
.len())),
791 NoMatch(i
) => NoMatch(i
),
796 /// Finds the end of the shortest match using only the DFA.
797 #[cfg(feature = "perf-dfa")]
798 #[cfg_attr(feature = "perf-inline", inline(always))]
799 fn shortest_dfa(&self, text
: &[u8], start
: usize) -> dfa
::Result
<usize> {
800 dfa
::Fsm
::forward(&self.ro
.dfa
, self.cache
.value(), true, text
, start
)
803 /// Finds the end of the shortest match using only the DFA by scanning for
805 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
806 #[cfg_attr(feature = "perf-inline", inline(always))]
807 fn shortest_dfa_reverse_suffix(
811 ) -> dfa
::Result
<usize> {
812 match self.exec_dfa_reverse_suffix(text
, start
) {
813 None
=> self.shortest_dfa(text
, start
),
814 Some(r
) => r
.map(|(_
, end
)| end
),
818 /// Finds the end of the shortest match using only the DFA by scanning for
819 /// suffix literals. It also reports the start of the match.
821 /// Note that if None is returned, then the optimization gave up to avoid
822 /// worst case quadratic behavior. A forward scanning DFA should be tried
825 /// If a match is returned and the full leftmost-first match is desired,
826 /// then a forward scan starting from the beginning of the match must be
829 /// If the result returned indicates that the DFA quit, then another
830 /// matching engine should be used.
831 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
832 #[cfg_attr(feature = "perf-inline", inline(always))]
833 fn exec_dfa_reverse_suffix(
836 original_start
: usize,
837 ) -> Option
<dfa
::Result
<(usize, usize)>> {
840 let lcs
= self.ro
.suffixes
.lcs();
841 debug_assert
!(lcs
.len() >= 1);
842 let mut start
= original_start
;
844 let mut last_literal
= start
;
845 while end
<= text
.len() {
846 last_literal
+= match lcs
.find(&text
[last_literal
..]) {
847 None
=> return Some(NoMatch(text
.len())),
850 end
= last_literal
+ lcs
.len();
851 match dfa
::Fsm
::reverse(
852 &self.ro
.dfa_reverse
,
858 Match(0) | NoMatch(0) => return None
,
859 Match(i
) => return Some(Match((start
+ i
, end
))),
865 Quit
=> return Some(Quit
),
868 Some(NoMatch(text
.len()))
871 /// Finds the leftmost-first match (start and end) using only the DFA
872 /// by scanning for suffix literals.
874 /// If the result returned indicates that the DFA quit, then another
875 /// matching engine should be used.
876 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
877 #[cfg_attr(feature = "perf-inline", inline(always))]
878 fn find_dfa_reverse_suffix(
882 ) -> dfa
::Result
<(usize, usize)> {
885 let match_start
= match self.exec_dfa_reverse_suffix(text
, start
) {
886 None
=> return self.find_dfa_forward(text
, start
),
887 Some(Match((start
, _
))) => start
,
890 // At this point, we've found a match. The only way to quit now
891 // without a match is if the DFA gives up (seems unlikely).
893 // Now run the DFA forwards to find the proper end of the match.
894 // (The suffix literal match can only indicate the earliest
895 // possible end location, which may appear before the end of the
896 // leftmost-first match.)
897 match dfa
::Fsm
::forward(
904 NoMatch(_
) => panic
!("BUG: reverse match implies forward match"),
906 Match(e
) => Match((match_start
, e
)),
910 /// Executes the NFA engine to return whether there is a match or not.
912 /// Ideally, we could use shortest_nfa(...).is_some() and get the same
913 /// performance characteristics, but regex sets don't have captures, which
914 /// shortest_nfa depends on.
915 #[cfg(feature = "perf-dfa")]
916 fn match_nfa(&self, text
: &[u8], start
: usize) -> bool
{
917 self.match_nfa_type(MatchNfaType
::Auto
, text
, start
)
920 /// Like match_nfa, but allows specification of the type of NFA engine.
939 /// Finds the shortest match using an NFA.
940 #[cfg(feature = "perf-dfa")]
941 fn shortest_nfa(&self, text
: &[u8], start
: usize) -> Option
<usize> {
942 self.shortest_nfa_type(MatchNfaType
::Auto
, text
, start
)
945 /// Like shortest_nfa, but allows specification of the type of NFA engine.
946 fn shortest_nfa_type(
952 let mut slots
= [None
, None
];
969 /// Like find, but executes an NFA engine.
975 ) -> Option
<(usize, usize)> {
976 let mut slots
= [None
, None
];
987 match (slots
[0], slots
[1]) {
988 (Some(s
), Some(e
)) => Some((s
, e
)),
996 /// Like find_nfa, but fills in captures.
998 /// `slots` should have length equal to `2 * nfa.captures.len()`.
999 #[cfg(feature = "perf-dfa")]
1005 ) -> Option
<(usize, usize)> {
1006 self.captures_nfa_type(
1015 /// Like captures_nfa, but allows specification of type of NFA engine.
1016 fn captures_nfa_type(
1023 ) -> Option
<(usize, usize)> {
1034 match (slots
[0], slots
[1]) {
1035 (Some(s
), Some(e
)) => Some((s
, e
)),
1045 mut ty
: MatchNfaType
,
1046 matches
: &mut [bool
],
1048 quit_after_match
: bool
,
1049 quit_after_match_with_pos
: bool
,
1054 use self::MatchNfaType
::*;
1056 if backtrack
::should_exec(self.ro
.nfa
.len(), text
.len()) {
1062 // The backtracker can't return the shortest match position as it is
1063 // implemented today. So if someone calls `shortest_match` and we need
1064 // to run an NFA, then use the PikeVM.
1065 if quit_after_match_with_pos
|| ty
== PikeVM
{
1075 self.exec_backtrack(matches
, slots
, text
, start
, end
)
1079 /// Always run the NFA algorithm.
1082 matches
: &mut [bool
],
1084 quit_after_match
: bool
,
1089 if self.ro
.nfa
.uses_bytes() {
1096 ByteInput
::new(text
, self.ro
.nfa
.only_utf8
),
1107 CharInput
::new(text
),
1114 /// Always runs the NFA using bounded backtracking.
1117 matches
: &mut [bool
],
1123 if self.ro
.nfa
.uses_bytes() {
1124 backtrack
::Bounded
::exec(
1129 ByteInput
::new(text
, self.ro
.nfa
.only_utf8
),
1134 backtrack
::Bounded
::exec(
1139 CharInput
::new(text
),
1146 /// Finds which regular expressions match the given text.
1148 /// `matches` should have length equal to the number of regexes being
1151 /// This is only useful when one wants to know which regexes in a set
1152 /// match some text.
1153 pub fn many_matches_at(
1155 matches
: &mut [bool
],
1159 use self::MatchType
::*;
1160 if !self.is_anchor_end_match(text
) {
1163 match self.ro
.match_type
{
1164 #[cfg(feature = "perf-literal")]
1166 debug_assert_eq
!(matches
.len(), 1);
1167 matches
[0] = self.find_literals(ty
, text
, start
).is_some();
1170 #[cfg(feature = "perf-dfa")]
1171 Dfa
| DfaAnchoredReverse
| DfaMany
=> {
1172 match dfa
::Fsm
::forward_many(
1179 dfa
::Result
::Match(_
) => true,
1180 dfa
::Result
::NoMatch(_
) => false,
1181 dfa
::Result
::Quit
=> self.exec_nfa(
1193 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1195 match dfa
::Fsm
::forward_many(
1202 dfa
::Result
::Match(_
) => true,
1203 dfa
::Result
::NoMatch(_
) => false,
1204 dfa
::Result
::Quit
=> self.exec_nfa(
1216 Nfa(ty
) => self.exec_nfa(
1230 #[cfg_attr(feature = "perf-inline", inline(always))]
1231 fn is_anchor_end_match(&self, text
: &[u8]) -> bool
{
1232 #[cfg(not(feature = "perf-literal"))]
1233 fn imp(_
: &ExecReadOnly
, _
: &[u8]) -> bool
{
1237 #[cfg(feature = "perf-literal")]
1238 fn imp(ro
: &ExecReadOnly
, text
: &[u8]) -> bool
{
1239 // Only do this check if the haystack is big (>1MB).
1240 if text
.len() > (1 << 20) && ro
.nfa
.is_anchored_end
{
1241 let lcs
= ro
.suffixes
.lcs();
1242 if lcs
.len() >= 1 && !lcs
.is_suffix(text
) {
1252 pub fn capture_name_idx(&self) -> &Arc
<HashMap
<String
, usize>> {
1253 &self.ro
.nfa
.capture_name_idx
1257 impl<'c
> ExecNoSyncStr
<'c
> {
1258 pub fn capture_name_idx(&self) -> &Arc
<HashMap
<String
, usize>> {
1259 self.0.capture_name_idx()
1264 /// Get a searcher that isn't Sync.
1265 #[cfg_attr(feature = "perf-inline", inline(always))]
1266 pub fn searcher(&self) -> ExecNoSync
{
1268 ro
: &self.ro
, // a clone is too expensive here! (and not needed)
1269 cache
: self.pool
.get(),
1273 /// Get a searcher that isn't Sync and can match on &str.
1274 #[cfg_attr(feature = "perf-inline", inline(always))]
1275 pub fn searcher_str(&self) -> ExecNoSyncStr
{
1276 ExecNoSyncStr(self.searcher())
1279 /// Build a Regex from this executor.
1280 pub fn into_regex(self) -> re_unicode
::Regex
{
1281 re_unicode
::Regex
::from(self)
1284 /// Build a RegexSet from this executor.
1285 pub fn into_regex_set(self) -> re_set
::unicode
::RegexSet
{
1286 re_set
::unicode
::RegexSet
::from(self)
1289 /// Build a Regex from this executor that can match arbitrary bytes.
1290 pub fn into_byte_regex(self) -> re_bytes
::Regex
{
1291 re_bytes
::Regex
::from(self)
1294 /// Build a RegexSet from this executor that can match arbitrary bytes.
1295 pub fn into_byte_regex_set(self) -> re_set
::bytes
::RegexSet
{
1296 re_set
::bytes
::RegexSet
::from(self)
1299 /// The original regular expressions given by the caller that were
1301 pub fn regex_strings(&self) -> &[String
] {
1305 /// Return a slice of capture names.
1307 /// Any capture that isn't named is None.
1308 pub fn capture_names(&self) -> &[Option
<String
>] {
1309 &self.ro
.nfa
.captures
1312 /// Return a reference to named groups mapping (from group name to
1313 /// group position).
1314 pub fn capture_name_idx(&self) -> &Arc
<HashMap
<String
, usize>> {
1315 &self.ro
.nfa
.capture_name_idx
1319 impl Clone
for Exec
{
1320 fn clone(&self) -> Exec
{
1321 let pool
= ExecReadOnly
::new_pool(&self.ro
);
1322 Exec { ro: self.ro.clone(), pool }
1327 fn choose_match_type(&self, hint
: Option
<MatchType
>) -> MatchType
{
1328 if let Some(MatchType
::Nfa(_
)) = hint
{
1329 return hint
.unwrap();
1331 // If the NFA is empty, then we'll never match anything.
1332 if self.nfa
.insts
.is_empty() {
1333 return MatchType
::Nothing
;
1335 if let Some(literalty
) = self.choose_literal_match_type() {
1338 if let Some(dfaty
) = self.choose_dfa_match_type() {
1341 // We're so totally hosed.
1342 MatchType
::Nfa(MatchNfaType
::Auto
)
1345 /// If a plain literal scan can be used, then a corresponding literal
1346 /// search type is returned.
1347 fn choose_literal_match_type(&self) -> Option
<MatchType
> {
1348 #[cfg(not(feature = "perf-literal"))]
1349 fn imp(_
: &ExecReadOnly
) -> Option
<MatchType
> {
1353 #[cfg(feature = "perf-literal")]
1354 fn imp(ro
: &ExecReadOnly
) -> Option
<MatchType
> {
1355 // If our set of prefixes is complete, then we can use it to find
1356 // a match in lieu of a regex engine. This doesn't quite work well
1357 // in the presence of multiple regexes, so only do it when there's
1360 // TODO(burntsushi): Also, don't try to match literals if the regex
1361 // is partially anchored. We could technically do it, but we'd need
1362 // to create two sets of literals: all of them and then the subset
1363 // that aren't anchored. We would then only search for all of them
1364 // when at the beginning of the input and use the subset in all
1366 if ro
.res
.len() != 1 {
1369 if ro
.ac
.is_some() {
1370 return Some(MatchType
::Literal(
1371 MatchLiteralType
::AhoCorasick
,
1374 if ro
.nfa
.prefixes
.complete() {
1375 return if ro
.nfa
.is_anchored_start
{
1376 Some(MatchType
::Literal(MatchLiteralType
::AnchoredStart
))
1378 Some(MatchType
::Literal(MatchLiteralType
::Unanchored
))
1381 if ro
.suffixes
.complete() {
1382 return if ro
.nfa
.is_anchored_end
{
1383 Some(MatchType
::Literal(MatchLiteralType
::AnchoredEnd
))
1385 // This case shouldn't happen. When the regex isn't
1386 // anchored, then complete prefixes should imply complete
1388 Some(MatchType
::Literal(MatchLiteralType
::Unanchored
))
1397 /// If a DFA scan can be used, then choose the appropriate DFA strategy.
1398 fn choose_dfa_match_type(&self) -> Option
<MatchType
> {
1399 #[cfg(not(feature = "perf-dfa"))]
1400 fn imp(_
: &ExecReadOnly
) -> Option
<MatchType
> {
1404 #[cfg(feature = "perf-dfa")]
1405 fn imp(ro
: &ExecReadOnly
) -> Option
<MatchType
> {
1406 if !dfa
::can_exec(&ro
.dfa
) {
1409 // Regex sets require a slightly specialized path.
1410 if ro
.res
.len() >= 2 {
1411 return Some(MatchType
::DfaMany
);
1413 // If the regex is anchored at the end but not the start, then
1414 // just match in reverse from the end of the haystack.
1415 if !ro
.nfa
.is_anchored_start
&& ro
.nfa
.is_anchored_end
{
1416 return Some(MatchType
::DfaAnchoredReverse
);
1418 #[cfg(feature = "perf-literal")]
1420 // If there's a longish suffix literal, then it might be faster
1421 // to look for that first.
1422 if ro
.should_suffix_scan() {
1423 return Some(MatchType
::DfaSuffix
);
1426 // Fall back to your garden variety forward searching lazy DFA.
1427 Some(MatchType
::Dfa
)
1433 /// Returns true if the program is amenable to suffix scanning.
1435 /// When this is true, as a heuristic, we assume it is OK to quickly scan
1436 /// for suffix literals and then do a *reverse* DFA match from any matches
1437 /// produced by the literal scan. (And then followed by a forward DFA
1438 /// search, since the previously found suffix literal maybe not actually be
1439 /// the end of a match.)
1441 /// This is a bit of a specialized optimization, but can result in pretty
1442 /// big performance wins if 1) there are no prefix literals and 2) the
1443 /// suffix literals are pretty rare in the text. (1) is obviously easy to
1444 /// account for but (2) is harder. As a proxy, we assume that longer
1445 /// strings are generally rarer, so we only enable this optimization when
1446 /// we have a meaty suffix.
1447 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1448 fn should_suffix_scan(&self) -> bool
{
1449 if self.suffixes
.is_empty() {
1452 let lcs_len
= self.suffixes
.lcs().char_len();
1453 lcs_len
>= 3 && lcs_len
> self.dfa
.prefixes
.lcp().char_len()
1456 fn new_pool(ro
: &Arc
<ExecReadOnly
>) -> Box
<Pool
<ProgramCache
>> {
1457 let ro
= ro
.clone();
1458 Box
::new(Pool
::new(Box
::new(move || {
1459 AssertUnwindSafe(RefCell
::new(ProgramCacheInner
::new(&ro
)))
1464 #[derive(Clone, Copy, Debug)]
1466 /// A single or multiple literal search. This is only used when the regex
1467 /// can be decomposed into a literal search.
1468 #[cfg(feature = "perf-literal")]
1469 Literal(MatchLiteralType
),
1470 /// A normal DFA search.
1471 #[cfg(feature = "perf-dfa")]
1473 /// A reverse DFA search starting from the end of a haystack.
1474 #[cfg(feature = "perf-dfa")]
1476 /// A reverse DFA search with suffix literal scanning.
1477 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1479 /// Use the DFA on two or more regular expressions.
1480 #[cfg(feature = "perf-dfa")]
1484 /// No match is ever possible, so don't ever try to search.
1488 #[derive(Clone, Copy, Debug)]
1489 #[cfg(feature = "perf-literal")]
1490 enum MatchLiteralType
{
1491 /// Match literals anywhere in text.
1493 /// Match literals only at the start of text.
1495 /// Match literals only at the end of text.
1497 /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1502 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
1504 /// Choose between Backtrack and PikeVM.
1506 /// NFA bounded backtracking.
1508 /// (This is only set by tests, since it never makes sense to always want
1513 /// (This is only set by tests, since it never makes sense to always want
1518 /// `ProgramCache` maintains reusable allocations for each matching engine
1519 /// available to a particular program.
1521 /// We declare this as unwind safe since it's a cache that's only used for
1522 /// performance purposes. If a panic occurs, it is (or should be) always safe
1523 /// to continue using the same regex object.
1524 pub type ProgramCache
= AssertUnwindSafe
<RefCell
<ProgramCacheInner
>>;
1527 pub struct ProgramCacheInner
{
1528 pub pikevm
: pikevm
::Cache
,
1529 pub backtrack
: backtrack
::Cache
,
1530 #[cfg(feature = "perf-dfa")]
1531 pub dfa
: dfa
::Cache
,
1532 #[cfg(feature = "perf-dfa")]
1533 pub dfa_reverse
: dfa
::Cache
,
1536 impl ProgramCacheInner
{
1537 fn new(ro
: &ExecReadOnly
) -> Self {
1539 pikevm
: pikevm
::Cache
::new(&ro
.nfa
),
1540 backtrack
: backtrack
::Cache
::new(&ro
.nfa
),
1541 #[cfg(feature = "perf-dfa")]
1542 dfa
: dfa
::Cache
::new(&ro
.dfa
),
1543 #[cfg(feature = "perf-dfa")]
1544 dfa_reverse
: dfa
::Cache
::new(&ro
.dfa_reverse
),
1549 /// Alternation literals checks if the given HIR is a simple alternation of
1550 /// literals, and if so, returns them. Otherwise, this returns None.
1551 #[cfg(feature = "perf-literal")]
1552 fn alternation_literals(expr
: &Hir
) -> Option
<Vec
<Vec
<u8>>> {
1553 use syntax
::hir
::{HirKind, Literal}
;
1555 // This is pretty hacky, but basically, if `is_alternation_literal` is
1556 // true, then we can make several assumptions about the structure of our
1557 // HIR. This is what justifies the `unreachable!` statements below.
1559 // This code should be refactored once we overhaul this crate's
1560 // optimization pipeline, because this is a terribly inflexible way to go
1563 if !expr
.is_alternation_literal() {
1566 let alts
= match *expr
.kind() {
1567 HirKind
::Alternation(ref alts
) => alts
,
1568 _
=> return None
, // one literal isn't worth it
1571 let extendlit
= |lit
: &Literal
, dst
: &mut Vec
<u8>| match *lit
{
1572 Literal
::Unicode(c
) => {
1573 let mut buf
= [0; 4];
1574 dst
.extend_from_slice(c
.encode_utf8(&mut buf
).as_bytes());
1576 Literal
::Byte(b
) => {
1581 let mut lits
= vec
![];
1583 let mut lit
= vec
![];
1585 HirKind
::Literal(ref x
) => extendlit(x
, &mut lit
),
1586 HirKind
::Concat(ref exprs
) => {
1589 HirKind
::Literal(ref x
) => extendlit(x
, &mut lit
),
1590 _
=> unreachable
!("expected literal, got {:?}", e
),
1594 _
=> unreachable
!("expected literal or concat, got {:?}", alt
),
1604 fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1605 use internal
::ExecBuilder
;
1607 let backtrack_bytes_re
= ExecBuilder
::new("^S")
1608 .bounded_backtracking()
1611 .map(|exec
| exec
.into_byte_regex())
1612 .map_err(|err
| format
!("{}", err
))
1615 let default_bytes_re
= ExecBuilder
::new("^S")
1618 .map(|exec
| exec
.into_byte_regex())
1619 .map_err(|err
| format
!("{}", err
))
1622 let input
= vec
![83, 83];
1624 let s1
= backtrack_bytes_re
.split(&input
);
1625 let s2
= default_bytes_re
.split(&input
);
1626 for (chunk1
, chunk2
) in s1
.zip(s2
) {
1627 assert_eq
!(chunk1
, chunk2
);
1632 fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1633 use internal
::ExecBuilder
;
1635 let backtrack_bytes_re
= ExecBuilder
::new(r
"^(?u:\*)")
1636 .bounded_backtracking()
1639 .map(|exec
| exec
.into_regex())
1640 .map_err(|err
| format
!("{}", err
))
1643 let default_bytes_re
= ExecBuilder
::new(r
"^(?u:\*)")
1646 .map(|exec
| exec
.into_regex())
1647 .map_err(|err
| format
!("{}", err
))
1652 let s1
= backtrack_bytes_re
.split(input
);
1653 let s2
= default_bytes_re
.split(input
);
1654 for (chunk1
, chunk2
) in s1
.zip(s2
) {
1655 assert_eq
!(chunk1
, chunk2
);