1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 use std
::cell
::RefCell
;
12 use std
::collections
::HashMap
;
16 use thread_local
::CachedThreadLocal
;
17 use syntax
::{Expr, ExprBuilder, Literals}
;
20 use compile
::Compiler
;
23 use input
::{ByteInput, CharInput}
;
24 use literals
::LiteralSearcher
;
27 use re_builder
::RegexOptions
;
30 use re_trait
::{RegularExpression, Slot, Locations, as_slots}
;
34 /// `Exec` manages the execution of a regular expression.
36 /// In particular, this manages the various compiled forms of a single regular
37 /// expression and the choice of which matching engine to use to execute a
38 /// regular expression.
40 /// All read only state.
41 ro
: Arc
<ExecReadOnly
>,
42 /// Caches for the various matching engines.
43 cache
: CachedThreadLocal
<ProgramCache
>,
46 /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
47 /// means it is no longer Sync, but we can now avoid the overhead of
48 /// synchronization to fetch the cache.
50 pub struct ExecNoSync
<'c
> {
51 /// All read only state.
52 ro
: &'c Arc
<ExecReadOnly
>,
53 /// Caches for the various matching engines.
54 cache
: &'c ProgramCache
,
57 /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
58 pub struct ExecNoSyncStr
<'c
>(ExecNoSync
<'c
>);
60 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
61 /// state is determined at compile time and never changes during search.
64 /// The original regular expressions given by the caller to compile.
66 /// A compiled program that is used in the NFA simulation and backtracking.
67 /// It can be byte-based or Unicode codepoint based.
69 /// N.B. It is not possibly to make this byte-based from the public API.
70 /// It is only used for testing byte based programs in the NFA simulations.
72 /// A compiled byte based program for DFA execution. This is only used
73 /// if a DFA can be executed. (Currently, only word boundary assertions are
74 /// not supported.) Note that this program contains an embedded `.*?`
75 /// preceding the first capture group, unless the regex is anchored at the
78 /// The same as above, except the program is reversed (and there is no
79 /// preceding `.*?`). This is used by the DFA to find the starting location
82 /// A set of suffix literals extracted from the regex.
84 /// Prefix literals are stored on the `Program`, since they are used inside
85 /// the matching engines.
86 suffixes
: LiteralSearcher
,
87 /// match_type encodes as much upfront knowledge about how we're going to
88 /// execute a search as possible.
89 match_type
: MatchType
,
92 /// Facilitates the construction of an executor by exposing various knobs
93 /// to control how a regex is executed and what kinds of resources it's
95 pub struct ExecBuilder
{
96 options
: RegexOptions
,
97 match_type
: Option
<MatchType
>,
102 /// Parsed represents a set of parsed regular expressions and their detected
112 /// Create a regex execution builder.
114 /// This uses default settings for everything except the regex itself,
115 /// which must be provided. Further knobs can be set by calling methods,
116 /// and then finally, `build` to actually create the executor.
117 pub fn new(re
: &str) -> Self {
118 Self::new_many(&[re
])
121 /// Like new, but compiles the union of the given regular expressions.
123 /// Note that when compiling 2 or more regular expressions, capture groups
124 /// are completely unsupported. (This means both `find` and `captures`
126 pub fn new_many
<I
, S
>(res
: I
) -> Self
127 where S
: AsRef
<str>, I
: IntoIterator
<Item
=S
> {
128 let mut opts
= RegexOptions
::default();
129 opts
.pats
= res
.into_iter().map(|s
| s
.as_ref().to_owned()).collect();
130 Self::new_options(opts
)
133 /// Create a regex execution builder.
134 pub fn new_options(opts
: RegexOptions
) -> Self {
143 /// Set the matching engine to be automatically determined.
145 /// This is the default state and will apply whatever optimizations are
146 /// possible, such as running a DFA.
148 /// This overrides whatever was previously set via the `nfa` or
149 /// `bounded_backtracking` methods.
150 pub fn automatic(mut self) -> Self {
151 self.match_type
= None
;
155 /// Sets the matching engine to use the NFA algorithm no matter what
156 /// optimizations are possible.
158 /// This overrides whatever was previously set via the `automatic` or
159 /// `bounded_backtracking` methods.
160 pub fn nfa(mut self) -> Self {
161 self.match_type
= Some(MatchType
::Nfa(MatchNfaType
::PikeVM
));
165 /// Sets the matching engine to use a bounded backtracking engine no
166 /// matter what optimizations are possible.
168 /// One must use this with care, since the bounded backtracking engine
169 /// uses memory proportion to `len(regex) * len(text)`.
171 /// This overrides whatever was previously set via the `automatic` or
173 pub fn bounded_backtracking(mut self) -> Self {
174 self.match_type
= Some(MatchType
::Nfa(MatchNfaType
::Backtrack
));
178 /// Compiles byte based programs for use with the NFA matching engines.
180 /// By default, the NFA engines match on Unicode scalar values. They can
181 /// be made to use byte based programs instead. In general, the byte based
182 /// programs are slower because of a less efficient encoding of character
185 /// Note that this does not impact DFA matching engines, which always
186 /// execute on bytes.
187 pub fn bytes(mut self, yes
: bool
) -> Self {
192 /// When disabled, the program compiled may match arbitrary bytes.
194 /// When enabled (the default), all compiled programs exclusively match
195 /// valid UTF-8 bytes.
196 pub fn only_utf8(mut self, yes
: bool
) -> Self {
197 self.only_utf8
= yes
;
201 /// Set the Unicode flag.
202 pub fn unicode(mut self, yes
: bool
) -> Self {
203 self.options
.unicode
= yes
;
207 /// Parse the current set of patterns into their AST and extract literals.
208 fn parse(&self) -> Result
<Parsed
, Error
> {
209 let mut exprs
= Vec
::with_capacity(self.options
.pats
.len());
210 let mut prefixes
= Some(Literals
::empty());
211 let mut suffixes
= Some(Literals
::empty());
212 let mut bytes
= false;
213 let is_set
= self.options
.pats
.len() > 1;
214 // If we're compiling a regex set and that set has any anchored
215 // expressions, then disable all literal optimizations.
216 for pat
in &self.options
.pats
{
219 .case_insensitive(self.options
.case_insensitive
)
220 .multi_line(self.options
.multi_line
)
221 .dot_matches_new_line(self.options
.dot_matches_new_line
)
222 .swap_greed(self.options
.swap_greed
)
223 .ignore_whitespace(self.options
.ignore_whitespace
)
224 .unicode(self.options
.unicode
)
225 .allow_bytes(!self.only_utf8
);
226 let expr
= try
!(parser
.parse(pat
));
227 bytes
= bytes
|| expr
.has_bytes();
229 if !expr
.is_anchored_start() && expr
.has_anchored_start() {
230 // Partial anchors unfortunately make it hard to use prefixes,
233 } else if is_set
&& expr
.is_anchored_start() {
234 // Regex sets with anchors do not go well with literal
238 prefixes
= prefixes
.and_then(|mut prefixes
| {
239 if !prefixes
.union_prefixes(&expr
) {
246 if !expr
.is_anchored_end() && expr
.has_anchored_end() {
247 // Partial anchors unfortunately make it hard to use suffixes,
250 } else if is_set
&& expr
.is_anchored_end() {
251 // Regex sets with anchors do not go well with literal
255 suffixes
= suffixes
.and_then(|mut suffixes
| {
256 if !suffixes
.union_suffixes(&expr
) {
266 prefixes
: prefixes
.unwrap_or_else(Literals
::empty
),
267 suffixes
: suffixes
.unwrap_or_else(Literals
::empty
),
272 /// Build an executor that can run a regular expression.
273 pub fn build(self) -> Result
<Exec
, Error
> {
274 // Special case when we have no patterns to compile.
275 // This can happen when compiling a regex set.
276 if self.options
.pats
.is_empty() {
277 let ro
= Arc
::new(ExecReadOnly
{
281 dfa_reverse
: Program
::new(),
282 suffixes
: LiteralSearcher
::empty(),
283 match_type
: MatchType
::Nothing
,
285 return Ok(Exec { ro: ro, cache: CachedThreadLocal::new() }
);
287 let parsed
= try
!(self.parse());
290 .size_limit(self.options
.size_limit
)
291 .bytes(self.bytes
|| parsed
.bytes
)
292 .only_utf8(self.only_utf8
)
293 .compile(&parsed
.exprs
));
296 .size_limit(self.options
.size_limit
)
298 .only_utf8(self.only_utf8
)
299 .compile(&parsed
.exprs
));
300 let mut dfa_reverse
= try
!(
302 .size_limit(self.options
.size_limit
)
304 .only_utf8(self.only_utf8
)
306 .compile(&parsed
.exprs
));
308 let prefixes
= parsed
.prefixes
.unambiguous_prefixes();
309 let suffixes
= parsed
.suffixes
.unambiguous_suffixes();
310 nfa
.prefixes
= LiteralSearcher
::prefixes(prefixes
);
311 dfa
.prefixes
= nfa
.prefixes
.clone();
312 dfa
.dfa_size_limit
= self.options
.dfa_size_limit
;
313 dfa_reverse
.dfa_size_limit
= self.options
.dfa_size_limit
;
315 let mut ro
= ExecReadOnly
{
316 res
: self.options
.pats
,
319 dfa_reverse
: dfa_reverse
,
320 suffixes
: LiteralSearcher
::suffixes(suffixes
),
321 match_type
: MatchType
::Nothing
,
323 ro
.match_type
= ro
.choose_match_type(self.match_type
);
325 let ro
= Arc
::new(ro
);
326 Ok(Exec { ro: ro, cache: CachedThreadLocal::new() }
)
330 impl<'c
> RegularExpression
for ExecNoSyncStr
<'c
> {
333 fn slots_len(&self) -> usize { self.0.slots_len() }
335 fn next_after_empty(&self, text
: &str, i
: usize) -> usize {
336 next_utf8(text
.as_bytes(), i
)
339 #[inline(always)] // reduces constant overhead
340 fn shortest_match_at(&self, text
: &str, start
: usize) -> Option
<usize> {
341 self.0.shortest_match_at(text
.as_bytes(), start
)
344 #[inline(always)] // reduces constant overhead
345 fn is_match_at(&self, text
: &str, start
: usize) -> bool
{
346 self.0.is_match_at
(text
.as_bytes(), start
)
349 #[inline(always)] // reduces constant overhead
350 fn find_at(&self, text
: &str, start
: usize) -> Option
<(usize, usize)> {
351 self.0.find_at
(text
.as_bytes(), start
)
354 #[inline(always)] // reduces constant overhead
357 locs
: &mut Locations
,
360 ) -> Option
<(usize, usize)> {
361 self.0.read_captures_at(locs
, text
.as_bytes(), start
)
365 impl<'c
> RegularExpression
for ExecNoSync
<'c
> {
368 /// Returns the number of capture slots in the regular expression. (There
369 /// are two slots for every capture group, corresponding to possibly empty
370 /// start and end locations of the capture.)
371 fn slots_len(&self) -> usize {
372 self.ro
.nfa
.captures
.len() * 2
375 fn next_after_empty(&self, _text
: &[u8], i
: usize) -> usize {
379 /// Returns the end of a match location, possibly occurring before the
380 /// end location of the correct leftmost-first match.
381 #[inline(always)] // reduces constant overhead
382 fn shortest_match_at(&self, text
: &[u8], start
: usize) -> Option
<usize> {
383 if !self.is_anchor_end_match(text
) {
386 match self.ro
.match_type
{
387 MatchType
::Literal(ty
) => {
388 self.find_literals(ty
, text
, start
).map(|(_
, e
)| e
)
390 MatchType
::Dfa
| MatchType
::DfaMany
=> {
391 match self.shortest_dfa(text
, start
) {
392 dfa
::Result
::Match(end
) => Some(end
),
393 dfa
::Result
::NoMatch(_
) => None
,
394 dfa
::Result
::Quit
=> self.shortest_nfa(text
, start
),
397 MatchType
::DfaAnchoredReverse
=> {
398 match dfa
::Fsm
::reverse(
399 &self.ro
.dfa_reverse
,
405 dfa
::Result
::Match(_
) => Some(text
.len()),
406 dfa
::Result
::NoMatch(_
) => None
,
407 dfa
::Result
::Quit
=> self.shortest_nfa(text
, start
),
410 MatchType
::DfaSuffix
=> {
411 match self.shortest_dfa_reverse_suffix(text
, start
) {
412 dfa
::Result
::Match(e
) => Some(e
),
413 dfa
::Result
::NoMatch(_
) => None
,
414 dfa
::Result
::Quit
=> self.shortest_nfa(text
, start
),
417 MatchType
::Nfa(ty
) => self.shortest_nfa_type(ty
, text
, start
),
418 MatchType
::Nothing
=> None
,
422 /// Returns true if and only if the regex matches text.
424 /// For single regular expressions, this is equivalent to calling
425 /// shortest_match(...).is_some().
426 #[inline(always)] // reduces constant overhead
427 fn is_match_at(&self, text
: &[u8], start
: usize) -> bool
{
428 if !self.is_anchor_end_match(text
) {
431 // We need to do this dance because shortest_match relies on the NFA
432 // filling in captures[1], but a RegexSet has no captures. In other
433 // words, a RegexSet can't (currently) use shortest_match. ---AG
434 match self.ro
.match_type
{
435 MatchType
::Literal(ty
) => {
436 self.find_literals(ty
, text
, start
).is_some()
438 MatchType
::Dfa
| MatchType
::DfaMany
=> {
439 match self.shortest_dfa(text
, start
) {
440 dfa
::Result
::Match(_
) => true,
441 dfa
::Result
::NoMatch(_
) => false,
442 dfa
::Result
::Quit
=> self.match_nfa(text
, start
),
445 MatchType
::DfaAnchoredReverse
=> {
446 match dfa
::Fsm
::reverse(
447 &self.ro
.dfa_reverse
,
453 dfa
::Result
::Match(_
) => true,
454 dfa
::Result
::NoMatch(_
) => false,
455 dfa
::Result
::Quit
=> self.match_nfa(text
, start
),
458 MatchType
::DfaSuffix
=> {
459 match self.shortest_dfa_reverse_suffix(text
, start
) {
460 dfa
::Result
::Match(_
) => true,
461 dfa
::Result
::NoMatch(_
) => false,
462 dfa
::Result
::Quit
=> self.match_nfa(text
, start
),
465 MatchType
::Nfa(ty
) => self.match_nfa_type(ty
, text
, start
),
466 MatchType
::Nothing
=> false,
470 /// Finds the start and end location of the leftmost-first match, starting
471 /// at the given location.
472 #[inline(always)] // reduces constant overhead
473 fn find_at(&self, text
: &[u8], start
: usize) -> Option
<(usize, usize)> {
474 if !self.is_anchor_end_match(text
) {
477 match self.ro
.match_type
{
478 MatchType
::Literal(ty
) => {
479 self.find_literals(ty
, text
, start
)
482 match self.find_dfa_forward(text
, start
) {
483 dfa
::Result
::Match((s
, e
)) => Some((s
, e
)),
484 dfa
::Result
::NoMatch(_
) => None
,
485 dfa
::Result
::Quit
=> {
486 self.find_nfa(MatchNfaType
::Auto
, text
, start
)
490 MatchType
::DfaAnchoredReverse
=> {
491 match self.find_dfa_anchored_reverse(text
, start
) {
492 dfa
::Result
::Match((s
, e
)) => Some((s
, e
)),
493 dfa
::Result
::NoMatch(_
) => None
,
494 dfa
::Result
::Quit
=> {
495 self.find_nfa(MatchNfaType
::Auto
, text
, start
)
499 MatchType
::DfaSuffix
=> {
500 match self.find_dfa_reverse_suffix(text
, start
) {
501 dfa
::Result
::Match((s
, e
)) => Some((s
, e
)),
502 dfa
::Result
::NoMatch(_
) => None
,
503 dfa
::Result
::Quit
=> {
504 self.find_nfa(MatchNfaType
::Auto
, text
, start
)
508 MatchType
::Nfa(ty
) => self.find_nfa(ty
, text
, start
),
509 MatchType
::Nothing
=> None
,
510 MatchType
::DfaMany
=> {
511 unreachable
!("BUG: RegexSet cannot be used with find")
516 /// Finds the start and end location of the leftmost-first match and also
517 /// fills in all matching capture groups.
519 /// The number of capture slots given should be equal to the total number
520 /// of capture slots in the compiled program.
522 /// Note that the first two slots always correspond to the start and end
523 /// locations of the overall match.
526 locs
: &mut Locations
,
529 ) -> Option
<(usize, usize)> {
530 let slots
= as_slots(locs
);
531 for slot
in slots
.iter_mut() {
534 // If the caller unnecessarily uses this, then we try to save them
537 0 => return self.find_at(text
, start
),
539 return self.find_at(text
, start
).map(|(s
, e
)| {
545 _
=> {}
// fallthrough
547 if !self.is_anchor_end_match(text
) {
550 match self.ro
.match_type
{
551 MatchType
::Literal(ty
) => {
552 self.find_literals(ty
, text
, start
).and_then(|(s
, e
)| {
553 self.captures_nfa_with_match(slots
, text
, s
, e
)
557 if self.ro
.nfa
.is_anchored_start
{
558 self.captures_nfa(slots
, text
, start
)
560 match self.find_dfa_forward(text
, start
) {
561 dfa
::Result
::Match((s
, e
)) => {
562 self.captures_nfa_with_match(slots
, text
, s
, e
)
564 dfa
::Result
::NoMatch(_
) => None
,
565 dfa
::Result
::Quit
=> self.captures_nfa(slots
, text
, start
),
569 MatchType
::DfaAnchoredReverse
=> {
570 match self.find_dfa_anchored_reverse(text
, start
) {
571 dfa
::Result
::Match((s
, e
)) => {
572 self.captures_nfa_with_match(slots
, text
, s
, e
)
574 dfa
::Result
::NoMatch(_
) => None
,
575 dfa
::Result
::Quit
=> self.captures_nfa(slots
, text
, start
),
578 MatchType
::DfaSuffix
=> {
579 match self.find_dfa_reverse_suffix(text
, start
) {
580 dfa
::Result
::Match((s
, e
)) => {
581 self.captures_nfa_with_match(slots
, text
, s
, e
)
583 dfa
::Result
::NoMatch(_
) => None
,
584 dfa
::Result
::Quit
=> self.captures_nfa(slots
, text
, start
),
587 MatchType
::Nfa(ty
) => {
588 self.captures_nfa_type(ty
, slots
, text
, start
)
590 MatchType
::Nothing
=> None
,
591 MatchType
::DfaMany
=> {
592 unreachable
!("BUG: RegexSet cannot be used with captures")
598 impl<'c
> ExecNoSync
<'c
> {
599 /// Finds the leftmost-first match using only literal search.
600 #[inline(always)] // reduces constant overhead
603 ty
: MatchLiteralType
,
606 ) -> Option
<(usize, usize)> {
607 use self::MatchLiteralType
::*;
610 let lits
= &self.ro
.nfa
.prefixes
;
611 lits
.find(&text
[start
..])
612 .map(|(s
, e
)| (start
+ s
, start
+ e
))
615 let lits
= &self.ro
.nfa
.prefixes
;
616 lits
.find_start(&text
[start
..])
617 .map(|(s
, e
)| (start
+ s
, start
+ e
))
620 let lits
= &self.ro
.suffixes
;
621 lits
.find_end(&text
[start
..])
622 .map(|(s
, e
)| (start
+ s
, start
+ e
))
627 /// Finds the leftmost-first match (start and end) using only the DFA.
629 /// If the result returned indicates that the DFA quit, then another
630 /// matching engine should be used.
631 #[inline(always)] // reduces constant overhead
636 ) -> dfa
::Result
<(usize, usize)> {
638 let end
= match dfa
::Fsm
::forward(
645 NoMatch(i
) => return NoMatch(i
),
647 Match(end
) if start
== end
=> return Match((start
, start
)),
650 // Now run the DFA in reverse to find the start of the match.
651 match dfa
::Fsm
::reverse(
652 &self.ro
.dfa_reverse
,
658 Match(s
) => Match((start
+ s
, end
)),
659 NoMatch(i
) => NoMatch(i
),
664 /// Finds the leftmost-first match (start and end) using only the DFA,
665 /// but assumes the regex is anchored at the end and therefore starts at
666 /// the end of the regex and matches in reverse.
668 /// If the result returned indicates that the DFA quit, then another
669 /// matching engine should be used.
670 #[inline(always)] // reduces constant overhead
671 fn find_dfa_anchored_reverse(
675 ) -> dfa
::Result
<(usize, usize)> {
677 match dfa
::Fsm
::reverse(
678 &self.ro
.dfa_reverse
,
684 Match(s
) => Match((start
+ s
, text
.len())),
685 NoMatch(i
) => NoMatch(i
),
690 /// Finds the end of the shortest match using only the DFA.
691 #[inline(always)] // reduces constant overhead
692 fn shortest_dfa(&self, text
: &[u8], start
: usize) -> dfa
::Result
<usize> {
693 dfa
::Fsm
::forward(&self.ro
.dfa
, self.cache
, true, text
, start
)
696 /// Finds the end of the shortest match using only the DFA by scanning for
699 #[inline(always)] // reduces constant overhead
700 fn shortest_dfa_reverse_suffix(
704 ) -> dfa
::Result
<usize> {
705 match self.exec_dfa_reverse_suffix(text
, start
) {
706 None
=> self.shortest_dfa(text
, start
),
707 Some(r
) => r
.map(|(_
, end
)| end
),
711 /// Finds the end of the shortest match using only the DFA by scanning for
712 /// suffix literals. It also reports the start of the match.
714 /// Note that if None is returned, then the optimization gave up to avoid
715 /// worst case quadratic behavior. A forward scanning DFA should be tried
718 /// If a match is returned and the full leftmost-first match is desired,
719 /// then a forward scan starting from the beginning of the match must be
722 /// If the result returned indicates that the DFA quit, then another
723 /// matching engine should be used.
724 #[inline(always)] // reduces constant overhead
725 fn exec_dfa_reverse_suffix(
728 original_start
: usize,
729 ) -> Option
<dfa
::Result
<(usize, usize)>> {
732 let lcs
= self.ro
.suffixes
.lcs();
733 debug_assert
!(lcs
.len() >= 1);
734 let mut start
= original_start
;
736 while end
<= text
.len() {
738 end
+= match lcs
.find(&text
[end
..]) {
739 None
=> return Some(NoMatch(text
.len())),
740 Some(start
) => start
+ lcs
.len(),
742 match dfa
::Fsm
::reverse(
743 &self.ro
.dfa_reverse
,
749 Match(0) | NoMatch(0) => return None
,
750 Match(s
) => return Some(Match((s
+ start
, end
))),
751 NoMatch(_
) => continue,
752 Quit
=> return Some(Quit
),
755 Some(NoMatch(text
.len()))
758 /// Finds the leftmost-first match (start and end) using only the DFA
759 /// by scanning for suffix literals.
761 /// If the result returned indicates that the DFA quit, then another
762 /// matching engine should be used.
763 #[inline(always)] // reduces constant overhead
764 fn find_dfa_reverse_suffix(
768 ) -> dfa
::Result
<(usize, usize)> {
771 let match_start
= match self.exec_dfa_reverse_suffix(text
, start
) {
772 None
=> return self.find_dfa_forward(text
, start
),
773 Some(Match((start
, _
))) => start
,
776 // At this point, we've found a match. The only way to quit now
777 // without a match is if the DFA gives up (seems unlikely).
779 // Now run the DFA forwards to find the proper end of the match.
780 // (The suffix literal match can only indicate the earliest
781 // possible end location, which may appear before the end of the
782 // leftmost-first match.)
783 match dfa
::Fsm
::forward(
790 NoMatch(_
) => panic
!("BUG: reverse match implies forward match"),
792 Match(e
) => Match((match_start
, e
)),
796 /// Executes the NFA engine to return whether there is a match or not.
798 /// Ideally, we could use shortest_nfa(...).is_some() and get the same
799 /// performance characteristics, but regex sets don't have captures, which
800 /// shortest_nfa depends on.
806 self.match_nfa_type(MatchNfaType
::Auto
, text
, start
)
809 /// Like match_nfa, but allows specification of the type of NFA engine.
816 self.exec_nfa(ty
, &mut [false], &mut [], true, text
, start
)
819 /// Finds the shortest match using an NFA.
820 fn shortest_nfa(&self, text
: &[u8], start
: usize) -> Option
<usize> {
821 self.shortest_nfa_type(MatchNfaType
::Auto
, text
, start
)
824 /// Like shortest_nfa, but allows specification of the type of NFA engine.
825 fn shortest_nfa_type(
831 let mut slots
= [None
, None
];
832 if self.exec_nfa(ty
, &mut [false], &mut slots
, true, text
, start
) {
839 /// Like find, but executes an NFA engine.
845 ) -> Option
<(usize, usize)> {
846 let mut slots
= [None
, None
];
847 if self.exec_nfa(ty
, &mut [false], &mut slots
, false, text
, start
) {
848 match (slots
[0], slots
[1]) {
849 (Some(s
), Some(e
)) => Some((s
, e
)),
857 /// Like find_nfa, but fills in captures and restricts the search space
858 /// using previously found match information.
860 /// `slots` should have length equal to `2 * nfa.captures.len()`.
861 fn captures_nfa_with_match(
867 ) -> Option
<(usize, usize)> {
868 // We can't use match_end directly, because we may need to examine one
869 // "character" after the end of a match for lookahead operators. We
870 // need to move two characters beyond the end, since some look-around
871 // operations may falsely assume a premature end of text otherwise.
873 next_utf8(text
, next_utf8(text
, match_end
)), text
.len());
874 self.captures_nfa(slots
, &text
[..e
], match_start
)
877 /// Like find_nfa, but fills in captures.
879 /// `slots` should have length equal to `2 * nfa.captures.len()`.
885 ) -> Option
<(usize, usize)> {
886 self.captures_nfa_type(MatchNfaType
::Auto
, slots
, text
, start
)
889 /// Like captures_nfa, but allows specification of type of NFA engine.
890 fn captures_nfa_type(
896 ) -> Option
<(usize, usize)> {
897 if self.exec_nfa(ty
, &mut [false], slots
, false, text
, start
) {
898 match (slots
[0], slots
[1]) {
899 (Some(s
), Some(e
)) => Some((s
, e
)),
909 mut ty
: MatchNfaType
,
910 matches
: &mut [bool
],
912 quit_after_match
: bool
,
916 use self::MatchNfaType
::*;
918 if backtrack
::should_exec(self.ro
.nfa
.len(), text
.len()) {
925 Auto
=> unreachable
!(),
926 Backtrack
=> self.exec_backtrack(matches
, slots
, text
, start
),
929 matches
, slots
, quit_after_match
, text
, start
)
934 /// Always run the NFA algorithm.
937 matches
: &mut [bool
],
939 quit_after_match
: bool
,
943 if self.ro
.nfa
.uses_bytes() {
950 ByteInput
::new(text
, self.ro
.nfa
.only_utf8
),
959 CharInput
::new(text
),
964 /// Always runs the NFA using bounded backtracking.
967 matches
: &mut [bool
],
972 if self.ro
.nfa
.uses_bytes() {
973 backtrack
::Bounded
::exec(
978 ByteInput
::new(text
, self.ro
.nfa
.only_utf8
),
981 backtrack
::Bounded
::exec(
986 CharInput
::new(text
),
991 /// Finds which regular expressions match the given text.
993 /// `matches` should have length equal to the number of regexes being
996 /// This is only useful when one wants to know which regexes in a set
998 pub fn many_matches_at(
1000 matches
: &mut [bool
],
1004 use self::MatchType
::*;
1005 if !self.is_anchor_end_match(text
) {
1008 match self.ro
.match_type
{
1010 debug_assert_eq
!(matches
.len(), 1);
1011 matches
[0] = self.find_literals(ty
, text
, start
).is_some();
1014 Dfa
| DfaAnchoredReverse
| DfaSuffix
| DfaMany
=> {
1015 match dfa
::Fsm
::forward_many(
1022 dfa
::Result
::Match(_
) => true,
1023 dfa
::Result
::NoMatch(_
) => false,
1024 dfa
::Result
::Quit
=> {
1035 Nfa(ty
) => self.exec_nfa(ty
, matches
, &mut [], false, text
, start
),
1040 #[inline(always)] // reduces constant overhead
1041 fn is_anchor_end_match(&self, text
: &[u8]) -> bool
{
1042 // Only do this check if the haystack is big (>1MB).
1043 if text
.len() > (1<<20) && self.ro
.nfa
.is_anchored_end
{
1044 let lcs
= self.ro
.suffixes
.lcs();
1045 if lcs
.len() >= 1 && !lcs
.is_suffix(text
) {
1052 pub fn capture_name_idx(&self) -> &Arc
<HashMap
<String
, usize>> {
1053 &self.ro
.nfa
.capture_name_idx
1057 impl<'c
> ExecNoSyncStr
<'c
> {
1058 pub fn capture_name_idx(&self) -> &Arc
<HashMap
<String
, usize>> {
1059 self.0.capture_name_idx()
1064 /// Get a searcher that isn't Sync.
1065 #[inline(always)] // reduces constant overhead
1066 pub fn searcher(&self) -> ExecNoSync
{
1067 let create
= || Box
::new(RefCell
::new(ProgramCacheInner
::new(&self.ro
)));
1069 ro
: &self.ro
, // a clone is too expensive here! (and not needed)
1070 cache
: self.cache
.get_or(create
),
1074 /// Get a searcher that isn't Sync and can match on &str.
1075 #[inline(always)] // reduces constant overhead
1076 pub fn searcher_str(&self) -> ExecNoSyncStr
{
1077 ExecNoSyncStr(self.searcher())
1080 /// Build a Regex from this executor.
1081 pub fn into_regex(self) -> re_unicode
::Regex
{
1082 re_unicode
::Regex
::from(self)
1085 /// Build a RegexSet from this executor.
1086 pub fn into_regex_set(self) -> re_set
::unicode
::RegexSet
{
1087 re_set
::unicode
::RegexSet
::from(self)
1090 /// Build a Regex from this executor that can match arbitrary bytes.
1091 pub fn into_byte_regex(self) -> re_bytes
::Regex
{
1092 re_bytes
::Regex
::from(self)
1095 /// Build a RegexSet from this executor that can match arbitrary bytes.
1096 pub fn into_byte_regex_set(self) -> re_set
::bytes
::RegexSet
{
1097 re_set
::bytes
::RegexSet
::from(self)
1100 /// The original regular expressions given by the caller that were
1102 pub fn regex_strings(&self) -> &[String
] {
1106 /// Return a slice of capture names.
1108 /// Any capture that isn't named is None.
1109 pub fn capture_names(&self) -> &[Option
<String
>] {
1110 &self.ro
.nfa
.captures
1113 /// Return a reference to named groups mapping (from group name to
1114 /// group position).
1115 pub fn capture_name_idx(&self) -> &Arc
<HashMap
<String
, usize>> {
1116 &self.ro
.nfa
.capture_name_idx
1120 impl Clone
for Exec
{
1121 fn clone(&self) -> Exec
{
1123 ro
: self.ro
.clone(),
1124 cache
: CachedThreadLocal
::new(),
1130 fn choose_match_type(&self, hint
: Option
<MatchType
>) -> MatchType
{
1131 use self::MatchType
::*;
1132 if let Some(Nfa(_
)) = hint
{
1133 return hint
.unwrap();
1135 // If the NFA is empty, then we'll never match anything.
1136 if self.nfa
.insts
.is_empty() {
1139 // If our set of prefixes is complete, then we can use it to find
1140 // a match in lieu of a regex engine. This doesn't quite work well in
1141 // the presence of multiple regexes, so only do it when there's one.
1143 // TODO(burntsushi): Also, don't try to match literals if the regex is
1144 // partially anchored. We could technically do it, but we'd need to
1145 // create two sets of literals: all of them and then the subset that
1146 // aren't anchored. We would then only search for all of them when at
1147 // the beginning of the input and use the subset in all other cases.
1148 if self.res
.len() == 1 {
1149 if self.nfa
.prefixes
.complete() {
1150 return if self.nfa
.is_anchored_start
{
1151 Literal(MatchLiteralType
::AnchoredStart
)
1153 Literal(MatchLiteralType
::Unanchored
)
1156 if self.suffixes
.complete() {
1157 return if self.nfa
.is_anchored_end
{
1158 Literal(MatchLiteralType
::AnchoredEnd
)
1160 // This case shouldn't happen. When the regex isn't
1161 // anchored, then complete prefixes should imply complete
1163 Literal(MatchLiteralType
::Unanchored
)
1167 // If we can execute the DFA, then we totally should.
1168 if dfa
::can_exec(&self.dfa
) {
1169 // Regex sets require a slightly specialized path.
1170 if self.res
.len() >= 2 {
1173 // If the regex is anchored at the end but not the start, then
1174 // just match in reverse from the end of the haystack.
1175 if !self.nfa
.is_anchored_start
&& self.nfa
.is_anchored_end
{
1176 return DfaAnchoredReverse
;
1178 // If there's a longish suffix literal, then it might be faster
1179 // to look for that first.
1180 if self.should_suffix_scan() {
1183 // Fall back to your garden variety forward searching lazy DFA.
1186 // We're so totally hosed.
1187 Nfa(MatchNfaType
::Auto
)
1190 /// Returns true if the program is amenable to suffix scanning.
1192 /// When this is true, as a heuristic, we assume it is OK to quickly scan
1193 /// for suffix literals and then do a *reverse* DFA match from any matches
1194 /// produced by the literal scan. (And then followed by a forward DFA
1195 /// search, since the previously found suffix literal maybe not actually be
1196 /// the end of a match.)
1198 /// This is a bit of a specialized optimization, but can result in pretty
1199 /// big performance wins if 1) there are no prefix literals and 2) the
1200 /// suffix literals are pretty rare in the text. (1) is obviously easy to
1201 /// account for but (2) is harder. As a proxy, we assume that longer
1202 /// strings are generally rarer, so we only enable this optimization when
1203 /// we have a meaty suffix.
1204 fn should_suffix_scan(&self) -> bool
{
1205 if self.suffixes
.is_empty() {
1208 let lcs_len
= self.suffixes
.lcs().char_len();
1209 lcs_len
>= 3 && lcs_len
> self.dfa
.prefixes
.lcp().char_len()
1213 #[derive(Clone, Copy, Debug)]
1215 /// A single or multiple literal search. This is only used when the regex
1216 /// can be decomposed into unambiguous literal search.
1217 Literal(MatchLiteralType
),
1218 /// A normal DFA search.
1220 /// A reverse DFA search starting from the end of a haystack.
1222 /// A reverse DFA search with suffix literal scanning.
1224 /// Use the DFA on two or more regular expressions.
1228 /// No match is ever possible, so don't ever try to search.
1232 #[derive(Clone, Copy, Debug)]
1233 enum MatchLiteralType
{
1234 /// Match literals anywhere in text.
1236 /// Match literals only at the start of text.
1238 /// Match literals only at the end of text.
1242 #[derive(Clone, Copy, Debug)]
1244 /// Choose between Backtrack and PikeVM.
1246 /// NFA bounded backtracking.
1248 /// (This is only set by tests, since it never makes sense to always want
1253 /// (This is only set by tests, since it never makes sense to always want
1258 /// `ProgramCache` maintains reusable allocations for each matching engine
1259 /// available to a particular program.
1260 pub type ProgramCache
= RefCell
<ProgramCacheInner
>;
1262 #[derive(Clone, Debug)]
1263 pub struct ProgramCacheInner
{
1264 pub pikevm
: pikevm
::Cache
,
1265 pub backtrack
: backtrack
::Cache
,
1266 pub dfa
: dfa
::Cache
,
1267 pub dfa_reverse
: dfa
::Cache
,
1270 impl ProgramCacheInner
{
1271 fn new(ro
: &ExecReadOnly
) -> Self {
1273 pikevm
: pikevm
::Cache
::new(&ro
.nfa
),
1274 backtrack
: backtrack
::Cache
::new(&ro
.nfa
),
1275 dfa
: dfa
::Cache
::new(&ro
.dfa
),
1276 dfa_reverse
: dfa
::Cache
::new(&ro
.dfa_reverse
),