]> git.proxmox.com Git - rustc.git/blob - src/vendor/regex/src/exec.rs
New upstream version 1.25.0+dfsg1
[rustc.git] / src / vendor / regex / src / exec.rs
1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 use std::cell::RefCell;
12 use std::collections::HashMap;
13 use std::cmp;
14 use std::sync::Arc;
15
16 use thread_local::CachedThreadLocal;
17 use syntax::{Expr, ExprBuilder, Literals};
18
19 use backtrack;
20 use compile::Compiler;
21 use dfa;
22 use error::Error;
23 use input::{ByteInput, CharInput};
24 use literals::LiteralSearcher;
25 use pikevm;
26 use prog::Program;
27 use re_builder::RegexOptions;
28 use re_bytes;
29 use re_set;
30 use re_trait::{RegularExpression, Slot, Locations, as_slots};
31 use re_unicode;
32 use utf8::next_utf8;
33
34 /// `Exec` manages the execution of a regular expression.
35 ///
36 /// In particular, this manages the various compiled forms of a single regular
37 /// expression and the choice of which matching engine to use to execute a
38 /// regular expression.
39 pub struct Exec {
40 /// All read only state.
41 ro: Arc<ExecReadOnly>,
42 /// Caches for the various matching engines.
43 cache: CachedThreadLocal<ProgramCache>,
44 }
45
46 /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
47 /// means it is no longer Sync, but we can now avoid the overhead of
48 /// synchronization to fetch the cache.
49 #[derive(Debug)]
50 pub struct ExecNoSync<'c> {
51 /// All read only state.
52 ro: &'c Arc<ExecReadOnly>,
53 /// Caches for the various matching engines.
54 cache: &'c ProgramCache,
55 }
56
57 /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
58 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
59
60 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
61 /// state is determined at compile time and never changes during search.
62 #[derive(Debug)]
63 struct ExecReadOnly {
64 /// The original regular expressions given by the caller to compile.
65 res: Vec<String>,
66 /// A compiled program that is used in the NFA simulation and backtracking.
67 /// It can be byte-based or Unicode codepoint based.
68 ///
69 /// N.B. It is not possibly to make this byte-based from the public API.
70 /// It is only used for testing byte based programs in the NFA simulations.
71 nfa: Program,
72 /// A compiled byte based program for DFA execution. This is only used
73 /// if a DFA can be executed. (Currently, only word boundary assertions are
74 /// not supported.) Note that this program contains an embedded `.*?`
75 /// preceding the first capture group, unless the regex is anchored at the
76 /// beginning.
77 dfa: Program,
78 /// The same as above, except the program is reversed (and there is no
79 /// preceding `.*?`). This is used by the DFA to find the starting location
80 /// of matches.
81 dfa_reverse: Program,
82 /// A set of suffix literals extracted from the regex.
83 ///
84 /// Prefix literals are stored on the `Program`, since they are used inside
85 /// the matching engines.
86 suffixes: LiteralSearcher,
87 /// match_type encodes as much upfront knowledge about how we're going to
88 /// execute a search as possible.
89 match_type: MatchType,
90 }
91
92 /// Facilitates the construction of an executor by exposing various knobs
93 /// to control how a regex is executed and what kinds of resources it's
94 /// permitted to use.
95 pub struct ExecBuilder {
96 options: RegexOptions,
97 match_type: Option<MatchType>,
98 bytes: bool,
99 only_utf8: bool,
100 }
101
102 /// Parsed represents a set of parsed regular expressions and their detected
103 /// literals.
104 struct Parsed {
105 exprs: Vec<Expr>,
106 prefixes: Literals,
107 suffixes: Literals,
108 bytes: bool,
109 }
110
111 impl ExecBuilder {
112 /// Create a regex execution builder.
113 ///
114 /// This uses default settings for everything except the regex itself,
115 /// which must be provided. Further knobs can be set by calling methods,
116 /// and then finally, `build` to actually create the executor.
117 pub fn new(re: &str) -> Self {
118 Self::new_many(&[re])
119 }
120
121 /// Like new, but compiles the union of the given regular expressions.
122 ///
123 /// Note that when compiling 2 or more regular expressions, capture groups
124 /// are completely unsupported. (This means both `find` and `captures`
125 /// wont work.)
126 pub fn new_many<I, S>(res: I) -> Self
127 where S: AsRef<str>, I: IntoIterator<Item=S> {
128 let mut opts = RegexOptions::default();
129 opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
130 Self::new_options(opts)
131 }
132
133 /// Create a regex execution builder.
134 pub fn new_options(opts: RegexOptions) -> Self {
135 ExecBuilder {
136 options: opts,
137 match_type: None,
138 bytes: false,
139 only_utf8: true,
140 }
141 }
142
143 /// Set the matching engine to be automatically determined.
144 ///
145 /// This is the default state and will apply whatever optimizations are
146 /// possible, such as running a DFA.
147 ///
148 /// This overrides whatever was previously set via the `nfa` or
149 /// `bounded_backtracking` methods.
150 pub fn automatic(mut self) -> Self {
151 self.match_type = None;
152 self
153 }
154
155 /// Sets the matching engine to use the NFA algorithm no matter what
156 /// optimizations are possible.
157 ///
158 /// This overrides whatever was previously set via the `automatic` or
159 /// `bounded_backtracking` methods.
160 pub fn nfa(mut self) -> Self {
161 self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
162 self
163 }
164
165 /// Sets the matching engine to use a bounded backtracking engine no
166 /// matter what optimizations are possible.
167 ///
168 /// One must use this with care, since the bounded backtracking engine
169 /// uses memory proportion to `len(regex) * len(text)`.
170 ///
171 /// This overrides whatever was previously set via the `automatic` or
172 /// `nfa` methods.
173 pub fn bounded_backtracking(mut self) -> Self {
174 self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
175 self
176 }
177
178 /// Compiles byte based programs for use with the NFA matching engines.
179 ///
180 /// By default, the NFA engines match on Unicode scalar values. They can
181 /// be made to use byte based programs instead. In general, the byte based
182 /// programs are slower because of a less efficient encoding of character
183 /// classes.
184 ///
185 /// Note that this does not impact DFA matching engines, which always
186 /// execute on bytes.
187 pub fn bytes(mut self, yes: bool) -> Self {
188 self.bytes = yes;
189 self
190 }
191
192 /// When disabled, the program compiled may match arbitrary bytes.
193 ///
194 /// When enabled (the default), all compiled programs exclusively match
195 /// valid UTF-8 bytes.
196 pub fn only_utf8(mut self, yes: bool) -> Self {
197 self.only_utf8 = yes;
198 self
199 }
200
201 /// Set the Unicode flag.
202 pub fn unicode(mut self, yes: bool) -> Self {
203 self.options.unicode = yes;
204 self
205 }
206
207 /// Parse the current set of patterns into their AST and extract literals.
208 fn parse(&self) -> Result<Parsed, Error> {
209 let mut exprs = Vec::with_capacity(self.options.pats.len());
210 let mut prefixes = Some(Literals::empty());
211 let mut suffixes = Some(Literals::empty());
212 let mut bytes = false;
213 let is_set = self.options.pats.len() > 1;
214 // If we're compiling a regex set and that set has any anchored
215 // expressions, then disable all literal optimizations.
216 for pat in &self.options.pats {
217 let parser =
218 ExprBuilder::new()
219 .case_insensitive(self.options.case_insensitive)
220 .multi_line(self.options.multi_line)
221 .dot_matches_new_line(self.options.dot_matches_new_line)
222 .swap_greed(self.options.swap_greed)
223 .ignore_whitespace(self.options.ignore_whitespace)
224 .unicode(self.options.unicode)
225 .allow_bytes(!self.only_utf8);
226 let expr = try!(parser.parse(pat));
227 bytes = bytes || expr.has_bytes();
228
229 if !expr.is_anchored_start() && expr.has_anchored_start() {
230 // Partial anchors unfortunately make it hard to use prefixes,
231 // so disable them.
232 prefixes = None;
233 } else if is_set && expr.is_anchored_start() {
234 // Regex sets with anchors do not go well with literal
235 // optimizations.
236 prefixes = None;
237 }
238 prefixes = prefixes.and_then(|mut prefixes| {
239 if !prefixes.union_prefixes(&expr) {
240 None
241 } else {
242 Some(prefixes)
243 }
244 });
245
246 if !expr.is_anchored_end() && expr.has_anchored_end() {
247 // Partial anchors unfortunately make it hard to use suffixes,
248 // so disable them.
249 suffixes = None;
250 } else if is_set && expr.is_anchored_end() {
251 // Regex sets with anchors do not go well with literal
252 // optimizations.
253 prefixes = None;
254 }
255 suffixes = suffixes.and_then(|mut suffixes| {
256 if !suffixes.union_suffixes(&expr) {
257 None
258 } else {
259 Some(suffixes)
260 }
261 });
262 exprs.push(expr);
263 }
264 Ok(Parsed {
265 exprs: exprs,
266 prefixes: prefixes.unwrap_or_else(Literals::empty),
267 suffixes: suffixes.unwrap_or_else(Literals::empty),
268 bytes: bytes,
269 })
270 }
271
272 /// Build an executor that can run a regular expression.
273 pub fn build(self) -> Result<Exec, Error> {
274 // Special case when we have no patterns to compile.
275 // This can happen when compiling a regex set.
276 if self.options.pats.is_empty() {
277 let ro = Arc::new(ExecReadOnly {
278 res: vec![],
279 nfa: Program::new(),
280 dfa: Program::new(),
281 dfa_reverse: Program::new(),
282 suffixes: LiteralSearcher::empty(),
283 match_type: MatchType::Nothing,
284 });
285 return Ok(Exec { ro: ro, cache: CachedThreadLocal::new() });
286 }
287 let parsed = try!(self.parse());
288 let mut nfa = try!(
289 Compiler::new()
290 .size_limit(self.options.size_limit)
291 .bytes(self.bytes || parsed.bytes)
292 .only_utf8(self.only_utf8)
293 .compile(&parsed.exprs));
294 let mut dfa = try!(
295 Compiler::new()
296 .size_limit(self.options.size_limit)
297 .dfa(true)
298 .only_utf8(self.only_utf8)
299 .compile(&parsed.exprs));
300 let mut dfa_reverse = try!(
301 Compiler::new()
302 .size_limit(self.options.size_limit)
303 .dfa(true)
304 .only_utf8(self.only_utf8)
305 .reverse(true)
306 .compile(&parsed.exprs));
307
308 let prefixes = parsed.prefixes.unambiguous_prefixes();
309 let suffixes = parsed.suffixes.unambiguous_suffixes();
310 nfa.prefixes = LiteralSearcher::prefixes(prefixes);
311 dfa.prefixes = nfa.prefixes.clone();
312 dfa.dfa_size_limit = self.options.dfa_size_limit;
313 dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
314
315 let mut ro = ExecReadOnly {
316 res: self.options.pats,
317 nfa: nfa,
318 dfa: dfa,
319 dfa_reverse: dfa_reverse,
320 suffixes: LiteralSearcher::suffixes(suffixes),
321 match_type: MatchType::Nothing,
322 };
323 ro.match_type = ro.choose_match_type(self.match_type);
324
325 let ro = Arc::new(ro);
326 Ok(Exec { ro: ro, cache: CachedThreadLocal::new() })
327 }
328 }
329
330 impl<'c> RegularExpression for ExecNoSyncStr<'c> {
331 type Text = str;
332
333 fn slots_len(&self) -> usize { self.0.slots_len() }
334
335 fn next_after_empty(&self, text: &str, i: usize) -> usize {
336 next_utf8(text.as_bytes(), i)
337 }
338
339 #[inline(always)] // reduces constant overhead
340 fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
341 self.0.shortest_match_at(text.as_bytes(), start)
342 }
343
344 #[inline(always)] // reduces constant overhead
345 fn is_match_at(&self, text: &str, start: usize) -> bool {
346 self.0.is_match_at(text.as_bytes(), start)
347 }
348
349 #[inline(always)] // reduces constant overhead
350 fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
351 self.0.find_at(text.as_bytes(), start)
352 }
353
354 #[inline(always)] // reduces constant overhead
355 fn read_captures_at(
356 &self,
357 locs: &mut Locations,
358 text: &str,
359 start: usize,
360 ) -> Option<(usize, usize)> {
361 self.0.read_captures_at(locs, text.as_bytes(), start)
362 }
363 }
364
365 impl<'c> RegularExpression for ExecNoSync<'c> {
366 type Text = [u8];
367
368 /// Returns the number of capture slots in the regular expression. (There
369 /// are two slots for every capture group, corresponding to possibly empty
370 /// start and end locations of the capture.)
371 fn slots_len(&self) -> usize {
372 self.ro.nfa.captures.len() * 2
373 }
374
375 fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
376 i + 1
377 }
378
379 /// Returns the end of a match location, possibly occurring before the
380 /// end location of the correct leftmost-first match.
381 #[inline(always)] // reduces constant overhead
382 fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
383 if !self.is_anchor_end_match(text) {
384 return None;
385 }
386 match self.ro.match_type {
387 MatchType::Literal(ty) => {
388 self.find_literals(ty, text, start).map(|(_, e)| e)
389 }
390 MatchType::Dfa | MatchType::DfaMany => {
391 match self.shortest_dfa(text, start) {
392 dfa::Result::Match(end) => Some(end),
393 dfa::Result::NoMatch(_) => None,
394 dfa::Result::Quit => self.shortest_nfa(text, start),
395 }
396 }
397 MatchType::DfaAnchoredReverse => {
398 match dfa::Fsm::reverse(
399 &self.ro.dfa_reverse,
400 self.cache,
401 true,
402 &text[start..],
403 text.len(),
404 ) {
405 dfa::Result::Match(_) => Some(text.len()),
406 dfa::Result::NoMatch(_) => None,
407 dfa::Result::Quit => self.shortest_nfa(text, start),
408 }
409 }
410 MatchType::DfaSuffix => {
411 match self.shortest_dfa_reverse_suffix(text, start) {
412 dfa::Result::Match(e) => Some(e),
413 dfa::Result::NoMatch(_) => None,
414 dfa::Result::Quit => self.shortest_nfa(text, start),
415 }
416 }
417 MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
418 MatchType::Nothing => None,
419 }
420 }
421
422 /// Returns true if and only if the regex matches text.
423 ///
424 /// For single regular expressions, this is equivalent to calling
425 /// shortest_match(...).is_some().
426 #[inline(always)] // reduces constant overhead
427 fn is_match_at(&self, text: &[u8], start: usize) -> bool {
428 if !self.is_anchor_end_match(text) {
429 return false;
430 }
431 // We need to do this dance because shortest_match relies on the NFA
432 // filling in captures[1], but a RegexSet has no captures. In other
433 // words, a RegexSet can't (currently) use shortest_match. ---AG
434 match self.ro.match_type {
435 MatchType::Literal(ty) => {
436 self.find_literals(ty, text, start).is_some()
437 }
438 MatchType::Dfa | MatchType::DfaMany => {
439 match self.shortest_dfa(text, start) {
440 dfa::Result::Match(_) => true,
441 dfa::Result::NoMatch(_) => false,
442 dfa::Result::Quit => self.match_nfa(text, start),
443 }
444 }
445 MatchType::DfaAnchoredReverse => {
446 match dfa::Fsm::reverse(
447 &self.ro.dfa_reverse,
448 self.cache,
449 true,
450 &text[start..],
451 text.len(),
452 ) {
453 dfa::Result::Match(_) => true,
454 dfa::Result::NoMatch(_) => false,
455 dfa::Result::Quit => self.match_nfa(text, start),
456 }
457 }
458 MatchType::DfaSuffix => {
459 match self.shortest_dfa_reverse_suffix(text, start) {
460 dfa::Result::Match(_) => true,
461 dfa::Result::NoMatch(_) => false,
462 dfa::Result::Quit => self.match_nfa(text, start),
463 }
464 }
465 MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
466 MatchType::Nothing => false,
467 }
468 }
469
470 /// Finds the start and end location of the leftmost-first match, starting
471 /// at the given location.
472 #[inline(always)] // reduces constant overhead
473 fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
474 if !self.is_anchor_end_match(text) {
475 return None;
476 }
477 match self.ro.match_type {
478 MatchType::Literal(ty) => {
479 self.find_literals(ty, text, start)
480 }
481 MatchType::Dfa => {
482 match self.find_dfa_forward(text, start) {
483 dfa::Result::Match((s, e)) => Some((s, e)),
484 dfa::Result::NoMatch(_) => None,
485 dfa::Result::Quit => {
486 self.find_nfa(MatchNfaType::Auto, text, start)
487 }
488 }
489 }
490 MatchType::DfaAnchoredReverse => {
491 match self.find_dfa_anchored_reverse(text, start) {
492 dfa::Result::Match((s, e)) => Some((s, e)),
493 dfa::Result::NoMatch(_) => None,
494 dfa::Result::Quit => {
495 self.find_nfa(MatchNfaType::Auto, text, start)
496 }
497 }
498 }
499 MatchType::DfaSuffix => {
500 match self.find_dfa_reverse_suffix(text, start) {
501 dfa::Result::Match((s, e)) => Some((s, e)),
502 dfa::Result::NoMatch(_) => None,
503 dfa::Result::Quit => {
504 self.find_nfa(MatchNfaType::Auto, text, start)
505 }
506 }
507 }
508 MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
509 MatchType::Nothing => None,
510 MatchType::DfaMany => {
511 unreachable!("BUG: RegexSet cannot be used with find")
512 }
513 }
514 }
515
516 /// Finds the start and end location of the leftmost-first match and also
517 /// fills in all matching capture groups.
518 ///
519 /// The number of capture slots given should be equal to the total number
520 /// of capture slots in the compiled program.
521 ///
522 /// Note that the first two slots always correspond to the start and end
523 /// locations of the overall match.
524 fn read_captures_at(
525 &self,
526 locs: &mut Locations,
527 text: &[u8],
528 start: usize,
529 ) -> Option<(usize, usize)> {
530 let slots = as_slots(locs);
531 for slot in slots.iter_mut() {
532 *slot = None;
533 }
534 // If the caller unnecessarily uses this, then we try to save them
535 // from themselves.
536 match slots.len() {
537 0 => return self.find_at(text, start),
538 2 => {
539 return self.find_at(text, start).map(|(s, e)| {
540 slots[0] = Some(s);
541 slots[1] = Some(e);
542 (s, e)
543 });
544 }
545 _ => {} // fallthrough
546 }
547 if !self.is_anchor_end_match(text) {
548 return None;
549 }
550 match self.ro.match_type {
551 MatchType::Literal(ty) => {
552 self.find_literals(ty, text, start).and_then(|(s, e)| {
553 self.captures_nfa_with_match(slots, text, s, e)
554 })
555 }
556 MatchType::Dfa => {
557 if self.ro.nfa.is_anchored_start {
558 self.captures_nfa(slots, text, start)
559 } else {
560 match self.find_dfa_forward(text, start) {
561 dfa::Result::Match((s, e)) => {
562 self.captures_nfa_with_match(slots, text, s, e)
563 }
564 dfa::Result::NoMatch(_) => None,
565 dfa::Result::Quit => self.captures_nfa(slots, text, start),
566 }
567 }
568 }
569 MatchType::DfaAnchoredReverse => {
570 match self.find_dfa_anchored_reverse(text, start) {
571 dfa::Result::Match((s, e)) => {
572 self.captures_nfa_with_match(slots, text, s, e)
573 }
574 dfa::Result::NoMatch(_) => None,
575 dfa::Result::Quit => self.captures_nfa(slots, text, start),
576 }
577 }
578 MatchType::DfaSuffix => {
579 match self.find_dfa_reverse_suffix(text, start) {
580 dfa::Result::Match((s, e)) => {
581 self.captures_nfa_with_match(slots, text, s, e)
582 }
583 dfa::Result::NoMatch(_) => None,
584 dfa::Result::Quit => self.captures_nfa(slots, text, start),
585 }
586 }
587 MatchType::Nfa(ty) => {
588 self.captures_nfa_type(ty, slots, text, start)
589 }
590 MatchType::Nothing => None,
591 MatchType::DfaMany => {
592 unreachable!("BUG: RegexSet cannot be used with captures")
593 }
594 }
595 }
596 }
597
598 impl<'c> ExecNoSync<'c> {
599 /// Finds the leftmost-first match using only literal search.
600 #[inline(always)] // reduces constant overhead
601 fn find_literals(
602 &self,
603 ty: MatchLiteralType,
604 text: &[u8],
605 start: usize,
606 ) -> Option<(usize, usize)> {
607 use self::MatchLiteralType::*;
608 match ty {
609 Unanchored => {
610 let lits = &self.ro.nfa.prefixes;
611 lits.find(&text[start..])
612 .map(|(s, e)| (start + s, start + e))
613 }
614 AnchoredStart => {
615 let lits = &self.ro.nfa.prefixes;
616 lits.find_start(&text[start..])
617 .map(|(s, e)| (start + s, start + e))
618 }
619 AnchoredEnd => {
620 let lits = &self.ro.suffixes;
621 lits.find_end(&text[start..])
622 .map(|(s, e)| (start + s, start + e))
623 }
624 }
625 }
626
627 /// Finds the leftmost-first match (start and end) using only the DFA.
628 ///
629 /// If the result returned indicates that the DFA quit, then another
630 /// matching engine should be used.
631 #[inline(always)] // reduces constant overhead
632 fn find_dfa_forward(
633 &self,
634 text: &[u8],
635 start: usize,
636 ) -> dfa::Result<(usize, usize)> {
637 use dfa::Result::*;
638 let end = match dfa::Fsm::forward(
639 &self.ro.dfa,
640 self.cache,
641 false,
642 text,
643 start,
644 ) {
645 NoMatch(i) => return NoMatch(i),
646 Quit => return Quit,
647 Match(end) if start == end => return Match((start, start)),
648 Match(end) => end,
649 };
650 // Now run the DFA in reverse to find the start of the match.
651 match dfa::Fsm::reverse(
652 &self.ro.dfa_reverse,
653 self.cache,
654 false,
655 &text[start..],
656 end - start,
657 ) {
658 Match(s) => Match((start + s, end)),
659 NoMatch(i) => NoMatch(i),
660 Quit => Quit,
661 }
662 }
663
664 /// Finds the leftmost-first match (start and end) using only the DFA,
665 /// but assumes the regex is anchored at the end and therefore starts at
666 /// the end of the regex and matches in reverse.
667 ///
668 /// If the result returned indicates that the DFA quit, then another
669 /// matching engine should be used.
670 #[inline(always)] // reduces constant overhead
671 fn find_dfa_anchored_reverse(
672 &self,
673 text: &[u8],
674 start: usize,
675 ) -> dfa::Result<(usize, usize)> {
676 use dfa::Result::*;
677 match dfa::Fsm::reverse(
678 &self.ro.dfa_reverse,
679 self.cache,
680 false,
681 &text[start..],
682 text.len() - start,
683 ) {
684 Match(s) => Match((start + s, text.len())),
685 NoMatch(i) => NoMatch(i),
686 Quit => Quit,
687 }
688 }
689
690 /// Finds the end of the shortest match using only the DFA.
691 #[inline(always)] // reduces constant overhead
692 fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
693 dfa::Fsm::forward(&self.ro.dfa, self.cache, true, text, start)
694 }
695
696 /// Finds the end of the shortest match using only the DFA by scanning for
697 /// suffix literals.
698 ///
699 #[inline(always)] // reduces constant overhead
700 fn shortest_dfa_reverse_suffix(
701 &self,
702 text: &[u8],
703 start: usize,
704 ) -> dfa::Result<usize> {
705 match self.exec_dfa_reverse_suffix(text, start) {
706 None => self.shortest_dfa(text, start),
707 Some(r) => r.map(|(_, end)| end),
708 }
709 }
710
711 /// Finds the end of the shortest match using only the DFA by scanning for
712 /// suffix literals. It also reports the start of the match.
713 ///
714 /// Note that if None is returned, then the optimization gave up to avoid
715 /// worst case quadratic behavior. A forward scanning DFA should be tried
716 /// next.
717 ///
718 /// If a match is returned and the full leftmost-first match is desired,
719 /// then a forward scan starting from the beginning of the match must be
720 /// done.
721 ///
722 /// If the result returned indicates that the DFA quit, then another
723 /// matching engine should be used.
724 #[inline(always)] // reduces constant overhead
725 fn exec_dfa_reverse_suffix(
726 &self,
727 text: &[u8],
728 original_start: usize,
729 ) -> Option<dfa::Result<(usize, usize)>> {
730 use dfa::Result::*;
731
732 let lcs = self.ro.suffixes.lcs();
733 debug_assert!(lcs.len() >= 1);
734 let mut start = original_start;
735 let mut end = start;
736 while end <= text.len() {
737 start = end;
738 end += match lcs.find(&text[end..]) {
739 None => return Some(NoMatch(text.len())),
740 Some(start) => start + lcs.len(),
741 };
742 match dfa::Fsm::reverse(
743 &self.ro.dfa_reverse,
744 self.cache,
745 false,
746 &text[start..end],
747 end - start,
748 ) {
749 Match(0) | NoMatch(0) => return None,
750 Match(s) => return Some(Match((s + start, end))),
751 NoMatch(_) => continue,
752 Quit => return Some(Quit),
753 };
754 }
755 Some(NoMatch(text.len()))
756 }
757
758 /// Finds the leftmost-first match (start and end) using only the DFA
759 /// by scanning for suffix literals.
760 ///
761 /// If the result returned indicates that the DFA quit, then another
762 /// matching engine should be used.
763 #[inline(always)] // reduces constant overhead
764 fn find_dfa_reverse_suffix(
765 &self,
766 text: &[u8],
767 start: usize,
768 ) -> dfa::Result<(usize, usize)> {
769 use dfa::Result::*;
770
771 let match_start = match self.exec_dfa_reverse_suffix(text, start) {
772 None => return self.find_dfa_forward(text, start),
773 Some(Match((start, _))) => start,
774 Some(r) => return r,
775 };
776 // At this point, we've found a match. The only way to quit now
777 // without a match is if the DFA gives up (seems unlikely).
778 //
779 // Now run the DFA forwards to find the proper end of the match.
780 // (The suffix literal match can only indicate the earliest
781 // possible end location, which may appear before the end of the
782 // leftmost-first match.)
783 match dfa::Fsm::forward(
784 &self.ro.dfa,
785 self.cache,
786 false,
787 text,
788 match_start,
789 ) {
790 NoMatch(_) => panic!("BUG: reverse match implies forward match"),
791 Quit => Quit,
792 Match(e) => Match((match_start, e)),
793 }
794 }
795
796 /// Executes the NFA engine to return whether there is a match or not.
797 ///
798 /// Ideally, we could use shortest_nfa(...).is_some() and get the same
799 /// performance characteristics, but regex sets don't have captures, which
800 /// shortest_nfa depends on.
801 fn match_nfa(
802 &self,
803 text: &[u8],
804 start: usize,
805 ) -> bool {
806 self.match_nfa_type(MatchNfaType::Auto, text, start)
807 }
808
809 /// Like match_nfa, but allows specification of the type of NFA engine.
810 fn match_nfa_type(
811 &self,
812 ty: MatchNfaType,
813 text: &[u8],
814 start: usize,
815 ) -> bool {
816 self.exec_nfa(ty, &mut [false], &mut [], true, text, start)
817 }
818
819 /// Finds the shortest match using an NFA.
820 fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
821 self.shortest_nfa_type(MatchNfaType::Auto, text, start)
822 }
823
824 /// Like shortest_nfa, but allows specification of the type of NFA engine.
825 fn shortest_nfa_type(
826 &self,
827 ty: MatchNfaType,
828 text: &[u8],
829 start: usize,
830 ) -> Option<usize> {
831 let mut slots = [None, None];
832 if self.exec_nfa(ty, &mut [false], &mut slots, true, text, start) {
833 slots[1]
834 } else {
835 None
836 }
837 }
838
839 /// Like find, but executes an NFA engine.
840 fn find_nfa(
841 &self,
842 ty: MatchNfaType,
843 text: &[u8],
844 start: usize,
845 ) -> Option<(usize, usize)> {
846 let mut slots = [None, None];
847 if self.exec_nfa(ty, &mut [false], &mut slots, false, text, start) {
848 match (slots[0], slots[1]) {
849 (Some(s), Some(e)) => Some((s, e)),
850 _ => None,
851 }
852 } else {
853 None
854 }
855 }
856
857 /// Like find_nfa, but fills in captures and restricts the search space
858 /// using previously found match information.
859 ///
860 /// `slots` should have length equal to `2 * nfa.captures.len()`.
861 fn captures_nfa_with_match(
862 &self,
863 slots: &mut [Slot],
864 text: &[u8],
865 match_start: usize,
866 match_end: usize,
867 ) -> Option<(usize, usize)> {
868 // We can't use match_end directly, because we may need to examine one
869 // "character" after the end of a match for lookahead operators. We
870 // need to move two characters beyond the end, since some look-around
871 // operations may falsely assume a premature end of text otherwise.
872 let e = cmp::min(
873 next_utf8(text, next_utf8(text, match_end)), text.len());
874 self.captures_nfa(slots, &text[..e], match_start)
875 }
876
877 /// Like find_nfa, but fills in captures.
878 ///
879 /// `slots` should have length equal to `2 * nfa.captures.len()`.
880 fn captures_nfa(
881 &self,
882 slots: &mut [Slot],
883 text: &[u8],
884 start: usize,
885 ) -> Option<(usize, usize)> {
886 self.captures_nfa_type(MatchNfaType::Auto, slots, text, start)
887 }
888
889 /// Like captures_nfa, but allows specification of type of NFA engine.
890 fn captures_nfa_type(
891 &self,
892 ty: MatchNfaType,
893 slots: &mut [Slot],
894 text: &[u8],
895 start: usize,
896 ) -> Option<(usize, usize)> {
897 if self.exec_nfa(ty, &mut [false], slots, false, text, start) {
898 match (slots[0], slots[1]) {
899 (Some(s), Some(e)) => Some((s, e)),
900 _ => None,
901 }
902 } else {
903 None
904 }
905 }
906
907 fn exec_nfa(
908 &self,
909 mut ty: MatchNfaType,
910 matches: &mut [bool],
911 slots: &mut [Slot],
912 quit_after_match: bool,
913 text: &[u8],
914 start: usize,
915 ) -> bool {
916 use self::MatchNfaType::*;
917 if let Auto = ty {
918 if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
919 ty = Backtrack;
920 } else {
921 ty = PikeVM;
922 }
923 }
924 match ty {
925 Auto => unreachable!(),
926 Backtrack => self.exec_backtrack(matches, slots, text, start),
927 PikeVM => {
928 self.exec_pikevm(
929 matches, slots, quit_after_match, text, start)
930 }
931 }
932 }
933
934 /// Always run the NFA algorithm.
935 fn exec_pikevm(
936 &self,
937 matches: &mut [bool],
938 slots: &mut [Slot],
939 quit_after_match: bool,
940 text: &[u8],
941 start: usize,
942 ) -> bool {
943 if self.ro.nfa.uses_bytes() {
944 pikevm::Fsm::exec(
945 &self.ro.nfa,
946 self.cache,
947 matches,
948 slots,
949 quit_after_match,
950 ByteInput::new(text, self.ro.nfa.only_utf8),
951 start)
952 } else {
953 pikevm::Fsm::exec(
954 &self.ro.nfa,
955 self.cache,
956 matches,
957 slots,
958 quit_after_match,
959 CharInput::new(text),
960 start)
961 }
962 }
963
964 /// Always runs the NFA using bounded backtracking.
965 fn exec_backtrack(
966 &self,
967 matches: &mut [bool],
968 slots: &mut [Slot],
969 text: &[u8],
970 start: usize,
971 ) -> bool {
972 if self.ro.nfa.uses_bytes() {
973 backtrack::Bounded::exec(
974 &self.ro.nfa,
975 self.cache,
976 matches,
977 slots,
978 ByteInput::new(text, self.ro.nfa.only_utf8),
979 start)
980 } else {
981 backtrack::Bounded::exec(
982 &self.ro.nfa,
983 self.cache,
984 matches,
985 slots,
986 CharInput::new(text),
987 start)
988 }
989 }
990
991 /// Finds which regular expressions match the given text.
992 ///
993 /// `matches` should have length equal to the number of regexes being
994 /// searched.
995 ///
996 /// This is only useful when one wants to know which regexes in a set
997 /// match some text.
998 pub fn many_matches_at(
999 &self,
1000 matches: &mut [bool],
1001 text: &[u8],
1002 start: usize,
1003 ) -> bool {
1004 use self::MatchType::*;
1005 if !self.is_anchor_end_match(text) {
1006 return false;
1007 }
1008 match self.ro.match_type {
1009 Literal(ty) => {
1010 debug_assert_eq!(matches.len(), 1);
1011 matches[0] = self.find_literals(ty, text, start).is_some();
1012 matches[0]
1013 }
1014 Dfa | DfaAnchoredReverse | DfaSuffix | DfaMany => {
1015 match dfa::Fsm::forward_many(
1016 &self.ro.dfa,
1017 self.cache,
1018 matches,
1019 text,
1020 start,
1021 ) {
1022 dfa::Result::Match(_) => true,
1023 dfa::Result::NoMatch(_) => false,
1024 dfa::Result::Quit => {
1025 self.exec_nfa(
1026 MatchNfaType::Auto,
1027 matches,
1028 &mut [],
1029 false,
1030 text,
1031 start)
1032 }
1033 }
1034 }
1035 Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start),
1036 Nothing => false,
1037 }
1038 }
1039
1040 #[inline(always)] // reduces constant overhead
1041 fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1042 // Only do this check if the haystack is big (>1MB).
1043 if text.len() > (1<<20) && self.ro.nfa.is_anchored_end {
1044 let lcs = self.ro.suffixes.lcs();
1045 if lcs.len() >= 1 && !lcs.is_suffix(text) {
1046 return false;
1047 }
1048 }
1049 true
1050 }
1051
1052 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1053 &self.ro.nfa.capture_name_idx
1054 }
1055 }
1056
1057 impl<'c> ExecNoSyncStr<'c> {
1058 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1059 self.0.capture_name_idx()
1060 }
1061 }
1062
1063 impl Exec {
1064 /// Get a searcher that isn't Sync.
1065 #[inline(always)] // reduces constant overhead
1066 pub fn searcher(&self) -> ExecNoSync {
1067 let create = || Box::new(RefCell::new(ProgramCacheInner::new(&self.ro)));
1068 ExecNoSync {
1069 ro: &self.ro, // a clone is too expensive here! (and not needed)
1070 cache: self.cache.get_or(create),
1071 }
1072 }
1073
1074 /// Get a searcher that isn't Sync and can match on &str.
1075 #[inline(always)] // reduces constant overhead
1076 pub fn searcher_str(&self) -> ExecNoSyncStr {
1077 ExecNoSyncStr(self.searcher())
1078 }
1079
1080 /// Build a Regex from this executor.
1081 pub fn into_regex(self) -> re_unicode::Regex {
1082 re_unicode::Regex::from(self)
1083 }
1084
1085 /// Build a RegexSet from this executor.
1086 pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1087 re_set::unicode::RegexSet::from(self)
1088 }
1089
1090 /// Build a Regex from this executor that can match arbitrary bytes.
1091 pub fn into_byte_regex(self) -> re_bytes::Regex {
1092 re_bytes::Regex::from(self)
1093 }
1094
1095 /// Build a RegexSet from this executor that can match arbitrary bytes.
1096 pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1097 re_set::bytes::RegexSet::from(self)
1098 }
1099
1100 /// The original regular expressions given by the caller that were
1101 /// compiled.
1102 pub fn regex_strings(&self) -> &[String] {
1103 &self.ro.res
1104 }
1105
1106 /// Return a slice of capture names.
1107 ///
1108 /// Any capture that isn't named is None.
1109 pub fn capture_names(&self) -> &[Option<String>] {
1110 &self.ro.nfa.captures
1111 }
1112
1113 /// Return a reference to named groups mapping (from group name to
1114 /// group position).
1115 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1116 &self.ro.nfa.capture_name_idx
1117 }
1118 }
1119
1120 impl Clone for Exec {
1121 fn clone(&self) -> Exec {
1122 Exec {
1123 ro: self.ro.clone(),
1124 cache: CachedThreadLocal::new(),
1125 }
1126 }
1127 }
1128
1129 impl ExecReadOnly {
1130 fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1131 use self::MatchType::*;
1132 if let Some(Nfa(_)) = hint {
1133 return hint.unwrap();
1134 }
1135 // If the NFA is empty, then we'll never match anything.
1136 if self.nfa.insts.is_empty() {
1137 return Nothing;
1138 }
1139 // If our set of prefixes is complete, then we can use it to find
1140 // a match in lieu of a regex engine. This doesn't quite work well in
1141 // the presence of multiple regexes, so only do it when there's one.
1142 //
1143 // TODO(burntsushi): Also, don't try to match literals if the regex is
1144 // partially anchored. We could technically do it, but we'd need to
1145 // create two sets of literals: all of them and then the subset that
1146 // aren't anchored. We would then only search for all of them when at
1147 // the beginning of the input and use the subset in all other cases.
1148 if self.res.len() == 1 {
1149 if self.nfa.prefixes.complete() {
1150 return if self.nfa.is_anchored_start {
1151 Literal(MatchLiteralType::AnchoredStart)
1152 } else {
1153 Literal(MatchLiteralType::Unanchored)
1154 };
1155 }
1156 if self.suffixes.complete() {
1157 return if self.nfa.is_anchored_end {
1158 Literal(MatchLiteralType::AnchoredEnd)
1159 } else {
1160 // This case shouldn't happen. When the regex isn't
1161 // anchored, then complete prefixes should imply complete
1162 // suffixes.
1163 Literal(MatchLiteralType::Unanchored)
1164 };
1165 }
1166 }
1167 // If we can execute the DFA, then we totally should.
1168 if dfa::can_exec(&self.dfa) {
1169 // Regex sets require a slightly specialized path.
1170 if self.res.len() >= 2 {
1171 return DfaMany;
1172 }
1173 // If the regex is anchored at the end but not the start, then
1174 // just match in reverse from the end of the haystack.
1175 if !self.nfa.is_anchored_start && self.nfa.is_anchored_end {
1176 return DfaAnchoredReverse;
1177 }
1178 // If there's a longish suffix literal, then it might be faster
1179 // to look for that first.
1180 if self.should_suffix_scan() {
1181 return DfaSuffix;
1182 }
1183 // Fall back to your garden variety forward searching lazy DFA.
1184 return Dfa;
1185 }
1186 // We're so totally hosed.
1187 Nfa(MatchNfaType::Auto)
1188 }
1189
1190 /// Returns true if the program is amenable to suffix scanning.
1191 ///
1192 /// When this is true, as a heuristic, we assume it is OK to quickly scan
1193 /// for suffix literals and then do a *reverse* DFA match from any matches
1194 /// produced by the literal scan. (And then followed by a forward DFA
1195 /// search, since the previously found suffix literal maybe not actually be
1196 /// the end of a match.)
1197 ///
1198 /// This is a bit of a specialized optimization, but can result in pretty
1199 /// big performance wins if 1) there are no prefix literals and 2) the
1200 /// suffix literals are pretty rare in the text. (1) is obviously easy to
1201 /// account for but (2) is harder. As a proxy, we assume that longer
1202 /// strings are generally rarer, so we only enable this optimization when
1203 /// we have a meaty suffix.
1204 fn should_suffix_scan(&self) -> bool {
1205 if self.suffixes.is_empty() {
1206 return false;
1207 }
1208 let lcs_len = self.suffixes.lcs().char_len();
1209 lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1210 }
1211 }
1212
1213 #[derive(Clone, Copy, Debug)]
1214 enum MatchType {
1215 /// A single or multiple literal search. This is only used when the regex
1216 /// can be decomposed into unambiguous literal search.
1217 Literal(MatchLiteralType),
1218 /// A normal DFA search.
1219 Dfa,
1220 /// A reverse DFA search starting from the end of a haystack.
1221 DfaAnchoredReverse,
1222 /// A reverse DFA search with suffix literal scanning.
1223 DfaSuffix,
1224 /// Use the DFA on two or more regular expressions.
1225 DfaMany,
1226 /// An NFA variant.
1227 Nfa(MatchNfaType),
1228 /// No match is ever possible, so don't ever try to search.
1229 Nothing,
1230 }
1231
1232 #[derive(Clone, Copy, Debug)]
1233 enum MatchLiteralType {
1234 /// Match literals anywhere in text.
1235 Unanchored,
1236 /// Match literals only at the start of text.
1237 AnchoredStart,
1238 /// Match literals only at the end of text.
1239 AnchoredEnd,
1240 }
1241
1242 #[derive(Clone, Copy, Debug)]
1243 enum MatchNfaType {
1244 /// Choose between Backtrack and PikeVM.
1245 Auto,
1246 /// NFA bounded backtracking.
1247 ///
1248 /// (This is only set by tests, since it never makes sense to always want
1249 /// backtracking.)
1250 Backtrack,
1251 /// The Pike VM.
1252 ///
1253 /// (This is only set by tests, since it never makes sense to always want
1254 /// the Pike VM.)
1255 PikeVM,
1256 }
1257
1258 /// `ProgramCache` maintains reusable allocations for each matching engine
1259 /// available to a particular program.
1260 pub type ProgramCache = RefCell<ProgramCacheInner>;
1261
1262 #[derive(Clone, Debug)]
1263 pub struct ProgramCacheInner {
1264 pub pikevm: pikevm::Cache,
1265 pub backtrack: backtrack::Cache,
1266 pub dfa: dfa::Cache,
1267 pub dfa_reverse: dfa::Cache,
1268 }
1269
1270 impl ProgramCacheInner {
1271 fn new(ro: &ExecReadOnly) -> Self {
1272 ProgramCacheInner {
1273 pikevm: pikevm::Cache::new(&ro.nfa),
1274 backtrack: backtrack::Cache::new(&ro.nfa),
1275 dfa: dfa::Cache::new(&ro.dfa),
1276 dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1277 }
1278 }
1279 }