]> git.proxmox.com Git - rustc.git/blame - src/vendor/regex-0.2.11/src/exec.rs
New upstream version 1.31.0+dfsg1
[rustc.git] / src / vendor / regex-0.2.11 / src / exec.rs
CommitLineData
94b46f34
XL
1// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use std::cell::RefCell;
12use std::collections::HashMap;
13use std::cmp;
14use std::sync::Arc;
15
16use thread_local::CachedThreadLocal;
17use syntax::ParserBuilder;
18use syntax::hir::Hir;
19use syntax::hir::literal::Literals;
20
21use backtrack;
22use compile::Compiler;
23use dfa;
24use error::Error;
25use input::{ByteInput, CharInput};
26use literal::LiteralSearcher;
27use pikevm;
28use prog::Program;
29use re_builder::RegexOptions;
30use re_bytes;
31use re_set;
32use re_trait::{RegularExpression, Slot, Locations, as_slots};
33use re_unicode;
34use utf8::next_utf8;
35
36/// `Exec` manages the execution of a regular expression.
37///
38/// In particular, this manages the various compiled forms of a single regular
39/// expression and the choice of which matching engine to use to execute a
40/// regular expression.
41pub struct Exec {
42 /// All read only state.
43 ro: Arc<ExecReadOnly>,
44 /// Caches for the various matching engines.
45 cache: CachedThreadLocal<ProgramCache>,
46}
47
48/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
49/// means it is no longer Sync, but we can now avoid the overhead of
50/// synchronization to fetch the cache.
51#[derive(Debug)]
52pub struct ExecNoSync<'c> {
53 /// All read only state.
54 ro: &'c Arc<ExecReadOnly>,
55 /// Caches for the various matching engines.
56 cache: &'c ProgramCache,
57}
58
59/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
60pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
61
62/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
63/// state is determined at compile time and never changes during search.
64#[derive(Debug)]
65struct ExecReadOnly {
66 /// The original regular expressions given by the caller to compile.
67 res: Vec<String>,
68 /// A compiled program that is used in the NFA simulation and backtracking.
69 /// It can be byte-based or Unicode codepoint based.
70 ///
71 /// N.B. It is not possibly to make this byte-based from the public API.
72 /// It is only used for testing byte based programs in the NFA simulations.
73 nfa: Program,
74 /// A compiled byte based program for DFA execution. This is only used
75 /// if a DFA can be executed. (Currently, only word boundary assertions are
76 /// not supported.) Note that this program contains an embedded `.*?`
77 /// preceding the first capture group, unless the regex is anchored at the
78 /// beginning.
79 dfa: Program,
80 /// The same as above, except the program is reversed (and there is no
81 /// preceding `.*?`). This is used by the DFA to find the starting location
82 /// of matches.
83 dfa_reverse: Program,
84 /// A set of suffix literals extracted from the regex.
85 ///
86 /// Prefix literals are stored on the `Program`, since they are used inside
87 /// the matching engines.
88 suffixes: LiteralSearcher,
89 /// match_type encodes as much upfront knowledge about how we're going to
90 /// execute a search as possible.
91 match_type: MatchType,
92}
93
94/// Facilitates the construction of an executor by exposing various knobs
95/// to control how a regex is executed and what kinds of resources it's
96/// permitted to use.
97pub struct ExecBuilder {
98 options: RegexOptions,
99 match_type: Option<MatchType>,
100 bytes: bool,
101 only_utf8: bool,
102}
103
104/// Parsed represents a set of parsed regular expressions and their detected
105/// literals.
106struct Parsed {
107 exprs: Vec<Hir>,
108 prefixes: Literals,
109 suffixes: Literals,
110 bytes: bool,
111}
112
113impl ExecBuilder {
114 /// Create a regex execution builder.
115 ///
116 /// This uses default settings for everything except the regex itself,
117 /// which must be provided. Further knobs can be set by calling methods,
118 /// and then finally, `build` to actually create the executor.
119 pub fn new(re: &str) -> Self {
120 Self::new_many(&[re])
121 }
122
123 /// Like new, but compiles the union of the given regular expressions.
124 ///
125 /// Note that when compiling 2 or more regular expressions, capture groups
126 /// are completely unsupported. (This means both `find` and `captures`
127 /// wont work.)
128 pub fn new_many<I, S>(res: I) -> Self
129 where S: AsRef<str>, I: IntoIterator<Item=S> {
130 let mut opts = RegexOptions::default();
131 opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
132 Self::new_options(opts)
133 }
134
135 /// Create a regex execution builder.
136 pub fn new_options(opts: RegexOptions) -> Self {
137 ExecBuilder {
138 options: opts,
139 match_type: None,
140 bytes: false,
141 only_utf8: true,
142 }
143 }
144
145 /// Set the matching engine to be automatically determined.
146 ///
147 /// This is the default state and will apply whatever optimizations are
148 /// possible, such as running a DFA.
149 ///
150 /// This overrides whatever was previously set via the `nfa` or
151 /// `bounded_backtracking` methods.
152 pub fn automatic(mut self) -> Self {
153 self.match_type = None;
154 self
155 }
156
157 /// Sets the matching engine to use the NFA algorithm no matter what
158 /// optimizations are possible.
159 ///
160 /// This overrides whatever was previously set via the `automatic` or
161 /// `bounded_backtracking` methods.
162 pub fn nfa(mut self) -> Self {
163 self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
164 self
165 }
166
167 /// Sets the matching engine to use a bounded backtracking engine no
168 /// matter what optimizations are possible.
169 ///
170 /// One must use this with care, since the bounded backtracking engine
171 /// uses memory proportion to `len(regex) * len(text)`.
172 ///
173 /// This overrides whatever was previously set via the `automatic` or
174 /// `nfa` methods.
175 pub fn bounded_backtracking(mut self) -> Self {
176 self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
177 self
178 }
179
180 /// Compiles byte based programs for use with the NFA matching engines.
181 ///
182 /// By default, the NFA engines match on Unicode scalar values. They can
183 /// be made to use byte based programs instead. In general, the byte based
184 /// programs are slower because of a less efficient encoding of character
185 /// classes.
186 ///
187 /// Note that this does not impact DFA matching engines, which always
188 /// execute on bytes.
189 pub fn bytes(mut self, yes: bool) -> Self {
190 self.bytes = yes;
191 self
192 }
193
194 /// When disabled, the program compiled may match arbitrary bytes.
195 ///
196 /// When enabled (the default), all compiled programs exclusively match
197 /// valid UTF-8 bytes.
198 pub fn only_utf8(mut self, yes: bool) -> Self {
199 self.only_utf8 = yes;
200 self
201 }
202
203 /// Set the Unicode flag.
204 pub fn unicode(mut self, yes: bool) -> Self {
205 self.options.unicode = yes;
206 self
207 }
208
209 /// Parse the current set of patterns into their AST and extract literals.
210 fn parse(&self) -> Result<Parsed, Error> {
211 let mut exprs = Vec::with_capacity(self.options.pats.len());
212 let mut prefixes = Some(Literals::empty());
213 let mut suffixes = Some(Literals::empty());
214 let mut bytes = false;
215 let is_set = self.options.pats.len() > 1;
216 // If we're compiling a regex set and that set has any anchored
217 // expressions, then disable all literal optimizations.
218 for pat in &self.options.pats {
219 let mut parser =
220 ParserBuilder::new()
221 // TODO(burntsushi): Disable octal in regex 1.0. Nobody
222 // uses it, and we'll get better error messages when
223 // someone tries to use a backreference. Provide a new
224 // opt-in toggle for it though.
225 .octal(true)
226 .case_insensitive(self.options.case_insensitive)
227 .multi_line(self.options.multi_line)
228 .dot_matches_new_line(self.options.dot_matches_new_line)
229 .swap_greed(self.options.swap_greed)
230 .ignore_whitespace(self.options.ignore_whitespace)
231 .unicode(self.options.unicode)
232 .allow_invalid_utf8(!self.only_utf8)
233 .nest_limit(self.options.nest_limit)
234 .build();
235 let expr = try!(parser.parse(pat));
236 bytes = bytes || !expr.is_always_utf8();
237
238 if !expr.is_anchored_start() && expr.is_any_anchored_start() {
239 // Partial anchors unfortunately make it hard to use prefixes,
240 // so disable them.
241 prefixes = None;
242 } else if is_set && expr.is_anchored_start() {
243 // Regex sets with anchors do not go well with literal
244 // optimizations.
245 prefixes = None;
246 }
247 prefixes = prefixes.and_then(|mut prefixes| {
248 if !prefixes.union_prefixes(&expr) {
249 None
250 } else {
251 Some(prefixes)
252 }
253 });
254
255 if !expr.is_anchored_end() && expr.is_any_anchored_end() {
256 // Partial anchors unfortunately make it hard to use suffixes,
257 // so disable them.
258 suffixes = None;
259 } else if is_set && expr.is_anchored_end() {
260 // Regex sets with anchors do not go well with literal
261 // optimizations.
262 suffixes = None;
263 }
264 suffixes = suffixes.and_then(|mut suffixes| {
265 if !suffixes.union_suffixes(&expr) {
266 None
267 } else {
268 Some(suffixes)
269 }
270 });
271 exprs.push(expr);
272 }
273 Ok(Parsed {
274 exprs: exprs,
275 prefixes: prefixes.unwrap_or_else(Literals::empty),
276 suffixes: suffixes.unwrap_or_else(Literals::empty),
277 bytes: bytes,
278 })
279 }
280
281 /// Build an executor that can run a regular expression.
282 pub fn build(self) -> Result<Exec, Error> {
283 // Special case when we have no patterns to compile.
284 // This can happen when compiling a regex set.
285 if self.options.pats.is_empty() {
286 let ro = Arc::new(ExecReadOnly {
287 res: vec![],
288 nfa: Program::new(),
289 dfa: Program::new(),
290 dfa_reverse: Program::new(),
291 suffixes: LiteralSearcher::empty(),
292 match_type: MatchType::Nothing,
293 });
294 return Ok(Exec { ro: ro, cache: CachedThreadLocal::new() });
295 }
296 let parsed = try!(self.parse());
297 let mut nfa = try!(
298 Compiler::new()
299 .size_limit(self.options.size_limit)
300 .bytes(self.bytes || parsed.bytes)
301 .only_utf8(self.only_utf8)
302 .compile(&parsed.exprs));
303 let mut dfa = try!(
304 Compiler::new()
305 .size_limit(self.options.size_limit)
306 .dfa(true)
307 .only_utf8(self.only_utf8)
308 .compile(&parsed.exprs));
309 let mut dfa_reverse = try!(
310 Compiler::new()
311 .size_limit(self.options.size_limit)
312 .dfa(true)
313 .only_utf8(self.only_utf8)
314 .reverse(true)
315 .compile(&parsed.exprs));
316
317 let prefixes = parsed.prefixes.unambiguous_prefixes();
318 let suffixes = parsed.suffixes.unambiguous_suffixes();
319 nfa.prefixes = LiteralSearcher::prefixes(prefixes);
320 dfa.prefixes = nfa.prefixes.clone();
321 dfa.dfa_size_limit = self.options.dfa_size_limit;
322 dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
323
324 let mut ro = ExecReadOnly {
325 res: self.options.pats,
326 nfa: nfa,
327 dfa: dfa,
328 dfa_reverse: dfa_reverse,
329 suffixes: LiteralSearcher::suffixes(suffixes),
330 match_type: MatchType::Nothing,
331 };
332 ro.match_type = ro.choose_match_type(self.match_type);
333
334 let ro = Arc::new(ro);
335 Ok(Exec { ro: ro, cache: CachedThreadLocal::new() })
336 }
337}
338
339impl<'c> RegularExpression for ExecNoSyncStr<'c> {
340 type Text = str;
341
342 fn slots_len(&self) -> usize { self.0.slots_len() }
343
344 fn next_after_empty(&self, text: &str, i: usize) -> usize {
345 next_utf8(text.as_bytes(), i)
346 }
347
348 #[inline(always)] // reduces constant overhead
349 fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
350 self.0.shortest_match_at(text.as_bytes(), start)
351 }
352
353 #[inline(always)] // reduces constant overhead
354 fn is_match_at(&self, text: &str, start: usize) -> bool {
355 self.0.is_match_at(text.as_bytes(), start)
356 }
357
358 #[inline(always)] // reduces constant overhead
359 fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
360 self.0.find_at(text.as_bytes(), start)
361 }
362
363 #[inline(always)] // reduces constant overhead
364 fn read_captures_at(
365 &self,
366 locs: &mut Locations,
367 text: &str,
368 start: usize,
369 ) -> Option<(usize, usize)> {
370 self.0.read_captures_at(locs, text.as_bytes(), start)
371 }
372}
373
374impl<'c> RegularExpression for ExecNoSync<'c> {
375 type Text = [u8];
376
377 /// Returns the number of capture slots in the regular expression. (There
378 /// are two slots for every capture group, corresponding to possibly empty
379 /// start and end locations of the capture.)
380 fn slots_len(&self) -> usize {
381 self.ro.nfa.captures.len() * 2
382 }
383
384 fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
385 i + 1
386 }
387
388 /// Returns the end of a match location, possibly occurring before the
389 /// end location of the correct leftmost-first match.
390 #[inline(always)] // reduces constant overhead
391 fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
392 if !self.is_anchor_end_match(text) {
393 return None;
394 }
395 match self.ro.match_type {
396 MatchType::Literal(ty) => {
397 self.find_literals(ty, text, start).map(|(_, e)| e)
398 }
399 MatchType::Dfa | MatchType::DfaMany => {
400 match self.shortest_dfa(text, start) {
401 dfa::Result::Match(end) => Some(end),
402 dfa::Result::NoMatch(_) => None,
403 dfa::Result::Quit => self.shortest_nfa(text, start),
404 }
405 }
406 MatchType::DfaAnchoredReverse => {
407 match dfa::Fsm::reverse(
408 &self.ro.dfa_reverse,
409 self.cache,
410 true,
411 &text[start..],
412 text.len(),
413 ) {
414 dfa::Result::Match(_) => Some(text.len()),
415 dfa::Result::NoMatch(_) => None,
416 dfa::Result::Quit => self.shortest_nfa(text, start),
417 }
418 }
419 MatchType::DfaSuffix => {
420 match self.shortest_dfa_reverse_suffix(text, start) {
421 dfa::Result::Match(e) => Some(e),
422 dfa::Result::NoMatch(_) => None,
423 dfa::Result::Quit => self.shortest_nfa(text, start),
424 }
425 }
426 MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
427 MatchType::Nothing => None,
428 }
429 }
430
431 /// Returns true if and only if the regex matches text.
432 ///
433 /// For single regular expressions, this is equivalent to calling
434 /// shortest_match(...).is_some().
435 #[inline(always)] // reduces constant overhead
436 fn is_match_at(&self, text: &[u8], start: usize) -> bool {
437 if !self.is_anchor_end_match(text) {
438 return false;
439 }
440 // We need to do this dance because shortest_match relies on the NFA
441 // filling in captures[1], but a RegexSet has no captures. In other
442 // words, a RegexSet can't (currently) use shortest_match. ---AG
443 match self.ro.match_type {
444 MatchType::Literal(ty) => {
445 self.find_literals(ty, text, start).is_some()
446 }
447 MatchType::Dfa | MatchType::DfaMany => {
448 match self.shortest_dfa(text, start) {
449 dfa::Result::Match(_) => true,
450 dfa::Result::NoMatch(_) => false,
451 dfa::Result::Quit => self.match_nfa(text, start),
452 }
453 }
454 MatchType::DfaAnchoredReverse => {
455 match dfa::Fsm::reverse(
456 &self.ro.dfa_reverse,
457 self.cache,
458 true,
459 &text[start..],
460 text.len(),
461 ) {
462 dfa::Result::Match(_) => true,
463 dfa::Result::NoMatch(_) => false,
464 dfa::Result::Quit => self.match_nfa(text, start),
465 }
466 }
467 MatchType::DfaSuffix => {
468 match self.shortest_dfa_reverse_suffix(text, start) {
469 dfa::Result::Match(_) => true,
470 dfa::Result::NoMatch(_) => false,
471 dfa::Result::Quit => self.match_nfa(text, start),
472 }
473 }
474 MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
475 MatchType::Nothing => false,
476 }
477 }
478
479 /// Finds the start and end location of the leftmost-first match, starting
480 /// at the given location.
481 #[inline(always)] // reduces constant overhead
482 fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
483 if !self.is_anchor_end_match(text) {
484 return None;
485 }
486 match self.ro.match_type {
487 MatchType::Literal(ty) => {
488 self.find_literals(ty, text, start)
489 }
490 MatchType::Dfa => {
491 match self.find_dfa_forward(text, start) {
492 dfa::Result::Match((s, e)) => Some((s, e)),
493 dfa::Result::NoMatch(_) => None,
494 dfa::Result::Quit => {
495 self.find_nfa(MatchNfaType::Auto, text, start)
496 }
497 }
498 }
499 MatchType::DfaAnchoredReverse => {
500 match self.find_dfa_anchored_reverse(text, start) {
501 dfa::Result::Match((s, e)) => Some((s, e)),
502 dfa::Result::NoMatch(_) => None,
503 dfa::Result::Quit => {
504 self.find_nfa(MatchNfaType::Auto, text, start)
505 }
506 }
507 }
508 MatchType::DfaSuffix => {
509 match self.find_dfa_reverse_suffix(text, start) {
510 dfa::Result::Match((s, e)) => Some((s, e)),
511 dfa::Result::NoMatch(_) => None,
512 dfa::Result::Quit => {
513 self.find_nfa(MatchNfaType::Auto, text, start)
514 }
515 }
516 }
517 MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
518 MatchType::Nothing => None,
519 MatchType::DfaMany => {
520 unreachable!("BUG: RegexSet cannot be used with find")
521 }
522 }
523 }
524
525 /// Finds the start and end location of the leftmost-first match and also
526 /// fills in all matching capture groups.
527 ///
528 /// The number of capture slots given should be equal to the total number
529 /// of capture slots in the compiled program.
530 ///
531 /// Note that the first two slots always correspond to the start and end
532 /// locations of the overall match.
533 fn read_captures_at(
534 &self,
535 locs: &mut Locations,
536 text: &[u8],
537 start: usize,
538 ) -> Option<(usize, usize)> {
539 let slots = as_slots(locs);
540 for slot in slots.iter_mut() {
541 *slot = None;
542 }
543 // If the caller unnecessarily uses this, then we try to save them
544 // from themselves.
545 match slots.len() {
546 0 => return self.find_at(text, start),
547 2 => {
548 return self.find_at(text, start).map(|(s, e)| {
549 slots[0] = Some(s);
550 slots[1] = Some(e);
551 (s, e)
552 });
553 }
554 _ => {} // fallthrough
555 }
556 if !self.is_anchor_end_match(text) {
557 return None;
558 }
559 match self.ro.match_type {
560 MatchType::Literal(ty) => {
561 self.find_literals(ty, text, start).and_then(|(s, e)| {
562 self.captures_nfa_with_match(slots, text, s, e)
563 })
564 }
565 MatchType::Dfa => {
566 if self.ro.nfa.is_anchored_start {
567 self.captures_nfa(slots, text, start)
568 } else {
569 match self.find_dfa_forward(text, start) {
570 dfa::Result::Match((s, e)) => {
571 self.captures_nfa_with_match(slots, text, s, e)
572 }
573 dfa::Result::NoMatch(_) => None,
574 dfa::Result::Quit => self.captures_nfa(slots, text, start),
575 }
576 }
577 }
578 MatchType::DfaAnchoredReverse => {
579 match self.find_dfa_anchored_reverse(text, start) {
580 dfa::Result::Match((s, e)) => {
581 self.captures_nfa_with_match(slots, text, s, e)
582 }
583 dfa::Result::NoMatch(_) => None,
584 dfa::Result::Quit => self.captures_nfa(slots, text, start),
585 }
586 }
587 MatchType::DfaSuffix => {
588 match self.find_dfa_reverse_suffix(text, start) {
589 dfa::Result::Match((s, e)) => {
590 self.captures_nfa_with_match(slots, text, s, e)
591 }
592 dfa::Result::NoMatch(_) => None,
593 dfa::Result::Quit => self.captures_nfa(slots, text, start),
594 }
595 }
596 MatchType::Nfa(ty) => {
597 self.captures_nfa_type(ty, slots, text, start)
598 }
599 MatchType::Nothing => None,
600 MatchType::DfaMany => {
601 unreachable!("BUG: RegexSet cannot be used with captures")
602 }
603 }
604 }
605}
606
607impl<'c> ExecNoSync<'c> {
608 /// Finds the leftmost-first match using only literal search.
609 #[inline(always)] // reduces constant overhead
610 fn find_literals(
611 &self,
612 ty: MatchLiteralType,
613 text: &[u8],
614 start: usize,
615 ) -> Option<(usize, usize)> {
616 use self::MatchLiteralType::*;
617 match ty {
618 Unanchored => {
619 let lits = &self.ro.nfa.prefixes;
620 lits.find(&text[start..])
621 .map(|(s, e)| (start + s, start + e))
622 }
623 AnchoredStart => {
624 let lits = &self.ro.nfa.prefixes;
8faf50e0
XL
625 if !self.ro.nfa.is_anchored_start
626 || (self.ro.nfa.is_anchored_start && start == 0) {
627 lits.find_start(&text[start..])
628 .map(|(s, e)| (start + s, start + e))
629 } else {
630 None
631 }
94b46f34
XL
632 }
633 AnchoredEnd => {
634 let lits = &self.ro.suffixes;
635 lits.find_end(&text[start..])
636 .map(|(s, e)| (start + s, start + e))
637 }
638 }
639 }
640
641 /// Finds the leftmost-first match (start and end) using only the DFA.
642 ///
643 /// If the result returned indicates that the DFA quit, then another
644 /// matching engine should be used.
645 #[inline(always)] // reduces constant overhead
646 fn find_dfa_forward(
647 &self,
648 text: &[u8],
649 start: usize,
650 ) -> dfa::Result<(usize, usize)> {
651 use dfa::Result::*;
652 let end = match dfa::Fsm::forward(
653 &self.ro.dfa,
654 self.cache,
655 false,
656 text,
657 start,
658 ) {
659 NoMatch(i) => return NoMatch(i),
660 Quit => return Quit,
661 Match(end) if start == end => return Match((start, start)),
662 Match(end) => end,
663 };
664 // Now run the DFA in reverse to find the start of the match.
665 match dfa::Fsm::reverse(
666 &self.ro.dfa_reverse,
667 self.cache,
668 false,
669 &text[start..],
670 end - start,
671 ) {
672 Match(s) => Match((start + s, end)),
673 NoMatch(i) => NoMatch(i),
674 Quit => Quit,
675 }
676 }
677
678 /// Finds the leftmost-first match (start and end) using only the DFA,
679 /// but assumes the regex is anchored at the end and therefore starts at
680 /// the end of the regex and matches in reverse.
681 ///
682 /// If the result returned indicates that the DFA quit, then another
683 /// matching engine should be used.
684 #[inline(always)] // reduces constant overhead
685 fn find_dfa_anchored_reverse(
686 &self,
687 text: &[u8],
688 start: usize,
689 ) -> dfa::Result<(usize, usize)> {
690 use dfa::Result::*;
691 match dfa::Fsm::reverse(
692 &self.ro.dfa_reverse,
693 self.cache,
694 false,
695 &text[start..],
696 text.len() - start,
697 ) {
698 Match(s) => Match((start + s, text.len())),
699 NoMatch(i) => NoMatch(i),
700 Quit => Quit,
701 }
702 }
703
704 /// Finds the end of the shortest match using only the DFA.
705 #[inline(always)] // reduces constant overhead
706 fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
707 dfa::Fsm::forward(&self.ro.dfa, self.cache, true, text, start)
708 }
709
710 /// Finds the end of the shortest match using only the DFA by scanning for
711 /// suffix literals.
712 ///
713 #[inline(always)] // reduces constant overhead
714 fn shortest_dfa_reverse_suffix(
715 &self,
716 text: &[u8],
717 start: usize,
718 ) -> dfa::Result<usize> {
719 match self.exec_dfa_reverse_suffix(text, start) {
720 None => self.shortest_dfa(text, start),
721 Some(r) => r.map(|(_, end)| end),
722 }
723 }
724
725 /// Finds the end of the shortest match using only the DFA by scanning for
726 /// suffix literals. It also reports the start of the match.
727 ///
728 /// Note that if None is returned, then the optimization gave up to avoid
729 /// worst case quadratic behavior. A forward scanning DFA should be tried
730 /// next.
731 ///
732 /// If a match is returned and the full leftmost-first match is desired,
733 /// then a forward scan starting from the beginning of the match must be
734 /// done.
735 ///
736 /// If the result returned indicates that the DFA quit, then another
737 /// matching engine should be used.
738 #[inline(always)] // reduces constant overhead
739 fn exec_dfa_reverse_suffix(
740 &self,
741 text: &[u8],
742 original_start: usize,
743 ) -> Option<dfa::Result<(usize, usize)>> {
744 use dfa::Result::*;
745
746 let lcs = self.ro.suffixes.lcs();
747 debug_assert!(lcs.len() >= 1);
748 let mut start = original_start;
749 let mut end = start;
750 while end <= text.len() {
751 start = end;
752 end += match lcs.find(&text[end..]) {
753 None => return Some(NoMatch(text.len())),
754 Some(start) => start + lcs.len(),
755 };
756 match dfa::Fsm::reverse(
757 &self.ro.dfa_reverse,
758 self.cache,
759 false,
760 &text[start..end],
761 end - start,
762 ) {
763 Match(0) | NoMatch(0) => return None,
764 Match(s) => return Some(Match((s + start, end))),
765 NoMatch(_) => continue,
766 Quit => return Some(Quit),
767 };
768 }
769 Some(NoMatch(text.len()))
770 }
771
772 /// Finds the leftmost-first match (start and end) using only the DFA
773 /// by scanning for suffix literals.
774 ///
775 /// If the result returned indicates that the DFA quit, then another
776 /// matching engine should be used.
777 #[inline(always)] // reduces constant overhead
778 fn find_dfa_reverse_suffix(
779 &self,
780 text: &[u8],
781 start: usize,
782 ) -> dfa::Result<(usize, usize)> {
783 use dfa::Result::*;
784
785 let match_start = match self.exec_dfa_reverse_suffix(text, start) {
786 None => return self.find_dfa_forward(text, start),
787 Some(Match((start, _))) => start,
788 Some(r) => return r,
789 };
790 // At this point, we've found a match. The only way to quit now
791 // without a match is if the DFA gives up (seems unlikely).
792 //
793 // Now run the DFA forwards to find the proper end of the match.
794 // (The suffix literal match can only indicate the earliest
795 // possible end location, which may appear before the end of the
796 // leftmost-first match.)
797 match dfa::Fsm::forward(
798 &self.ro.dfa,
799 self.cache,
800 false,
801 text,
802 match_start,
803 ) {
804 NoMatch(_) => panic!("BUG: reverse match implies forward match"),
805 Quit => Quit,
806 Match(e) => Match((match_start, e)),
807 }
808 }
809
810 /// Executes the NFA engine to return whether there is a match or not.
811 ///
812 /// Ideally, we could use shortest_nfa(...).is_some() and get the same
813 /// performance characteristics, but regex sets don't have captures, which
814 /// shortest_nfa depends on.
815 fn match_nfa(
816 &self,
817 text: &[u8],
818 start: usize,
819 ) -> bool {
820 self.match_nfa_type(MatchNfaType::Auto, text, start)
821 }
822
823 /// Like match_nfa, but allows specification of the type of NFA engine.
824 fn match_nfa_type(
825 &self,
826 ty: MatchNfaType,
827 text: &[u8],
828 start: usize,
829 ) -> bool {
830 self.exec_nfa(ty, &mut [false], &mut [], true, text, start)
831 }
832
833 /// Finds the shortest match using an NFA.
834 fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
835 self.shortest_nfa_type(MatchNfaType::Auto, text, start)
836 }
837
838 /// Like shortest_nfa, but allows specification of the type of NFA engine.
839 fn shortest_nfa_type(
840 &self,
841 ty: MatchNfaType,
842 text: &[u8],
843 start: usize,
844 ) -> Option<usize> {
845 let mut slots = [None, None];
846 if self.exec_nfa(ty, &mut [false], &mut slots, true, text, start) {
847 slots[1]
848 } else {
849 None
850 }
851 }
852
853 /// Like find, but executes an NFA engine.
854 fn find_nfa(
855 &self,
856 ty: MatchNfaType,
857 text: &[u8],
858 start: usize,
859 ) -> Option<(usize, usize)> {
860 let mut slots = [None, None];
861 if self.exec_nfa(ty, &mut [false], &mut slots, false, text, start) {
862 match (slots[0], slots[1]) {
863 (Some(s), Some(e)) => Some((s, e)),
864 _ => None,
865 }
866 } else {
867 None
868 }
869 }
870
871 /// Like find_nfa, but fills in captures and restricts the search space
872 /// using previously found match information.
873 ///
874 /// `slots` should have length equal to `2 * nfa.captures.len()`.
875 fn captures_nfa_with_match(
876 &self,
877 slots: &mut [Slot],
878 text: &[u8],
879 match_start: usize,
880 match_end: usize,
881 ) -> Option<(usize, usize)> {
882 // We can't use match_end directly, because we may need to examine one
883 // "character" after the end of a match for lookahead operators. We
884 // need to move two characters beyond the end, since some look-around
885 // operations may falsely assume a premature end of text otherwise.
886 let e = cmp::min(
887 next_utf8(text, next_utf8(text, match_end)), text.len());
888 self.captures_nfa(slots, &text[..e], match_start)
889 }
890
891 /// Like find_nfa, but fills in captures.
892 ///
893 /// `slots` should have length equal to `2 * nfa.captures.len()`.
894 fn captures_nfa(
895 &self,
896 slots: &mut [Slot],
897 text: &[u8],
898 start: usize,
899 ) -> Option<(usize, usize)> {
900 self.captures_nfa_type(MatchNfaType::Auto, slots, text, start)
901 }
902
903 /// Like captures_nfa, but allows specification of type of NFA engine.
904 fn captures_nfa_type(
905 &self,
906 ty: MatchNfaType,
907 slots: &mut [Slot],
908 text: &[u8],
909 start: usize,
910 ) -> Option<(usize, usize)> {
911 if self.exec_nfa(ty, &mut [false], slots, false, text, start) {
912 match (slots[0], slots[1]) {
913 (Some(s), Some(e)) => Some((s, e)),
914 _ => None,
915 }
916 } else {
917 None
918 }
919 }
920
921 fn exec_nfa(
922 &self,
923 mut ty: MatchNfaType,
924 matches: &mut [bool],
925 slots: &mut [Slot],
926 quit_after_match: bool,
927 text: &[u8],
928 start: usize,
929 ) -> bool {
930 use self::MatchNfaType::*;
931 if let Auto = ty {
932 if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
933 ty = Backtrack;
934 } else {
935 ty = PikeVM;
936 }
937 }
938 match ty {
939 Auto => unreachable!(),
940 Backtrack => self.exec_backtrack(matches, slots, text, start),
941 PikeVM => {
942 self.exec_pikevm(
943 matches, slots, quit_after_match, text, start)
944 }
945 }
946 }
947
948 /// Always run the NFA algorithm.
949 fn exec_pikevm(
950 &self,
951 matches: &mut [bool],
952 slots: &mut [Slot],
953 quit_after_match: bool,
954 text: &[u8],
955 start: usize,
956 ) -> bool {
957 if self.ro.nfa.uses_bytes() {
958 pikevm::Fsm::exec(
959 &self.ro.nfa,
960 self.cache,
961 matches,
962 slots,
963 quit_after_match,
964 ByteInput::new(text, self.ro.nfa.only_utf8),
965 start)
966 } else {
967 pikevm::Fsm::exec(
968 &self.ro.nfa,
969 self.cache,
970 matches,
971 slots,
972 quit_after_match,
973 CharInput::new(text),
974 start)
975 }
976 }
977
978 /// Always runs the NFA using bounded backtracking.
979 fn exec_backtrack(
980 &self,
981 matches: &mut [bool],
982 slots: &mut [Slot],
983 text: &[u8],
984 start: usize,
985 ) -> bool {
986 if self.ro.nfa.uses_bytes() {
987 backtrack::Bounded::exec(
988 &self.ro.nfa,
989 self.cache,
990 matches,
991 slots,
992 ByteInput::new(text, self.ro.nfa.only_utf8),
993 start)
994 } else {
995 backtrack::Bounded::exec(
996 &self.ro.nfa,
997 self.cache,
998 matches,
999 slots,
1000 CharInput::new(text),
1001 start)
1002 }
1003 }
1004
1005 /// Finds which regular expressions match the given text.
1006 ///
1007 /// `matches` should have length equal to the number of regexes being
1008 /// searched.
1009 ///
1010 /// This is only useful when one wants to know which regexes in a set
1011 /// match some text.
1012 pub fn many_matches_at(
1013 &self,
1014 matches: &mut [bool],
1015 text: &[u8],
1016 start: usize,
1017 ) -> bool {
1018 use self::MatchType::*;
1019 if !self.is_anchor_end_match(text) {
1020 return false;
1021 }
1022 match self.ro.match_type {
1023 Literal(ty) => {
1024 debug_assert_eq!(matches.len(), 1);
1025 matches[0] = self.find_literals(ty, text, start).is_some();
1026 matches[0]
1027 }
1028 Dfa | DfaAnchoredReverse | DfaSuffix | DfaMany => {
1029 match dfa::Fsm::forward_many(
1030 &self.ro.dfa,
1031 self.cache,
1032 matches,
1033 text,
1034 start,
1035 ) {
1036 dfa::Result::Match(_) => true,
1037 dfa::Result::NoMatch(_) => false,
1038 dfa::Result::Quit => {
1039 self.exec_nfa(
1040 MatchNfaType::Auto,
1041 matches,
1042 &mut [],
1043 false,
1044 text,
1045 start)
1046 }
1047 }
1048 }
1049 Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start),
1050 Nothing => false,
1051 }
1052 }
1053
1054 #[inline(always)] // reduces constant overhead
1055 fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1056 // Only do this check if the haystack is big (>1MB).
1057 if text.len() > (1<<20) && self.ro.nfa.is_anchored_end {
1058 let lcs = self.ro.suffixes.lcs();
1059 if lcs.len() >= 1 && !lcs.is_suffix(text) {
1060 return false;
1061 }
1062 }
1063 true
1064 }
1065
1066 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1067 &self.ro.nfa.capture_name_idx
1068 }
1069}
1070
1071impl<'c> ExecNoSyncStr<'c> {
1072 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1073 self.0.capture_name_idx()
1074 }
1075}
1076
1077impl Exec {
1078 /// Get a searcher that isn't Sync.
1079 #[inline(always)] // reduces constant overhead
1080 pub fn searcher(&self) -> ExecNoSync {
1081 let create = || Box::new(RefCell::new(ProgramCacheInner::new(&self.ro)));
1082 ExecNoSync {
1083 ro: &self.ro, // a clone is too expensive here! (and not needed)
1084 cache: self.cache.get_or(create),
1085 }
1086 }
1087
1088 /// Get a searcher that isn't Sync and can match on &str.
1089 #[inline(always)] // reduces constant overhead
1090 pub fn searcher_str(&self) -> ExecNoSyncStr {
1091 ExecNoSyncStr(self.searcher())
1092 }
1093
1094 /// Build a Regex from this executor.
1095 pub fn into_regex(self) -> re_unicode::Regex {
1096 re_unicode::Regex::from(self)
1097 }
1098
1099 /// Build a RegexSet from this executor.
1100 pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1101 re_set::unicode::RegexSet::from(self)
1102 }
1103
1104 /// Build a Regex from this executor that can match arbitrary bytes.
1105 pub fn into_byte_regex(self) -> re_bytes::Regex {
1106 re_bytes::Regex::from(self)
1107 }
1108
1109 /// Build a RegexSet from this executor that can match arbitrary bytes.
1110 pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1111 re_set::bytes::RegexSet::from(self)
1112 }
1113
1114 /// The original regular expressions given by the caller that were
1115 /// compiled.
1116 pub fn regex_strings(&self) -> &[String] {
1117 &self.ro.res
1118 }
1119
1120 /// Return a slice of capture names.
1121 ///
1122 /// Any capture that isn't named is None.
1123 pub fn capture_names(&self) -> &[Option<String>] {
1124 &self.ro.nfa.captures
1125 }
1126
1127 /// Return a reference to named groups mapping (from group name to
1128 /// group position).
1129 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1130 &self.ro.nfa.capture_name_idx
1131 }
1132}
1133
1134impl Clone for Exec {
1135 fn clone(&self) -> Exec {
1136 Exec {
1137 ro: self.ro.clone(),
1138 cache: CachedThreadLocal::new(),
1139 }
1140 }
1141}
1142
1143impl ExecReadOnly {
1144 fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1145 use self::MatchType::*;
1146 if let Some(Nfa(_)) = hint {
1147 return hint.unwrap();
1148 }
1149 // If the NFA is empty, then we'll never match anything.
1150 if self.nfa.insts.is_empty() {
1151 return Nothing;
1152 }
1153 // If our set of prefixes is complete, then we can use it to find
1154 // a match in lieu of a regex engine. This doesn't quite work well in
1155 // the presence of multiple regexes, so only do it when there's one.
1156 //
1157 // TODO(burntsushi): Also, don't try to match literals if the regex is
1158 // partially anchored. We could technically do it, but we'd need to
1159 // create two sets of literals: all of them and then the subset that
1160 // aren't anchored. We would then only search for all of them when at
1161 // the beginning of the input and use the subset in all other cases.
1162 if self.res.len() == 1 {
1163 if self.nfa.prefixes.complete() {
1164 return if self.nfa.is_anchored_start {
1165 Literal(MatchLiteralType::AnchoredStart)
1166 } else {
1167 Literal(MatchLiteralType::Unanchored)
1168 };
1169 }
1170 if self.suffixes.complete() {
1171 return if self.nfa.is_anchored_end {
1172 Literal(MatchLiteralType::AnchoredEnd)
1173 } else {
1174 // This case shouldn't happen. When the regex isn't
1175 // anchored, then complete prefixes should imply complete
1176 // suffixes.
1177 Literal(MatchLiteralType::Unanchored)
1178 };
1179 }
1180 }
1181 // If we can execute the DFA, then we totally should.
1182 if dfa::can_exec(&self.dfa) {
1183 // Regex sets require a slightly specialized path.
1184 if self.res.len() >= 2 {
1185 return DfaMany;
1186 }
1187 // If the regex is anchored at the end but not the start, then
1188 // just match in reverse from the end of the haystack.
1189 if !self.nfa.is_anchored_start && self.nfa.is_anchored_end {
1190 return DfaAnchoredReverse;
1191 }
1192 // If there's a longish suffix literal, then it might be faster
1193 // to look for that first.
1194 if self.should_suffix_scan() {
1195 return DfaSuffix;
1196 }
1197 // Fall back to your garden variety forward searching lazy DFA.
1198 return Dfa;
1199 }
1200 // We're so totally hosed.
1201 Nfa(MatchNfaType::Auto)
1202 }
1203
1204 /// Returns true if the program is amenable to suffix scanning.
1205 ///
1206 /// When this is true, as a heuristic, we assume it is OK to quickly scan
1207 /// for suffix literals and then do a *reverse* DFA match from any matches
1208 /// produced by the literal scan. (And then followed by a forward DFA
1209 /// search, since the previously found suffix literal maybe not actually be
1210 /// the end of a match.)
1211 ///
1212 /// This is a bit of a specialized optimization, but can result in pretty
1213 /// big performance wins if 1) there are no prefix literals and 2) the
1214 /// suffix literals are pretty rare in the text. (1) is obviously easy to
1215 /// account for but (2) is harder. As a proxy, we assume that longer
1216 /// strings are generally rarer, so we only enable this optimization when
1217 /// we have a meaty suffix.
1218 fn should_suffix_scan(&self) -> bool {
1219 if self.suffixes.is_empty() {
1220 return false;
1221 }
1222 let lcs_len = self.suffixes.lcs().char_len();
1223 lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1224 }
1225}
1226
1227#[derive(Clone, Copy, Debug)]
1228enum MatchType {
1229 /// A single or multiple literal search. This is only used when the regex
1230 /// can be decomposed into unambiguous literal search.
1231 Literal(MatchLiteralType),
1232 /// A normal DFA search.
1233 Dfa,
1234 /// A reverse DFA search starting from the end of a haystack.
1235 DfaAnchoredReverse,
1236 /// A reverse DFA search with suffix literal scanning.
1237 DfaSuffix,
1238 /// Use the DFA on two or more regular expressions.
1239 DfaMany,
1240 /// An NFA variant.
1241 Nfa(MatchNfaType),
1242 /// No match is ever possible, so don't ever try to search.
1243 Nothing,
1244}
1245
1246#[derive(Clone, Copy, Debug)]
1247enum MatchLiteralType {
1248 /// Match literals anywhere in text.
1249 Unanchored,
1250 /// Match literals only at the start of text.
1251 AnchoredStart,
1252 /// Match literals only at the end of text.
1253 AnchoredEnd,
1254}
1255
1256#[derive(Clone, Copy, Debug)]
1257enum MatchNfaType {
1258 /// Choose between Backtrack and PikeVM.
1259 Auto,
1260 /// NFA bounded backtracking.
1261 ///
1262 /// (This is only set by tests, since it never makes sense to always want
1263 /// backtracking.)
1264 Backtrack,
1265 /// The Pike VM.
1266 ///
1267 /// (This is only set by tests, since it never makes sense to always want
1268 /// the Pike VM.)
1269 PikeVM,
1270}
1271
1272/// `ProgramCache` maintains reusable allocations for each matching engine
1273/// available to a particular program.
1274pub type ProgramCache = RefCell<ProgramCacheInner>;
1275
1276#[derive(Clone, Debug)]
1277pub struct ProgramCacheInner {
1278 pub pikevm: pikevm::Cache,
1279 pub backtrack: backtrack::Cache,
1280 pub dfa: dfa::Cache,
1281 pub dfa_reverse: dfa::Cache,
1282}
1283
1284impl ProgramCacheInner {
1285 fn new(ro: &ExecReadOnly) -> Self {
1286 ProgramCacheInner {
1287 pikevm: pikevm::Cache::new(&ro.nfa),
1288 backtrack: backtrack::Cache::new(&ro.nfa),
1289 dfa: dfa::Cache::new(&ro.dfa),
1290 dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1291 }
1292 }
1293}
8faf50e0
XL
1294
1295#[cfg(test)]
1296mod test {
1297 #[test]
1298 fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1299 use internal::ExecBuilder;
1300
1301 let backtrack_bytes_re = ExecBuilder::new("^S")
1302 .bounded_backtracking()
1303 .only_utf8(false)
1304 .build()
1305 .map(|exec| exec.into_byte_regex())
1306 .map_err(|err| format!("{}", err))
1307 .unwrap();
1308
1309 let default_bytes_re = ExecBuilder::new("^S")
1310 .only_utf8(false)
1311 .build()
1312 .map(|exec| exec.into_byte_regex())
1313 .map_err(|err| format!("{}", err))
1314 .unwrap();
1315
1316 let input = vec![83, 83];
1317
1318 let s1 = backtrack_bytes_re.split(&input);
1319 let s2 = default_bytes_re.split(&input);
1320 for (chunk1, chunk2) in s1.zip(s2) {
1321 assert_eq!(chunk1, chunk2);
1322 }
1323 }
1324
1325 #[test]
1326 fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1327 use internal::ExecBuilder;
1328
1329 let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1330 .bounded_backtracking()
1331 .bytes(true)
1332 .build()
1333 .map(|exec| exec.into_regex())
1334 .map_err(|err| format!("{}", err))
1335 .unwrap();
1336
1337 let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1338 .bytes(true)
1339 .build()
1340 .map(|exec| exec.into_regex())
1341 .map_err(|err| format!("{}", err))
1342 .unwrap();
1343
1344 let input = "**";
1345
1346 let s1 = backtrack_bytes_re.split(input);
1347 let s2 = default_bytes_re.split(input);
1348 for (chunk1, chunk2) in s1.zip(s2) {
1349 assert_eq!(chunk1, chunk2);
1350 }
1351 }
1352}