]>
Commit | Line | Data |
---|---|---|
8bb4bdeb XL |
1 | // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
10 | ||
11 | use std::cell::RefCell; | |
12 | use std::collections::HashMap; | |
13 | use std::cmp; | |
14 | use std::sync::Arc; | |
15 | ||
16 | use thread_local::CachedThreadLocal; | |
17 | use syntax::{Expr, ExprBuilder, Literals}; | |
18 | ||
19 | use backtrack; | |
20 | use compile::Compiler; | |
21 | use dfa; | |
22 | use error::Error; | |
23 | use input::{ByteInput, CharInput}; | |
24 | use literals::LiteralSearcher; | |
25 | use pikevm; | |
26 | use prog::Program; | |
27 | use re_builder::RegexOptions; | |
28 | use re_bytes; | |
29 | use re_set; | |
30 | use re_trait::{RegularExpression, Slot, Locations, as_slots}; | |
31 | use re_unicode; | |
32 | use utf8::next_utf8; | |
33 | ||
ff7c6d11 | 34 | /// `Exec` manages the execution of a regular expression. |
8bb4bdeb XL |
35 | /// |
36 | /// In particular, this manages the various compiled forms of a single regular | |
37 | /// expression and the choice of which matching engine to use to execute a | |
38 | /// regular expression. | |
39 | pub struct Exec { | |
40 | /// All read only state. | |
41 | ro: Arc<ExecReadOnly>, | |
42 | /// Caches for the various matching engines. | |
43 | cache: CachedThreadLocal<ProgramCache>, | |
44 | } | |
45 | ||
ff7c6d11 | 46 | /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This |
8bb4bdeb XL |
47 | /// means it is no longer Sync, but we can now avoid the overhead of |
48 | /// synchronization to fetch the cache. | |
49 | #[derive(Debug)] | |
50 | pub struct ExecNoSync<'c> { | |
51 | /// All read only state. | |
52 | ro: &'c Arc<ExecReadOnly>, | |
53 | /// Caches for the various matching engines. | |
54 | cache: &'c ProgramCache, | |
55 | } | |
56 | ||
ff7c6d11 | 57 | /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8]. |
8bb4bdeb XL |
58 | pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>); |
59 | ||
ff7c6d11 | 60 | /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such |
8bb4bdeb XL |
61 | /// state is determined at compile time and never changes during search. |
62 | #[derive(Debug)] | |
63 | struct ExecReadOnly { | |
64 | /// The original regular expressions given by the caller to compile. | |
65 | res: Vec<String>, | |
66 | /// A compiled program that is used in the NFA simulation and backtracking. | |
67 | /// It can be byte-based or Unicode codepoint based. | |
68 | /// | |
69 | /// N.B. It is not possibly to make this byte-based from the public API. | |
70 | /// It is only used for testing byte based programs in the NFA simulations. | |
71 | nfa: Program, | |
72 | /// A compiled byte based program for DFA execution. This is only used | |
73 | /// if a DFA can be executed. (Currently, only word boundary assertions are | |
74 | /// not supported.) Note that this program contains an embedded `.*?` | |
75 | /// preceding the first capture group, unless the regex is anchored at the | |
76 | /// beginning. | |
77 | dfa: Program, | |
78 | /// The same as above, except the program is reversed (and there is no | |
79 | /// preceding `.*?`). This is used by the DFA to find the starting location | |
80 | /// of matches. | |
81 | dfa_reverse: Program, | |
82 | /// A set of suffix literals extracted from the regex. | |
83 | /// | |
84 | /// Prefix literals are stored on the `Program`, since they are used inside | |
85 | /// the matching engines. | |
86 | suffixes: LiteralSearcher, | |
87 | /// match_type encodes as much upfront knowledge about how we're going to | |
88 | /// execute a search as possible. | |
89 | match_type: MatchType, | |
90 | } | |
91 | ||
92 | /// Facilitates the construction of an executor by exposing various knobs | |
93 | /// to control how a regex is executed and what kinds of resources it's | |
94 | /// permitted to use. | |
95 | pub struct ExecBuilder { | |
96 | options: RegexOptions, | |
97 | match_type: Option<MatchType>, | |
98 | bytes: bool, | |
99 | only_utf8: bool, | |
100 | } | |
101 | ||
102 | /// Parsed represents a set of parsed regular expressions and their detected | |
103 | /// literals. | |
104 | struct Parsed { | |
105 | exprs: Vec<Expr>, | |
106 | prefixes: Literals, | |
107 | suffixes: Literals, | |
108 | bytes: bool, | |
109 | } | |
110 | ||
111 | impl ExecBuilder { | |
112 | /// Create a regex execution builder. | |
113 | /// | |
114 | /// This uses default settings for everything except the regex itself, | |
115 | /// which must be provided. Further knobs can be set by calling methods, | |
116 | /// and then finally, `build` to actually create the executor. | |
117 | pub fn new(re: &str) -> Self { | |
118 | Self::new_many(&[re]) | |
119 | } | |
120 | ||
121 | /// Like new, but compiles the union of the given regular expressions. | |
122 | /// | |
123 | /// Note that when compiling 2 or more regular expressions, capture groups | |
124 | /// are completely unsupported. (This means both `find` and `captures` | |
125 | /// wont work.) | |
126 | pub fn new_many<I, S>(res: I) -> Self | |
127 | where S: AsRef<str>, I: IntoIterator<Item=S> { | |
128 | let mut opts = RegexOptions::default(); | |
129 | opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect(); | |
130 | Self::new_options(opts) | |
131 | } | |
132 | ||
133 | /// Create a regex execution builder. | |
134 | pub fn new_options(opts: RegexOptions) -> Self { | |
135 | ExecBuilder { | |
136 | options: opts, | |
137 | match_type: None, | |
138 | bytes: false, | |
139 | only_utf8: true, | |
140 | } | |
141 | } | |
142 | ||
143 | /// Set the matching engine to be automatically determined. | |
144 | /// | |
145 | /// This is the default state and will apply whatever optimizations are | |
146 | /// possible, such as running a DFA. | |
147 | /// | |
148 | /// This overrides whatever was previously set via the `nfa` or | |
149 | /// `bounded_backtracking` methods. | |
150 | pub fn automatic(mut self) -> Self { | |
151 | self.match_type = None; | |
152 | self | |
153 | } | |
154 | ||
155 | /// Sets the matching engine to use the NFA algorithm no matter what | |
156 | /// optimizations are possible. | |
157 | /// | |
158 | /// This overrides whatever was previously set via the `automatic` or | |
159 | /// `bounded_backtracking` methods. | |
160 | pub fn nfa(mut self) -> Self { | |
161 | self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM)); | |
162 | self | |
163 | } | |
164 | ||
165 | /// Sets the matching engine to use a bounded backtracking engine no | |
166 | /// matter what optimizations are possible. | |
167 | /// | |
168 | /// One must use this with care, since the bounded backtracking engine | |
169 | /// uses memory proportion to `len(regex) * len(text)`. | |
170 | /// | |
171 | /// This overrides whatever was previously set via the `automatic` or | |
172 | /// `nfa` methods. | |
173 | pub fn bounded_backtracking(mut self) -> Self { | |
174 | self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack)); | |
175 | self | |
176 | } | |
177 | ||
178 | /// Compiles byte based programs for use with the NFA matching engines. | |
179 | /// | |
180 | /// By default, the NFA engines match on Unicode scalar values. They can | |
181 | /// be made to use byte based programs instead. In general, the byte based | |
182 | /// programs are slower because of a less efficient encoding of character | |
183 | /// classes. | |
184 | /// | |
185 | /// Note that this does not impact DFA matching engines, which always | |
186 | /// execute on bytes. | |
187 | pub fn bytes(mut self, yes: bool) -> Self { | |
188 | self.bytes = yes; | |
189 | self | |
190 | } | |
191 | ||
192 | /// When disabled, the program compiled may match arbitrary bytes. | |
193 | /// | |
194 | /// When enabled (the default), all compiled programs exclusively match | |
195 | /// valid UTF-8 bytes. | |
196 | pub fn only_utf8(mut self, yes: bool) -> Self { | |
197 | self.only_utf8 = yes; | |
198 | self | |
199 | } | |
200 | ||
201 | /// Set the Unicode flag. | |
202 | pub fn unicode(mut self, yes: bool) -> Self { | |
203 | self.options.unicode = yes; | |
204 | self | |
205 | } | |
206 | ||
207 | /// Parse the current set of patterns into their AST and extract literals. | |
208 | fn parse(&self) -> Result<Parsed, Error> { | |
209 | let mut exprs = Vec::with_capacity(self.options.pats.len()); | |
210 | let mut prefixes = Some(Literals::empty()); | |
211 | let mut suffixes = Some(Literals::empty()); | |
212 | let mut bytes = false; | |
041b39d2 XL |
213 | let is_set = self.options.pats.len() > 1; |
214 | // If we're compiling a regex set and that set has any anchored | |
215 | // expressions, then disable all literal optimizations. | |
8bb4bdeb XL |
216 | for pat in &self.options.pats { |
217 | let parser = | |
218 | ExprBuilder::new() | |
219 | .case_insensitive(self.options.case_insensitive) | |
220 | .multi_line(self.options.multi_line) | |
221 | .dot_matches_new_line(self.options.dot_matches_new_line) | |
222 | .swap_greed(self.options.swap_greed) | |
223 | .ignore_whitespace(self.options.ignore_whitespace) | |
224 | .unicode(self.options.unicode) | |
225 | .allow_bytes(!self.only_utf8); | |
226 | let expr = try!(parser.parse(pat)); | |
227 | bytes = bytes || expr.has_bytes(); | |
228 | ||
229 | if !expr.is_anchored_start() && expr.has_anchored_start() { | |
230 | // Partial anchors unfortunately make it hard to use prefixes, | |
231 | // so disable them. | |
232 | prefixes = None; | |
041b39d2 XL |
233 | } else if is_set && expr.is_anchored_start() { |
234 | // Regex sets with anchors do not go well with literal | |
235 | // optimizations. | |
236 | prefixes = None; | |
8bb4bdeb XL |
237 | } |
238 | prefixes = prefixes.and_then(|mut prefixes| { | |
239 | if !prefixes.union_prefixes(&expr) { | |
240 | None | |
241 | } else { | |
242 | Some(prefixes) | |
243 | } | |
244 | }); | |
245 | ||
246 | if !expr.is_anchored_end() && expr.has_anchored_end() { | |
247 | // Partial anchors unfortunately make it hard to use suffixes, | |
248 | // so disable them. | |
249 | suffixes = None; | |
041b39d2 XL |
250 | } else if is_set && expr.is_anchored_end() { |
251 | // Regex sets with anchors do not go well with literal | |
252 | // optimizations. | |
253 | prefixes = None; | |
8bb4bdeb XL |
254 | } |
255 | suffixes = suffixes.and_then(|mut suffixes| { | |
256 | if !suffixes.union_suffixes(&expr) { | |
257 | None | |
258 | } else { | |
259 | Some(suffixes) | |
260 | } | |
261 | }); | |
262 | exprs.push(expr); | |
263 | } | |
264 | Ok(Parsed { | |
265 | exprs: exprs, | |
ff7c6d11 XL |
266 | prefixes: prefixes.unwrap_or_else(Literals::empty), |
267 | suffixes: suffixes.unwrap_or_else(Literals::empty), | |
8bb4bdeb XL |
268 | bytes: bytes, |
269 | }) | |
270 | } | |
271 | ||
272 | /// Build an executor that can run a regular expression. | |
273 | pub fn build(self) -> Result<Exec, Error> { | |
274 | // Special case when we have no patterns to compile. | |
275 | // This can happen when compiling a regex set. | |
276 | if self.options.pats.is_empty() { | |
277 | let ro = Arc::new(ExecReadOnly { | |
278 | res: vec![], | |
279 | nfa: Program::new(), | |
280 | dfa: Program::new(), | |
281 | dfa_reverse: Program::new(), | |
282 | suffixes: LiteralSearcher::empty(), | |
283 | match_type: MatchType::Nothing, | |
284 | }); | |
285 | return Ok(Exec { ro: ro, cache: CachedThreadLocal::new() }); | |
286 | } | |
287 | let parsed = try!(self.parse()); | |
288 | let mut nfa = try!( | |
289 | Compiler::new() | |
290 | .size_limit(self.options.size_limit) | |
291 | .bytes(self.bytes || parsed.bytes) | |
292 | .only_utf8(self.only_utf8) | |
293 | .compile(&parsed.exprs)); | |
294 | let mut dfa = try!( | |
295 | Compiler::new() | |
296 | .size_limit(self.options.size_limit) | |
297 | .dfa(true) | |
298 | .only_utf8(self.only_utf8) | |
299 | .compile(&parsed.exprs)); | |
300 | let mut dfa_reverse = try!( | |
301 | Compiler::new() | |
302 | .size_limit(self.options.size_limit) | |
303 | .dfa(true) | |
304 | .only_utf8(self.only_utf8) | |
305 | .reverse(true) | |
306 | .compile(&parsed.exprs)); | |
307 | ||
308 | let prefixes = parsed.prefixes.unambiguous_prefixes(); | |
309 | let suffixes = parsed.suffixes.unambiguous_suffixes(); | |
310 | nfa.prefixes = LiteralSearcher::prefixes(prefixes); | |
311 | dfa.prefixes = nfa.prefixes.clone(); | |
312 | dfa.dfa_size_limit = self.options.dfa_size_limit; | |
313 | dfa_reverse.dfa_size_limit = self.options.dfa_size_limit; | |
314 | ||
315 | let mut ro = ExecReadOnly { | |
316 | res: self.options.pats, | |
317 | nfa: nfa, | |
318 | dfa: dfa, | |
319 | dfa_reverse: dfa_reverse, | |
320 | suffixes: LiteralSearcher::suffixes(suffixes), | |
321 | match_type: MatchType::Nothing, | |
322 | }; | |
323 | ro.match_type = ro.choose_match_type(self.match_type); | |
324 | ||
325 | let ro = Arc::new(ro); | |
326 | Ok(Exec { ro: ro, cache: CachedThreadLocal::new() }) | |
327 | } | |
328 | } | |
329 | ||
330 | impl<'c> RegularExpression for ExecNoSyncStr<'c> { | |
331 | type Text = str; | |
332 | ||
333 | fn slots_len(&self) -> usize { self.0.slots_len() } | |
334 | ||
335 | fn next_after_empty(&self, text: &str, i: usize) -> usize { | |
336 | next_utf8(text.as_bytes(), i) | |
337 | } | |
338 | ||
339 | #[inline(always)] // reduces constant overhead | |
340 | fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> { | |
341 | self.0.shortest_match_at(text.as_bytes(), start) | |
342 | } | |
343 | ||
344 | #[inline(always)] // reduces constant overhead | |
345 | fn is_match_at(&self, text: &str, start: usize) -> bool { | |
346 | self.0.is_match_at(text.as_bytes(), start) | |
347 | } | |
348 | ||
349 | #[inline(always)] // reduces constant overhead | |
350 | fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { | |
351 | self.0.find_at(text.as_bytes(), start) | |
352 | } | |
353 | ||
354 | #[inline(always)] // reduces constant overhead | |
355 | fn read_captures_at( | |
356 | &self, | |
357 | locs: &mut Locations, | |
358 | text: &str, | |
359 | start: usize, | |
360 | ) -> Option<(usize, usize)> { | |
361 | self.0.read_captures_at(locs, text.as_bytes(), start) | |
362 | } | |
363 | } | |
364 | ||
365 | impl<'c> RegularExpression for ExecNoSync<'c> { | |
366 | type Text = [u8]; | |
367 | ||
368 | /// Returns the number of capture slots in the regular expression. (There | |
369 | /// are two slots for every capture group, corresponding to possibly empty | |
370 | /// start and end locations of the capture.) | |
371 | fn slots_len(&self) -> usize { | |
372 | self.ro.nfa.captures.len() * 2 | |
373 | } | |
374 | ||
375 | fn next_after_empty(&self, _text: &[u8], i: usize) -> usize { | |
376 | i + 1 | |
377 | } | |
378 | ||
379 | /// Returns the end of a match location, possibly occurring before the | |
380 | /// end location of the correct leftmost-first match. | |
381 | #[inline(always)] // reduces constant overhead | |
382 | fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> { | |
383 | if !self.is_anchor_end_match(text) { | |
384 | return None; | |
385 | } | |
386 | match self.ro.match_type { | |
387 | MatchType::Literal(ty) => { | |
388 | self.find_literals(ty, text, start).map(|(_, e)| e) | |
389 | } | |
390 | MatchType::Dfa | MatchType::DfaMany => { | |
391 | match self.shortest_dfa(text, start) { | |
392 | dfa::Result::Match(end) => Some(end), | |
393 | dfa::Result::NoMatch(_) => None, | |
394 | dfa::Result::Quit => self.shortest_nfa(text, start), | |
395 | } | |
396 | } | |
397 | MatchType::DfaAnchoredReverse => { | |
398 | match dfa::Fsm::reverse( | |
399 | &self.ro.dfa_reverse, | |
ff7c6d11 | 400 | self.cache, |
8bb4bdeb XL |
401 | true, |
402 | &text[start..], | |
403 | text.len(), | |
404 | ) { | |
405 | dfa::Result::Match(_) => Some(text.len()), | |
406 | dfa::Result::NoMatch(_) => None, | |
407 | dfa::Result::Quit => self.shortest_nfa(text, start), | |
408 | } | |
409 | } | |
410 | MatchType::DfaSuffix => { | |
411 | match self.shortest_dfa_reverse_suffix(text, start) { | |
412 | dfa::Result::Match(e) => Some(e), | |
413 | dfa::Result::NoMatch(_) => None, | |
414 | dfa::Result::Quit => self.shortest_nfa(text, start), | |
415 | } | |
416 | } | |
417 | MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start), | |
418 | MatchType::Nothing => None, | |
419 | } | |
420 | } | |
421 | ||
422 | /// Returns true if and only if the regex matches text. | |
423 | /// | |
424 | /// For single regular expressions, this is equivalent to calling | |
425 | /// shortest_match(...).is_some(). | |
426 | #[inline(always)] // reduces constant overhead | |
427 | fn is_match_at(&self, text: &[u8], start: usize) -> bool { | |
428 | if !self.is_anchor_end_match(text) { | |
429 | return false; | |
430 | } | |
431 | // We need to do this dance because shortest_match relies on the NFA | |
432 | // filling in captures[1], but a RegexSet has no captures. In other | |
433 | // words, a RegexSet can't (currently) use shortest_match. ---AG | |
434 | match self.ro.match_type { | |
435 | MatchType::Literal(ty) => { | |
436 | self.find_literals(ty, text, start).is_some() | |
437 | } | |
438 | MatchType::Dfa | MatchType::DfaMany => { | |
439 | match self.shortest_dfa(text, start) { | |
440 | dfa::Result::Match(_) => true, | |
441 | dfa::Result::NoMatch(_) => false, | |
442 | dfa::Result::Quit => self.match_nfa(text, start), | |
443 | } | |
444 | } | |
445 | MatchType::DfaAnchoredReverse => { | |
446 | match dfa::Fsm::reverse( | |
447 | &self.ro.dfa_reverse, | |
ff7c6d11 | 448 | self.cache, |
8bb4bdeb XL |
449 | true, |
450 | &text[start..], | |
451 | text.len(), | |
452 | ) { | |
453 | dfa::Result::Match(_) => true, | |
454 | dfa::Result::NoMatch(_) => false, | |
455 | dfa::Result::Quit => self.match_nfa(text, start), | |
456 | } | |
457 | } | |
458 | MatchType::DfaSuffix => { | |
459 | match self.shortest_dfa_reverse_suffix(text, start) { | |
460 | dfa::Result::Match(_) => true, | |
461 | dfa::Result::NoMatch(_) => false, | |
462 | dfa::Result::Quit => self.match_nfa(text, start), | |
463 | } | |
464 | } | |
465 | MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start), | |
466 | MatchType::Nothing => false, | |
467 | } | |
468 | } | |
469 | ||
470 | /// Finds the start and end location of the leftmost-first match, starting | |
471 | /// at the given location. | |
472 | #[inline(always)] // reduces constant overhead | |
473 | fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> { | |
474 | if !self.is_anchor_end_match(text) { | |
475 | return None; | |
476 | } | |
477 | match self.ro.match_type { | |
478 | MatchType::Literal(ty) => { | |
479 | self.find_literals(ty, text, start) | |
480 | } | |
481 | MatchType::Dfa => { | |
482 | match self.find_dfa_forward(text, start) { | |
483 | dfa::Result::Match((s, e)) => Some((s, e)), | |
484 | dfa::Result::NoMatch(_) => None, | |
485 | dfa::Result::Quit => { | |
486 | self.find_nfa(MatchNfaType::Auto, text, start) | |
487 | } | |
488 | } | |
489 | } | |
490 | MatchType::DfaAnchoredReverse => { | |
491 | match self.find_dfa_anchored_reverse(text, start) { | |
492 | dfa::Result::Match((s, e)) => Some((s, e)), | |
493 | dfa::Result::NoMatch(_) => None, | |
494 | dfa::Result::Quit => { | |
495 | self.find_nfa(MatchNfaType::Auto, text, start) | |
496 | } | |
497 | } | |
498 | } | |
499 | MatchType::DfaSuffix => { | |
500 | match self.find_dfa_reverse_suffix(text, start) { | |
501 | dfa::Result::Match((s, e)) => Some((s, e)), | |
502 | dfa::Result::NoMatch(_) => None, | |
503 | dfa::Result::Quit => { | |
504 | self.find_nfa(MatchNfaType::Auto, text, start) | |
505 | } | |
506 | } | |
507 | } | |
508 | MatchType::Nfa(ty) => self.find_nfa(ty, text, start), | |
509 | MatchType::Nothing => None, | |
510 | MatchType::DfaMany => { | |
511 | unreachable!("BUG: RegexSet cannot be used with find") | |
512 | } | |
513 | } | |
514 | } | |
515 | ||
516 | /// Finds the start and end location of the leftmost-first match and also | |
517 | /// fills in all matching capture groups. | |
518 | /// | |
519 | /// The number of capture slots given should be equal to the total number | |
520 | /// of capture slots in the compiled program. | |
521 | /// | |
522 | /// Note that the first two slots always correspond to the start and end | |
523 | /// locations of the overall match. | |
524 | fn read_captures_at( | |
525 | &self, | |
526 | locs: &mut Locations, | |
527 | text: &[u8], | |
528 | start: usize, | |
529 | ) -> Option<(usize, usize)> { | |
530 | let slots = as_slots(locs); | |
531 | for slot in slots.iter_mut() { | |
532 | *slot = None; | |
533 | } | |
534 | // If the caller unnecessarily uses this, then we try to save them | |
535 | // from themselves. | |
536 | match slots.len() { | |
537 | 0 => return self.find_at(text, start), | |
538 | 2 => { | |
539 | return self.find_at(text, start).map(|(s, e)| { | |
540 | slots[0] = Some(s); | |
541 | slots[1] = Some(e); | |
542 | (s, e) | |
543 | }); | |
544 | } | |
545 | _ => {} // fallthrough | |
546 | } | |
547 | if !self.is_anchor_end_match(text) { | |
548 | return None; | |
549 | } | |
550 | match self.ro.match_type { | |
551 | MatchType::Literal(ty) => { | |
552 | self.find_literals(ty, text, start).and_then(|(s, e)| { | |
553 | self.captures_nfa_with_match(slots, text, s, e) | |
554 | }) | |
555 | } | |
556 | MatchType::Dfa => { | |
2c00a5a8 XL |
557 | if self.ro.nfa.is_anchored_start { |
558 | self.captures_nfa(slots, text, start) | |
559 | } else { | |
560 | match self.find_dfa_forward(text, start) { | |
561 | dfa::Result::Match((s, e)) => { | |
562 | self.captures_nfa_with_match(slots, text, s, e) | |
563 | } | |
564 | dfa::Result::NoMatch(_) => None, | |
565 | dfa::Result::Quit => self.captures_nfa(slots, text, start), | |
8bb4bdeb | 566 | } |
8bb4bdeb XL |
567 | } |
568 | } | |
569 | MatchType::DfaAnchoredReverse => { | |
570 | match self.find_dfa_anchored_reverse(text, start) { | |
571 | dfa::Result::Match((s, e)) => { | |
572 | self.captures_nfa_with_match(slots, text, s, e) | |
573 | } | |
574 | dfa::Result::NoMatch(_) => None, | |
575 | dfa::Result::Quit => self.captures_nfa(slots, text, start), | |
576 | } | |
577 | } | |
578 | MatchType::DfaSuffix => { | |
579 | match self.find_dfa_reverse_suffix(text, start) { | |
580 | dfa::Result::Match((s, e)) => { | |
581 | self.captures_nfa_with_match(slots, text, s, e) | |
582 | } | |
583 | dfa::Result::NoMatch(_) => None, | |
584 | dfa::Result::Quit => self.captures_nfa(slots, text, start), | |
585 | } | |
586 | } | |
587 | MatchType::Nfa(ty) => { | |
588 | self.captures_nfa_type(ty, slots, text, start) | |
589 | } | |
590 | MatchType::Nothing => None, | |
591 | MatchType::DfaMany => { | |
592 | unreachable!("BUG: RegexSet cannot be used with captures") | |
593 | } | |
594 | } | |
595 | } | |
596 | } | |
597 | ||
598 | impl<'c> ExecNoSync<'c> { | |
599 | /// Finds the leftmost-first match using only literal search. | |
600 | #[inline(always)] // reduces constant overhead | |
601 | fn find_literals( | |
602 | &self, | |
603 | ty: MatchLiteralType, | |
604 | text: &[u8], | |
605 | start: usize, | |
606 | ) -> Option<(usize, usize)> { | |
607 | use self::MatchLiteralType::*; | |
608 | match ty { | |
609 | Unanchored => { | |
610 | let lits = &self.ro.nfa.prefixes; | |
611 | lits.find(&text[start..]) | |
612 | .map(|(s, e)| (start + s, start + e)) | |
613 | } | |
614 | AnchoredStart => { | |
615 | let lits = &self.ro.nfa.prefixes; | |
616 | lits.find_start(&text[start..]) | |
617 | .map(|(s, e)| (start + s, start + e)) | |
618 | } | |
619 | AnchoredEnd => { | |
620 | let lits = &self.ro.suffixes; | |
621 | lits.find_end(&text[start..]) | |
622 | .map(|(s, e)| (start + s, start + e)) | |
623 | } | |
624 | } | |
625 | } | |
626 | ||
627 | /// Finds the leftmost-first match (start and end) using only the DFA. | |
628 | /// | |
629 | /// If the result returned indicates that the DFA quit, then another | |
630 | /// matching engine should be used. | |
631 | #[inline(always)] // reduces constant overhead | |
632 | fn find_dfa_forward( | |
633 | &self, | |
634 | text: &[u8], | |
635 | start: usize, | |
636 | ) -> dfa::Result<(usize, usize)> { | |
637 | use dfa::Result::*; | |
638 | let end = match dfa::Fsm::forward( | |
639 | &self.ro.dfa, | |
ff7c6d11 | 640 | self.cache, |
8bb4bdeb XL |
641 | false, |
642 | text, | |
643 | start, | |
644 | ) { | |
645 | NoMatch(i) => return NoMatch(i), | |
646 | Quit => return Quit, | |
647 | Match(end) if start == end => return Match((start, start)), | |
648 | Match(end) => end, | |
649 | }; | |
650 | // Now run the DFA in reverse to find the start of the match. | |
651 | match dfa::Fsm::reverse( | |
652 | &self.ro.dfa_reverse, | |
ff7c6d11 | 653 | self.cache, |
8bb4bdeb XL |
654 | false, |
655 | &text[start..], | |
656 | end - start, | |
657 | ) { | |
658 | Match(s) => Match((start + s, end)), | |
659 | NoMatch(i) => NoMatch(i), | |
660 | Quit => Quit, | |
661 | } | |
662 | } | |
663 | ||
664 | /// Finds the leftmost-first match (start and end) using only the DFA, | |
665 | /// but assumes the regex is anchored at the end and therefore starts at | |
666 | /// the end of the regex and matches in reverse. | |
667 | /// | |
668 | /// If the result returned indicates that the DFA quit, then another | |
669 | /// matching engine should be used. | |
670 | #[inline(always)] // reduces constant overhead | |
671 | fn find_dfa_anchored_reverse( | |
672 | &self, | |
673 | text: &[u8], | |
674 | start: usize, | |
675 | ) -> dfa::Result<(usize, usize)> { | |
676 | use dfa::Result::*; | |
677 | match dfa::Fsm::reverse( | |
678 | &self.ro.dfa_reverse, | |
ff7c6d11 | 679 | self.cache, |
8bb4bdeb XL |
680 | false, |
681 | &text[start..], | |
682 | text.len() - start, | |
683 | ) { | |
684 | Match(s) => Match((start + s, text.len())), | |
685 | NoMatch(i) => NoMatch(i), | |
686 | Quit => Quit, | |
687 | } | |
688 | } | |
689 | ||
690 | /// Finds the end of the shortest match using only the DFA. | |
691 | #[inline(always)] // reduces constant overhead | |
692 | fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> { | |
ff7c6d11 | 693 | dfa::Fsm::forward(&self.ro.dfa, self.cache, true, text, start) |
8bb4bdeb XL |
694 | } |
695 | ||
696 | /// Finds the end of the shortest match using only the DFA by scanning for | |
697 | /// suffix literals. | |
698 | /// | |
699 | #[inline(always)] // reduces constant overhead | |
700 | fn shortest_dfa_reverse_suffix( | |
701 | &self, | |
702 | text: &[u8], | |
703 | start: usize, | |
704 | ) -> dfa::Result<usize> { | |
705 | match self.exec_dfa_reverse_suffix(text, start) { | |
706 | None => self.shortest_dfa(text, start), | |
707 | Some(r) => r.map(|(_, end)| end), | |
708 | } | |
709 | } | |
710 | ||
711 | /// Finds the end of the shortest match using only the DFA by scanning for | |
712 | /// suffix literals. It also reports the start of the match. | |
713 | /// | |
714 | /// Note that if None is returned, then the optimization gave up to avoid | |
715 | /// worst case quadratic behavior. A forward scanning DFA should be tried | |
716 | /// next. | |
717 | /// | |
718 | /// If a match is returned and the full leftmost-first match is desired, | |
719 | /// then a forward scan starting from the beginning of the match must be | |
720 | /// done. | |
721 | /// | |
722 | /// If the result returned indicates that the DFA quit, then another | |
723 | /// matching engine should be used. | |
724 | #[inline(always)] // reduces constant overhead | |
725 | fn exec_dfa_reverse_suffix( | |
726 | &self, | |
727 | text: &[u8], | |
728 | original_start: usize, | |
729 | ) -> Option<dfa::Result<(usize, usize)>> { | |
730 | use dfa::Result::*; | |
731 | ||
732 | let lcs = self.ro.suffixes.lcs(); | |
733 | debug_assert!(lcs.len() >= 1); | |
734 | let mut start = original_start; | |
735 | let mut end = start; | |
736 | while end <= text.len() { | |
737 | start = end; | |
ff7c6d11 | 738 | end += match lcs.find(&text[end..]) { |
8bb4bdeb XL |
739 | None => return Some(NoMatch(text.len())), |
740 | Some(start) => start + lcs.len(), | |
741 | }; | |
742 | match dfa::Fsm::reverse( | |
743 | &self.ro.dfa_reverse, | |
ff7c6d11 | 744 | self.cache, |
8bb4bdeb XL |
745 | false, |
746 | &text[start..end], | |
747 | end - start, | |
748 | ) { | |
749 | Match(0) | NoMatch(0) => return None, | |
750 | Match(s) => return Some(Match((s + start, end))), | |
751 | NoMatch(_) => continue, | |
752 | Quit => return Some(Quit), | |
753 | }; | |
754 | } | |
755 | Some(NoMatch(text.len())) | |
756 | } | |
757 | ||
758 | /// Finds the leftmost-first match (start and end) using only the DFA | |
759 | /// by scanning for suffix literals. | |
760 | /// | |
761 | /// If the result returned indicates that the DFA quit, then another | |
762 | /// matching engine should be used. | |
763 | #[inline(always)] // reduces constant overhead | |
764 | fn find_dfa_reverse_suffix( | |
765 | &self, | |
766 | text: &[u8], | |
767 | start: usize, | |
768 | ) -> dfa::Result<(usize, usize)> { | |
769 | use dfa::Result::*; | |
770 | ||
771 | let match_start = match self.exec_dfa_reverse_suffix(text, start) { | |
772 | None => return self.find_dfa_forward(text, start), | |
773 | Some(Match((start, _))) => start, | |
774 | Some(r) => return r, | |
775 | }; | |
776 | // At this point, we've found a match. The only way to quit now | |
777 | // without a match is if the DFA gives up (seems unlikely). | |
778 | // | |
779 | // Now run the DFA forwards to find the proper end of the match. | |
780 | // (The suffix literal match can only indicate the earliest | |
781 | // possible end location, which may appear before the end of the | |
782 | // leftmost-first match.) | |
783 | match dfa::Fsm::forward( | |
784 | &self.ro.dfa, | |
ff7c6d11 | 785 | self.cache, |
8bb4bdeb XL |
786 | false, |
787 | text, | |
788 | match_start, | |
789 | ) { | |
790 | NoMatch(_) => panic!("BUG: reverse match implies forward match"), | |
791 | Quit => Quit, | |
792 | Match(e) => Match((match_start, e)), | |
793 | } | |
794 | } | |
795 | ||
796 | /// Executes the NFA engine to return whether there is a match or not. | |
797 | /// | |
798 | /// Ideally, we could use shortest_nfa(...).is_some() and get the same | |
799 | /// performance characteristics, but regex sets don't have captures, which | |
800 | /// shortest_nfa depends on. | |
801 | fn match_nfa( | |
802 | &self, | |
803 | text: &[u8], | |
804 | start: usize, | |
805 | ) -> bool { | |
806 | self.match_nfa_type(MatchNfaType::Auto, text, start) | |
807 | } | |
808 | ||
809 | /// Like match_nfa, but allows specification of the type of NFA engine. | |
810 | fn match_nfa_type( | |
811 | &self, | |
812 | ty: MatchNfaType, | |
813 | text: &[u8], | |
814 | start: usize, | |
815 | ) -> bool { | |
816 | self.exec_nfa(ty, &mut [false], &mut [], true, text, start) | |
817 | } | |
818 | ||
819 | /// Finds the shortest match using an NFA. | |
820 | fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> { | |
821 | self.shortest_nfa_type(MatchNfaType::Auto, text, start) | |
822 | } | |
823 | ||
824 | /// Like shortest_nfa, but allows specification of the type of NFA engine. | |
825 | fn shortest_nfa_type( | |
826 | &self, | |
827 | ty: MatchNfaType, | |
828 | text: &[u8], | |
829 | start: usize, | |
830 | ) -> Option<usize> { | |
831 | let mut slots = [None, None]; | |
832 | if self.exec_nfa(ty, &mut [false], &mut slots, true, text, start) { | |
833 | slots[1] | |
834 | } else { | |
835 | None | |
836 | } | |
837 | } | |
838 | ||
839 | /// Like find, but executes an NFA engine. | |
840 | fn find_nfa( | |
841 | &self, | |
842 | ty: MatchNfaType, | |
843 | text: &[u8], | |
844 | start: usize, | |
845 | ) -> Option<(usize, usize)> { | |
846 | let mut slots = [None, None]; | |
847 | if self.exec_nfa(ty, &mut [false], &mut slots, false, text, start) { | |
848 | match (slots[0], slots[1]) { | |
849 | (Some(s), Some(e)) => Some((s, e)), | |
850 | _ => None, | |
851 | } | |
852 | } else { | |
853 | None | |
854 | } | |
855 | } | |
856 | ||
857 | /// Like find_nfa, but fills in captures and restricts the search space | |
858 | /// using previously found match information. | |
859 | /// | |
860 | /// `slots` should have length equal to `2 * nfa.captures.len()`. | |
861 | fn captures_nfa_with_match( | |
862 | &self, | |
863 | slots: &mut [Slot], | |
864 | text: &[u8], | |
865 | match_start: usize, | |
866 | match_end: usize, | |
867 | ) -> Option<(usize, usize)> { | |
041b39d2 XL |
868 | // We can't use match_end directly, because we may need to examine one |
869 | // "character" after the end of a match for lookahead operators. We | |
870 | // need to move two characters beyond the end, since some look-around | |
871 | // operations may falsely assume a premature end of text otherwise. | |
872 | let e = cmp::min( | |
873 | next_utf8(text, next_utf8(text, match_end)), text.len()); | |
8bb4bdeb XL |
874 | self.captures_nfa(slots, &text[..e], match_start) |
875 | } | |
876 | ||
877 | /// Like find_nfa, but fills in captures. | |
878 | /// | |
879 | /// `slots` should have length equal to `2 * nfa.captures.len()`. | |
880 | fn captures_nfa( | |
881 | &self, | |
882 | slots: &mut [Slot], | |
883 | text: &[u8], | |
884 | start: usize, | |
885 | ) -> Option<(usize, usize)> { | |
886 | self.captures_nfa_type(MatchNfaType::Auto, slots, text, start) | |
887 | } | |
888 | ||
889 | /// Like captures_nfa, but allows specification of type of NFA engine. | |
890 | fn captures_nfa_type( | |
891 | &self, | |
892 | ty: MatchNfaType, | |
893 | slots: &mut [Slot], | |
894 | text: &[u8], | |
895 | start: usize, | |
896 | ) -> Option<(usize, usize)> { | |
897 | if self.exec_nfa(ty, &mut [false], slots, false, text, start) { | |
898 | match (slots[0], slots[1]) { | |
899 | (Some(s), Some(e)) => Some((s, e)), | |
900 | _ => None, | |
901 | } | |
902 | } else { | |
903 | None | |
904 | } | |
905 | } | |
906 | ||
907 | fn exec_nfa( | |
908 | &self, | |
909 | mut ty: MatchNfaType, | |
910 | matches: &mut [bool], | |
911 | slots: &mut [Slot], | |
912 | quit_after_match: bool, | |
913 | text: &[u8], | |
914 | start: usize, | |
915 | ) -> bool { | |
916 | use self::MatchNfaType::*; | |
917 | if let Auto = ty { | |
918 | if backtrack::should_exec(self.ro.nfa.len(), text.len()) { | |
919 | ty = Backtrack; | |
920 | } else { | |
921 | ty = PikeVM; | |
922 | } | |
923 | } | |
924 | match ty { | |
925 | Auto => unreachable!(), | |
926 | Backtrack => self.exec_backtrack(matches, slots, text, start), | |
927 | PikeVM => { | |
928 | self.exec_pikevm( | |
929 | matches, slots, quit_after_match, text, start) | |
930 | } | |
931 | } | |
932 | } | |
933 | ||
934 | /// Always run the NFA algorithm. | |
935 | fn exec_pikevm( | |
936 | &self, | |
937 | matches: &mut [bool], | |
938 | slots: &mut [Slot], | |
939 | quit_after_match: bool, | |
940 | text: &[u8], | |
941 | start: usize, | |
942 | ) -> bool { | |
943 | if self.ro.nfa.uses_bytes() { | |
944 | pikevm::Fsm::exec( | |
945 | &self.ro.nfa, | |
ff7c6d11 | 946 | self.cache, |
8bb4bdeb XL |
947 | matches, |
948 | slots, | |
949 | quit_after_match, | |
950 | ByteInput::new(text, self.ro.nfa.only_utf8), | |
951 | start) | |
952 | } else { | |
953 | pikevm::Fsm::exec( | |
954 | &self.ro.nfa, | |
ff7c6d11 | 955 | self.cache, |
8bb4bdeb XL |
956 | matches, |
957 | slots, | |
958 | quit_after_match, | |
959 | CharInput::new(text), | |
960 | start) | |
961 | } | |
962 | } | |
963 | ||
964 | /// Always runs the NFA using bounded backtracking. | |
965 | fn exec_backtrack( | |
966 | &self, | |
967 | matches: &mut [bool], | |
968 | slots: &mut [Slot], | |
969 | text: &[u8], | |
970 | start: usize, | |
971 | ) -> bool { | |
972 | if self.ro.nfa.uses_bytes() { | |
973 | backtrack::Bounded::exec( | |
974 | &self.ro.nfa, | |
ff7c6d11 | 975 | self.cache, |
8bb4bdeb XL |
976 | matches, |
977 | slots, | |
978 | ByteInput::new(text, self.ro.nfa.only_utf8), | |
979 | start) | |
980 | } else { | |
981 | backtrack::Bounded::exec( | |
982 | &self.ro.nfa, | |
ff7c6d11 | 983 | self.cache, |
8bb4bdeb XL |
984 | matches, |
985 | slots, | |
986 | CharInput::new(text), | |
987 | start) | |
988 | } | |
989 | } | |
990 | ||
991 | /// Finds which regular expressions match the given text. | |
992 | /// | |
993 | /// `matches` should have length equal to the number of regexes being | |
994 | /// searched. | |
995 | /// | |
996 | /// This is only useful when one wants to know which regexes in a set | |
997 | /// match some text. | |
998 | pub fn many_matches_at( | |
999 | &self, | |
1000 | matches: &mut [bool], | |
1001 | text: &[u8], | |
1002 | start: usize, | |
1003 | ) -> bool { | |
1004 | use self::MatchType::*; | |
1005 | if !self.is_anchor_end_match(text) { | |
1006 | return false; | |
1007 | } | |
1008 | match self.ro.match_type { | |
1009 | Literal(ty) => { | |
ff7c6d11 | 1010 | debug_assert_eq!(matches.len(), 1); |
8bb4bdeb XL |
1011 | matches[0] = self.find_literals(ty, text, start).is_some(); |
1012 | matches[0] | |
1013 | } | |
1014 | Dfa | DfaAnchoredReverse | DfaSuffix | DfaMany => { | |
1015 | match dfa::Fsm::forward_many( | |
1016 | &self.ro.dfa, | |
ff7c6d11 | 1017 | self.cache, |
8bb4bdeb XL |
1018 | matches, |
1019 | text, | |
1020 | start, | |
1021 | ) { | |
1022 | dfa::Result::Match(_) => true, | |
1023 | dfa::Result::NoMatch(_) => false, | |
1024 | dfa::Result::Quit => { | |
1025 | self.exec_nfa( | |
1026 | MatchNfaType::Auto, | |
1027 | matches, | |
1028 | &mut [], | |
1029 | false, | |
1030 | text, | |
1031 | start) | |
1032 | } | |
1033 | } | |
1034 | } | |
1035 | Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start), | |
1036 | Nothing => false, | |
1037 | } | |
1038 | } | |
1039 | ||
1040 | #[inline(always)] // reduces constant overhead | |
1041 | fn is_anchor_end_match(&self, text: &[u8]) -> bool { | |
1042 | // Only do this check if the haystack is big (>1MB). | |
1043 | if text.len() > (1<<20) && self.ro.nfa.is_anchored_end { | |
1044 | let lcs = self.ro.suffixes.lcs(); | |
1045 | if lcs.len() >= 1 && !lcs.is_suffix(text) { | |
1046 | return false; | |
1047 | } | |
1048 | } | |
1049 | true | |
1050 | } | |
1051 | ||
1052 | pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { | |
1053 | &self.ro.nfa.capture_name_idx | |
1054 | } | |
1055 | } | |
1056 | ||
1057 | impl<'c> ExecNoSyncStr<'c> { | |
1058 | pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { | |
1059 | self.0.capture_name_idx() | |
1060 | } | |
1061 | } | |
1062 | ||
1063 | impl Exec { | |
1064 | /// Get a searcher that isn't Sync. | |
1065 | #[inline(always)] // reduces constant overhead | |
1066 | pub fn searcher(&self) -> ExecNoSync { | |
1067 | let create = || Box::new(RefCell::new(ProgramCacheInner::new(&self.ro))); | |
1068 | ExecNoSync { | |
1069 | ro: &self.ro, // a clone is too expensive here! (and not needed) | |
1070 | cache: self.cache.get_or(create), | |
1071 | } | |
1072 | } | |
1073 | ||
1074 | /// Get a searcher that isn't Sync and can match on &str. | |
1075 | #[inline(always)] // reduces constant overhead | |
1076 | pub fn searcher_str(&self) -> ExecNoSyncStr { | |
1077 | ExecNoSyncStr(self.searcher()) | |
1078 | } | |
1079 | ||
1080 | /// Build a Regex from this executor. | |
1081 | pub fn into_regex(self) -> re_unicode::Regex { | |
1082 | re_unicode::Regex::from(self) | |
1083 | } | |
1084 | ||
1085 | /// Build a RegexSet from this executor. | |
1086 | pub fn into_regex_set(self) -> re_set::unicode::RegexSet { | |
1087 | re_set::unicode::RegexSet::from(self) | |
1088 | } | |
1089 | ||
1090 | /// Build a Regex from this executor that can match arbitrary bytes. | |
1091 | pub fn into_byte_regex(self) -> re_bytes::Regex { | |
1092 | re_bytes::Regex::from(self) | |
1093 | } | |
1094 | ||
1095 | /// Build a RegexSet from this executor that can match arbitrary bytes. | |
1096 | pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet { | |
1097 | re_set::bytes::RegexSet::from(self) | |
1098 | } | |
1099 | ||
1100 | /// The original regular expressions given by the caller that were | |
1101 | /// compiled. | |
1102 | pub fn regex_strings(&self) -> &[String] { | |
1103 | &self.ro.res | |
1104 | } | |
1105 | ||
1106 | /// Return a slice of capture names. | |
1107 | /// | |
1108 | /// Any capture that isn't named is None. | |
1109 | pub fn capture_names(&self) -> &[Option<String>] { | |
1110 | &self.ro.nfa.captures | |
1111 | } | |
1112 | ||
1113 | /// Return a reference to named groups mapping (from group name to | |
1114 | /// group position). | |
1115 | pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { | |
1116 | &self.ro.nfa.capture_name_idx | |
1117 | } | |
1118 | } | |
1119 | ||
1120 | impl Clone for Exec { | |
1121 | fn clone(&self) -> Exec { | |
1122 | Exec { | |
1123 | ro: self.ro.clone(), | |
1124 | cache: CachedThreadLocal::new(), | |
1125 | } | |
1126 | } | |
1127 | } | |
1128 | ||
1129 | impl ExecReadOnly { | |
1130 | fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType { | |
1131 | use self::MatchType::*; | |
1132 | if let Some(Nfa(_)) = hint { | |
1133 | return hint.unwrap(); | |
1134 | } | |
1135 | // If the NFA is empty, then we'll never match anything. | |
1136 | if self.nfa.insts.is_empty() { | |
1137 | return Nothing; | |
1138 | } | |
1139 | // If our set of prefixes is complete, then we can use it to find | |
1140 | // a match in lieu of a regex engine. This doesn't quite work well in | |
1141 | // the presence of multiple regexes, so only do it when there's one. | |
1142 | // | |
1143 | // TODO(burntsushi): Also, don't try to match literals if the regex is | |
1144 | // partially anchored. We could technically do it, but we'd need to | |
1145 | // create two sets of literals: all of them and then the subset that | |
1146 | // aren't anchored. We would then only search for all of them when at | |
1147 | // the beginning of the input and use the subset in all other cases. | |
1148 | if self.res.len() == 1 { | |
1149 | if self.nfa.prefixes.complete() { | |
1150 | return if self.nfa.is_anchored_start { | |
1151 | Literal(MatchLiteralType::AnchoredStart) | |
1152 | } else { | |
1153 | Literal(MatchLiteralType::Unanchored) | |
1154 | }; | |
1155 | } | |
1156 | if self.suffixes.complete() { | |
1157 | return if self.nfa.is_anchored_end { | |
1158 | Literal(MatchLiteralType::AnchoredEnd) | |
1159 | } else { | |
1160 | // This case shouldn't happen. When the regex isn't | |
1161 | // anchored, then complete prefixes should imply complete | |
1162 | // suffixes. | |
1163 | Literal(MatchLiteralType::Unanchored) | |
1164 | }; | |
1165 | } | |
1166 | } | |
1167 | // If we can execute the DFA, then we totally should. | |
1168 | if dfa::can_exec(&self.dfa) { | |
1169 | // Regex sets require a slightly specialized path. | |
1170 | if self.res.len() >= 2 { | |
1171 | return DfaMany; | |
1172 | } | |
1173 | // If the regex is anchored at the end but not the start, then | |
1174 | // just match in reverse from the end of the haystack. | |
1175 | if !self.nfa.is_anchored_start && self.nfa.is_anchored_end { | |
1176 | return DfaAnchoredReverse; | |
1177 | } | |
1178 | // If there's a longish suffix literal, then it might be faster | |
1179 | // to look for that first. | |
1180 | if self.should_suffix_scan() { | |
1181 | return DfaSuffix; | |
1182 | } | |
1183 | // Fall back to your garden variety forward searching lazy DFA. | |
1184 | return Dfa; | |
1185 | } | |
1186 | // We're so totally hosed. | |
1187 | Nfa(MatchNfaType::Auto) | |
1188 | } | |
1189 | ||
1190 | /// Returns true if the program is amenable to suffix scanning. | |
1191 | /// | |
1192 | /// When this is true, as a heuristic, we assume it is OK to quickly scan | |
1193 | /// for suffix literals and then do a *reverse* DFA match from any matches | |
1194 | /// produced by the literal scan. (And then followed by a forward DFA | |
1195 | /// search, since the previously found suffix literal maybe not actually be | |
1196 | /// the end of a match.) | |
1197 | /// | |
1198 | /// This is a bit of a specialized optimization, but can result in pretty | |
1199 | /// big performance wins if 1) there are no prefix literals and 2) the | |
1200 | /// suffix literals are pretty rare in the text. (1) is obviously easy to | |
1201 | /// account for but (2) is harder. As a proxy, we assume that longer | |
1202 | /// strings are generally rarer, so we only enable this optimization when | |
1203 | /// we have a meaty suffix. | |
1204 | fn should_suffix_scan(&self) -> bool { | |
1205 | if self.suffixes.is_empty() { | |
1206 | return false; | |
1207 | } | |
1208 | let lcs_len = self.suffixes.lcs().char_len(); | |
1209 | lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len() | |
1210 | } | |
1211 | } | |
1212 | ||
1213 | #[derive(Clone, Copy, Debug)] | |
1214 | enum MatchType { | |
1215 | /// A single or multiple literal search. This is only used when the regex | |
1216 | /// can be decomposed into unambiguous literal search. | |
1217 | Literal(MatchLiteralType), | |
1218 | /// A normal DFA search. | |
1219 | Dfa, | |
1220 | /// A reverse DFA search starting from the end of a haystack. | |
1221 | DfaAnchoredReverse, | |
1222 | /// A reverse DFA search with suffix literal scanning. | |
1223 | DfaSuffix, | |
1224 | /// Use the DFA on two or more regular expressions. | |
1225 | DfaMany, | |
1226 | /// An NFA variant. | |
1227 | Nfa(MatchNfaType), | |
1228 | /// No match is ever possible, so don't ever try to search. | |
1229 | Nothing, | |
1230 | } | |
1231 | ||
1232 | #[derive(Clone, Copy, Debug)] | |
1233 | enum MatchLiteralType { | |
1234 | /// Match literals anywhere in text. | |
1235 | Unanchored, | |
1236 | /// Match literals only at the start of text. | |
1237 | AnchoredStart, | |
1238 | /// Match literals only at the end of text. | |
1239 | AnchoredEnd, | |
1240 | } | |
1241 | ||
1242 | #[derive(Clone, Copy, Debug)] | |
1243 | enum MatchNfaType { | |
1244 | /// Choose between Backtrack and PikeVM. | |
1245 | Auto, | |
1246 | /// NFA bounded backtracking. | |
1247 | /// | |
1248 | /// (This is only set by tests, since it never makes sense to always want | |
1249 | /// backtracking.) | |
1250 | Backtrack, | |
1251 | /// The Pike VM. | |
1252 | /// | |
1253 | /// (This is only set by tests, since it never makes sense to always want | |
1254 | /// the Pike VM.) | |
1255 | PikeVM, | |
1256 | } | |
1257 | ||
ff7c6d11 | 1258 | /// `ProgramCache` maintains reusable allocations for each matching engine |
8bb4bdeb XL |
1259 | /// available to a particular program. |
1260 | pub type ProgramCache = RefCell<ProgramCacheInner>; | |
1261 | ||
1262 | #[derive(Clone, Debug)] | |
1263 | pub struct ProgramCacheInner { | |
1264 | pub pikevm: pikevm::Cache, | |
1265 | pub backtrack: backtrack::Cache, | |
1266 | pub dfa: dfa::Cache, | |
1267 | pub dfa_reverse: dfa::Cache, | |
1268 | } | |
1269 | ||
1270 | impl ProgramCacheInner { | |
1271 | fn new(ro: &ExecReadOnly) -> Self { | |
1272 | ProgramCacheInner { | |
1273 | pikevm: pikevm::Cache::new(&ro.nfa), | |
1274 | backtrack: backtrack::Cache::new(&ro.nfa), | |
1275 | dfa: dfa::Cache::new(&ro.dfa), | |
1276 | dfa_reverse: dfa::Cache::new(&ro.dfa_reverse), | |
1277 | } | |
1278 | } | |
1279 | } |