]>
Commit | Line | Data |
---|---|---|
94b46f34 XL |
1 | // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
10 | ||
11 | use std::cell::RefCell; | |
12 | use std::collections::HashMap; | |
13 | use std::cmp; | |
14 | use std::sync::Arc; | |
15 | ||
16 | use thread_local::CachedThreadLocal; | |
17 | use syntax::ParserBuilder; | |
18 | use syntax::hir::Hir; | |
19 | use syntax::hir::literal::Literals; | |
20 | ||
21 | use backtrack; | |
22 | use compile::Compiler; | |
23 | use dfa; | |
24 | use error::Error; | |
25 | use input::{ByteInput, CharInput}; | |
26 | use literal::LiteralSearcher; | |
27 | use pikevm; | |
28 | use prog::Program; | |
29 | use re_builder::RegexOptions; | |
30 | use re_bytes; | |
31 | use re_set; | |
32 | use re_trait::{RegularExpression, Slot, Locations, as_slots}; | |
33 | use re_unicode; | |
34 | use utf8::next_utf8; | |
35 | ||
36 | /// `Exec` manages the execution of a regular expression. | |
37 | /// | |
38 | /// In particular, this manages the various compiled forms of a single regular | |
39 | /// expression and the choice of which matching engine to use to execute a | |
40 | /// regular expression. | |
41 | pub struct Exec { | |
42 | /// All read only state. | |
43 | ro: Arc<ExecReadOnly>, | |
44 | /// Caches for the various matching engines. | |
45 | cache: CachedThreadLocal<ProgramCache>, | |
46 | } | |
47 | ||
48 | /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This | |
49 | /// means it is no longer Sync, but we can now avoid the overhead of | |
50 | /// synchronization to fetch the cache. | |
51 | #[derive(Debug)] | |
52 | pub struct ExecNoSync<'c> { | |
53 | /// All read only state. | |
54 | ro: &'c Arc<ExecReadOnly>, | |
55 | /// Caches for the various matching engines. | |
56 | cache: &'c ProgramCache, | |
57 | } | |
58 | ||
59 | /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8]. | |
60 | pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>); | |
61 | ||
62 | /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such | |
63 | /// state is determined at compile time and never changes during search. | |
64 | #[derive(Debug)] | |
65 | struct ExecReadOnly { | |
66 | /// The original regular expressions given by the caller to compile. | |
67 | res: Vec<String>, | |
68 | /// A compiled program that is used in the NFA simulation and backtracking. | |
69 | /// It can be byte-based or Unicode codepoint based. | |
70 | /// | |
71 | /// N.B. It is not possibly to make this byte-based from the public API. | |
72 | /// It is only used for testing byte based programs in the NFA simulations. | |
73 | nfa: Program, | |
74 | /// A compiled byte based program for DFA execution. This is only used | |
75 | /// if a DFA can be executed. (Currently, only word boundary assertions are | |
76 | /// not supported.) Note that this program contains an embedded `.*?` | |
77 | /// preceding the first capture group, unless the regex is anchored at the | |
78 | /// beginning. | |
79 | dfa: Program, | |
80 | /// The same as above, except the program is reversed (and there is no | |
81 | /// preceding `.*?`). This is used by the DFA to find the starting location | |
82 | /// of matches. | |
83 | dfa_reverse: Program, | |
84 | /// A set of suffix literals extracted from the regex. | |
85 | /// | |
86 | /// Prefix literals are stored on the `Program`, since they are used inside | |
87 | /// the matching engines. | |
88 | suffixes: LiteralSearcher, | |
89 | /// match_type encodes as much upfront knowledge about how we're going to | |
90 | /// execute a search as possible. | |
91 | match_type: MatchType, | |
92 | } | |
93 | ||
94 | /// Facilitates the construction of an executor by exposing various knobs | |
95 | /// to control how a regex is executed and what kinds of resources it's | |
96 | /// permitted to use. | |
97 | pub struct ExecBuilder { | |
98 | options: RegexOptions, | |
99 | match_type: Option<MatchType>, | |
100 | bytes: bool, | |
101 | only_utf8: bool, | |
102 | } | |
103 | ||
104 | /// Parsed represents a set of parsed regular expressions and their detected | |
105 | /// literals. | |
106 | struct Parsed { | |
107 | exprs: Vec<Hir>, | |
108 | prefixes: Literals, | |
109 | suffixes: Literals, | |
110 | bytes: bool, | |
111 | } | |
112 | ||
113 | impl ExecBuilder { | |
114 | /// Create a regex execution builder. | |
115 | /// | |
116 | /// This uses default settings for everything except the regex itself, | |
117 | /// which must be provided. Further knobs can be set by calling methods, | |
118 | /// and then finally, `build` to actually create the executor. | |
119 | pub fn new(re: &str) -> Self { | |
120 | Self::new_many(&[re]) | |
121 | } | |
122 | ||
123 | /// Like new, but compiles the union of the given regular expressions. | |
124 | /// | |
125 | /// Note that when compiling 2 or more regular expressions, capture groups | |
126 | /// are completely unsupported. (This means both `find` and `captures` | |
127 | /// wont work.) | |
128 | pub fn new_many<I, S>(res: I) -> Self | |
129 | where S: AsRef<str>, I: IntoIterator<Item=S> { | |
130 | let mut opts = RegexOptions::default(); | |
131 | opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect(); | |
132 | Self::new_options(opts) | |
133 | } | |
134 | ||
135 | /// Create a regex execution builder. | |
136 | pub fn new_options(opts: RegexOptions) -> Self { | |
137 | ExecBuilder { | |
138 | options: opts, | |
139 | match_type: None, | |
140 | bytes: false, | |
141 | only_utf8: true, | |
142 | } | |
143 | } | |
144 | ||
145 | /// Set the matching engine to be automatically determined. | |
146 | /// | |
147 | /// This is the default state and will apply whatever optimizations are | |
148 | /// possible, such as running a DFA. | |
149 | /// | |
150 | /// This overrides whatever was previously set via the `nfa` or | |
151 | /// `bounded_backtracking` methods. | |
152 | pub fn automatic(mut self) -> Self { | |
153 | self.match_type = None; | |
154 | self | |
155 | } | |
156 | ||
157 | /// Sets the matching engine to use the NFA algorithm no matter what | |
158 | /// optimizations are possible. | |
159 | /// | |
160 | /// This overrides whatever was previously set via the `automatic` or | |
161 | /// `bounded_backtracking` methods. | |
162 | pub fn nfa(mut self) -> Self { | |
163 | self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM)); | |
164 | self | |
165 | } | |
166 | ||
167 | /// Sets the matching engine to use a bounded backtracking engine no | |
168 | /// matter what optimizations are possible. | |
169 | /// | |
170 | /// One must use this with care, since the bounded backtracking engine | |
171 | /// uses memory proportion to `len(regex) * len(text)`. | |
172 | /// | |
173 | /// This overrides whatever was previously set via the `automatic` or | |
174 | /// `nfa` methods. | |
175 | pub fn bounded_backtracking(mut self) -> Self { | |
176 | self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack)); | |
177 | self | |
178 | } | |
179 | ||
180 | /// Compiles byte based programs for use with the NFA matching engines. | |
181 | /// | |
182 | /// By default, the NFA engines match on Unicode scalar values. They can | |
183 | /// be made to use byte based programs instead. In general, the byte based | |
184 | /// programs are slower because of a less efficient encoding of character | |
185 | /// classes. | |
186 | /// | |
187 | /// Note that this does not impact DFA matching engines, which always | |
188 | /// execute on bytes. | |
189 | pub fn bytes(mut self, yes: bool) -> Self { | |
190 | self.bytes = yes; | |
191 | self | |
192 | } | |
193 | ||
194 | /// When disabled, the program compiled may match arbitrary bytes. | |
195 | /// | |
196 | /// When enabled (the default), all compiled programs exclusively match | |
197 | /// valid UTF-8 bytes. | |
198 | pub fn only_utf8(mut self, yes: bool) -> Self { | |
199 | self.only_utf8 = yes; | |
200 | self | |
201 | } | |
202 | ||
203 | /// Set the Unicode flag. | |
204 | pub fn unicode(mut self, yes: bool) -> Self { | |
205 | self.options.unicode = yes; | |
206 | self | |
207 | } | |
208 | ||
209 | /// Parse the current set of patterns into their AST and extract literals. | |
210 | fn parse(&self) -> Result<Parsed, Error> { | |
211 | let mut exprs = Vec::with_capacity(self.options.pats.len()); | |
212 | let mut prefixes = Some(Literals::empty()); | |
213 | let mut suffixes = Some(Literals::empty()); | |
214 | let mut bytes = false; | |
215 | let is_set = self.options.pats.len() > 1; | |
216 | // If we're compiling a regex set and that set has any anchored | |
217 | // expressions, then disable all literal optimizations. | |
218 | for pat in &self.options.pats { | |
219 | let mut parser = | |
220 | ParserBuilder::new() | |
221 | // TODO(burntsushi): Disable octal in regex 1.0. Nobody | |
222 | // uses it, and we'll get better error messages when | |
223 | // someone tries to use a backreference. Provide a new | |
224 | // opt-in toggle for it though. | |
225 | .octal(true) | |
226 | .case_insensitive(self.options.case_insensitive) | |
227 | .multi_line(self.options.multi_line) | |
228 | .dot_matches_new_line(self.options.dot_matches_new_line) | |
229 | .swap_greed(self.options.swap_greed) | |
230 | .ignore_whitespace(self.options.ignore_whitespace) | |
231 | .unicode(self.options.unicode) | |
232 | .allow_invalid_utf8(!self.only_utf8) | |
233 | .nest_limit(self.options.nest_limit) | |
234 | .build(); | |
235 | let expr = try!(parser.parse(pat)); | |
236 | bytes = bytes || !expr.is_always_utf8(); | |
237 | ||
238 | if !expr.is_anchored_start() && expr.is_any_anchored_start() { | |
239 | // Partial anchors unfortunately make it hard to use prefixes, | |
240 | // so disable them. | |
241 | prefixes = None; | |
242 | } else if is_set && expr.is_anchored_start() { | |
243 | // Regex sets with anchors do not go well with literal | |
244 | // optimizations. | |
245 | prefixes = None; | |
246 | } | |
247 | prefixes = prefixes.and_then(|mut prefixes| { | |
248 | if !prefixes.union_prefixes(&expr) { | |
249 | None | |
250 | } else { | |
251 | Some(prefixes) | |
252 | } | |
253 | }); | |
254 | ||
255 | if !expr.is_anchored_end() && expr.is_any_anchored_end() { | |
256 | // Partial anchors unfortunately make it hard to use suffixes, | |
257 | // so disable them. | |
258 | suffixes = None; | |
259 | } else if is_set && expr.is_anchored_end() { | |
260 | // Regex sets with anchors do not go well with literal | |
261 | // optimizations. | |
262 | suffixes = None; | |
263 | } | |
264 | suffixes = suffixes.and_then(|mut suffixes| { | |
265 | if !suffixes.union_suffixes(&expr) { | |
266 | None | |
267 | } else { | |
268 | Some(suffixes) | |
269 | } | |
270 | }); | |
271 | exprs.push(expr); | |
272 | } | |
273 | Ok(Parsed { | |
274 | exprs: exprs, | |
275 | prefixes: prefixes.unwrap_or_else(Literals::empty), | |
276 | suffixes: suffixes.unwrap_or_else(Literals::empty), | |
277 | bytes: bytes, | |
278 | }) | |
279 | } | |
280 | ||
281 | /// Build an executor that can run a regular expression. | |
282 | pub fn build(self) -> Result<Exec, Error> { | |
283 | // Special case when we have no patterns to compile. | |
284 | // This can happen when compiling a regex set. | |
285 | if self.options.pats.is_empty() { | |
286 | let ro = Arc::new(ExecReadOnly { | |
287 | res: vec![], | |
288 | nfa: Program::new(), | |
289 | dfa: Program::new(), | |
290 | dfa_reverse: Program::new(), | |
291 | suffixes: LiteralSearcher::empty(), | |
292 | match_type: MatchType::Nothing, | |
293 | }); | |
294 | return Ok(Exec { ro: ro, cache: CachedThreadLocal::new() }); | |
295 | } | |
296 | let parsed = try!(self.parse()); | |
297 | let mut nfa = try!( | |
298 | Compiler::new() | |
299 | .size_limit(self.options.size_limit) | |
300 | .bytes(self.bytes || parsed.bytes) | |
301 | .only_utf8(self.only_utf8) | |
302 | .compile(&parsed.exprs)); | |
303 | let mut dfa = try!( | |
304 | Compiler::new() | |
305 | .size_limit(self.options.size_limit) | |
306 | .dfa(true) | |
307 | .only_utf8(self.only_utf8) | |
308 | .compile(&parsed.exprs)); | |
309 | let mut dfa_reverse = try!( | |
310 | Compiler::new() | |
311 | .size_limit(self.options.size_limit) | |
312 | .dfa(true) | |
313 | .only_utf8(self.only_utf8) | |
314 | .reverse(true) | |
315 | .compile(&parsed.exprs)); | |
316 | ||
317 | let prefixes = parsed.prefixes.unambiguous_prefixes(); | |
318 | let suffixes = parsed.suffixes.unambiguous_suffixes(); | |
319 | nfa.prefixes = LiteralSearcher::prefixes(prefixes); | |
320 | dfa.prefixes = nfa.prefixes.clone(); | |
321 | dfa.dfa_size_limit = self.options.dfa_size_limit; | |
322 | dfa_reverse.dfa_size_limit = self.options.dfa_size_limit; | |
323 | ||
324 | let mut ro = ExecReadOnly { | |
325 | res: self.options.pats, | |
326 | nfa: nfa, | |
327 | dfa: dfa, | |
328 | dfa_reverse: dfa_reverse, | |
329 | suffixes: LiteralSearcher::suffixes(suffixes), | |
330 | match_type: MatchType::Nothing, | |
331 | }; | |
332 | ro.match_type = ro.choose_match_type(self.match_type); | |
333 | ||
334 | let ro = Arc::new(ro); | |
335 | Ok(Exec { ro: ro, cache: CachedThreadLocal::new() }) | |
336 | } | |
337 | } | |
338 | ||
339 | impl<'c> RegularExpression for ExecNoSyncStr<'c> { | |
340 | type Text = str; | |
341 | ||
342 | fn slots_len(&self) -> usize { self.0.slots_len() } | |
343 | ||
344 | fn next_after_empty(&self, text: &str, i: usize) -> usize { | |
345 | next_utf8(text.as_bytes(), i) | |
346 | } | |
347 | ||
348 | #[inline(always)] // reduces constant overhead | |
349 | fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> { | |
350 | self.0.shortest_match_at(text.as_bytes(), start) | |
351 | } | |
352 | ||
353 | #[inline(always)] // reduces constant overhead | |
354 | fn is_match_at(&self, text: &str, start: usize) -> bool { | |
355 | self.0.is_match_at(text.as_bytes(), start) | |
356 | } | |
357 | ||
358 | #[inline(always)] // reduces constant overhead | |
359 | fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { | |
360 | self.0.find_at(text.as_bytes(), start) | |
361 | } | |
362 | ||
363 | #[inline(always)] // reduces constant overhead | |
364 | fn read_captures_at( | |
365 | &self, | |
366 | locs: &mut Locations, | |
367 | text: &str, | |
368 | start: usize, | |
369 | ) -> Option<(usize, usize)> { | |
370 | self.0.read_captures_at(locs, text.as_bytes(), start) | |
371 | } | |
372 | } | |
373 | ||
374 | impl<'c> RegularExpression for ExecNoSync<'c> { | |
375 | type Text = [u8]; | |
376 | ||
377 | /// Returns the number of capture slots in the regular expression. (There | |
378 | /// are two slots for every capture group, corresponding to possibly empty | |
379 | /// start and end locations of the capture.) | |
380 | fn slots_len(&self) -> usize { | |
381 | self.ro.nfa.captures.len() * 2 | |
382 | } | |
383 | ||
384 | fn next_after_empty(&self, _text: &[u8], i: usize) -> usize { | |
385 | i + 1 | |
386 | } | |
387 | ||
388 | /// Returns the end of a match location, possibly occurring before the | |
389 | /// end location of the correct leftmost-first match. | |
390 | #[inline(always)] // reduces constant overhead | |
391 | fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> { | |
392 | if !self.is_anchor_end_match(text) { | |
393 | return None; | |
394 | } | |
395 | match self.ro.match_type { | |
396 | MatchType::Literal(ty) => { | |
397 | self.find_literals(ty, text, start).map(|(_, e)| e) | |
398 | } | |
399 | MatchType::Dfa | MatchType::DfaMany => { | |
400 | match self.shortest_dfa(text, start) { | |
401 | dfa::Result::Match(end) => Some(end), | |
402 | dfa::Result::NoMatch(_) => None, | |
403 | dfa::Result::Quit => self.shortest_nfa(text, start), | |
404 | } | |
405 | } | |
406 | MatchType::DfaAnchoredReverse => { | |
407 | match dfa::Fsm::reverse( | |
408 | &self.ro.dfa_reverse, | |
409 | self.cache, | |
410 | true, | |
411 | &text[start..], | |
412 | text.len(), | |
413 | ) { | |
414 | dfa::Result::Match(_) => Some(text.len()), | |
415 | dfa::Result::NoMatch(_) => None, | |
416 | dfa::Result::Quit => self.shortest_nfa(text, start), | |
417 | } | |
418 | } | |
419 | MatchType::DfaSuffix => { | |
420 | match self.shortest_dfa_reverse_suffix(text, start) { | |
421 | dfa::Result::Match(e) => Some(e), | |
422 | dfa::Result::NoMatch(_) => None, | |
423 | dfa::Result::Quit => self.shortest_nfa(text, start), | |
424 | } | |
425 | } | |
426 | MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start), | |
427 | MatchType::Nothing => None, | |
428 | } | |
429 | } | |
430 | ||
431 | /// Returns true if and only if the regex matches text. | |
432 | /// | |
433 | /// For single regular expressions, this is equivalent to calling | |
434 | /// shortest_match(...).is_some(). | |
435 | #[inline(always)] // reduces constant overhead | |
436 | fn is_match_at(&self, text: &[u8], start: usize) -> bool { | |
437 | if !self.is_anchor_end_match(text) { | |
438 | return false; | |
439 | } | |
440 | // We need to do this dance because shortest_match relies on the NFA | |
441 | // filling in captures[1], but a RegexSet has no captures. In other | |
442 | // words, a RegexSet can't (currently) use shortest_match. ---AG | |
443 | match self.ro.match_type { | |
444 | MatchType::Literal(ty) => { | |
445 | self.find_literals(ty, text, start).is_some() | |
446 | } | |
447 | MatchType::Dfa | MatchType::DfaMany => { | |
448 | match self.shortest_dfa(text, start) { | |
449 | dfa::Result::Match(_) => true, | |
450 | dfa::Result::NoMatch(_) => false, | |
451 | dfa::Result::Quit => self.match_nfa(text, start), | |
452 | } | |
453 | } | |
454 | MatchType::DfaAnchoredReverse => { | |
455 | match dfa::Fsm::reverse( | |
456 | &self.ro.dfa_reverse, | |
457 | self.cache, | |
458 | true, | |
459 | &text[start..], | |
460 | text.len(), | |
461 | ) { | |
462 | dfa::Result::Match(_) => true, | |
463 | dfa::Result::NoMatch(_) => false, | |
464 | dfa::Result::Quit => self.match_nfa(text, start), | |
465 | } | |
466 | } | |
467 | MatchType::DfaSuffix => { | |
468 | match self.shortest_dfa_reverse_suffix(text, start) { | |
469 | dfa::Result::Match(_) => true, | |
470 | dfa::Result::NoMatch(_) => false, | |
471 | dfa::Result::Quit => self.match_nfa(text, start), | |
472 | } | |
473 | } | |
474 | MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start), | |
475 | MatchType::Nothing => false, | |
476 | } | |
477 | } | |
478 | ||
479 | /// Finds the start and end location of the leftmost-first match, starting | |
480 | /// at the given location. | |
481 | #[inline(always)] // reduces constant overhead | |
482 | fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> { | |
483 | if !self.is_anchor_end_match(text) { | |
484 | return None; | |
485 | } | |
486 | match self.ro.match_type { | |
487 | MatchType::Literal(ty) => { | |
488 | self.find_literals(ty, text, start) | |
489 | } | |
490 | MatchType::Dfa => { | |
491 | match self.find_dfa_forward(text, start) { | |
492 | dfa::Result::Match((s, e)) => Some((s, e)), | |
493 | dfa::Result::NoMatch(_) => None, | |
494 | dfa::Result::Quit => { | |
495 | self.find_nfa(MatchNfaType::Auto, text, start) | |
496 | } | |
497 | } | |
498 | } | |
499 | MatchType::DfaAnchoredReverse => { | |
500 | match self.find_dfa_anchored_reverse(text, start) { | |
501 | dfa::Result::Match((s, e)) => Some((s, e)), | |
502 | dfa::Result::NoMatch(_) => None, | |
503 | dfa::Result::Quit => { | |
504 | self.find_nfa(MatchNfaType::Auto, text, start) | |
505 | } | |
506 | } | |
507 | } | |
508 | MatchType::DfaSuffix => { | |
509 | match self.find_dfa_reverse_suffix(text, start) { | |
510 | dfa::Result::Match((s, e)) => Some((s, e)), | |
511 | dfa::Result::NoMatch(_) => None, | |
512 | dfa::Result::Quit => { | |
513 | self.find_nfa(MatchNfaType::Auto, text, start) | |
514 | } | |
515 | } | |
516 | } | |
517 | MatchType::Nfa(ty) => self.find_nfa(ty, text, start), | |
518 | MatchType::Nothing => None, | |
519 | MatchType::DfaMany => { | |
520 | unreachable!("BUG: RegexSet cannot be used with find") | |
521 | } | |
522 | } | |
523 | } | |
524 | ||
525 | /// Finds the start and end location of the leftmost-first match and also | |
526 | /// fills in all matching capture groups. | |
527 | /// | |
528 | /// The number of capture slots given should be equal to the total number | |
529 | /// of capture slots in the compiled program. | |
530 | /// | |
531 | /// Note that the first two slots always correspond to the start and end | |
532 | /// locations of the overall match. | |
533 | fn read_captures_at( | |
534 | &self, | |
535 | locs: &mut Locations, | |
536 | text: &[u8], | |
537 | start: usize, | |
538 | ) -> Option<(usize, usize)> { | |
539 | let slots = as_slots(locs); | |
540 | for slot in slots.iter_mut() { | |
541 | *slot = None; | |
542 | } | |
543 | // If the caller unnecessarily uses this, then we try to save them | |
544 | // from themselves. | |
545 | match slots.len() { | |
546 | 0 => return self.find_at(text, start), | |
547 | 2 => { | |
548 | return self.find_at(text, start).map(|(s, e)| { | |
549 | slots[0] = Some(s); | |
550 | slots[1] = Some(e); | |
551 | (s, e) | |
552 | }); | |
553 | } | |
554 | _ => {} // fallthrough | |
555 | } | |
556 | if !self.is_anchor_end_match(text) { | |
557 | return None; | |
558 | } | |
559 | match self.ro.match_type { | |
560 | MatchType::Literal(ty) => { | |
561 | self.find_literals(ty, text, start).and_then(|(s, e)| { | |
562 | self.captures_nfa_with_match(slots, text, s, e) | |
563 | }) | |
564 | } | |
565 | MatchType::Dfa => { | |
566 | if self.ro.nfa.is_anchored_start { | |
567 | self.captures_nfa(slots, text, start) | |
568 | } else { | |
569 | match self.find_dfa_forward(text, start) { | |
570 | dfa::Result::Match((s, e)) => { | |
571 | self.captures_nfa_with_match(slots, text, s, e) | |
572 | } | |
573 | dfa::Result::NoMatch(_) => None, | |
574 | dfa::Result::Quit => self.captures_nfa(slots, text, start), | |
575 | } | |
576 | } | |
577 | } | |
578 | MatchType::DfaAnchoredReverse => { | |
579 | match self.find_dfa_anchored_reverse(text, start) { | |
580 | dfa::Result::Match((s, e)) => { | |
581 | self.captures_nfa_with_match(slots, text, s, e) | |
582 | } | |
583 | dfa::Result::NoMatch(_) => None, | |
584 | dfa::Result::Quit => self.captures_nfa(slots, text, start), | |
585 | } | |
586 | } | |
587 | MatchType::DfaSuffix => { | |
588 | match self.find_dfa_reverse_suffix(text, start) { | |
589 | dfa::Result::Match((s, e)) => { | |
590 | self.captures_nfa_with_match(slots, text, s, e) | |
591 | } | |
592 | dfa::Result::NoMatch(_) => None, | |
593 | dfa::Result::Quit => self.captures_nfa(slots, text, start), | |
594 | } | |
595 | } | |
596 | MatchType::Nfa(ty) => { | |
597 | self.captures_nfa_type(ty, slots, text, start) | |
598 | } | |
599 | MatchType::Nothing => None, | |
600 | MatchType::DfaMany => { | |
601 | unreachable!("BUG: RegexSet cannot be used with captures") | |
602 | } | |
603 | } | |
604 | } | |
605 | } | |
606 | ||
607 | impl<'c> ExecNoSync<'c> { | |
608 | /// Finds the leftmost-first match using only literal search. | |
609 | #[inline(always)] // reduces constant overhead | |
610 | fn find_literals( | |
611 | &self, | |
612 | ty: MatchLiteralType, | |
613 | text: &[u8], | |
614 | start: usize, | |
615 | ) -> Option<(usize, usize)> { | |
616 | use self::MatchLiteralType::*; | |
617 | match ty { | |
618 | Unanchored => { | |
619 | let lits = &self.ro.nfa.prefixes; | |
620 | lits.find(&text[start..]) | |
621 | .map(|(s, e)| (start + s, start + e)) | |
622 | } | |
623 | AnchoredStart => { | |
624 | let lits = &self.ro.nfa.prefixes; | |
8faf50e0 XL |
625 | if !self.ro.nfa.is_anchored_start |
626 | || (self.ro.nfa.is_anchored_start && start == 0) { | |
627 | lits.find_start(&text[start..]) | |
628 | .map(|(s, e)| (start + s, start + e)) | |
629 | } else { | |
630 | None | |
631 | } | |
94b46f34 XL |
632 | } |
633 | AnchoredEnd => { | |
634 | let lits = &self.ro.suffixes; | |
635 | lits.find_end(&text[start..]) | |
636 | .map(|(s, e)| (start + s, start + e)) | |
637 | } | |
638 | } | |
639 | } | |
640 | ||
641 | /// Finds the leftmost-first match (start and end) using only the DFA. | |
642 | /// | |
643 | /// If the result returned indicates that the DFA quit, then another | |
644 | /// matching engine should be used. | |
645 | #[inline(always)] // reduces constant overhead | |
646 | fn find_dfa_forward( | |
647 | &self, | |
648 | text: &[u8], | |
649 | start: usize, | |
650 | ) -> dfa::Result<(usize, usize)> { | |
651 | use dfa::Result::*; | |
652 | let end = match dfa::Fsm::forward( | |
653 | &self.ro.dfa, | |
654 | self.cache, | |
655 | false, | |
656 | text, | |
657 | start, | |
658 | ) { | |
659 | NoMatch(i) => return NoMatch(i), | |
660 | Quit => return Quit, | |
661 | Match(end) if start == end => return Match((start, start)), | |
662 | Match(end) => end, | |
663 | }; | |
664 | // Now run the DFA in reverse to find the start of the match. | |
665 | match dfa::Fsm::reverse( | |
666 | &self.ro.dfa_reverse, | |
667 | self.cache, | |
668 | false, | |
669 | &text[start..], | |
670 | end - start, | |
671 | ) { | |
672 | Match(s) => Match((start + s, end)), | |
673 | NoMatch(i) => NoMatch(i), | |
674 | Quit => Quit, | |
675 | } | |
676 | } | |
677 | ||
678 | /// Finds the leftmost-first match (start and end) using only the DFA, | |
679 | /// but assumes the regex is anchored at the end and therefore starts at | |
680 | /// the end of the regex and matches in reverse. | |
681 | /// | |
682 | /// If the result returned indicates that the DFA quit, then another | |
683 | /// matching engine should be used. | |
684 | #[inline(always)] // reduces constant overhead | |
685 | fn find_dfa_anchored_reverse( | |
686 | &self, | |
687 | text: &[u8], | |
688 | start: usize, | |
689 | ) -> dfa::Result<(usize, usize)> { | |
690 | use dfa::Result::*; | |
691 | match dfa::Fsm::reverse( | |
692 | &self.ro.dfa_reverse, | |
693 | self.cache, | |
694 | false, | |
695 | &text[start..], | |
696 | text.len() - start, | |
697 | ) { | |
698 | Match(s) => Match((start + s, text.len())), | |
699 | NoMatch(i) => NoMatch(i), | |
700 | Quit => Quit, | |
701 | } | |
702 | } | |
703 | ||
704 | /// Finds the end of the shortest match using only the DFA. | |
705 | #[inline(always)] // reduces constant overhead | |
706 | fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> { | |
707 | dfa::Fsm::forward(&self.ro.dfa, self.cache, true, text, start) | |
708 | } | |
709 | ||
710 | /// Finds the end of the shortest match using only the DFA by scanning for | |
711 | /// suffix literals. | |
712 | /// | |
713 | #[inline(always)] // reduces constant overhead | |
714 | fn shortest_dfa_reverse_suffix( | |
715 | &self, | |
716 | text: &[u8], | |
717 | start: usize, | |
718 | ) -> dfa::Result<usize> { | |
719 | match self.exec_dfa_reverse_suffix(text, start) { | |
720 | None => self.shortest_dfa(text, start), | |
721 | Some(r) => r.map(|(_, end)| end), | |
722 | } | |
723 | } | |
724 | ||
725 | /// Finds the end of the shortest match using only the DFA by scanning for | |
726 | /// suffix literals. It also reports the start of the match. | |
727 | /// | |
728 | /// Note that if None is returned, then the optimization gave up to avoid | |
729 | /// worst case quadratic behavior. A forward scanning DFA should be tried | |
730 | /// next. | |
731 | /// | |
732 | /// If a match is returned and the full leftmost-first match is desired, | |
733 | /// then a forward scan starting from the beginning of the match must be | |
734 | /// done. | |
735 | /// | |
736 | /// If the result returned indicates that the DFA quit, then another | |
737 | /// matching engine should be used. | |
738 | #[inline(always)] // reduces constant overhead | |
739 | fn exec_dfa_reverse_suffix( | |
740 | &self, | |
741 | text: &[u8], | |
742 | original_start: usize, | |
743 | ) -> Option<dfa::Result<(usize, usize)>> { | |
744 | use dfa::Result::*; | |
745 | ||
746 | let lcs = self.ro.suffixes.lcs(); | |
747 | debug_assert!(lcs.len() >= 1); | |
748 | let mut start = original_start; | |
749 | let mut end = start; | |
750 | while end <= text.len() { | |
751 | start = end; | |
752 | end += match lcs.find(&text[end..]) { | |
753 | None => return Some(NoMatch(text.len())), | |
754 | Some(start) => start + lcs.len(), | |
755 | }; | |
756 | match dfa::Fsm::reverse( | |
757 | &self.ro.dfa_reverse, | |
758 | self.cache, | |
759 | false, | |
760 | &text[start..end], | |
761 | end - start, | |
762 | ) { | |
763 | Match(0) | NoMatch(0) => return None, | |
764 | Match(s) => return Some(Match((s + start, end))), | |
765 | NoMatch(_) => continue, | |
766 | Quit => return Some(Quit), | |
767 | }; | |
768 | } | |
769 | Some(NoMatch(text.len())) | |
770 | } | |
771 | ||
772 | /// Finds the leftmost-first match (start and end) using only the DFA | |
773 | /// by scanning for suffix literals. | |
774 | /// | |
775 | /// If the result returned indicates that the DFA quit, then another | |
776 | /// matching engine should be used. | |
777 | #[inline(always)] // reduces constant overhead | |
778 | fn find_dfa_reverse_suffix( | |
779 | &self, | |
780 | text: &[u8], | |
781 | start: usize, | |
782 | ) -> dfa::Result<(usize, usize)> { | |
783 | use dfa::Result::*; | |
784 | ||
785 | let match_start = match self.exec_dfa_reverse_suffix(text, start) { | |
786 | None => return self.find_dfa_forward(text, start), | |
787 | Some(Match((start, _))) => start, | |
788 | Some(r) => return r, | |
789 | }; | |
790 | // At this point, we've found a match. The only way to quit now | |
791 | // without a match is if the DFA gives up (seems unlikely). | |
792 | // | |
793 | // Now run the DFA forwards to find the proper end of the match. | |
794 | // (The suffix literal match can only indicate the earliest | |
795 | // possible end location, which may appear before the end of the | |
796 | // leftmost-first match.) | |
797 | match dfa::Fsm::forward( | |
798 | &self.ro.dfa, | |
799 | self.cache, | |
800 | false, | |
801 | text, | |
802 | match_start, | |
803 | ) { | |
804 | NoMatch(_) => panic!("BUG: reverse match implies forward match"), | |
805 | Quit => Quit, | |
806 | Match(e) => Match((match_start, e)), | |
807 | } | |
808 | } | |
809 | ||
810 | /// Executes the NFA engine to return whether there is a match or not. | |
811 | /// | |
812 | /// Ideally, we could use shortest_nfa(...).is_some() and get the same | |
813 | /// performance characteristics, but regex sets don't have captures, which | |
814 | /// shortest_nfa depends on. | |
815 | fn match_nfa( | |
816 | &self, | |
817 | text: &[u8], | |
818 | start: usize, | |
819 | ) -> bool { | |
820 | self.match_nfa_type(MatchNfaType::Auto, text, start) | |
821 | } | |
822 | ||
823 | /// Like match_nfa, but allows specification of the type of NFA engine. | |
824 | fn match_nfa_type( | |
825 | &self, | |
826 | ty: MatchNfaType, | |
827 | text: &[u8], | |
828 | start: usize, | |
829 | ) -> bool { | |
830 | self.exec_nfa(ty, &mut [false], &mut [], true, text, start) | |
831 | } | |
832 | ||
833 | /// Finds the shortest match using an NFA. | |
834 | fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> { | |
835 | self.shortest_nfa_type(MatchNfaType::Auto, text, start) | |
836 | } | |
837 | ||
838 | /// Like shortest_nfa, but allows specification of the type of NFA engine. | |
839 | fn shortest_nfa_type( | |
840 | &self, | |
841 | ty: MatchNfaType, | |
842 | text: &[u8], | |
843 | start: usize, | |
844 | ) -> Option<usize> { | |
845 | let mut slots = [None, None]; | |
846 | if self.exec_nfa(ty, &mut [false], &mut slots, true, text, start) { | |
847 | slots[1] | |
848 | } else { | |
849 | None | |
850 | } | |
851 | } | |
852 | ||
853 | /// Like find, but executes an NFA engine. | |
854 | fn find_nfa( | |
855 | &self, | |
856 | ty: MatchNfaType, | |
857 | text: &[u8], | |
858 | start: usize, | |
859 | ) -> Option<(usize, usize)> { | |
860 | let mut slots = [None, None]; | |
861 | if self.exec_nfa(ty, &mut [false], &mut slots, false, text, start) { | |
862 | match (slots[0], slots[1]) { | |
863 | (Some(s), Some(e)) => Some((s, e)), | |
864 | _ => None, | |
865 | } | |
866 | } else { | |
867 | None | |
868 | } | |
869 | } | |
870 | ||
871 | /// Like find_nfa, but fills in captures and restricts the search space | |
872 | /// using previously found match information. | |
873 | /// | |
874 | /// `slots` should have length equal to `2 * nfa.captures.len()`. | |
875 | fn captures_nfa_with_match( | |
876 | &self, | |
877 | slots: &mut [Slot], | |
878 | text: &[u8], | |
879 | match_start: usize, | |
880 | match_end: usize, | |
881 | ) -> Option<(usize, usize)> { | |
882 | // We can't use match_end directly, because we may need to examine one | |
883 | // "character" after the end of a match for lookahead operators. We | |
884 | // need to move two characters beyond the end, since some look-around | |
885 | // operations may falsely assume a premature end of text otherwise. | |
886 | let e = cmp::min( | |
887 | next_utf8(text, next_utf8(text, match_end)), text.len()); | |
888 | self.captures_nfa(slots, &text[..e], match_start) | |
889 | } | |
890 | ||
891 | /// Like find_nfa, but fills in captures. | |
892 | /// | |
893 | /// `slots` should have length equal to `2 * nfa.captures.len()`. | |
894 | fn captures_nfa( | |
895 | &self, | |
896 | slots: &mut [Slot], | |
897 | text: &[u8], | |
898 | start: usize, | |
899 | ) -> Option<(usize, usize)> { | |
900 | self.captures_nfa_type(MatchNfaType::Auto, slots, text, start) | |
901 | } | |
902 | ||
903 | /// Like captures_nfa, but allows specification of type of NFA engine. | |
904 | fn captures_nfa_type( | |
905 | &self, | |
906 | ty: MatchNfaType, | |
907 | slots: &mut [Slot], | |
908 | text: &[u8], | |
909 | start: usize, | |
910 | ) -> Option<(usize, usize)> { | |
911 | if self.exec_nfa(ty, &mut [false], slots, false, text, start) { | |
912 | match (slots[0], slots[1]) { | |
913 | (Some(s), Some(e)) => Some((s, e)), | |
914 | _ => None, | |
915 | } | |
916 | } else { | |
917 | None | |
918 | } | |
919 | } | |
920 | ||
921 | fn exec_nfa( | |
922 | &self, | |
923 | mut ty: MatchNfaType, | |
924 | matches: &mut [bool], | |
925 | slots: &mut [Slot], | |
926 | quit_after_match: bool, | |
927 | text: &[u8], | |
928 | start: usize, | |
929 | ) -> bool { | |
930 | use self::MatchNfaType::*; | |
931 | if let Auto = ty { | |
932 | if backtrack::should_exec(self.ro.nfa.len(), text.len()) { | |
933 | ty = Backtrack; | |
934 | } else { | |
935 | ty = PikeVM; | |
936 | } | |
937 | } | |
938 | match ty { | |
939 | Auto => unreachable!(), | |
940 | Backtrack => self.exec_backtrack(matches, slots, text, start), | |
941 | PikeVM => { | |
942 | self.exec_pikevm( | |
943 | matches, slots, quit_after_match, text, start) | |
944 | } | |
945 | } | |
946 | } | |
947 | ||
948 | /// Always run the NFA algorithm. | |
949 | fn exec_pikevm( | |
950 | &self, | |
951 | matches: &mut [bool], | |
952 | slots: &mut [Slot], | |
953 | quit_after_match: bool, | |
954 | text: &[u8], | |
955 | start: usize, | |
956 | ) -> bool { | |
957 | if self.ro.nfa.uses_bytes() { | |
958 | pikevm::Fsm::exec( | |
959 | &self.ro.nfa, | |
960 | self.cache, | |
961 | matches, | |
962 | slots, | |
963 | quit_after_match, | |
964 | ByteInput::new(text, self.ro.nfa.only_utf8), | |
965 | start) | |
966 | } else { | |
967 | pikevm::Fsm::exec( | |
968 | &self.ro.nfa, | |
969 | self.cache, | |
970 | matches, | |
971 | slots, | |
972 | quit_after_match, | |
973 | CharInput::new(text), | |
974 | start) | |
975 | } | |
976 | } | |
977 | ||
978 | /// Always runs the NFA using bounded backtracking. | |
979 | fn exec_backtrack( | |
980 | &self, | |
981 | matches: &mut [bool], | |
982 | slots: &mut [Slot], | |
983 | text: &[u8], | |
984 | start: usize, | |
985 | ) -> bool { | |
986 | if self.ro.nfa.uses_bytes() { | |
987 | backtrack::Bounded::exec( | |
988 | &self.ro.nfa, | |
989 | self.cache, | |
990 | matches, | |
991 | slots, | |
992 | ByteInput::new(text, self.ro.nfa.only_utf8), | |
993 | start) | |
994 | } else { | |
995 | backtrack::Bounded::exec( | |
996 | &self.ro.nfa, | |
997 | self.cache, | |
998 | matches, | |
999 | slots, | |
1000 | CharInput::new(text), | |
1001 | start) | |
1002 | } | |
1003 | } | |
1004 | ||
1005 | /// Finds which regular expressions match the given text. | |
1006 | /// | |
1007 | /// `matches` should have length equal to the number of regexes being | |
1008 | /// searched. | |
1009 | /// | |
1010 | /// This is only useful when one wants to know which regexes in a set | |
1011 | /// match some text. | |
1012 | pub fn many_matches_at( | |
1013 | &self, | |
1014 | matches: &mut [bool], | |
1015 | text: &[u8], | |
1016 | start: usize, | |
1017 | ) -> bool { | |
1018 | use self::MatchType::*; | |
1019 | if !self.is_anchor_end_match(text) { | |
1020 | return false; | |
1021 | } | |
1022 | match self.ro.match_type { | |
1023 | Literal(ty) => { | |
1024 | debug_assert_eq!(matches.len(), 1); | |
1025 | matches[0] = self.find_literals(ty, text, start).is_some(); | |
1026 | matches[0] | |
1027 | } | |
1028 | Dfa | DfaAnchoredReverse | DfaSuffix | DfaMany => { | |
1029 | match dfa::Fsm::forward_many( | |
1030 | &self.ro.dfa, | |
1031 | self.cache, | |
1032 | matches, | |
1033 | text, | |
1034 | start, | |
1035 | ) { | |
1036 | dfa::Result::Match(_) => true, | |
1037 | dfa::Result::NoMatch(_) => false, | |
1038 | dfa::Result::Quit => { | |
1039 | self.exec_nfa( | |
1040 | MatchNfaType::Auto, | |
1041 | matches, | |
1042 | &mut [], | |
1043 | false, | |
1044 | text, | |
1045 | start) | |
1046 | } | |
1047 | } | |
1048 | } | |
1049 | Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start), | |
1050 | Nothing => false, | |
1051 | } | |
1052 | } | |
1053 | ||
1054 | #[inline(always)] // reduces constant overhead | |
1055 | fn is_anchor_end_match(&self, text: &[u8]) -> bool { | |
1056 | // Only do this check if the haystack is big (>1MB). | |
1057 | if text.len() > (1<<20) && self.ro.nfa.is_anchored_end { | |
1058 | let lcs = self.ro.suffixes.lcs(); | |
1059 | if lcs.len() >= 1 && !lcs.is_suffix(text) { | |
1060 | return false; | |
1061 | } | |
1062 | } | |
1063 | true | |
1064 | } | |
1065 | ||
1066 | pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { | |
1067 | &self.ro.nfa.capture_name_idx | |
1068 | } | |
1069 | } | |
1070 | ||
1071 | impl<'c> ExecNoSyncStr<'c> { | |
1072 | pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { | |
1073 | self.0.capture_name_idx() | |
1074 | } | |
1075 | } | |
1076 | ||
1077 | impl Exec { | |
1078 | /// Get a searcher that isn't Sync. | |
1079 | #[inline(always)] // reduces constant overhead | |
1080 | pub fn searcher(&self) -> ExecNoSync { | |
1081 | let create = || Box::new(RefCell::new(ProgramCacheInner::new(&self.ro))); | |
1082 | ExecNoSync { | |
1083 | ro: &self.ro, // a clone is too expensive here! (and not needed) | |
1084 | cache: self.cache.get_or(create), | |
1085 | } | |
1086 | } | |
1087 | ||
1088 | /// Get a searcher that isn't Sync and can match on &str. | |
1089 | #[inline(always)] // reduces constant overhead | |
1090 | pub fn searcher_str(&self) -> ExecNoSyncStr { | |
1091 | ExecNoSyncStr(self.searcher()) | |
1092 | } | |
1093 | ||
1094 | /// Build a Regex from this executor. | |
1095 | pub fn into_regex(self) -> re_unicode::Regex { | |
1096 | re_unicode::Regex::from(self) | |
1097 | } | |
1098 | ||
1099 | /// Build a RegexSet from this executor. | |
1100 | pub fn into_regex_set(self) -> re_set::unicode::RegexSet { | |
1101 | re_set::unicode::RegexSet::from(self) | |
1102 | } | |
1103 | ||
1104 | /// Build a Regex from this executor that can match arbitrary bytes. | |
1105 | pub fn into_byte_regex(self) -> re_bytes::Regex { | |
1106 | re_bytes::Regex::from(self) | |
1107 | } | |
1108 | ||
1109 | /// Build a RegexSet from this executor that can match arbitrary bytes. | |
1110 | pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet { | |
1111 | re_set::bytes::RegexSet::from(self) | |
1112 | } | |
1113 | ||
1114 | /// The original regular expressions given by the caller that were | |
1115 | /// compiled. | |
1116 | pub fn regex_strings(&self) -> &[String] { | |
1117 | &self.ro.res | |
1118 | } | |
1119 | ||
1120 | /// Return a slice of capture names. | |
1121 | /// | |
1122 | /// Any capture that isn't named is None. | |
1123 | pub fn capture_names(&self) -> &[Option<String>] { | |
1124 | &self.ro.nfa.captures | |
1125 | } | |
1126 | ||
1127 | /// Return a reference to named groups mapping (from group name to | |
1128 | /// group position). | |
1129 | pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { | |
1130 | &self.ro.nfa.capture_name_idx | |
1131 | } | |
1132 | } | |
1133 | ||
1134 | impl Clone for Exec { | |
1135 | fn clone(&self) -> Exec { | |
1136 | Exec { | |
1137 | ro: self.ro.clone(), | |
1138 | cache: CachedThreadLocal::new(), | |
1139 | } | |
1140 | } | |
1141 | } | |
1142 | ||
1143 | impl ExecReadOnly { | |
1144 | fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType { | |
1145 | use self::MatchType::*; | |
1146 | if let Some(Nfa(_)) = hint { | |
1147 | return hint.unwrap(); | |
1148 | } | |
1149 | // If the NFA is empty, then we'll never match anything. | |
1150 | if self.nfa.insts.is_empty() { | |
1151 | return Nothing; | |
1152 | } | |
1153 | // If our set of prefixes is complete, then we can use it to find | |
1154 | // a match in lieu of a regex engine. This doesn't quite work well in | |
1155 | // the presence of multiple regexes, so only do it when there's one. | |
1156 | // | |
1157 | // TODO(burntsushi): Also, don't try to match literals if the regex is | |
1158 | // partially anchored. We could technically do it, but we'd need to | |
1159 | // create two sets of literals: all of them and then the subset that | |
1160 | // aren't anchored. We would then only search for all of them when at | |
1161 | // the beginning of the input and use the subset in all other cases. | |
1162 | if self.res.len() == 1 { | |
1163 | if self.nfa.prefixes.complete() { | |
1164 | return if self.nfa.is_anchored_start { | |
1165 | Literal(MatchLiteralType::AnchoredStart) | |
1166 | } else { | |
1167 | Literal(MatchLiteralType::Unanchored) | |
1168 | }; | |
1169 | } | |
1170 | if self.suffixes.complete() { | |
1171 | return if self.nfa.is_anchored_end { | |
1172 | Literal(MatchLiteralType::AnchoredEnd) | |
1173 | } else { | |
1174 | // This case shouldn't happen. When the regex isn't | |
1175 | // anchored, then complete prefixes should imply complete | |
1176 | // suffixes. | |
1177 | Literal(MatchLiteralType::Unanchored) | |
1178 | }; | |
1179 | } | |
1180 | } | |
1181 | // If we can execute the DFA, then we totally should. | |
1182 | if dfa::can_exec(&self.dfa) { | |
1183 | // Regex sets require a slightly specialized path. | |
1184 | if self.res.len() >= 2 { | |
1185 | return DfaMany; | |
1186 | } | |
1187 | // If the regex is anchored at the end but not the start, then | |
1188 | // just match in reverse from the end of the haystack. | |
1189 | if !self.nfa.is_anchored_start && self.nfa.is_anchored_end { | |
1190 | return DfaAnchoredReverse; | |
1191 | } | |
1192 | // If there's a longish suffix literal, then it might be faster | |
1193 | // to look for that first. | |
1194 | if self.should_suffix_scan() { | |
1195 | return DfaSuffix; | |
1196 | } | |
1197 | // Fall back to your garden variety forward searching lazy DFA. | |
1198 | return Dfa; | |
1199 | } | |
1200 | // We're so totally hosed. | |
1201 | Nfa(MatchNfaType::Auto) | |
1202 | } | |
1203 | ||
1204 | /// Returns true if the program is amenable to suffix scanning. | |
1205 | /// | |
1206 | /// When this is true, as a heuristic, we assume it is OK to quickly scan | |
1207 | /// for suffix literals and then do a *reverse* DFA match from any matches | |
1208 | /// produced by the literal scan. (And then followed by a forward DFA | |
1209 | /// search, since the previously found suffix literal maybe not actually be | |
1210 | /// the end of a match.) | |
1211 | /// | |
1212 | /// This is a bit of a specialized optimization, but can result in pretty | |
1213 | /// big performance wins if 1) there are no prefix literals and 2) the | |
1214 | /// suffix literals are pretty rare in the text. (1) is obviously easy to | |
1215 | /// account for but (2) is harder. As a proxy, we assume that longer | |
1216 | /// strings are generally rarer, so we only enable this optimization when | |
1217 | /// we have a meaty suffix. | |
1218 | fn should_suffix_scan(&self) -> bool { | |
1219 | if self.suffixes.is_empty() { | |
1220 | return false; | |
1221 | } | |
1222 | let lcs_len = self.suffixes.lcs().char_len(); | |
1223 | lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len() | |
1224 | } | |
1225 | } | |
1226 | ||
1227 | #[derive(Clone, Copy, Debug)] | |
1228 | enum MatchType { | |
1229 | /// A single or multiple literal search. This is only used when the regex | |
1230 | /// can be decomposed into unambiguous literal search. | |
1231 | Literal(MatchLiteralType), | |
1232 | /// A normal DFA search. | |
1233 | Dfa, | |
1234 | /// A reverse DFA search starting from the end of a haystack. | |
1235 | DfaAnchoredReverse, | |
1236 | /// A reverse DFA search with suffix literal scanning. | |
1237 | DfaSuffix, | |
1238 | /// Use the DFA on two or more regular expressions. | |
1239 | DfaMany, | |
1240 | /// An NFA variant. | |
1241 | Nfa(MatchNfaType), | |
1242 | /// No match is ever possible, so don't ever try to search. | |
1243 | Nothing, | |
1244 | } | |
1245 | ||
1246 | #[derive(Clone, Copy, Debug)] | |
1247 | enum MatchLiteralType { | |
1248 | /// Match literals anywhere in text. | |
1249 | Unanchored, | |
1250 | /// Match literals only at the start of text. | |
1251 | AnchoredStart, | |
1252 | /// Match literals only at the end of text. | |
1253 | AnchoredEnd, | |
1254 | } | |
1255 | ||
1256 | #[derive(Clone, Copy, Debug)] | |
1257 | enum MatchNfaType { | |
1258 | /// Choose between Backtrack and PikeVM. | |
1259 | Auto, | |
1260 | /// NFA bounded backtracking. | |
1261 | /// | |
1262 | /// (This is only set by tests, since it never makes sense to always want | |
1263 | /// backtracking.) | |
1264 | Backtrack, | |
1265 | /// The Pike VM. | |
1266 | /// | |
1267 | /// (This is only set by tests, since it never makes sense to always want | |
1268 | /// the Pike VM.) | |
1269 | PikeVM, | |
1270 | } | |
1271 | ||
1272 | /// `ProgramCache` maintains reusable allocations for each matching engine | |
1273 | /// available to a particular program. | |
1274 | pub type ProgramCache = RefCell<ProgramCacheInner>; | |
1275 | ||
1276 | #[derive(Clone, Debug)] | |
1277 | pub struct ProgramCacheInner { | |
1278 | pub pikevm: pikevm::Cache, | |
1279 | pub backtrack: backtrack::Cache, | |
1280 | pub dfa: dfa::Cache, | |
1281 | pub dfa_reverse: dfa::Cache, | |
1282 | } | |
1283 | ||
1284 | impl ProgramCacheInner { | |
1285 | fn new(ro: &ExecReadOnly) -> Self { | |
1286 | ProgramCacheInner { | |
1287 | pikevm: pikevm::Cache::new(&ro.nfa), | |
1288 | backtrack: backtrack::Cache::new(&ro.nfa), | |
1289 | dfa: dfa::Cache::new(&ro.dfa), | |
1290 | dfa_reverse: dfa::Cache::new(&ro.dfa_reverse), | |
1291 | } | |
1292 | } | |
1293 | } | |
8faf50e0 XL |
1294 | |
1295 | #[cfg(test)] | |
1296 | mod test { | |
1297 | #[test] | |
1298 | fn uppercut_s_backtracking_bytes_default_bytes_mismatch() { | |
1299 | use internal::ExecBuilder; | |
1300 | ||
1301 | let backtrack_bytes_re = ExecBuilder::new("^S") | |
1302 | .bounded_backtracking() | |
1303 | .only_utf8(false) | |
1304 | .build() | |
1305 | .map(|exec| exec.into_byte_regex()) | |
1306 | .map_err(|err| format!("{}", err)) | |
1307 | .unwrap(); | |
1308 | ||
1309 | let default_bytes_re = ExecBuilder::new("^S") | |
1310 | .only_utf8(false) | |
1311 | .build() | |
1312 | .map(|exec| exec.into_byte_regex()) | |
1313 | .map_err(|err| format!("{}", err)) | |
1314 | .unwrap(); | |
1315 | ||
1316 | let input = vec![83, 83]; | |
1317 | ||
1318 | let s1 = backtrack_bytes_re.split(&input); | |
1319 | let s2 = default_bytes_re.split(&input); | |
1320 | for (chunk1, chunk2) in s1.zip(s2) { | |
1321 | assert_eq!(chunk1, chunk2); | |
1322 | } | |
1323 | } | |
1324 | ||
1325 | #[test] | |
1326 | fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() { | |
1327 | use internal::ExecBuilder; | |
1328 | ||
1329 | let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)") | |
1330 | .bounded_backtracking() | |
1331 | .bytes(true) | |
1332 | .build() | |
1333 | .map(|exec| exec.into_regex()) | |
1334 | .map_err(|err| format!("{}", err)) | |
1335 | .unwrap(); | |
1336 | ||
1337 | let default_bytes_re = ExecBuilder::new(r"^(?u:\*)") | |
1338 | .bytes(true) | |
1339 | .build() | |
1340 | .map(|exec| exec.into_regex()) | |
1341 | .map_err(|err| format!("{}", err)) | |
1342 | .unwrap(); | |
1343 | ||
1344 | let input = "**"; | |
1345 | ||
1346 | let s1 = backtrack_bytes_re.split(input); | |
1347 | let s2 = default_bytes_re.split(input); | |
1348 | for (chunk1, chunk2) in s1.zip(s2) { | |
1349 | assert_eq!(chunk1, chunk2); | |
1350 | } | |
1351 | } | |
1352 | } |