]>
Commit | Line | Data |
---|---|---|
781aab86 FG |
1 | use core::{ |
2 | fmt::Debug, | |
3 | panic::{RefUnwindSafe, UnwindSafe}, | |
4 | }; | |
5 | ||
6 | use alloc::sync::Arc; | |
7 | ||
8 | use regex_syntax::hir::{literal, Hir}; | |
9 | ||
10 | use crate::{ | |
11 | meta::{ | |
12 | error::{BuildError, RetryError, RetryFailError, RetryQuadraticError}, | |
13 | regex::{Cache, RegexInfo}, | |
14 | reverse_inner, wrappers, | |
15 | }, | |
16 | nfa::thompson::{self, WhichCaptures, NFA}, | |
17 | util::{ | |
18 | captures::{Captures, GroupInfo}, | |
19 | look::LookMatcher, | |
20 | prefilter::{self, Prefilter, PrefilterI}, | |
21 | primitives::{NonMaxUsize, PatternID}, | |
22 | search::{Anchored, HalfMatch, Input, Match, MatchKind, PatternSet}, | |
23 | }, | |
24 | }; | |
25 | ||
26 | /// A trait that represents a single meta strategy. Its main utility is in | |
27 | /// providing a way to do dynamic dispatch over a few choices. | |
28 | /// | |
29 | /// Why dynamic dispatch? I actually don't have a super compelling reason, and | |
30 | /// importantly, I have not benchmarked it with the main alternative: an enum. | |
31 | /// I went with dynamic dispatch initially because the regex engine search code | |
32 | /// really can't be inlined into caller code in most cases because it's just | |
33 | /// too big. In other words, it is already expected that every regex search | |
34 | /// will entail at least the cost of a function call. | |
35 | /// | |
36 | /// I do wonder whether using enums would result in better codegen overall | |
37 | /// though. It's a worthwhile experiment to try. Probably the most interesting | |
38 | /// benchmark to run in such a case would be one with a high match count. That | |
39 | /// is, a benchmark to test the overall latency of a search call. | |
40 | pub(super) trait Strategy: | |
41 | Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static | |
42 | { | |
43 | fn group_info(&self) -> &GroupInfo; | |
44 | ||
45 | fn create_cache(&self) -> Cache; | |
46 | ||
47 | fn reset_cache(&self, cache: &mut Cache); | |
48 | ||
49 | fn is_accelerated(&self) -> bool; | |
50 | ||
51 | fn memory_usage(&self) -> usize; | |
52 | ||
53 | fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match>; | |
54 | ||
55 | fn search_half( | |
56 | &self, | |
57 | cache: &mut Cache, | |
58 | input: &Input<'_>, | |
59 | ) -> Option<HalfMatch>; | |
60 | ||
61 | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool; | |
62 | ||
63 | fn search_slots( | |
64 | &self, | |
65 | cache: &mut Cache, | |
66 | input: &Input<'_>, | |
67 | slots: &mut [Option<NonMaxUsize>], | |
68 | ) -> Option<PatternID>; | |
69 | ||
70 | fn which_overlapping_matches( | |
71 | &self, | |
72 | cache: &mut Cache, | |
73 | input: &Input<'_>, | |
74 | patset: &mut PatternSet, | |
75 | ); | |
76 | } | |
77 | ||
78 | pub(super) fn new( | |
79 | info: &RegexInfo, | |
80 | hirs: &[&Hir], | |
81 | ) -> Result<Arc<dyn Strategy>, BuildError> { | |
82 | // At this point, we're committed to a regex engine of some kind. So pull | |
83 | // out a prefilter if we can, which will feed to each of the constituent | |
84 | // regex engines. | |
85 | let pre = if info.is_always_anchored_start() { | |
86 | // PERF: I'm not sure we necessarily want to do this... We may want to | |
87 | // run a prefilter for quickly rejecting in some cases. The problem | |
88 | // is that anchored searches overlap quite a bit with the use case | |
89 | // of "run a regex on every line to extract data." In that case, the | |
90 | // regex always matches, so running a prefilter doesn't really help us | |
91 | // there. The main place where a prefilter helps in an anchored search | |
92 | // is if the anchored search is not expected to match frequently. That | |
93 | // is, the prefilter gives us a way to possibly reject a haystack very | |
94 | // quickly. | |
95 | // | |
96 | // Maybe we should do use a prefilter, but only for longer haystacks? | |
97 | // Or maybe we should only use a prefilter when we think it's "fast"? | |
98 | // | |
99 | // Interestingly, I think we currently lack the infrastructure for | |
100 | // disabling a prefilter based on haystack length. That would probably | |
101 | // need to be a new 'Input' option. (Interestingly, an 'Input' used to | |
102 | // carry a 'Prefilter' with it, but I moved away from that.) | |
103 | debug!("skipping literal extraction since regex is anchored"); | |
104 | None | |
105 | } else if let Some(pre) = info.config().get_prefilter() { | |
106 | debug!( | |
107 | "skipping literal extraction since the caller provided a prefilter" | |
108 | ); | |
109 | Some(pre.clone()) | |
110 | } else if info.config().get_auto_prefilter() { | |
111 | let kind = info.config().get_match_kind(); | |
112 | let prefixes = crate::util::prefilter::prefixes(kind, hirs); | |
113 | // If we can build a full `Strategy` from just the extracted prefixes, | |
114 | // then we can short-circuit and avoid building a regex engine at all. | |
115 | if let Some(pre) = Pre::from_prefixes(info, &prefixes) { | |
116 | debug!( | |
117 | "found that the regex can be broken down to a literal \ | |
118 | search, avoiding the regex engine entirely", | |
119 | ); | |
120 | return Ok(pre); | |
121 | } | |
122 | // This now attempts another short-circuit of the regex engine: if we | |
123 | // have a huge alternation of just plain literals, then we can just use | |
124 | // Aho-Corasick for that and avoid the regex engine entirely. | |
125 | // | |
126 | // You might think this case would just be handled by | |
127 | // `Pre::from_prefixes`, but that technique relies on heuristic literal | |
128 | // extraction from the corresponding `Hir`. That works, but part of | |
129 | // heuristics limit the size and number of literals returned. This case | |
130 | // will specifically handle patterns with very large alternations. | |
131 | // | |
132 | // One wonders if we should just roll this our heuristic literal | |
133 | // extraction, and then I think this case could disappear entirely. | |
134 | if let Some(pre) = Pre::from_alternation_literals(info, hirs) { | |
135 | debug!( | |
136 | "found plain alternation of literals, \ | |
137 | avoiding regex engine entirely and using Aho-Corasick" | |
138 | ); | |
139 | return Ok(pre); | |
140 | } | |
141 | prefixes.literals().and_then(|strings| { | |
142 | debug!( | |
143 | "creating prefilter from {} literals: {:?}", | |
144 | strings.len(), | |
145 | strings, | |
146 | ); | |
147 | Prefilter::new(kind, strings) | |
148 | }) | |
149 | } else { | |
150 | debug!("skipping literal extraction since prefilters were disabled"); | |
151 | None | |
152 | }; | |
153 | let mut core = Core::new(info.clone(), pre.clone(), hirs)?; | |
154 | // Now that we have our core regex engines built, there are a few cases | |
155 | // where we can do a little bit better than just a normal "search forward | |
156 | // and maybe use a prefilter when in a start state." However, these cases | |
157 | // may not always work or otherwise build on top of the Core searcher. | |
158 | // For example, the reverse anchored optimization seems like it might | |
159 | // always work, but only the DFAs support reverse searching and the DFAs | |
160 | // might give up or quit for reasons. If we had, e.g., a PikeVM that | |
161 | // supported reverse searching, then we could avoid building a full Core | |
162 | // engine for this case. | |
163 | core = match ReverseAnchored::new(core) { | |
164 | Err(core) => core, | |
165 | Ok(ra) => { | |
166 | debug!("using reverse anchored strategy"); | |
167 | return Ok(Arc::new(ra)); | |
168 | } | |
169 | }; | |
170 | core = match ReverseSuffix::new(core, hirs) { | |
171 | Err(core) => core, | |
172 | Ok(rs) => { | |
173 | debug!("using reverse suffix strategy"); | |
174 | return Ok(Arc::new(rs)); | |
175 | } | |
176 | }; | |
177 | core = match ReverseInner::new(core, hirs) { | |
178 | Err(core) => core, | |
179 | Ok(ri) => { | |
180 | debug!("using reverse inner strategy"); | |
181 | return Ok(Arc::new(ri)); | |
182 | } | |
183 | }; | |
184 | debug!("using core strategy"); | |
185 | Ok(Arc::new(core)) | |
186 | } | |
187 | ||
188 | #[derive(Clone, Debug)] | |
189 | struct Pre<P> { | |
190 | pre: P, | |
191 | group_info: GroupInfo, | |
192 | } | |
193 | ||
194 | impl<P: PrefilterI> Pre<P> { | |
195 | fn new(pre: P) -> Arc<dyn Strategy> { | |
196 | // The only thing we support when we use prefilters directly as a | |
197 | // strategy is the start and end of the overall match for a single | |
198 | // pattern. In other words, exactly one implicit capturing group. Which | |
199 | // is exactly what we use here for a GroupInfo. | |
200 | let group_info = GroupInfo::new([[None::<&str>]]).unwrap(); | |
201 | Arc::new(Pre { pre, group_info }) | |
202 | } | |
203 | } | |
204 | ||
205 | // This is a little weird, but we don't actually care about the type parameter | |
206 | // here because we're selecting which underlying prefilter to use. So we just | |
207 | // define it on an arbitrary type. | |
208 | impl Pre<()> { | |
209 | /// Given a sequence of prefixes, attempt to return a full `Strategy` using | |
210 | /// just the prefixes. | |
211 | /// | |
212 | /// Basically, this occurs when the prefixes given not just prefixes, | |
213 | /// but an enumeration of the entire language matched by the regular | |
214 | /// expression. | |
215 | /// | |
216 | /// A number of other conditions need to be true too. For example, there | |
217 | /// can be only one pattern, the number of explicit capture groups is 0, no | |
218 | /// look-around assertions and so on. | |
219 | /// | |
220 | /// Note that this ignores `Config::get_auto_prefilter` because if this | |
221 | /// returns something, then it isn't a prefilter but a matcher itself. | |
222 | /// Therefore, it shouldn't suffer from the problems typical to prefilters | |
223 | /// (such as a high false positive rate). | |
224 | fn from_prefixes( | |
225 | info: &RegexInfo, | |
226 | prefixes: &literal::Seq, | |
227 | ) -> Option<Arc<dyn Strategy>> { | |
228 | let kind = info.config().get_match_kind(); | |
229 | // Check to see if our prefixes are exact, which means we might be | |
230 | // able to bypass the regex engine entirely and just rely on literal | |
231 | // searches. | |
232 | if !prefixes.is_exact() { | |
233 | return None; | |
234 | } | |
235 | // We also require that we have a single regex pattern. Namely, | |
236 | // we reuse the prefilter infrastructure to implement search and | |
237 | // prefilters only report spans. Prefilters don't know about pattern | |
238 | // IDs. The multi-regex case isn't a lost cause, we might still use | |
239 | // Aho-Corasick and we might still just use a regular prefilter, but | |
240 | // that's done below. | |
241 | if info.pattern_len() != 1 { | |
242 | return None; | |
243 | } | |
244 | // We can't have any capture groups either. The literal engines don't | |
245 | // know how to deal with things like '(foo)(bar)'. In that case, a | |
246 | // prefilter will just be used and then the regex engine will resolve | |
247 | // the capture groups. | |
248 | if info.props()[0].explicit_captures_len() != 0 { | |
249 | return None; | |
250 | } | |
251 | // We also require that it has zero look-around assertions. Namely, | |
252 | // literal extraction treats look-around assertions as if they match | |
253 | // *every* empty string. But of course, that isn't true. So for | |
254 | // example, 'foo\bquux' never matches anything, but 'fooquux' is | |
255 | // extracted from that as an exact literal. Such cases should just run | |
256 | // the regex engine. 'fooquux' will be used as a normal prefilter, and | |
257 | // then the regex engine will try to look for an actual match. | |
258 | if !info.props()[0].look_set().is_empty() { | |
259 | return None; | |
260 | } | |
261 | // Finally, currently, our prefilters are all oriented around | |
262 | // leftmost-first match semantics, so don't try to use them if the | |
263 | // caller asked for anything else. | |
264 | if kind != MatchKind::LeftmostFirst { | |
265 | return None; | |
266 | } | |
267 | // The above seems like a lot of requirements to meet, but it applies | |
268 | // to a lot of cases. 'foo', '[abc][123]' and 'foo|bar|quux' all meet | |
269 | // the above criteria, for example. | |
270 | // | |
271 | // Note that this is effectively a latency optimization. If we didn't | |
272 | // do this, then the extracted literals would still get bundled into | |
273 | // a prefilter, and every regex engine capable of running unanchored | |
274 | // searches supports prefilters. So this optimization merely sidesteps | |
275 | // having to run the regex engine at all to confirm the match. Thus, it | |
276 | // decreases the latency of a match. | |
277 | ||
278 | // OK because we know the set is exact and thus finite. | |
279 | let prefixes = prefixes.literals().unwrap(); | |
280 | debug!( | |
281 | "trying to bypass regex engine by creating \ | |
282 | prefilter from {} literals: {:?}", | |
283 | prefixes.len(), | |
284 | prefixes, | |
285 | ); | |
286 | let choice = match prefilter::Choice::new(kind, prefixes) { | |
287 | Some(choice) => choice, | |
288 | None => { | |
289 | debug!( | |
290 | "regex bypass failed because no prefilter could be built" | |
291 | ); | |
292 | return None; | |
293 | } | |
294 | }; | |
295 | let strat: Arc<dyn Strategy> = match choice { | |
296 | prefilter::Choice::Memchr(pre) => Pre::new(pre), | |
297 | prefilter::Choice::Memchr2(pre) => Pre::new(pre), | |
298 | prefilter::Choice::Memchr3(pre) => Pre::new(pre), | |
299 | prefilter::Choice::Memmem(pre) => Pre::new(pre), | |
300 | prefilter::Choice::Teddy(pre) => Pre::new(pre), | |
301 | prefilter::Choice::ByteSet(pre) => Pre::new(pre), | |
302 | prefilter::Choice::AhoCorasick(pre) => Pre::new(pre), | |
303 | }; | |
304 | Some(strat) | |
305 | } | |
306 | ||
307 | /// Attempts to extract an alternation of literals, and if it's deemed | |
308 | /// worth doing, returns an Aho-Corasick prefilter as a strategy. | |
309 | /// | |
310 | /// And currently, this only returns something when 'hirs.len() == 1'. This | |
311 | /// could in theory do something if there are multiple HIRs where all of | |
312 | /// them are alternation of literals, but I haven't had the time to go down | |
313 | /// that path yet. | |
314 | fn from_alternation_literals( | |
315 | info: &RegexInfo, | |
316 | hirs: &[&Hir], | |
317 | ) -> Option<Arc<dyn Strategy>> { | |
318 | use crate::util::prefilter::AhoCorasick; | |
319 | ||
320 | let lits = crate::meta::literal::alternation_literals(info, hirs)?; | |
321 | let ac = AhoCorasick::new(MatchKind::LeftmostFirst, &lits)?; | |
322 | Some(Pre::new(ac)) | |
323 | } | |
324 | } | |
325 | ||
326 | // This implements Strategy for anything that implements PrefilterI. | |
327 | // | |
328 | // Note that this must only be used for regexes of length 1. Multi-regexes | |
329 | // don't work here. The prefilter interface only provides the span of a match | |
330 | // and not the pattern ID. (I did consider making it more expressive, but I | |
331 | // couldn't figure out how to tie everything together elegantly.) Thus, so long | |
332 | // as the regex only contains one pattern, we can simply assume that a match | |
333 | // corresponds to PatternID::ZERO. And indeed, that's what we do here. | |
334 | // | |
335 | // In practice, since this impl is used to report matches directly and thus | |
336 | // completely bypasses the regex engine, we only wind up using this under the | |
337 | // following restrictions: | |
338 | // | |
339 | // * There must be only one pattern. As explained above. | |
340 | // * The literal sequence must be finite and only contain exact literals. | |
341 | // * There must not be any look-around assertions. If there are, the literals | |
342 | // extracted might be exact, but a match doesn't necessarily imply an overall | |
343 | // match. As a trivial example, 'foo\bbar' does not match 'foobar'. | |
344 | // * The pattern must not have any explicit capturing groups. If it does, the | |
345 | // caller might expect them to be resolved. e.g., 'foo(bar)'. | |
346 | // | |
347 | // So when all of those things are true, we use a prefilter directly as a | |
348 | // strategy. | |
349 | // | |
350 | // In the case where the number of patterns is more than 1, we don't use this | |
351 | // but do use a special Aho-Corasick strategy if all of the regexes are just | |
352 | // simple literals or alternations of literals. (We also use the Aho-Corasick | |
353 | // strategy when len(patterns)==1 if the number of literals is large. In that | |
354 | // case, literal extraction gives up and will return an infinite set.) | |
355 | impl<P: PrefilterI> Strategy for Pre<P> { | |
4b012472 | 356 | #[cfg_attr(feature = "perf-inline", inline(always))] |
781aab86 FG |
357 | fn group_info(&self) -> &GroupInfo { |
358 | &self.group_info | |
359 | } | |
360 | ||
361 | fn create_cache(&self) -> Cache { | |
362 | Cache { | |
363 | capmatches: Captures::all(self.group_info().clone()), | |
364 | pikevm: wrappers::PikeVMCache::none(), | |
365 | backtrack: wrappers::BoundedBacktrackerCache::none(), | |
366 | onepass: wrappers::OnePassCache::none(), | |
367 | hybrid: wrappers::HybridCache::none(), | |
368 | revhybrid: wrappers::ReverseHybridCache::none(), | |
369 | } | |
370 | } | |
371 | ||
372 | fn reset_cache(&self, _cache: &mut Cache) {} | |
373 | ||
374 | fn is_accelerated(&self) -> bool { | |
375 | self.pre.is_fast() | |
376 | } | |
377 | ||
378 | fn memory_usage(&self) -> usize { | |
379 | self.pre.memory_usage() | |
380 | } | |
381 | ||
4b012472 | 382 | #[cfg_attr(feature = "perf-inline", inline(always))] |
781aab86 FG |
383 | fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option<Match> { |
384 | if input.is_done() { | |
385 | return None; | |
386 | } | |
387 | if input.get_anchored().is_anchored() { | |
388 | return self | |
389 | .pre | |
390 | .prefix(input.haystack(), input.get_span()) | |
391 | .map(|sp| Match::new(PatternID::ZERO, sp)); | |
392 | } | |
393 | self.pre | |
394 | .find(input.haystack(), input.get_span()) | |
395 | .map(|sp| Match::new(PatternID::ZERO, sp)) | |
396 | } | |
397 | ||
4b012472 | 398 | #[cfg_attr(feature = "perf-inline", inline(always))] |
781aab86 FG |
399 | fn search_half( |
400 | &self, | |
401 | cache: &mut Cache, | |
402 | input: &Input<'_>, | |
403 | ) -> Option<HalfMatch> { | |
404 | self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end())) | |
405 | } | |
406 | ||
4b012472 | 407 | #[cfg_attr(feature = "perf-inline", inline(always))] |
781aab86 FG |
408 | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { |
409 | self.search(cache, input).is_some() | |
410 | } | |
411 | ||
4b012472 | 412 | #[cfg_attr(feature = "perf-inline", inline(always))] |
781aab86 FG |
413 | fn search_slots( |
414 | &self, | |
415 | cache: &mut Cache, | |
416 | input: &Input<'_>, | |
417 | slots: &mut [Option<NonMaxUsize>], | |
418 | ) -> Option<PatternID> { | |
419 | let m = self.search(cache, input)?; | |
420 | if let Some(slot) = slots.get_mut(0) { | |
421 | *slot = NonMaxUsize::new(m.start()); | |
422 | } | |
423 | if let Some(slot) = slots.get_mut(1) { | |
424 | *slot = NonMaxUsize::new(m.end()); | |
425 | } | |
426 | Some(m.pattern()) | |
427 | } | |
428 | ||
4b012472 | 429 | #[cfg_attr(feature = "perf-inline", inline(always))] |
781aab86 FG |
430 | fn which_overlapping_matches( |
431 | &self, | |
432 | cache: &mut Cache, | |
433 | input: &Input<'_>, | |
434 | patset: &mut PatternSet, | |
435 | ) { | |
436 | if self.search(cache, input).is_some() { | |
437 | patset.insert(PatternID::ZERO); | |
438 | } | |
439 | } | |
440 | } | |
441 | ||
442 | #[derive(Debug)] | |
443 | struct Core { | |
444 | info: RegexInfo, | |
445 | pre: Option<Prefilter>, | |
446 | nfa: NFA, | |
447 | nfarev: Option<NFA>, | |
448 | pikevm: wrappers::PikeVM, | |
449 | backtrack: wrappers::BoundedBacktracker, | |
450 | onepass: wrappers::OnePass, | |
451 | hybrid: wrappers::Hybrid, | |
452 | dfa: wrappers::DFA, | |
453 | } | |
454 | ||
455 | impl Core { | |
456 | fn new( | |
457 | info: RegexInfo, | |
458 | pre: Option<Prefilter>, | |
459 | hirs: &[&Hir], | |
460 | ) -> Result<Core, BuildError> { | |
461 | let mut lookm = LookMatcher::new(); | |
462 | lookm.set_line_terminator(info.config().get_line_terminator()); | |
463 | let thompson_config = thompson::Config::new() | |
464 | .utf8(info.config().get_utf8_empty()) | |
465 | .nfa_size_limit(info.config().get_nfa_size_limit()) | |
466 | .shrink(false) | |
467 | .which_captures(info.config().get_which_captures()) | |
468 | .look_matcher(lookm); | |
469 | let nfa = thompson::Compiler::new() | |
470 | .configure(thompson_config.clone()) | |
471 | .build_many_from_hir(hirs) | |
472 | .map_err(BuildError::nfa)?; | |
473 | // It's possible for the PikeVM or the BB to fail to build, even though | |
474 | // at this point, we already have a full NFA in hand. They can fail | |
475 | // when a Unicode word boundary is used but where Unicode word boundary | |
476 | // support is disabled at compile time, thus making it impossible to | |
477 | // match. (Construction can also fail if the NFA was compiled without | |
478 | // captures, but we always enable that above.) | |
479 | let pikevm = wrappers::PikeVM::new(&info, pre.clone(), &nfa)?; | |
480 | let backtrack = | |
481 | wrappers::BoundedBacktracker::new(&info, pre.clone(), &nfa)?; | |
482 | // The onepass engine can of course fail to build, but we expect it to | |
483 | // fail in many cases because it is an optimization that doesn't apply | |
484 | // to all regexes. The 'OnePass' wrapper encapsulates this failure (and | |
485 | // logs a message if it occurs). | |
486 | let onepass = wrappers::OnePass::new(&info, &nfa); | |
487 | // We try to encapsulate whether a particular regex engine should be | |
488 | // used within each respective wrapper, but the DFAs need a reverse NFA | |
489 | // to build itself, and we really do not want to build a reverse NFA if | |
490 | // we know we aren't going to use the lazy DFA. So we do a config check | |
491 | // up front, which is in practice the only way we won't try to use the | |
492 | // DFA. | |
493 | let (nfarev, hybrid, dfa) = | |
494 | if !info.config().get_hybrid() && !info.config().get_dfa() { | |
495 | (None, wrappers::Hybrid::none(), wrappers::DFA::none()) | |
496 | } else { | |
497 | // FIXME: Technically, we don't quite yet KNOW that we need | |
498 | // a reverse NFA. It's possible for the DFAs below to both | |
499 | // fail to build just based on the forward NFA. In which case, | |
500 | // building the reverse NFA was totally wasted work. But... | |
501 | // fixing this requires breaking DFA construction apart into | |
502 | // two pieces: one for the forward part and another for the | |
503 | // reverse part. Quite annoying. Making it worse, when building | |
504 | // both DFAs fails, it's quite likely that the NFA is large and | |
505 | // that it will take quite some time to build the reverse NFA | |
506 | // too. So... it's really probably worth it to do this! | |
507 | let nfarev = thompson::Compiler::new() | |
508 | // Currently, reverse NFAs don't support capturing groups, | |
509 | // so we MUST disable them. But even if we didn't have to, | |
510 | // we would, because nothing in this crate does anything | |
511 | // useful with capturing groups in reverse. And of course, | |
512 | // the lazy DFA ignores capturing groups in all cases. | |
513 | .configure( | |
514 | thompson_config | |
515 | .clone() | |
516 | .which_captures(WhichCaptures::None) | |
517 | .reverse(true), | |
518 | ) | |
519 | .build_many_from_hir(hirs) | |
520 | .map_err(BuildError::nfa)?; | |
521 | let dfa = if !info.config().get_dfa() { | |
522 | wrappers::DFA::none() | |
523 | } else { | |
524 | wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev) | |
525 | }; | |
526 | let hybrid = if !info.config().get_hybrid() { | |
527 | wrappers::Hybrid::none() | |
528 | } else if dfa.is_some() { | |
529 | debug!("skipping lazy DFA because we have a full DFA"); | |
530 | wrappers::Hybrid::none() | |
531 | } else { | |
532 | wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev) | |
533 | }; | |
534 | (Some(nfarev), hybrid, dfa) | |
535 | }; | |
536 | Ok(Core { | |
537 | info, | |
538 | pre, | |
539 | nfa, | |
540 | nfarev, | |
541 | pikevm, | |
542 | backtrack, | |
543 | onepass, | |
544 | hybrid, | |
545 | dfa, | |
546 | }) | |
547 | } | |
548 | ||
549 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
550 | fn try_search_mayfail( | |
551 | &self, | |
552 | cache: &mut Cache, | |
553 | input: &Input<'_>, | |
554 | ) -> Option<Result<Option<Match>, RetryFailError>> { | |
555 | if let Some(e) = self.dfa.get(input) { | |
556 | trace!("using full DFA for search at {:?}", input.get_span()); | |
557 | Some(e.try_search(input)) | |
558 | } else if let Some(e) = self.hybrid.get(input) { | |
559 | trace!("using lazy DFA for search at {:?}", input.get_span()); | |
560 | Some(e.try_search(&mut cache.hybrid, input)) | |
561 | } else { | |
562 | None | |
563 | } | |
564 | } | |
565 | ||
566 | fn search_nofail( | |
567 | &self, | |
568 | cache: &mut Cache, | |
569 | input: &Input<'_>, | |
570 | ) -> Option<Match> { | |
571 | let caps = &mut cache.capmatches; | |
572 | caps.set_pattern(None); | |
573 | // We manually inline 'try_search_slots_nofail' here because we need to | |
574 | // borrow from 'cache.capmatches' in this method, but if we do, then | |
575 | // we can't pass 'cache' wholesale to to 'try_slots_no_hybrid'. It's a | |
576 | // classic example of how the borrow checker inhibits decomposition. | |
577 | // There are of course work-arounds (more types and/or interior | |
578 | // mutability), but that's more annoying than this IMO. | |
579 | let pid = if let Some(ref e) = self.onepass.get(input) { | |
580 | trace!("using OnePass for search at {:?}", input.get_span()); | |
581 | e.search_slots(&mut cache.onepass, input, caps.slots_mut()) | |
582 | } else if let Some(ref e) = self.backtrack.get(input) { | |
583 | trace!( | |
584 | "using BoundedBacktracker for search at {:?}", | |
585 | input.get_span() | |
586 | ); | |
587 | e.search_slots(&mut cache.backtrack, input, caps.slots_mut()) | |
588 | } else { | |
589 | trace!("using PikeVM for search at {:?}", input.get_span()); | |
590 | let e = self.pikevm.get(); | |
591 | e.search_slots(&mut cache.pikevm, input, caps.slots_mut()) | |
592 | }; | |
593 | caps.set_pattern(pid); | |
594 | caps.get_match() | |
595 | } | |
596 | ||
597 | fn search_half_nofail( | |
598 | &self, | |
599 | cache: &mut Cache, | |
600 | input: &Input<'_>, | |
601 | ) -> Option<HalfMatch> { | |
602 | // Only the lazy/full DFA returns half-matches, since the DFA requires | |
603 | // a reverse scan to find the start position. These fallback regex | |
604 | // engines can find the start and end in a single pass, so we just do | |
605 | // that and throw away the start offset to conform to the API. | |
606 | let m = self.search_nofail(cache, input)?; | |
607 | Some(HalfMatch::new(m.pattern(), m.end())) | |
608 | } | |
609 | ||
610 | fn search_slots_nofail( | |
611 | &self, | |
612 | cache: &mut Cache, | |
613 | input: &Input<'_>, | |
614 | slots: &mut [Option<NonMaxUsize>], | |
615 | ) -> Option<PatternID> { | |
616 | if let Some(ref e) = self.onepass.get(input) { | |
617 | trace!( | |
618 | "using OnePass for capture search at {:?}", | |
619 | input.get_span() | |
620 | ); | |
621 | e.search_slots(&mut cache.onepass, input, slots) | |
622 | } else if let Some(ref e) = self.backtrack.get(input) { | |
623 | trace!( | |
624 | "using BoundedBacktracker for capture search at {:?}", | |
625 | input.get_span() | |
626 | ); | |
627 | e.search_slots(&mut cache.backtrack, input, slots) | |
628 | } else { | |
629 | trace!( | |
630 | "using PikeVM for capture search at {:?}", | |
631 | input.get_span() | |
632 | ); | |
633 | let e = self.pikevm.get(); | |
634 | e.search_slots(&mut cache.pikevm, input, slots) | |
635 | } | |
636 | } | |
637 | ||
638 | fn is_match_nofail(&self, cache: &mut Cache, input: &Input<'_>) -> bool { | |
639 | if let Some(ref e) = self.onepass.get(input) { | |
640 | trace!( | |
641 | "using OnePass for is-match search at {:?}", | |
642 | input.get_span() | |
643 | ); | |
644 | e.search_slots(&mut cache.onepass, input, &mut []).is_some() | |
645 | } else if let Some(ref e) = self.backtrack.get(input) { | |
646 | trace!( | |
647 | "using BoundedBacktracker for is-match search at {:?}", | |
648 | input.get_span() | |
649 | ); | |
650 | e.is_match(&mut cache.backtrack, input) | |
651 | } else { | |
652 | trace!( | |
653 | "using PikeVM for is-match search at {:?}", | |
654 | input.get_span() | |
655 | ); | |
656 | let e = self.pikevm.get(); | |
657 | e.is_match(&mut cache.pikevm, input) | |
658 | } | |
659 | } | |
660 | ||
661 | fn is_capture_search_needed(&self, slots_len: usize) -> bool { | |
662 | slots_len > self.nfa.group_info().implicit_slot_len() | |
663 | } | |
664 | } | |
665 | ||
666 | impl Strategy for Core { | |
667 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
668 | fn group_info(&self) -> &GroupInfo { | |
669 | self.nfa.group_info() | |
670 | } | |
671 | ||
672 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
673 | fn create_cache(&self) -> Cache { | |
674 | Cache { | |
675 | capmatches: Captures::all(self.group_info().clone()), | |
676 | pikevm: self.pikevm.create_cache(), | |
677 | backtrack: self.backtrack.create_cache(), | |
678 | onepass: self.onepass.create_cache(), | |
679 | hybrid: self.hybrid.create_cache(), | |
680 | revhybrid: wrappers::ReverseHybridCache::none(), | |
681 | } | |
682 | } | |
683 | ||
684 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
685 | fn reset_cache(&self, cache: &mut Cache) { | |
686 | cache.pikevm.reset(&self.pikevm); | |
687 | cache.backtrack.reset(&self.backtrack); | |
688 | cache.onepass.reset(&self.onepass); | |
689 | cache.hybrid.reset(&self.hybrid); | |
690 | } | |
691 | ||
692 | fn is_accelerated(&self) -> bool { | |
693 | self.pre.as_ref().map_or(false, |pre| pre.is_fast()) | |
694 | } | |
695 | ||
696 | fn memory_usage(&self) -> usize { | |
697 | self.info.memory_usage() | |
698 | + self.pre.as_ref().map_or(0, |pre| pre.memory_usage()) | |
699 | + self.nfa.memory_usage() | |
700 | + self.nfarev.as_ref().map_or(0, |nfa| nfa.memory_usage()) | |
701 | + self.onepass.memory_usage() | |
702 | + self.dfa.memory_usage() | |
703 | } | |
704 | ||
705 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
706 | fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> { | |
707 | // We manually inline try_search_mayfail here because letting the | |
708 | // compiler do it seems to produce pretty crappy codegen. | |
709 | return if let Some(e) = self.dfa.get(input) { | |
710 | trace!("using full DFA for full search at {:?}", input.get_span()); | |
711 | match e.try_search(input) { | |
712 | Ok(x) => x, | |
713 | Err(_err) => { | |
714 | trace!("full DFA search failed: {}", _err); | |
715 | self.search_nofail(cache, input) | |
716 | } | |
717 | } | |
718 | } else if let Some(e) = self.hybrid.get(input) { | |
719 | trace!("using lazy DFA for full search at {:?}", input.get_span()); | |
720 | match e.try_search(&mut cache.hybrid, input) { | |
721 | Ok(x) => x, | |
722 | Err(_err) => { | |
723 | trace!("lazy DFA search failed: {}", _err); | |
724 | self.search_nofail(cache, input) | |
725 | } | |
726 | } | |
727 | } else { | |
728 | self.search_nofail(cache, input) | |
729 | }; | |
730 | } | |
731 | ||
732 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
733 | fn search_half( | |
734 | &self, | |
735 | cache: &mut Cache, | |
736 | input: &Input<'_>, | |
737 | ) -> Option<HalfMatch> { | |
738 | // The main difference with 'search' is that if we're using a DFA, we | |
739 | // can use a single forward scan without needing to run the reverse | |
740 | // DFA. | |
741 | if let Some(e) = self.dfa.get(input) { | |
742 | trace!("using full DFA for half search at {:?}", input.get_span()); | |
743 | match e.try_search_half_fwd(input) { | |
744 | Ok(x) => x, | |
745 | Err(_err) => { | |
746 | trace!("full DFA half search failed: {}", _err); | |
747 | self.search_half_nofail(cache, input) | |
748 | } | |
749 | } | |
750 | } else if let Some(e) = self.hybrid.get(input) { | |
751 | trace!("using lazy DFA for half search at {:?}", input.get_span()); | |
752 | match e.try_search_half_fwd(&mut cache.hybrid, input) { | |
753 | Ok(x) => x, | |
754 | Err(_err) => { | |
755 | trace!("lazy DFA half search failed: {}", _err); | |
756 | self.search_half_nofail(cache, input) | |
757 | } | |
758 | } | |
759 | } else { | |
760 | self.search_half_nofail(cache, input) | |
761 | } | |
762 | } | |
763 | ||
764 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
765 | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { | |
766 | if let Some(e) = self.dfa.get(input) { | |
767 | trace!( | |
768 | "using full DFA for is-match search at {:?}", | |
769 | input.get_span() | |
770 | ); | |
771 | match e.try_search_half_fwd(input) { | |
772 | Ok(x) => x.is_some(), | |
773 | Err(_err) => { | |
774 | trace!("full DFA half search failed: {}", _err); | |
775 | self.is_match_nofail(cache, input) | |
776 | } | |
777 | } | |
778 | } else if let Some(e) = self.hybrid.get(input) { | |
779 | trace!( | |
780 | "using lazy DFA for is-match search at {:?}", | |
781 | input.get_span() | |
782 | ); | |
783 | match e.try_search_half_fwd(&mut cache.hybrid, input) { | |
784 | Ok(x) => x.is_some(), | |
785 | Err(_err) => { | |
786 | trace!("lazy DFA half search failed: {}", _err); | |
787 | self.is_match_nofail(cache, input) | |
788 | } | |
789 | } | |
790 | } else { | |
791 | self.is_match_nofail(cache, input) | |
792 | } | |
793 | } | |
794 | ||
795 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
796 | fn search_slots( | |
797 | &self, | |
798 | cache: &mut Cache, | |
799 | input: &Input<'_>, | |
800 | slots: &mut [Option<NonMaxUsize>], | |
801 | ) -> Option<PatternID> { | |
802 | // Even if the regex has explicit capture groups, if the caller didn't | |
803 | // provide any explicit slots, then it doesn't make sense to try and do | |
804 | // extra work to get offsets for those slots. Ideally the caller should | |
805 | // realize this and not call this routine in the first place, but alas, | |
806 | // we try to save the caller from themselves if they do. | |
807 | if !self.is_capture_search_needed(slots.len()) { | |
808 | trace!("asked for slots unnecessarily, trying fast path"); | |
809 | let m = self.search(cache, input)?; | |
810 | copy_match_to_slots(m, slots); | |
811 | return Some(m.pattern()); | |
812 | } | |
813 | // If the onepass DFA is available for this search (which only happens | |
814 | // when it's anchored), then skip running a fallible DFA. The onepass | |
815 | // DFA isn't as fast as a full or lazy DFA, but it is typically quite | |
816 | // a bit faster than the backtracker or the PikeVM. So it isn't as | |
817 | // advantageous to try and do a full/lazy DFA scan first. | |
818 | // | |
819 | // We still theorize that it's better to do a full/lazy DFA scan, even | |
820 | // when it's anchored, because it's usually much faster and permits us | |
821 | // to say "no match" much more quickly. This does hurt the case of, | |
822 | // say, parsing each line in a log file into capture groups, because | |
823 | // in that case, the line always matches. So the lazy DFA scan is | |
824 | // usually just wasted work. But, the lazy DFA is usually quite fast | |
825 | // and doesn't cost too much here. | |
826 | if self.onepass.get(&input).is_some() { | |
827 | return self.search_slots_nofail(cache, &input, slots); | |
828 | } | |
829 | let m = match self.try_search_mayfail(cache, input) { | |
830 | Some(Ok(Some(m))) => m, | |
831 | Some(Ok(None)) => return None, | |
832 | Some(Err(_err)) => { | |
833 | trace!("fast capture search failed: {}", _err); | |
834 | return self.search_slots_nofail(cache, input, slots); | |
835 | } | |
836 | None => { | |
837 | return self.search_slots_nofail(cache, input, slots); | |
838 | } | |
839 | }; | |
840 | // At this point, now that we've found the bounds of the | |
841 | // match, we need to re-run something that can resolve | |
842 | // capturing groups. But we only need to run on it on the | |
843 | // match bounds and not the entire haystack. | |
844 | trace!( | |
845 | "match found at {}..{} in capture search, \ | |
846 | using another engine to find captures", | |
847 | m.start(), | |
848 | m.end(), | |
849 | ); | |
850 | let input = input | |
851 | .clone() | |
852 | .span(m.start()..m.end()) | |
853 | .anchored(Anchored::Pattern(m.pattern())); | |
854 | Some( | |
855 | self.search_slots_nofail(cache, &input, slots) | |
856 | .expect("should find a match"), | |
857 | ) | |
858 | } | |
859 | ||
860 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
861 | fn which_overlapping_matches( | |
862 | &self, | |
863 | cache: &mut Cache, | |
864 | input: &Input<'_>, | |
865 | patset: &mut PatternSet, | |
866 | ) { | |
867 | if let Some(e) = self.dfa.get(input) { | |
868 | trace!( | |
869 | "using full DFA for overlapping search at {:?}", | |
870 | input.get_span() | |
871 | ); | |
872 | let _err = match e.try_which_overlapping_matches(input, patset) { | |
873 | Ok(()) => return, | |
874 | Err(err) => err, | |
875 | }; | |
876 | trace!("fast overlapping search failed: {}", _err); | |
877 | } else if let Some(e) = self.hybrid.get(input) { | |
878 | trace!( | |
879 | "using lazy DFA for overlapping search at {:?}", | |
880 | input.get_span() | |
881 | ); | |
882 | let _err = match e.try_which_overlapping_matches( | |
883 | &mut cache.hybrid, | |
884 | input, | |
885 | patset, | |
886 | ) { | |
887 | Ok(()) => { | |
888 | return; | |
889 | } | |
890 | Err(err) => err, | |
891 | }; | |
892 | trace!("fast overlapping search failed: {}", _err); | |
893 | } | |
894 | trace!( | |
895 | "using PikeVM for overlapping search at {:?}", | |
896 | input.get_span() | |
897 | ); | |
898 | let e = self.pikevm.get(); | |
899 | e.which_overlapping_matches(&mut cache.pikevm, input, patset) | |
900 | } | |
901 | } | |
902 | ||
903 | #[derive(Debug)] | |
904 | struct ReverseAnchored { | |
905 | core: Core, | |
906 | } | |
907 | ||
908 | impl ReverseAnchored { | |
909 | fn new(core: Core) -> Result<ReverseAnchored, Core> { | |
910 | if !core.info.is_always_anchored_end() { | |
911 | debug!( | |
912 | "skipping reverse anchored optimization because \ | |
913 | the regex is not always anchored at the end" | |
914 | ); | |
915 | return Err(core); | |
916 | } | |
917 | // Note that the caller can still request an anchored search even when | |
918 | // the regex isn't anchored at the start. We detect that case in the | |
919 | // search routines below and just fallback to the core engine. This | |
920 | // is fine because both searches are anchored. It's just a matter of | |
921 | // picking one. Falling back to the core engine is a little simpler, | |
922 | // since if we used the reverse anchored approach, we'd have to add an | |
923 | // extra check to ensure the match reported starts at the place where | |
924 | // the caller requested the search to start. | |
925 | if core.info.is_always_anchored_start() { | |
926 | debug!( | |
927 | "skipping reverse anchored optimization because \ | |
928 | the regex is also anchored at the start" | |
929 | ); | |
930 | return Err(core); | |
931 | } | |
932 | // Only DFAs can do reverse searches (currently), so we need one of | |
933 | // them in order to do this optimization. It's possible (although | |
934 | // pretty unlikely) that we have neither and need to give up. | |
935 | if !core.hybrid.is_some() && !core.dfa.is_some() { | |
936 | debug!( | |
937 | "skipping reverse anchored optimization because \ | |
938 | we don't have a lazy DFA or a full DFA" | |
939 | ); | |
940 | return Err(core); | |
941 | } | |
942 | Ok(ReverseAnchored { core }) | |
943 | } | |
944 | ||
945 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
946 | fn try_search_half_anchored_rev( | |
947 | &self, | |
948 | cache: &mut Cache, | |
949 | input: &Input<'_>, | |
950 | ) -> Result<Option<HalfMatch>, RetryFailError> { | |
951 | // We of course always want an anchored search. In theory, the | |
952 | // underlying regex engines should automatically enable anchored | |
953 | // searches since the regex is itself anchored, but this more clearly | |
954 | // expresses intent and is always correct. | |
955 | let input = input.clone().anchored(Anchored::Yes); | |
956 | if let Some(e) = self.core.dfa.get(&input) { | |
957 | trace!( | |
958 | "using full DFA for reverse anchored search at {:?}", | |
959 | input.get_span() | |
960 | ); | |
961 | e.try_search_half_rev(&input) | |
962 | } else if let Some(e) = self.core.hybrid.get(&input) { | |
963 | trace!( | |
964 | "using lazy DFA for reverse anchored search at {:?}", | |
965 | input.get_span() | |
966 | ); | |
967 | e.try_search_half_rev(&mut cache.hybrid, &input) | |
968 | } else { | |
969 | unreachable!("ReverseAnchored always has a DFA") | |
970 | } | |
971 | } | |
972 | } | |
973 | ||
974 | // Note that in this impl, we don't check that 'input.end() == | |
975 | // input.haystack().len()'. In particular, when that condition is false, a | |
976 | // match is always impossible because we know that the regex is always anchored | |
977 | // at the end (or else 'ReverseAnchored' won't be built). We don't check that | |
978 | // here because the 'Regex' wrapper actually does that for us in all cases. | |
979 | // Thus, in this impl, we can actually assume that the end position in 'input' | |
980 | // is equivalent to the length of the haystack. | |
981 | impl Strategy for ReverseAnchored { | |
982 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
983 | fn group_info(&self) -> &GroupInfo { | |
984 | self.core.group_info() | |
985 | } | |
986 | ||
987 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
988 | fn create_cache(&self) -> Cache { | |
989 | self.core.create_cache() | |
990 | } | |
991 | ||
992 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
993 | fn reset_cache(&self, cache: &mut Cache) { | |
994 | self.core.reset_cache(cache); | |
995 | } | |
996 | ||
997 | fn is_accelerated(&self) -> bool { | |
998 | // Since this is anchored at the end, a reverse anchored search is | |
999 | // almost certainly guaranteed to result in a much faster search than | |
1000 | // a standard forward search. | |
1001 | true | |
1002 | } | |
1003 | ||
1004 | fn memory_usage(&self) -> usize { | |
1005 | self.core.memory_usage() | |
1006 | } | |
1007 | ||
1008 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1009 | fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> { | |
1010 | if input.get_anchored().is_anchored() { | |
1011 | return self.core.search(cache, input); | |
1012 | } | |
1013 | match self.try_search_half_anchored_rev(cache, input) { | |
1014 | Err(_err) => { | |
1015 | trace!("fast reverse anchored search failed: {}", _err); | |
1016 | self.core.search_nofail(cache, input) | |
1017 | } | |
1018 | Ok(None) => None, | |
1019 | Ok(Some(hm)) => { | |
1020 | Some(Match::new(hm.pattern(), hm.offset()..input.end())) | |
1021 | } | |
1022 | } | |
1023 | } | |
1024 | ||
1025 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1026 | fn search_half( | |
1027 | &self, | |
1028 | cache: &mut Cache, | |
1029 | input: &Input<'_>, | |
1030 | ) -> Option<HalfMatch> { | |
1031 | if input.get_anchored().is_anchored() { | |
1032 | return self.core.search_half(cache, input); | |
1033 | } | |
1034 | match self.try_search_half_anchored_rev(cache, input) { | |
1035 | Err(_err) => { | |
1036 | trace!("fast reverse anchored search failed: {}", _err); | |
1037 | self.core.search_half_nofail(cache, input) | |
1038 | } | |
1039 | Ok(None) => None, | |
1040 | Ok(Some(hm)) => { | |
1041 | // Careful here! 'try_search_half' is a *forward* search that | |
1042 | // only cares about the *end* position of a match. But | |
1043 | // 'hm.offset()' is actually the start of the match. So we | |
1044 | // actually just throw that away here and, since we know we | |
1045 | // have a match, return the only possible position at which a | |
1046 | // match can occur: input.end(). | |
1047 | Some(HalfMatch::new(hm.pattern(), input.end())) | |
1048 | } | |
1049 | } | |
1050 | } | |
1051 | ||
1052 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1053 | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { | |
1054 | if input.get_anchored().is_anchored() { | |
1055 | return self.core.is_match(cache, input); | |
1056 | } | |
1057 | match self.try_search_half_anchored_rev(cache, input) { | |
1058 | Err(_err) => { | |
1059 | trace!("fast reverse anchored search failed: {}", _err); | |
1060 | self.core.is_match_nofail(cache, input) | |
1061 | } | |
1062 | Ok(None) => false, | |
1063 | Ok(Some(_)) => true, | |
1064 | } | |
1065 | } | |
1066 | ||
1067 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1068 | fn search_slots( | |
1069 | &self, | |
1070 | cache: &mut Cache, | |
1071 | input: &Input<'_>, | |
1072 | slots: &mut [Option<NonMaxUsize>], | |
1073 | ) -> Option<PatternID> { | |
1074 | if input.get_anchored().is_anchored() { | |
1075 | return self.core.search_slots(cache, input, slots); | |
1076 | } | |
1077 | match self.try_search_half_anchored_rev(cache, input) { | |
1078 | Err(_err) => { | |
1079 | trace!("fast reverse anchored search failed: {}", _err); | |
1080 | self.core.search_slots_nofail(cache, input, slots) | |
1081 | } | |
1082 | Ok(None) => None, | |
1083 | Ok(Some(hm)) => { | |
1084 | if !self.core.is_capture_search_needed(slots.len()) { | |
1085 | trace!("asked for slots unnecessarily, skipping captures"); | |
1086 | let m = Match::new(hm.pattern(), hm.offset()..input.end()); | |
1087 | copy_match_to_slots(m, slots); | |
1088 | return Some(m.pattern()); | |
1089 | } | |
1090 | let start = hm.offset(); | |
1091 | let input = input | |
1092 | .clone() | |
1093 | .span(start..input.end()) | |
1094 | .anchored(Anchored::Pattern(hm.pattern())); | |
1095 | self.core.search_slots_nofail(cache, &input, slots) | |
1096 | } | |
1097 | } | |
1098 | } | |
1099 | ||
1100 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1101 | fn which_overlapping_matches( | |
1102 | &self, | |
1103 | cache: &mut Cache, | |
1104 | input: &Input<'_>, | |
1105 | patset: &mut PatternSet, | |
1106 | ) { | |
1107 | // It seems like this could probably benefit from a reverse anchored | |
1108 | // optimization, perhaps by doing an overlapping reverse search (which | |
1109 | // the DFAs do support). I haven't given it much thought though, and | |
1110 | // I'm currently focus more on the single pattern case. | |
1111 | self.core.which_overlapping_matches(cache, input, patset) | |
1112 | } | |
1113 | } | |
1114 | ||
1115 | #[derive(Debug)] | |
1116 | struct ReverseSuffix { | |
1117 | core: Core, | |
1118 | pre: Prefilter, | |
1119 | } | |
1120 | ||
1121 | impl ReverseSuffix { | |
1122 | fn new(core: Core, hirs: &[&Hir]) -> Result<ReverseSuffix, Core> { | |
1123 | if !core.info.config().get_auto_prefilter() { | |
1124 | debug!( | |
1125 | "skipping reverse suffix optimization because \ | |
1126 | automatic prefilters are disabled" | |
1127 | ); | |
1128 | return Err(core); | |
1129 | } | |
1130 | // Like the reverse inner optimization, we don't do this for regexes | |
1131 | // that are always anchored. It could lead to scanning too much, but | |
1132 | // could say "no match" much more quickly than running the regex | |
1133 | // engine if the initial literal scan doesn't match. With that said, | |
1134 | // the reverse suffix optimization has lower overhead, since it only | |
1135 | // requires a reverse scan after a literal match to confirm or reject | |
1136 | // the match. (Although, in the case of confirmation, it then needs to | |
1137 | // do another forward scan to find the end position.) | |
1138 | // | |
1139 | // Note that the caller can still request an anchored search even | |
1140 | // when the regex isn't anchored. We detect that case in the search | |
1141 | // routines below and just fallback to the core engine. Currently this | |
1142 | // optimization assumes all searches are unanchored, so if we do want | |
1143 | // to enable this optimization for anchored searches, it will need a | |
1144 | // little work to support it. | |
1145 | if core.info.is_always_anchored_start() { | |
1146 | debug!( | |
1147 | "skipping reverse suffix optimization because \ | |
1148 | the regex is always anchored at the start", | |
1149 | ); | |
1150 | return Err(core); | |
1151 | } | |
1152 | // Only DFAs can do reverse searches (currently), so we need one of | |
1153 | // them in order to do this optimization. It's possible (although | |
1154 | // pretty unlikely) that we have neither and need to give up. | |
1155 | if !core.hybrid.is_some() && !core.dfa.is_some() { | |
1156 | debug!( | |
1157 | "skipping reverse suffix optimization because \ | |
1158 | we don't have a lazy DFA or a full DFA" | |
1159 | ); | |
1160 | return Err(core); | |
1161 | } | |
1162 | if core.pre.as_ref().map_or(false, |p| p.is_fast()) { | |
1163 | debug!( | |
1164 | "skipping reverse suffix optimization because \ | |
1165 | we already have a prefilter that we think is fast" | |
1166 | ); | |
1167 | return Err(core); | |
1168 | } | |
1169 | let kind = core.info.config().get_match_kind(); | |
1170 | let suffixes = crate::util::prefilter::suffixes(kind, hirs); | |
1171 | let lcs = match suffixes.longest_common_suffix() { | |
1172 | None => { | |
1173 | debug!( | |
1174 | "skipping reverse suffix optimization because \ | |
1175 | a longest common suffix could not be found", | |
1176 | ); | |
1177 | return Err(core); | |
1178 | } | |
1179 | Some(lcs) if lcs.is_empty() => { | |
1180 | debug!( | |
1181 | "skipping reverse suffix optimization because \ | |
1182 | the longest common suffix is the empty string", | |
1183 | ); | |
1184 | return Err(core); | |
1185 | } | |
1186 | Some(lcs) => lcs, | |
1187 | }; | |
1188 | let pre = match Prefilter::new(kind, &[lcs]) { | |
1189 | Some(pre) => pre, | |
1190 | None => { | |
1191 | debug!( | |
1192 | "skipping reverse suffix optimization because \ | |
1193 | a prefilter could not be constructed from the \ | |
1194 | longest common suffix", | |
1195 | ); | |
1196 | return Err(core); | |
1197 | } | |
1198 | }; | |
1199 | if !pre.is_fast() { | |
1200 | debug!( | |
1201 | "skipping reverse suffix optimization because \ | |
1202 | while we have a suffix prefilter, it is not \ | |
1203 | believed to be 'fast'" | |
1204 | ); | |
1205 | return Err(core); | |
1206 | } | |
1207 | Ok(ReverseSuffix { core, pre }) | |
1208 | } | |
1209 | ||
1210 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1211 | fn try_search_half_start( | |
1212 | &self, | |
1213 | cache: &mut Cache, | |
1214 | input: &Input<'_>, | |
1215 | ) -> Result<Option<HalfMatch>, RetryError> { | |
1216 | let mut span = input.get_span(); | |
1217 | let mut min_start = 0; | |
1218 | loop { | |
1219 | let litmatch = match self.pre.find(input.haystack(), span) { | |
1220 | None => return Ok(None), | |
1221 | Some(span) => span, | |
1222 | }; | |
1223 | trace!("reverse suffix scan found suffix match at {:?}", litmatch); | |
1224 | let revinput = input | |
1225 | .clone() | |
1226 | .anchored(Anchored::Yes) | |
1227 | .span(input.start()..litmatch.end); | |
1228 | match self | |
1229 | .try_search_half_rev_limited(cache, &revinput, min_start)? | |
1230 | { | |
1231 | None => { | |
1232 | if span.start >= span.end { | |
1233 | break; | |
1234 | } | |
1235 | span.start = litmatch.start.checked_add(1).unwrap(); | |
1236 | } | |
1237 | Some(hm) => return Ok(Some(hm)), | |
1238 | } | |
1239 | min_start = litmatch.end; | |
1240 | } | |
1241 | Ok(None) | |
1242 | } | |
1243 | ||
1244 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1245 | fn try_search_half_fwd( | |
1246 | &self, | |
1247 | cache: &mut Cache, | |
1248 | input: &Input<'_>, | |
1249 | ) -> Result<Option<HalfMatch>, RetryFailError> { | |
1250 | if let Some(e) = self.core.dfa.get(&input) { | |
1251 | trace!( | |
1252 | "using full DFA for forward reverse suffix search at {:?}", | |
1253 | input.get_span() | |
1254 | ); | |
1255 | e.try_search_half_fwd(&input) | |
1256 | } else if let Some(e) = self.core.hybrid.get(&input) { | |
1257 | trace!( | |
1258 | "using lazy DFA for forward reverse suffix search at {:?}", | |
1259 | input.get_span() | |
1260 | ); | |
1261 | e.try_search_half_fwd(&mut cache.hybrid, &input) | |
1262 | } else { | |
1263 | unreachable!("ReverseSuffix always has a DFA") | |
1264 | } | |
1265 | } | |
1266 | ||
1267 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1268 | fn try_search_half_rev_limited( | |
1269 | &self, | |
1270 | cache: &mut Cache, | |
1271 | input: &Input<'_>, | |
1272 | min_start: usize, | |
1273 | ) -> Result<Option<HalfMatch>, RetryError> { | |
1274 | if let Some(e) = self.core.dfa.get(&input) { | |
1275 | trace!( | |
1276 | "using full DFA for reverse suffix search at {:?}, \ | |
1277 | but will be stopped at {} to avoid quadratic behavior", | |
1278 | input.get_span(), | |
1279 | min_start, | |
1280 | ); | |
1281 | e.try_search_half_rev_limited(&input, min_start) | |
1282 | } else if let Some(e) = self.core.hybrid.get(&input) { | |
1283 | trace!( | |
4b012472 | 1284 | "using lazy DFA for reverse suffix search at {:?}, \ |
781aab86 FG |
1285 | but will be stopped at {} to avoid quadratic behavior", |
1286 | input.get_span(), | |
1287 | min_start, | |
1288 | ); | |
1289 | e.try_search_half_rev_limited(&mut cache.hybrid, &input, min_start) | |
1290 | } else { | |
1291 | unreachable!("ReverseSuffix always has a DFA") | |
1292 | } | |
1293 | } | |
1294 | } | |
1295 | ||
1296 | impl Strategy for ReverseSuffix { | |
1297 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1298 | fn group_info(&self) -> &GroupInfo { | |
1299 | self.core.group_info() | |
1300 | } | |
1301 | ||
1302 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1303 | fn create_cache(&self) -> Cache { | |
1304 | self.core.create_cache() | |
1305 | } | |
1306 | ||
1307 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1308 | fn reset_cache(&self, cache: &mut Cache) { | |
1309 | self.core.reset_cache(cache); | |
1310 | } | |
1311 | ||
1312 | fn is_accelerated(&self) -> bool { | |
1313 | self.pre.is_fast() | |
1314 | } | |
1315 | ||
1316 | fn memory_usage(&self) -> usize { | |
1317 | self.core.memory_usage() + self.pre.memory_usage() | |
1318 | } | |
1319 | ||
1320 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1321 | fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> { | |
1322 | if input.get_anchored().is_anchored() { | |
1323 | return self.core.search(cache, input); | |
1324 | } | |
1325 | match self.try_search_half_start(cache, input) { | |
1326 | Err(RetryError::Quadratic(_err)) => { | |
1327 | trace!("reverse suffix optimization failed: {}", _err); | |
1328 | self.core.search(cache, input) | |
1329 | } | |
1330 | Err(RetryError::Fail(_err)) => { | |
1331 | trace!("reverse suffix reverse fast search failed: {}", _err); | |
1332 | self.core.search_nofail(cache, input) | |
1333 | } | |
1334 | Ok(None) => None, | |
1335 | Ok(Some(hm_start)) => { | |
1336 | let fwdinput = input | |
1337 | .clone() | |
1338 | .anchored(Anchored::Pattern(hm_start.pattern())) | |
1339 | .span(hm_start.offset()..input.end()); | |
1340 | match self.try_search_half_fwd(cache, &fwdinput) { | |
1341 | Err(_err) => { | |
1342 | trace!( | |
1343 | "reverse suffix forward fast search failed: {}", | |
1344 | _err | |
1345 | ); | |
1346 | self.core.search_nofail(cache, input) | |
1347 | } | |
1348 | Ok(None) => { | |
1349 | unreachable!( | |
1350 | "suffix match plus reverse match implies \ | |
1351 | there must be a match", | |
1352 | ) | |
1353 | } | |
1354 | Ok(Some(hm_end)) => Some(Match::new( | |
1355 | hm_start.pattern(), | |
1356 | hm_start.offset()..hm_end.offset(), | |
1357 | )), | |
1358 | } | |
1359 | } | |
1360 | } | |
1361 | } | |
1362 | ||
1363 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1364 | fn search_half( | |
1365 | &self, | |
1366 | cache: &mut Cache, | |
1367 | input: &Input<'_>, | |
1368 | ) -> Option<HalfMatch> { | |
1369 | if input.get_anchored().is_anchored() { | |
1370 | return self.core.search_half(cache, input); | |
1371 | } | |
1372 | match self.try_search_half_start(cache, input) { | |
1373 | Err(RetryError::Quadratic(_err)) => { | |
1374 | trace!("reverse suffix half optimization failed: {}", _err); | |
1375 | self.core.search_half(cache, input) | |
1376 | } | |
1377 | Err(RetryError::Fail(_err)) => { | |
1378 | trace!( | |
1379 | "reverse suffix reverse fast half search failed: {}", | |
1380 | _err | |
1381 | ); | |
1382 | self.core.search_half_nofail(cache, input) | |
1383 | } | |
1384 | Ok(None) => None, | |
1385 | Ok(Some(hm_start)) => { | |
1386 | // This is a bit subtle. It is tempting to just stop searching | |
1387 | // at this point and return a half-match with an offset | |
1388 | // corresponding to where the suffix was found. But the suffix | |
1389 | // match does not necessarily correspond to the end of the | |
1390 | // proper leftmost-first match. Consider /[a-z]+ing/ against | |
1391 | // 'tingling'. The first suffix match is the first 'ing', and | |
1392 | // the /[a-z]+/ matches the 't'. So if we stopped here, then | |
1393 | // we'd report 'ting' as the match. But 'tingling' is the | |
1394 | // correct match because of greediness. | |
1395 | let fwdinput = input | |
1396 | .clone() | |
1397 | .anchored(Anchored::Pattern(hm_start.pattern())) | |
1398 | .span(hm_start.offset()..input.end()); | |
1399 | match self.try_search_half_fwd(cache, &fwdinput) { | |
1400 | Err(_err) => { | |
1401 | trace!( | |
1402 | "reverse suffix forward fast search failed: {}", | |
1403 | _err | |
1404 | ); | |
1405 | self.core.search_half_nofail(cache, input) | |
1406 | } | |
1407 | Ok(None) => { | |
1408 | unreachable!( | |
1409 | "suffix match plus reverse match implies \ | |
1410 | there must be a match", | |
1411 | ) | |
1412 | } | |
1413 | Ok(Some(hm_end)) => Some(hm_end), | |
1414 | } | |
1415 | } | |
1416 | } | |
1417 | } | |
1418 | ||
1419 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1420 | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { | |
1421 | if input.get_anchored().is_anchored() { | |
1422 | return self.core.is_match(cache, input); | |
1423 | } | |
1424 | match self.try_search_half_start(cache, input) { | |
1425 | Err(RetryError::Quadratic(_err)) => { | |
1426 | trace!("reverse suffix half optimization failed: {}", _err); | |
1427 | self.core.is_match_nofail(cache, input) | |
1428 | } | |
1429 | Err(RetryError::Fail(_err)) => { | |
1430 | trace!( | |
1431 | "reverse suffix reverse fast half search failed: {}", | |
1432 | _err | |
1433 | ); | |
1434 | self.core.is_match_nofail(cache, input) | |
1435 | } | |
1436 | Ok(None) => false, | |
1437 | Ok(Some(_)) => true, | |
1438 | } | |
1439 | } | |
1440 | ||
1441 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1442 | fn search_slots( | |
1443 | &self, | |
1444 | cache: &mut Cache, | |
1445 | input: &Input<'_>, | |
1446 | slots: &mut [Option<NonMaxUsize>], | |
1447 | ) -> Option<PatternID> { | |
1448 | if input.get_anchored().is_anchored() { | |
1449 | return self.core.search_slots(cache, input, slots); | |
1450 | } | |
1451 | if !self.core.is_capture_search_needed(slots.len()) { | |
1452 | trace!("asked for slots unnecessarily, trying fast path"); | |
1453 | let m = self.search(cache, input)?; | |
1454 | copy_match_to_slots(m, slots); | |
1455 | return Some(m.pattern()); | |
1456 | } | |
1457 | let hm_start = match self.try_search_half_start(cache, input) { | |
1458 | Err(RetryError::Quadratic(_err)) => { | |
1459 | trace!( | |
1460 | "reverse suffix captures optimization failed: {}", | |
1461 | _err | |
1462 | ); | |
1463 | return self.core.search_slots(cache, input, slots); | |
1464 | } | |
1465 | Err(RetryError::Fail(_err)) => { | |
1466 | trace!( | |
1467 | "reverse suffix reverse fast captures search failed: {}", | |
1468 | _err | |
1469 | ); | |
1470 | return self.core.search_slots_nofail(cache, input, slots); | |
1471 | } | |
1472 | Ok(None) => return None, | |
1473 | Ok(Some(hm_start)) => hm_start, | |
1474 | }; | |
1475 | trace!( | |
1476 | "match found at {}..{} in capture search, \ | |
1477 | using another engine to find captures", | |
1478 | hm_start.offset(), | |
1479 | input.end(), | |
1480 | ); | |
1481 | let start = hm_start.offset(); | |
1482 | let input = input | |
1483 | .clone() | |
1484 | .span(start..input.end()) | |
1485 | .anchored(Anchored::Pattern(hm_start.pattern())); | |
1486 | self.core.search_slots_nofail(cache, &input, slots) | |
1487 | } | |
1488 | ||
1489 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1490 | fn which_overlapping_matches( | |
1491 | &self, | |
1492 | cache: &mut Cache, | |
1493 | input: &Input<'_>, | |
1494 | patset: &mut PatternSet, | |
1495 | ) { | |
1496 | self.core.which_overlapping_matches(cache, input, patset) | |
1497 | } | |
1498 | } | |
1499 | ||
1500 | #[derive(Debug)] | |
1501 | struct ReverseInner { | |
1502 | core: Core, | |
1503 | preinner: Prefilter, | |
1504 | nfarev: NFA, | |
1505 | hybrid: wrappers::ReverseHybrid, | |
1506 | dfa: wrappers::ReverseDFA, | |
1507 | } | |
1508 | ||
1509 | impl ReverseInner { | |
1510 | fn new(core: Core, hirs: &[&Hir]) -> Result<ReverseInner, Core> { | |
1511 | if !core.info.config().get_auto_prefilter() { | |
1512 | debug!( | |
1513 | "skipping reverse inner optimization because \ | |
1514 | automatic prefilters are disabled" | |
1515 | ); | |
1516 | return Err(core); | |
1517 | } | |
1518 | // Currently we hard-code the assumption of leftmost-first match | |
1519 | // semantics. This isn't a huge deal because 'all' semantics tend to | |
1520 | // only be used for forward overlapping searches with multiple regexes, | |
1521 | // and this optimization only supports a single pattern at the moment. | |
1522 | if core.info.config().get_match_kind() != MatchKind::LeftmostFirst { | |
1523 | debug!( | |
1524 | "skipping reverse inner optimization because \ | |
1525 | match kind is {:?} but this only supports leftmost-first", | |
1526 | core.info.config().get_match_kind(), | |
1527 | ); | |
1528 | return Err(core); | |
1529 | } | |
1530 | // It's likely that a reverse inner scan has too much overhead for it | |
1531 | // to be worth it when the regex is anchored at the start. It is | |
1532 | // possible for it to be quite a bit faster if the initial literal | |
1533 | // scan fails to detect a match, in which case, we can say "no match" | |
1534 | // very quickly. But this could be undesirable, e.g., scanning too far | |
1535 | // or when the literal scan matches. If it matches, then confirming the | |
1536 | // match requires a reverse scan followed by a forward scan to confirm | |
1537 | // or reject, which is a fair bit of work. | |
1538 | // | |
1539 | // Note that the caller can still request an anchored search even | |
1540 | // when the regex isn't anchored. We detect that case in the search | |
1541 | // routines below and just fallback to the core engine. Currently this | |
1542 | // optimization assumes all searches are unanchored, so if we do want | |
1543 | // to enable this optimization for anchored searches, it will need a | |
1544 | // little work to support it. | |
1545 | if core.info.is_always_anchored_start() { | |
1546 | debug!( | |
1547 | "skipping reverse inner optimization because \ | |
1548 | the regex is always anchored at the start", | |
1549 | ); | |
1550 | return Err(core); | |
1551 | } | |
1552 | // Only DFAs can do reverse searches (currently), so we need one of | |
1553 | // them in order to do this optimization. It's possible (although | |
1554 | // pretty unlikely) that we have neither and need to give up. | |
1555 | if !core.hybrid.is_some() && !core.dfa.is_some() { | |
1556 | debug!( | |
1557 | "skipping reverse inner optimization because \ | |
1558 | we don't have a lazy DFA or a full DFA" | |
1559 | ); | |
1560 | return Err(core); | |
1561 | } | |
1562 | if core.pre.as_ref().map_or(false, |p| p.is_fast()) { | |
1563 | debug!( | |
1564 | "skipping reverse inner optimization because \ | |
1565 | we already have a prefilter that we think is fast" | |
1566 | ); | |
1567 | return Err(core); | |
1568 | } else if core.pre.is_some() { | |
1569 | debug!( | |
1570 | "core engine has a prefix prefilter, but it is \ | |
1571 | probably not fast, so continuing with attempt to \ | |
1572 | use reverse inner prefilter" | |
1573 | ); | |
1574 | } | |
1575 | let (concat_prefix, preinner) = match reverse_inner::extract(hirs) { | |
1576 | Some(x) => x, | |
1577 | // N.B. the 'extract' function emits debug messages explaining | |
1578 | // why we bailed out here. | |
1579 | None => return Err(core), | |
1580 | }; | |
1581 | debug!("building reverse NFA for prefix before inner literal"); | |
1582 | let mut lookm = LookMatcher::new(); | |
1583 | lookm.set_line_terminator(core.info.config().get_line_terminator()); | |
1584 | let thompson_config = thompson::Config::new() | |
1585 | .reverse(true) | |
1586 | .utf8(core.info.config().get_utf8_empty()) | |
1587 | .nfa_size_limit(core.info.config().get_nfa_size_limit()) | |
1588 | .shrink(false) | |
1589 | .which_captures(WhichCaptures::None) | |
1590 | .look_matcher(lookm); | |
1591 | let result = thompson::Compiler::new() | |
1592 | .configure(thompson_config) | |
1593 | .build_from_hir(&concat_prefix); | |
1594 | let nfarev = match result { | |
1595 | Ok(nfarev) => nfarev, | |
1596 | Err(_err) => { | |
1597 | debug!( | |
1598 | "skipping reverse inner optimization because the \ | |
1599 | reverse NFA failed to build: {}", | |
1600 | _err, | |
1601 | ); | |
1602 | return Err(core); | |
1603 | } | |
1604 | }; | |
1605 | debug!("building reverse DFA for prefix before inner literal"); | |
1606 | let dfa = if !core.info.config().get_dfa() { | |
1607 | wrappers::ReverseDFA::none() | |
1608 | } else { | |
1609 | wrappers::ReverseDFA::new(&core.info, &nfarev) | |
1610 | }; | |
1611 | let hybrid = if !core.info.config().get_hybrid() { | |
1612 | wrappers::ReverseHybrid::none() | |
1613 | } else if dfa.is_some() { | |
1614 | debug!( | |
1615 | "skipping lazy DFA for reverse inner optimization \ | |
1616 | because we have a full DFA" | |
1617 | ); | |
1618 | wrappers::ReverseHybrid::none() | |
1619 | } else { | |
1620 | wrappers::ReverseHybrid::new(&core.info, &nfarev) | |
1621 | }; | |
1622 | Ok(ReverseInner { core, preinner, nfarev, hybrid, dfa }) | |
1623 | } | |
1624 | ||
1625 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1626 | fn try_search_full( | |
1627 | &self, | |
1628 | cache: &mut Cache, | |
1629 | input: &Input<'_>, | |
1630 | ) -> Result<Option<Match>, RetryError> { | |
1631 | let mut span = input.get_span(); | |
1632 | let mut min_match_start = 0; | |
1633 | let mut min_pre_start = 0; | |
1634 | loop { | |
1635 | let litmatch = match self.preinner.find(input.haystack(), span) { | |
1636 | None => return Ok(None), | |
1637 | Some(span) => span, | |
1638 | }; | |
1639 | if litmatch.start < min_pre_start { | |
1640 | trace!( | |
1641 | "found inner prefilter match at {:?}, which starts \ | |
1642 | before the end of the last forward scan at {}, \ | |
1643 | quitting to avoid quadratic behavior", | |
1644 | litmatch, | |
1645 | min_pre_start, | |
1646 | ); | |
1647 | return Err(RetryError::Quadratic(RetryQuadraticError::new())); | |
1648 | } | |
1649 | trace!("reverse inner scan found inner match at {:?}", litmatch); | |
1650 | let revinput = input | |
1651 | .clone() | |
1652 | .anchored(Anchored::Yes) | |
1653 | .span(input.start()..litmatch.start); | |
1654 | // Note that in addition to the literal search above scanning past | |
1655 | // our minimum start point, this routine can also return an error | |
1656 | // as a result of detecting possible quadratic behavior if the | |
1657 | // reverse scan goes past the minimum start point. That is, the | |
1658 | // literal search might not, but the reverse regex search for the | |
1659 | // prefix might! | |
1660 | match self.try_search_half_rev_limited( | |
1661 | cache, | |
1662 | &revinput, | |
1663 | min_match_start, | |
1664 | )? { | |
1665 | None => { | |
1666 | if span.start >= span.end { | |
1667 | break; | |
1668 | } | |
1669 | span.start = litmatch.start.checked_add(1).unwrap(); | |
1670 | } | |
1671 | Some(hm_start) => { | |
1672 | let fwdinput = input | |
1673 | .clone() | |
1674 | .anchored(Anchored::Pattern(hm_start.pattern())) | |
1675 | .span(hm_start.offset()..input.end()); | |
1676 | match self.try_search_half_fwd_stopat(cache, &fwdinput)? { | |
1677 | Err(stopat) => { | |
1678 | min_pre_start = stopat; | |
1679 | span.start = | |
1680 | litmatch.start.checked_add(1).unwrap(); | |
1681 | } | |
1682 | Ok(hm_end) => { | |
1683 | return Ok(Some(Match::new( | |
1684 | hm_start.pattern(), | |
1685 | hm_start.offset()..hm_end.offset(), | |
1686 | ))) | |
1687 | } | |
1688 | } | |
1689 | } | |
1690 | } | |
1691 | min_match_start = litmatch.end; | |
1692 | } | |
1693 | Ok(None) | |
1694 | } | |
1695 | ||
1696 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1697 | fn try_search_half_fwd_stopat( | |
1698 | &self, | |
1699 | cache: &mut Cache, | |
1700 | input: &Input<'_>, | |
1701 | ) -> Result<Result<HalfMatch, usize>, RetryFailError> { | |
1702 | if let Some(e) = self.core.dfa.get(&input) { | |
1703 | trace!( | |
1704 | "using full DFA for forward reverse inner search at {:?}", | |
1705 | input.get_span() | |
1706 | ); | |
1707 | e.try_search_half_fwd_stopat(&input) | |
1708 | } else if let Some(e) = self.core.hybrid.get(&input) { | |
1709 | trace!( | |
1710 | "using lazy DFA for forward reverse inner search at {:?}", | |
1711 | input.get_span() | |
1712 | ); | |
1713 | e.try_search_half_fwd_stopat(&mut cache.hybrid, &input) | |
1714 | } else { | |
1715 | unreachable!("ReverseInner always has a DFA") | |
1716 | } | |
1717 | } | |
1718 | ||
1719 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1720 | fn try_search_half_rev_limited( | |
1721 | &self, | |
1722 | cache: &mut Cache, | |
1723 | input: &Input<'_>, | |
1724 | min_start: usize, | |
1725 | ) -> Result<Option<HalfMatch>, RetryError> { | |
1726 | if let Some(e) = self.dfa.get(&input) { | |
1727 | trace!( | |
1728 | "using full DFA for reverse inner search at {:?}, \ | |
1729 | but will be stopped at {} to avoid quadratic behavior", | |
1730 | input.get_span(), | |
1731 | min_start, | |
1732 | ); | |
1733 | e.try_search_half_rev_limited(&input, min_start) | |
1734 | } else if let Some(e) = self.hybrid.get(&input) { | |
1735 | trace!( | |
1736 | "using lazy DFA for reverse inner search at {:?}, \ | |
1737 | but will be stopped at {} to avoid quadratic behavior", | |
1738 | input.get_span(), | |
1739 | min_start, | |
1740 | ); | |
1741 | e.try_search_half_rev_limited( | |
1742 | &mut cache.revhybrid, | |
1743 | &input, | |
1744 | min_start, | |
1745 | ) | |
1746 | } else { | |
1747 | unreachable!("ReverseInner always has a DFA") | |
1748 | } | |
1749 | } | |
1750 | } | |
1751 | ||
1752 | impl Strategy for ReverseInner { | |
1753 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1754 | fn group_info(&self) -> &GroupInfo { | |
1755 | self.core.group_info() | |
1756 | } | |
1757 | ||
1758 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1759 | fn create_cache(&self) -> Cache { | |
1760 | let mut cache = self.core.create_cache(); | |
1761 | cache.revhybrid = self.hybrid.create_cache(); | |
1762 | cache | |
1763 | } | |
1764 | ||
1765 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1766 | fn reset_cache(&self, cache: &mut Cache) { | |
1767 | self.core.reset_cache(cache); | |
1768 | cache.revhybrid.reset(&self.hybrid); | |
1769 | } | |
1770 | ||
1771 | fn is_accelerated(&self) -> bool { | |
1772 | self.preinner.is_fast() | |
1773 | } | |
1774 | ||
1775 | fn memory_usage(&self) -> usize { | |
1776 | self.core.memory_usage() | |
1777 | + self.preinner.memory_usage() | |
1778 | + self.nfarev.memory_usage() | |
1779 | + self.dfa.memory_usage() | |
1780 | } | |
1781 | ||
1782 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1783 | fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> { | |
1784 | if input.get_anchored().is_anchored() { | |
1785 | return self.core.search(cache, input); | |
1786 | } | |
1787 | match self.try_search_full(cache, input) { | |
1788 | Err(RetryError::Quadratic(_err)) => { | |
1789 | trace!("reverse inner optimization failed: {}", _err); | |
1790 | self.core.search(cache, input) | |
1791 | } | |
1792 | Err(RetryError::Fail(_err)) => { | |
1793 | trace!("reverse inner fast search failed: {}", _err); | |
1794 | self.core.search_nofail(cache, input) | |
1795 | } | |
1796 | Ok(matornot) => matornot, | |
1797 | } | |
1798 | } | |
1799 | ||
1800 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1801 | fn search_half( | |
1802 | &self, | |
1803 | cache: &mut Cache, | |
1804 | input: &Input<'_>, | |
1805 | ) -> Option<HalfMatch> { | |
1806 | if input.get_anchored().is_anchored() { | |
1807 | return self.core.search_half(cache, input); | |
1808 | } | |
1809 | match self.try_search_full(cache, input) { | |
1810 | Err(RetryError::Quadratic(_err)) => { | |
1811 | trace!("reverse inner half optimization failed: {}", _err); | |
1812 | self.core.search_half(cache, input) | |
1813 | } | |
1814 | Err(RetryError::Fail(_err)) => { | |
1815 | trace!("reverse inner fast half search failed: {}", _err); | |
1816 | self.core.search_half_nofail(cache, input) | |
1817 | } | |
1818 | Ok(None) => None, | |
1819 | Ok(Some(m)) => Some(HalfMatch::new(m.pattern(), m.end())), | |
1820 | } | |
1821 | } | |
1822 | ||
1823 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1824 | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { | |
1825 | if input.get_anchored().is_anchored() { | |
1826 | return self.core.is_match(cache, input); | |
1827 | } | |
1828 | match self.try_search_full(cache, input) { | |
1829 | Err(RetryError::Quadratic(_err)) => { | |
1830 | trace!("reverse inner half optimization failed: {}", _err); | |
1831 | self.core.is_match_nofail(cache, input) | |
1832 | } | |
1833 | Err(RetryError::Fail(_err)) => { | |
1834 | trace!("reverse inner fast half search failed: {}", _err); | |
1835 | self.core.is_match_nofail(cache, input) | |
1836 | } | |
1837 | Ok(None) => false, | |
1838 | Ok(Some(_)) => true, | |
1839 | } | |
1840 | } | |
1841 | ||
1842 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1843 | fn search_slots( | |
1844 | &self, | |
1845 | cache: &mut Cache, | |
1846 | input: &Input<'_>, | |
1847 | slots: &mut [Option<NonMaxUsize>], | |
1848 | ) -> Option<PatternID> { | |
1849 | if input.get_anchored().is_anchored() { | |
1850 | return self.core.search_slots(cache, input, slots); | |
1851 | } | |
1852 | if !self.core.is_capture_search_needed(slots.len()) { | |
1853 | trace!("asked for slots unnecessarily, trying fast path"); | |
1854 | let m = self.search(cache, input)?; | |
1855 | copy_match_to_slots(m, slots); | |
1856 | return Some(m.pattern()); | |
1857 | } | |
1858 | let m = match self.try_search_full(cache, input) { | |
1859 | Err(RetryError::Quadratic(_err)) => { | |
1860 | trace!("reverse inner captures optimization failed: {}", _err); | |
1861 | return self.core.search_slots(cache, input, slots); | |
1862 | } | |
1863 | Err(RetryError::Fail(_err)) => { | |
1864 | trace!("reverse inner fast captures search failed: {}", _err); | |
1865 | return self.core.search_slots_nofail(cache, input, slots); | |
1866 | } | |
1867 | Ok(None) => return None, | |
1868 | Ok(Some(m)) => m, | |
1869 | }; | |
1870 | trace!( | |
1871 | "match found at {}..{} in capture search, \ | |
1872 | using another engine to find captures", | |
1873 | m.start(), | |
1874 | m.end(), | |
1875 | ); | |
1876 | let input = input | |
1877 | .clone() | |
1878 | .span(m.start()..m.end()) | |
1879 | .anchored(Anchored::Pattern(m.pattern())); | |
1880 | self.core.search_slots_nofail(cache, &input, slots) | |
1881 | } | |
1882 | ||
1883 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1884 | fn which_overlapping_matches( | |
1885 | &self, | |
1886 | cache: &mut Cache, | |
1887 | input: &Input<'_>, | |
1888 | patset: &mut PatternSet, | |
1889 | ) { | |
1890 | self.core.which_overlapping_matches(cache, input, patset) | |
1891 | } | |
1892 | } | |
1893 | ||
1894 | /// Copies the offsets in the given match to the corresponding positions in | |
1895 | /// `slots`. | |
1896 | /// | |
1897 | /// In effect, this sets the slots corresponding to the implicit group for the | |
1898 | /// pattern in the given match. If the indices for the corresponding slots do | |
1899 | /// not exist, then no slots are set. | |
1900 | /// | |
1901 | /// This is useful when the caller provides slots (or captures), but you use a | |
1902 | /// regex engine that doesn't operate on slots (like a lazy DFA). This function | |
1903 | /// lets you map the match you get back to the slots provided by the caller. | |
1904 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
1905 | fn copy_match_to_slots(m: Match, slots: &mut [Option<NonMaxUsize>]) { | |
1906 | let slot_start = m.pattern().as_usize() * 2; | |
1907 | let slot_end = slot_start + 1; | |
1908 | if let Some(slot) = slots.get_mut(slot_start) { | |
1909 | *slot = NonMaxUsize::new(m.start()); | |
1910 | } | |
1911 | if let Some(slot) = slots.get_mut(slot_end) { | |
1912 | *slot = NonMaxUsize::new(m.end()); | |
1913 | } | |
1914 | } |