]> git.proxmox.com Git - cargo.git/blob - vendor/aho-corasick/src/automaton.rs
New upstream version 0.47.0
[cargo.git] / vendor / aho-corasick / src / automaton.rs
1 use ahocorasick::MatchKind;
2 use prefilter::{self, Candidate, Prefilter, PrefilterState};
3 use state_id::{dead_id, fail_id, StateID};
4 use Match;
5
6 // NOTE: This trait essentially started as a copy of the same trait from from
7 // regex-automata, with some wording changed since we use this trait for
8 // NFAs in addition to DFAs in this crate. Additionally, we do not export
9 // this trait. It's only used internally to reduce code duplication. The
10 // regex-automata crate needs to expose it because its Regex type is generic
11 // over implementations of this trait. In this crate, we encapsulate everything
12 // behind the AhoCorasick type.
13 //
14 // This trait is a bit of a mess, but it's not quite clear how to fix it.
15 // Basically, there are several competing concerns:
16 //
17 // * We need performance, so everything effectively needs to get monomorphized.
18 // * There are several variations on searching Aho-Corasick automatons:
19 // overlapping, standard and leftmost. Overlapping and standard are somewhat
20 // combined together below, but there is no real way to combine standard with
21 // leftmost. Namely, leftmost requires continuing a search even after a match
22 // is found, in order to correctly disambiguate a match.
23 // * On top of that, *sometimes* callers want to know which state the automaton
24 // is in after searching. This is principally useful for overlapping and
25 // stream searches. However, when callers don't care about this, we really
26 // do not want to be forced to compute it, since it sometimes requires extra
27 // work. Thus, there are effectively two copies of leftmost searching: one
28 // for tracking the state ID and one that doesn't. We should ideally do the
29 // same for standard searching, but my sanity stopped me.
30
31 // SAFETY RATIONALE: Previously, the code below went to some length to remove
32 // all bounds checks. This generally produced tighter assembly and lead to
33 // 20-50% improvements in micro-benchmarks on corpora made up of random
34 // characters. This somewhat makes sense, since the branch predictor is going
35 // to be at its worse on random text.
36 //
37 // However, using the aho-corasick-debug tool and manually benchmarking
38 // different inputs, the code *with* bounds checks actually wound up being
39 // slightly faster:
40 //
41 // $ cat input
42 // Sherlock Holmes
43 // John Watson
44 // Professor Moriarty
45 // Irene Adler
46 // Mary Watson
47 //
48 // $ aho-corasick-debug-safe \
49 // input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
50 // pattern read time: 32.824µs
51 // automaton build time: 444.687µs
52 // automaton heap usage: 72392 bytes
53 // match count: 639
54 // count time: 1.809961702s
55 //
56 // $ aho-corasick-debug-master \
57 // input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
58 // pattern read time: 31.425µs
59 // automaton build time: 317.434µs
60 // automaton heap usage: 72392 bytes
61 // match count: 639
62 // count time: 2.059157705s
63 //
64 // I was able to reproduce this result on two different machines (an i5 and
65 // an i7). Therefore, we go the route of safe code for now.
66
67 /// A trait describing the interface of an Aho-Corasick finite state machine.
68 ///
69 /// Every automaton has exactly one fail state, one dead state and exactly one
70 /// start state. Generally, these correspond to the first, second and third
71 /// states, respectively. The failure state is always treated as a sentinel.
72 /// That is, no correct Aho-Corasick automaton will ever transition into the
73 /// fail state. The dead state, however, can be transitioned into, but only
74 /// when leftmost-first or leftmost-longest match semantics are enabled and
75 /// only when at least one match has been observed.
76 ///
77 /// Every automaton also has one or more match states, such that
78 /// `Automaton::is_match_state(id)` returns `true` if and only if `id`
79 /// corresponds to a match state.
80 pub trait Automaton {
81 /// The representation used for state identifiers in this automaton.
82 ///
83 /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
84 type ID: StateID;
85
86 /// The type of matching that should be done.
87 fn match_kind(&self) -> &MatchKind;
88
89 /// Returns true if and only if this automaton uses anchored searches.
90 fn anchored(&self) -> bool;
91
92 /// An optional prefilter for quickly skipping to the next candidate match.
93 /// A prefilter must report at least every match, although it may report
94 /// positions that do not correspond to a match. That is, it must not allow
95 /// false negatives, but can allow false positives.
96 ///
97 /// Currently, a prefilter only runs when the automaton is in the start
98 /// state. That is, the position reported by a prefilter should always
99 /// correspond to the start of a potential match.
100 fn prefilter(&self) -> Option<&dyn Prefilter>;
101
102 /// Return the identifier of this automaton's start state.
103 fn start_state(&self) -> Self::ID;
104
105 /// Returns true if and only if the given state identifier refers to a
106 /// valid state.
107 fn is_valid(&self, id: Self::ID) -> bool;
108
109 /// Returns true if and only if the given identifier corresponds to a match
110 /// state.
111 ///
112 /// The state ID given must be valid, or else implementors may panic.
113 fn is_match_state(&self, id: Self::ID) -> bool;
114
115 /// Returns true if and only if the given identifier corresponds to a state
116 /// that is either the dead state or a match state.
117 ///
118 /// Depending on the implementation of the automaton, this routine can
119 /// be used to save a branch in the core matching loop. Nevertheless,
120 /// `is_match_state(id) || id == dead_id()` is always a valid
121 /// implementation. Indeed, this is the default implementation.
122 ///
123 /// The state ID given must be valid, or else implementors may panic.
124 fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
125 id == dead_id() || self.is_match_state(id)
126 }
127
128 /// If the given state is a match state, return the match corresponding
129 /// to the given match index. `end` must be the ending position of the
130 /// detected match. If no match exists or if `match_index` exceeds the
131 /// number of matches in this state, then `None` is returned.
132 ///
133 /// The state ID given must be valid, or else implementors may panic.
134 ///
135 /// If the given state ID is correct and if the `match_index` is less than
136 /// the number of matches for that state, then this is guaranteed to return
137 /// a match.
138 fn get_match(
139 &self,
140 id: Self::ID,
141 match_index: usize,
142 end: usize,
143 ) -> Option<Match>;
144
145 /// Returns the number of matches for the given state. If the given state
146 /// is not a match state, then this returns 0.
147 ///
148 /// The state ID given must be valid, or else implementors must panic.
149 fn match_count(&self, id: Self::ID) -> usize;
150
151 /// Given the current state that this automaton is in and the next input
152 /// byte, this method returns the identifier of the next state. The
153 /// identifier returned must always be valid and may never correspond to
154 /// the fail state. The returned identifier may, however, point to the
155 /// dead state.
156 ///
157 /// This is not safe so that implementors may look up the next state
158 /// without memory safety checks such as bounds checks. As such, callers
159 /// must ensure that the given identifier corresponds to a valid automaton
160 /// state. Implementors must, in turn, ensure that this routine is safe for
161 /// all valid state identifiers and for all possible `u8` values.
162 fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
163
164 /// Like next_state, but debug_asserts that the underlying
165 /// implementation never returns a `fail_id()` for the next state.
166 fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID {
167 let next = self.next_state(current, input);
168 // We should never see a transition to the failure state.
169 debug_assert!(
170 next != fail_id(),
171 "automaton should never return fail_id for next state"
172 );
173 next
174 }
175
176 /// Execute a search using standard match semantics.
177 ///
178 /// This can be used even when the automaton was constructed with leftmost
179 /// match semantics when you want to find the earliest possible match. This
180 /// can also be used as part of an overlapping search implementation.
181 ///
182 /// N.B. This does not report a match if `state_id` is given as a matching
183 /// state. As such, this should not be used directly.
184 #[inline(always)]
185 fn standard_find_at(
186 &self,
187 prestate: &mut PrefilterState,
188 haystack: &[u8],
189 at: usize,
190 state_id: &mut Self::ID,
191 ) -> Option<Match> {
192 if let Some(pre) = self.prefilter() {
193 self.standard_find_at_imp(
194 prestate,
195 Some(pre),
196 haystack,
197 at,
198 state_id,
199 )
200 } else {
201 self.standard_find_at_imp(prestate, None, haystack, at, state_id)
202 }
203 }
204
205 // It's important for this to always be inlined. Namely, its only caller
206 // is standard_find_at, and the inlining should remove the case analysis
207 // for prefilter scanning when there is no prefilter available.
208 #[inline(always)]
209 fn standard_find_at_imp(
210 &self,
211 prestate: &mut PrefilterState,
212 prefilter: Option<&dyn Prefilter>,
213 haystack: &[u8],
214 mut at: usize,
215 state_id: &mut Self::ID,
216 ) -> Option<Match> {
217 while at < haystack.len() {
218 if let Some(pre) = prefilter {
219 if prestate.is_effective(at) && *state_id == self.start_state()
220 {
221 let c = prefilter::next(prestate, pre, haystack, at)
222 .into_option();
223 match c {
224 None => return None,
225 Some(i) => {
226 at = i;
227 }
228 }
229 }
230 }
231 // CORRECTNESS: next_state is correct for all possible u8 values,
232 // so the only thing we're concerned about is the validity of
233 // `state_id`. `state_id` either comes from the caller (in which
234 // case, we assume it is correct), or it comes from the return
235 // value of next_state, which is guaranteed to be correct.
236 *state_id = self.next_state_no_fail(*state_id, haystack[at]);
237 at += 1;
238 // This routine always quits immediately after seeing a
239 // match, and since dead states can only come after seeing
240 // a match, seeing a dead state here is impossible. (Unless
241 // we have an anchored automaton, in which case, dead states
242 // are used to stop a search.)
243 debug_assert!(
244 *state_id != dead_id() || self.anchored(),
245 "standard find should never see a dead state"
246 );
247
248 if self.is_match_or_dead_state(*state_id) {
249 return if *state_id == dead_id() {
250 None
251 } else {
252 self.get_match(*state_id, 0, at)
253 };
254 }
255 }
256 None
257 }
258
259 /// Execute a search using leftmost (either first or longest) match
260 /// semantics.
261 ///
262 /// The principle difference between searching with standard semantics and
263 /// searching with leftmost semantics is that leftmost searching will
264 /// continue searching even after a match has been found. Once a match
265 /// is found, the search does not stop until either the haystack has been
266 /// exhausted or a dead state is observed in the automaton. (Dead states
267 /// only exist in automatons constructed with leftmost semantics.) That is,
268 /// we rely on the construction of the automaton to tell us when to quit.
269 #[inline(never)]
270 fn leftmost_find_at(
271 &self,
272 prestate: &mut PrefilterState,
273 haystack: &[u8],
274 at: usize,
275 state_id: &mut Self::ID,
276 ) -> Option<Match> {
277 if let Some(pre) = self.prefilter() {
278 self.leftmost_find_at_imp(
279 prestate,
280 Some(pre),
281 haystack,
282 at,
283 state_id,
284 )
285 } else {
286 self.leftmost_find_at_imp(prestate, None, haystack, at, state_id)
287 }
288 }
289
290 // It's important for this to always be inlined. Namely, its only caller
291 // is leftmost_find_at, and the inlining should remove the case analysis
292 // for prefilter scanning when there is no prefilter available.
293 #[inline(always)]
294 fn leftmost_find_at_imp(
295 &self,
296 prestate: &mut PrefilterState,
297 prefilter: Option<&dyn Prefilter>,
298 haystack: &[u8],
299 mut at: usize,
300 state_id: &mut Self::ID,
301 ) -> Option<Match> {
302 debug_assert!(self.match_kind().is_leftmost());
303 if self.anchored() && at > 0 && *state_id == self.start_state() {
304 return None;
305 }
306 let mut last_match = self.get_match(*state_id, 0, at);
307 while at < haystack.len() {
308 if let Some(pre) = prefilter {
309 if prestate.is_effective(at) && *state_id == self.start_state()
310 {
311 let c = prefilter::next(prestate, pre, haystack, at)
312 .into_option();
313 match c {
314 None => return None,
315 Some(i) => {
316 at = i;
317 }
318 }
319 }
320 }
321 // CORRECTNESS: next_state is correct for all possible u8 values,
322 // so the only thing we're concerned about is the validity of
323 // `state_id`. `state_id` either comes from the caller (in which
324 // case, we assume it is correct), or it comes from the return
325 // value of next_state, which is guaranteed to be correct.
326 *state_id = self.next_state_no_fail(*state_id, haystack[at]);
327 at += 1;
328 if self.is_match_or_dead_state(*state_id) {
329 if *state_id == dead_id() {
330 // The only way to enter into a dead state is if a match
331 // has been found, so we assert as much. This is different
332 // from normal automata, where you might enter a dead state
333 // if you know a subsequent match will never be found
334 // (regardless of whether a match has already been found).
335 // For Aho-Corasick, it is built so that we can match at
336 // any position, so the possibility of a match always
337 // exists.
338 //
339 // (Unless we have an anchored automaton, in which case,
340 // dead states are used to stop a search.)
341 debug_assert!(
342 last_match.is_some() || self.anchored(),
343 "failure state should only be seen after match"
344 );
345 return last_match;
346 }
347 last_match = self.get_match(*state_id, 0, at);
348 }
349 }
350 last_match
351 }
352
353 /// This is like leftmost_find_at, but does not need to track a caller
354 /// provided state id. In other words, the only output of this routine is a
355 /// match, if one exists.
356 ///
357 /// It is regrettable that we need to effectively copy a chunk of
358 /// implementation twice, but when we don't need to track the state ID, we
359 /// can allow the prefilter to report matches immediately without having
360 /// to re-confirm them with the automaton. The re-confirmation step is
361 /// necessary in leftmost_find_at because tracing through the automaton is
362 /// the only way to correctly set the state ID. (Perhaps an alternative
363 /// would be to keep a map from pattern ID to matching state ID, but that
364 /// complicates the code and still doesn't permit us to defer to the
365 /// prefilter entirely when possible.)
366 ///
367 /// I did try a few things to avoid the code duplication here, but nothing
368 /// optimized as well as this approach. (In microbenchmarks, there was
369 /// about a 25% difference.)
370 #[inline(never)]
371 fn leftmost_find_at_no_state(
372 &self,
373 prestate: &mut PrefilterState,
374 haystack: &[u8],
375 at: usize,
376 ) -> Option<Match> {
377 if let Some(pre) = self.prefilter() {
378 self.leftmost_find_at_no_state_imp(
379 prestate,
380 Some(pre),
381 haystack,
382 at,
383 )
384 } else {
385 self.leftmost_find_at_no_state_imp(prestate, None, haystack, at)
386 }
387 }
388
389 // It's important for this to always be inlined. Namely, its only caller
390 // is leftmost_find_at_no_state, and the inlining should remove the case
391 // analysis for prefilter scanning when there is no prefilter available.
392 #[inline(always)]
393 fn leftmost_find_at_no_state_imp(
394 &self,
395 prestate: &mut PrefilterState,
396 prefilter: Option<&dyn Prefilter>,
397 haystack: &[u8],
398 mut at: usize,
399 ) -> Option<Match> {
400 debug_assert!(self.match_kind().is_leftmost());
401 if self.anchored() && at > 0 {
402 return None;
403 }
404 // If our prefilter handles confirmation of matches 100% of the
405 // time, and since we don't need to track state IDs, we can avoid
406 // Aho-Corasick completely.
407 if let Some(pre) = prefilter {
408 // We should never have a prefilter during an anchored search.
409 debug_assert!(!self.anchored());
410 if !pre.reports_false_positives() {
411 return match pre.next_candidate(prestate, haystack, at) {
412 Candidate::None => None,
413 Candidate::Match(m) => Some(m),
414 Candidate::PossibleStartOfMatch(_) => unreachable!(),
415 };
416 }
417 }
418
419 let mut state_id = self.start_state();
420 let mut last_match = self.get_match(state_id, 0, at);
421 while at < haystack.len() {
422 if let Some(pre) = prefilter {
423 if prestate.is_effective(at) && state_id == self.start_state()
424 {
425 match prefilter::next(prestate, pre, haystack, at) {
426 Candidate::None => return None,
427 // Since we aren't tracking a state ID, we can
428 // quit early once we know we have a match.
429 Candidate::Match(m) => return Some(m),
430 Candidate::PossibleStartOfMatch(i) => {
431 at = i;
432 }
433 }
434 }
435 }
436 // CORRECTNESS: next_state is correct for all possible u8 values,
437 // so the only thing we're concerned about is the validity of
438 // `state_id`. `state_id` either comes from the caller (in which
439 // case, we assume it is correct), or it comes from the return
440 // value of next_state, which is guaranteed to be correct.
441 state_id = self.next_state_no_fail(state_id, haystack[at]);
442 at += 1;
443 if self.is_match_or_dead_state(state_id) {
444 if state_id == dead_id() {
445 // The only way to enter into a dead state is if a
446 // match has been found, so we assert as much. This
447 // is different from normal automata, where you might
448 // enter a dead state if you know a subsequent match
449 // will never be found (regardless of whether a match
450 // has already been found). For Aho-Corasick, it is
451 // built so that we can match at any position, so the
452 // possibility of a match always exists.
453 //
454 // (Unless we have an anchored automaton, in which
455 // case, dead states are used to stop a search.)
456 debug_assert!(
457 last_match.is_some() || self.anchored(),
458 "failure state should only be seen after match"
459 );
460 return last_match;
461 }
462 last_match = self.get_match(state_id, 0, at);
463 }
464 }
465 last_match
466 }
467
468 /// Execute an overlapping search.
469 ///
470 /// When executing an overlapping match, the previous state ID in addition
471 /// to the previous match index should be given. If there are more matches
472 /// at the given state, then the match is reported and the given index is
473 /// incremented.
474 #[inline(always)]
475 fn overlapping_find_at(
476 &self,
477 prestate: &mut PrefilterState,
478 haystack: &[u8],
479 at: usize,
480 state_id: &mut Self::ID,
481 match_index: &mut usize,
482 ) -> Option<Match> {
483 if self.anchored() && at > 0 && *state_id == self.start_state() {
484 return None;
485 }
486
487 let match_count = self.match_count(*state_id);
488 if *match_index < match_count {
489 // This is guaranteed to return a match since
490 // match_index < match_count.
491 let result = self.get_match(*state_id, *match_index, at);
492 debug_assert!(result.is_some(), "must be a match");
493 *match_index += 1;
494 return result;
495 }
496
497 *match_index = 0;
498 match self.standard_find_at(prestate, haystack, at, state_id) {
499 None => None,
500 Some(m) => {
501 *match_index = 1;
502 Some(m)
503 }
504 }
505 }
506
507 /// Return the earliest match found. This returns as soon as we know that
508 /// we have a match. As such, this does not necessarily correspond to the
509 /// leftmost starting match, but rather, the leftmost position at which a
510 /// match ends.
511 #[inline(always)]
512 fn earliest_find_at(
513 &self,
514 prestate: &mut PrefilterState,
515 haystack: &[u8],
516 at: usize,
517 state_id: &mut Self::ID,
518 ) -> Option<Match> {
519 if *state_id == self.start_state() {
520 if self.anchored() && at > 0 {
521 return None;
522 }
523 if let Some(m) = self.get_match(*state_id, 0, at) {
524 return Some(m);
525 }
526 }
527 self.standard_find_at(prestate, haystack, at, state_id)
528 }
529
530 /// A convenience function for finding the next match according to the
531 /// match semantics of this automaton. For standard match semantics, this
532 /// finds the earliest match. Otherwise, the leftmost match is found.
533 #[inline(always)]
534 fn find_at(
535 &self,
536 prestate: &mut PrefilterState,
537 haystack: &[u8],
538 at: usize,
539 state_id: &mut Self::ID,
540 ) -> Option<Match> {
541 match *self.match_kind() {
542 MatchKind::Standard => {
543 self.earliest_find_at(prestate, haystack, at, state_id)
544 }
545 MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
546 self.leftmost_find_at(prestate, haystack, at, state_id)
547 }
548 MatchKind::__Nonexhaustive => unreachable!(),
549 }
550 }
551
552 /// Like find_at, but does not track state identifiers. This permits some
553 /// optimizations when a prefilter that confirms its own matches is
554 /// present.
555 #[inline(always)]
556 fn find_at_no_state(
557 &self,
558 prestate: &mut PrefilterState,
559 haystack: &[u8],
560 at: usize,
561 ) -> Option<Match> {
562 match *self.match_kind() {
563 MatchKind::Standard => {
564 let mut state = self.start_state();
565 self.earliest_find_at(prestate, haystack, at, &mut state)
566 }
567 MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
568 self.leftmost_find_at_no_state(prestate, haystack, at)
569 }
570 MatchKind::__Nonexhaustive => unreachable!(),
571 }
572 }
573 }