]> git.proxmox.com Git - rustc.git/blob - vendor/aho-corasick/src/ahocorasick.rs
New upstream version 1.45.0+dfsg1
[rustc.git] / vendor / aho-corasick / src / ahocorasick.rs
1 use std::io;
2
3 use automaton::Automaton;
4 use buffer::Buffer;
5 use dfa::{self, DFA};
6 use error::Result;
7 use nfa::{self, NFA};
8 use packed;
9 use prefilter::PrefilterState;
10 use state_id::StateID;
11 use Match;
12
13 /// An automaton for searching multiple strings in linear time.
14 ///
15 /// The `AhoCorasick` type supports a few basic ways of constructing an
16 /// automaton, including
17 /// [`AhoCorasick::new`](struct.AhoCorasick.html#method.new)
18 /// and
19 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured).
20 /// However, there are a fair number of configurable options that can be set
21 /// by using
22 /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html)
23 /// instead. Such options include, but are not limited to, how matches are
24 /// determined, simple case insensitivity, whether to use a DFA or not and
25 /// various knobs for controlling the space-vs-time trade offs taken when
26 /// building the automaton.
27 ///
28 /// If you aren't sure where to start, try beginning with
29 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured).
30 ///
31 /// # Resource usage
32 ///
33 /// Aho-Corasick automatons are always constructed in `O(p)` time, where `p`
34 /// is the combined length of all patterns being searched. With that said,
35 /// building an automaton can be fairly costly because of high constant
36 /// factors, particularly when enabling the
37 /// [DFA](struct.AhoCorasickBuilder.html#method.dfa)
38 /// option (which is disabled by default). For this reason, it's generally a
39 /// good idea to build an automaton once and reuse it as much as possible.
40 ///
41 /// Aho-Corasick automatons can also use a fair bit of memory. To get a
42 /// concrete idea of how much memory is being used, try using the
43 /// [`AhoCorasick::heap_bytes`](struct.AhoCorasick.html#method.heap_bytes)
44 /// method.
45 ///
46 /// # Examples
47 ///
48 /// This example shows how to search for occurrences of multiple patterns
49 /// simultaneously in a case insensitive fashion. Each match includes the
50 /// pattern that matched along with the byte offsets of the match.
51 ///
52 /// ```
53 /// use aho_corasick::AhoCorasickBuilder;
54 ///
55 /// let patterns = &["apple", "maple", "snapple"];
56 /// let haystack = "Nobody likes maple in their apple flavored Snapple.";
57 ///
58 /// let ac = AhoCorasickBuilder::new()
59 /// .ascii_case_insensitive(true)
60 /// .build(patterns);
61 /// let mut matches = vec![];
62 /// for mat in ac.find_iter(haystack) {
63 /// matches.push((mat.pattern(), mat.start(), mat.end()));
64 /// }
65 /// assert_eq!(matches, vec![
66 /// (1, 13, 18),
67 /// (0, 28, 33),
68 /// (2, 43, 50),
69 /// ]);
70 /// ```
71 ///
72 /// This example shows how to replace matches with some other string:
73 ///
74 /// ```
75 /// use aho_corasick::AhoCorasick;
76 ///
77 /// let patterns = &["fox", "brown", "quick"];
78 /// let haystack = "The quick brown fox.";
79 /// let replace_with = &["sloth", "grey", "slow"];
80 ///
81 /// let ac = AhoCorasick::new(patterns);
82 /// let result = ac.replace_all(haystack, replace_with);
83 /// assert_eq!(result, "The slow grey sloth.");
84 /// ```
85 #[derive(Clone, Debug)]
86 pub struct AhoCorasick<S: StateID = usize> {
87 imp: Imp<S>,
88 match_kind: MatchKind,
89 }
90
91 impl AhoCorasick {
92 /// Create a new Aho-Corasick automaton using the default configuration.
93 ///
94 /// The default configuration optimizes for less space usage, but at the
95 /// expense of longer search times. To change the configuration, use
96 /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html)
97 /// for fine-grained control, or
98 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured)
99 /// for automatic configuration if you aren't sure which settings to pick.
100 ///
101 /// This uses the default
102 /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
103 /// match semantics, which reports a match as soon as it is found. This
104 /// corresponds to the standard match semantics supported by textbook
105 /// descriptions of the Aho-Corasick algorithm.
106 ///
107 /// # Examples
108 ///
109 /// Basic usage:
110 ///
111 /// ```
112 /// use aho_corasick::AhoCorasick;
113 ///
114 /// let ac = AhoCorasick::new(&[
115 /// "foo", "bar", "baz",
116 /// ]);
117 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
118 /// ```
119 pub fn new<I, P>(patterns: I) -> AhoCorasick
120 where
121 I: IntoIterator<Item = P>,
122 P: AsRef<[u8]>,
123 {
124 AhoCorasickBuilder::new().build(patterns)
125 }
126
127 /// Build an Aho-Corasick automaton with an automatically determined
128 /// configuration.
129 ///
130 /// Specifically, this requires a slice of patterns instead of an iterator
131 /// since the configuration is determined by looking at the patterns before
132 /// constructing the automaton. The idea here is to balance space and time
133 /// automatically. That is, when searching a small number of patterns, this
134 /// will attempt to use the fastest possible configuration since the total
135 /// space required will be small anyway. As the number of patterns grows,
136 /// this will fall back to slower configurations that use less space.
137 ///
138 /// If you want auto configuration but with match semantics different from
139 /// the default `MatchKind::Standard`, then use
140 /// [`AhoCorasickBuilder::auto_configure`](struct.AhoCorasickBuilder.html#method.auto_configure).
141 ///
142 /// # Examples
143 ///
144 /// Basic usage is just like `new`, except you must provide a slice:
145 ///
146 /// ```
147 /// use aho_corasick::AhoCorasick;
148 ///
149 /// let ac = AhoCorasick::new_auto_configured(&[
150 /// "foo", "bar", "baz",
151 /// ]);
152 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
153 /// ```
154 pub fn new_auto_configured<B>(patterns: &[B]) -> AhoCorasick
155 where
156 B: AsRef<[u8]>,
157 {
158 AhoCorasickBuilder::new().auto_configure(patterns).build(patterns)
159 }
160 }
161
162 impl<S: StateID> AhoCorasick<S> {
163 /// Returns true if and only if this automaton matches the haystack at any
164 /// position.
165 ///
166 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
167 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
168 /// `&[u8]` itself.
169 ///
170 /// # Examples
171 ///
172 /// Basic usage:
173 ///
174 /// ```
175 /// use aho_corasick::AhoCorasick;
176 ///
177 /// let ac = AhoCorasick::new(&[
178 /// "foo", "bar", "quux", "baz",
179 /// ]);
180 /// assert!(ac.is_match("xxx bar xxx"));
181 /// assert!(!ac.is_match("xxx qux xxx"));
182 /// ```
183 pub fn is_match<B: AsRef<[u8]>>(&self, haystack: B) -> bool {
184 self.earliest_find(haystack).is_some()
185 }
186
187 /// Returns the location of the first detected match in `haystack`.
188 ///
189 /// This method has the same behavior regardless of the
190 /// [`MatchKind`](enum.MatchKind.html)
191 /// of this automaton.
192 ///
193 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
194 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
195 /// `&[u8]` itself.
196 ///
197 /// # Examples
198 ///
199 /// Basic usage:
200 ///
201 /// ```
202 /// use aho_corasick::AhoCorasick;
203 ///
204 /// let ac = AhoCorasick::new(&[
205 /// "abc", "b",
206 /// ]);
207 /// let mat = ac.earliest_find("abcd").expect("should have match");
208 /// assert_eq!(1, mat.pattern());
209 /// assert_eq!((1, 2), (mat.start(), mat.end()));
210 /// ```
211 pub fn earliest_find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
212 let mut prestate = PrefilterState::new(self.max_pattern_len());
213 let mut start = self.imp.start_state();
214 self.imp.earliest_find_at(
215 &mut prestate,
216 haystack.as_ref(),
217 0,
218 &mut start,
219 )
220 }
221
222 /// Returns the location of the first match according to the match
223 /// semantics that this automaton was constructed with.
224 ///
225 /// When using `MatchKind::Standard`, this corresponds precisely to the
226 /// same behavior as
227 /// [`earliest_find`](struct.AhoCorasick.html#method.earliest_find).
228 /// Otherwise, match semantics correspond to either
229 /// [leftmost-first](enum.MatchKind.html#variant.LeftmostFirst)
230 /// or
231 /// [leftmost-longest](enum.MatchKind.html#variant.LeftmostLongest).
232 ///
233 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
234 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
235 /// `&[u8]` itself.
236 ///
237 /// # Examples
238 ///
239 /// Basic usage, with standard semantics:
240 ///
241 /// ```
242 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
243 ///
244 /// let patterns = &["b", "abc", "abcd"];
245 /// let haystack = "abcd";
246 ///
247 /// let ac = AhoCorasickBuilder::new()
248 /// .match_kind(MatchKind::Standard) // default, not necessary
249 /// .build(patterns);
250 /// let mat = ac.find(haystack).expect("should have a match");
251 /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
252 /// ```
253 ///
254 /// Now with leftmost-first semantics:
255 ///
256 /// ```
257 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
258 ///
259 /// let patterns = &["b", "abc", "abcd"];
260 /// let haystack = "abcd";
261 ///
262 /// let ac = AhoCorasickBuilder::new()
263 /// .match_kind(MatchKind::LeftmostFirst)
264 /// .build(patterns);
265 /// let mat = ac.find(haystack).expect("should have a match");
266 /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
267 /// ```
268 ///
269 /// And finally, leftmost-longest semantics:
270 ///
271 /// ```
272 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
273 ///
274 /// let patterns = &["b", "abc", "abcd"];
275 /// let haystack = "abcd";
276 ///
277 /// let ac = AhoCorasickBuilder::new()
278 /// .match_kind(MatchKind::LeftmostLongest)
279 /// .build(patterns);
280 /// let mat = ac.find(haystack).expect("should have a match");
281 /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]);
282 /// ```
283 pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
284 let mut prestate = PrefilterState::new(self.max_pattern_len());
285 self.imp.find_at_no_state(&mut prestate, haystack.as_ref(), 0)
286 }
287
288 /// Returns an iterator of non-overlapping matches, using the match
289 /// semantics that this automaton was constructed with.
290 ///
291 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
292 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
293 /// `&[u8]` itself.
294 ///
295 /// # Examples
296 ///
297 /// Basic usage, with standard semantics:
298 ///
299 /// ```
300 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
301 ///
302 /// let patterns = &["append", "appendage", "app"];
303 /// let haystack = "append the app to the appendage";
304 ///
305 /// let ac = AhoCorasickBuilder::new()
306 /// .match_kind(MatchKind::Standard) // default, not necessary
307 /// .build(patterns);
308 /// let matches: Vec<usize> = ac
309 /// .find_iter(haystack)
310 /// .map(|mat| mat.pattern())
311 /// .collect();
312 /// assert_eq!(vec![2, 2, 2], matches);
313 /// ```
314 ///
315 /// Now with leftmost-first semantics:
316 ///
317 /// ```
318 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
319 ///
320 /// let patterns = &["append", "appendage", "app"];
321 /// let haystack = "append the app to the appendage";
322 ///
323 /// let ac = AhoCorasickBuilder::new()
324 /// .match_kind(MatchKind::LeftmostFirst)
325 /// .build(patterns);
326 /// let matches: Vec<usize> = ac
327 /// .find_iter(haystack)
328 /// .map(|mat| mat.pattern())
329 /// .collect();
330 /// assert_eq!(vec![0, 2, 0], matches);
331 /// ```
332 ///
333 /// And finally, leftmost-longest semantics:
334 ///
335 /// ```
336 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
337 ///
338 /// let patterns = &["append", "appendage", "app"];
339 /// let haystack = "append the app to the appendage";
340 ///
341 /// let ac = AhoCorasickBuilder::new()
342 /// .match_kind(MatchKind::LeftmostLongest)
343 /// .build(patterns);
344 /// let matches: Vec<usize> = ac
345 /// .find_iter(haystack)
346 /// .map(|mat| mat.pattern())
347 /// .collect();
348 /// assert_eq!(vec![0, 2, 1], matches);
349 /// ```
350 pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
351 &'a self,
352 haystack: &'b B,
353 ) -> FindIter<'a, 'b, S> {
354 FindIter::new(self, haystack.as_ref())
355 }
356
357 /// Returns an iterator of overlapping matches in the given `haystack`.
358 ///
359 /// Overlapping matches can _only_ be detected using
360 /// `MatchKind::Standard` semantics. If this automaton was constructed with
361 /// leftmost semantics, then this method will panic. To determine whether
362 /// this will panic at runtime, use the
363 /// [`AhoCorasick::supports_overlapping`](struct.AhoCorasick.html#method.supports_overlapping)
364 /// method.
365 ///
366 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
367 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
368 /// `&[u8]` itself.
369 ///
370 /// # Panics
371 ///
372 /// This panics when `AhoCorasick::supports_overlapping` returns `false`.
373 /// That is, this panics when this automaton's match semantics are not
374 /// `MatchKind::Standard`.
375 ///
376 /// # Examples
377 ///
378 /// Basic usage, with standard semantics:
379 ///
380 /// ```
381 /// use aho_corasick::AhoCorasick;
382 ///
383 /// let patterns = &["append", "appendage", "app"];
384 /// let haystack = "append the app to the appendage";
385 ///
386 /// let ac = AhoCorasick::new(patterns);
387 /// let matches: Vec<usize> = ac
388 /// .find_overlapping_iter(haystack)
389 /// .map(|mat| mat.pattern())
390 /// .collect();
391 /// assert_eq!(vec![2, 0, 2, 2, 0, 1], matches);
392 /// ```
393 pub fn find_overlapping_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
394 &'a self,
395 haystack: &'b B,
396 ) -> FindOverlappingIter<'a, 'b, S> {
397 FindOverlappingIter::new(self, haystack.as_ref())
398 }
399
400 /// Replace all matches with a corresponding value in the `replace_with`
401 /// slice given. Matches correspond to the same matches as reported by
402 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
403 ///
404 /// Replacements are determined by the index of the matching pattern.
405 /// For example, if the pattern with index `2` is found, then it is
406 /// replaced by `replace_with[2]`.
407 ///
408 /// # Panics
409 ///
410 /// This panics when `replace_with.len()` does not equal the total number
411 /// of patterns that are matched by this automaton.
412 ///
413 /// # Examples
414 ///
415 /// Basic usage:
416 ///
417 /// ```
418 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
419 ///
420 /// let patterns = &["append", "appendage", "app"];
421 /// let haystack = "append the app to the appendage";
422 ///
423 /// let ac = AhoCorasickBuilder::new()
424 /// .match_kind(MatchKind::LeftmostFirst)
425 /// .build(patterns);
426 /// let result = ac.replace_all(haystack, &["x", "y", "z"]);
427 /// assert_eq!("x the z to the xage", result);
428 /// ```
429 pub fn replace_all<B>(&self, haystack: &str, replace_with: &[B]) -> String
430 where
431 B: AsRef<str>,
432 {
433 assert_eq!(
434 replace_with.len(),
435 self.pattern_count(),
436 "replace_all requires a replacement for every pattern \
437 in the automaton"
438 );
439 let mut dst = String::with_capacity(haystack.len());
440 self.replace_all_with(haystack, &mut dst, |mat, _, dst| {
441 dst.push_str(replace_with[mat.pattern()].as_ref());
442 true
443 });
444 dst
445 }
446
447 /// Replace all matches using raw bytes with a corresponding value in the
448 /// `replace_with` slice given. Matches correspond to the same matches as
449 /// reported by [`find_iter`](struct.AhoCorasick.html#method.find_iter).
450 ///
451 /// Replacements are determined by the index of the matching pattern.
452 /// For example, if the pattern with index `2` is found, then it is
453 /// replaced by `replace_with[2]`.
454 ///
455 /// # Panics
456 ///
457 /// This panics when `replace_with.len()` does not equal the total number
458 /// of patterns that are matched by this automaton.
459 ///
460 /// # Examples
461 ///
462 /// Basic usage:
463 ///
464 /// ```
465 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
466 ///
467 /// let patterns = &["append", "appendage", "app"];
468 /// let haystack = b"append the app to the appendage";
469 ///
470 /// let ac = AhoCorasickBuilder::new()
471 /// .match_kind(MatchKind::LeftmostFirst)
472 /// .build(patterns);
473 /// let result = ac.replace_all_bytes(haystack, &["x", "y", "z"]);
474 /// assert_eq!(b"x the z to the xage".to_vec(), result);
475 /// ```
476 pub fn replace_all_bytes<B>(
477 &self,
478 haystack: &[u8],
479 replace_with: &[B],
480 ) -> Vec<u8>
481 where
482 B: AsRef<[u8]>,
483 {
484 assert_eq!(
485 replace_with.len(),
486 self.pattern_count(),
487 "replace_all_bytes requires a replacement for every pattern \
488 in the automaton"
489 );
490 let mut dst = Vec::with_capacity(haystack.len());
491 self.replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| {
492 dst.extend(replace_with[mat.pattern()].as_ref());
493 true
494 });
495 dst
496 }
497
498 /// Replace all matches using a closure called on each match.
499 /// Matches correspond to the same matches as reported by
500 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
501 ///
502 /// The closure accepts three parameters: the match found, the text of
503 /// the match and a string buffer with which to write the replaced text
504 /// (if any). If the closure returns `true`, then it continues to the next
505 /// match. If the closure returns false, then searching is stopped.
506 ///
507 /// # Examples
508 ///
509 /// Basic usage:
510 ///
511 /// ```
512 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
513 ///
514 /// let patterns = &["append", "appendage", "app"];
515 /// let haystack = "append the app to the appendage";
516 ///
517 /// let ac = AhoCorasickBuilder::new()
518 /// .match_kind(MatchKind::LeftmostFirst)
519 /// .build(patterns);
520 /// let mut result = String::new();
521 /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| {
522 /// dst.push_str(&mat.pattern().to_string());
523 /// true
524 /// });
525 /// assert_eq!("0 the 2 to the 0age", result);
526 /// ```
527 pub fn replace_all_with<F>(
528 &self,
529 haystack: &str,
530 dst: &mut String,
531 mut replace_with: F,
532 ) where
533 F: FnMut(&Match, &str, &mut String) -> bool,
534 {
535 let mut last_match = 0;
536 for mat in self.find_iter(haystack) {
537 dst.push_str(&haystack[last_match..mat.start()]);
538 last_match = mat.end();
539 replace_with(&mat, &haystack[mat.start()..mat.end()], dst);
540 }
541 dst.push_str(&haystack[last_match..]);
542 }
543
544 /// Replace all matches using raw bytes with a closure called on each
545 /// match. Matches correspond to the same matches as reported by
546 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
547 ///
548 /// The closure accepts three parameters: the match found, the text of
549 /// the match and a byte buffer with which to write the replaced text
550 /// (if any). If the closure returns `true`, then it continues to the next
551 /// match. If the closure returns false, then searching is stopped.
552 ///
553 /// # Examples
554 ///
555 /// Basic usage:
556 ///
557 /// ```
558 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
559 ///
560 /// let patterns = &["append", "appendage", "app"];
561 /// let haystack = b"append the app to the appendage";
562 ///
563 /// let ac = AhoCorasickBuilder::new()
564 /// .match_kind(MatchKind::LeftmostFirst)
565 /// .build(patterns);
566 /// let mut result = vec![];
567 /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
568 /// dst.extend(mat.pattern().to_string().bytes());
569 /// true
570 /// });
571 /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result);
572 /// ```
573 pub fn replace_all_with_bytes<F>(
574 &self,
575 haystack: &[u8],
576 dst: &mut Vec<u8>,
577 mut replace_with: F,
578 ) where
579 F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool,
580 {
581 let mut last_match = 0;
582 for mat in self.find_iter(haystack) {
583 dst.extend(&haystack[last_match..mat.start()]);
584 last_match = mat.end();
585 replace_with(&mat, &haystack[mat.start()..mat.end()], dst);
586 }
587 dst.extend(&haystack[last_match..]);
588 }
589
590 /// Returns an iterator of non-overlapping matches in the given
591 /// stream. Matches correspond to the same matches as reported by
592 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
593 ///
594 /// The matches yielded by this iterator use absolute position offsets in
595 /// the stream given, where the first byte has index `0`. Matches are
596 /// yieled until the stream is exhausted.
597 ///
598 /// Each item yielded by the iterator is an `io::Result<Match>`, where an
599 /// error is yielded if there was a problem reading from the reader given.
600 ///
601 /// When searching a stream, an internal buffer is used. Therefore, callers
602 /// should avoiding providing a buffered reader, if possible.
603 ///
604 /// Searching a stream requires that the automaton was built with
605 /// `MatchKind::Standard` semantics. If this automaton was constructed
606 /// with leftmost semantics, then this method will panic. To determine
607 /// whether this will panic at runtime, use the
608 /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
609 /// method.
610 ///
611 /// # Memory usage
612 ///
613 /// In general, searching streams will use a constant amount of memory for
614 /// its internal buffer. The one requirement is that the internal buffer
615 /// must be at least the size of the longest possible match. In most use
616 /// cases, the default buffer size will be much larger than any individual
617 /// match.
618 ///
619 /// # Panics
620 ///
621 /// This panics when `AhoCorasick::supports_stream` returns `false`.
622 /// That is, this panics when this automaton's match semantics are not
623 /// `MatchKind::Standard`. This restriction may be lifted in the future.
624 ///
625 /// # Examples
626 ///
627 /// Basic usage:
628 ///
629 /// ```
630 /// use aho_corasick::AhoCorasick;
631 ///
632 /// # fn example() -> Result<(), ::std::io::Error> {
633 /// let patterns = &["append", "appendage", "app"];
634 /// let haystack = "append the app to the appendage";
635 ///
636 /// let ac = AhoCorasick::new(patterns);
637 /// let mut matches = vec![];
638 /// for result in ac.stream_find_iter(haystack.as_bytes()) {
639 /// let mat = result?;
640 /// matches.push(mat.pattern());
641 /// }
642 /// assert_eq!(vec![2, 2, 2], matches);
643 /// # Ok(()) }; example().unwrap()
644 /// ```
645 pub fn stream_find_iter<'a, R: io::Read>(
646 &'a self,
647 rdr: R,
648 ) -> StreamFindIter<'a, R, S> {
649 StreamFindIter::new(self, rdr)
650 }
651
652 /// Search for and replace all matches of this automaton in
653 /// the given reader, and write the replacements to the given
654 /// writer. Matches correspond to the same matches as reported by
655 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
656 ///
657 /// Replacements are determined by the index of the matching pattern.
658 /// For example, if the pattern with index `2` is found, then it is
659 /// replaced by `replace_with[2]`.
660 ///
661 /// After all matches are replaced, the writer is _not_ flushed.
662 ///
663 /// If there was a problem reading from the given reader or writing to the
664 /// given writer, then the corresponding `io::Error` is returned and all
665 /// replacement is stopped.
666 ///
667 /// When searching a stream, an internal buffer is used. Therefore, callers
668 /// should avoiding providing a buffered reader, if possible. However,
669 /// callers may want to provide a buffered writer.
670 ///
671 /// Searching a stream requires that the automaton was built with
672 /// `MatchKind::Standard` semantics. If this automaton was constructed
673 /// with leftmost semantics, then this method will panic. To determine
674 /// whether this will panic at runtime, use the
675 /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
676 /// method.
677 ///
678 /// # Memory usage
679 ///
680 /// In general, searching streams will use a constant amount of memory for
681 /// its internal buffer. The one requirement is that the internal buffer
682 /// must be at least the size of the longest possible match. In most use
683 /// cases, the default buffer size will be much larger than any individual
684 /// match.
685 ///
686 /// # Panics
687 ///
688 /// This panics when `AhoCorasick::supports_stream` returns `false`.
689 /// That is, this panics when this automaton's match semantics are not
690 /// `MatchKind::Standard`. This restriction may be lifted in the future.
691 ///
692 /// # Examples
693 ///
694 /// Basic usage:
695 ///
696 /// ```
697 /// use aho_corasick::AhoCorasick;
698 ///
699 /// # fn example() -> Result<(), ::std::io::Error> {
700 /// let patterns = &["fox", "brown", "quick"];
701 /// let haystack = "The quick brown fox.";
702 /// let replace_with = &["sloth", "grey", "slow"];
703 ///
704 /// let ac = AhoCorasick::new(patterns);
705 /// let mut result = vec![];
706 /// ac.stream_replace_all(haystack.as_bytes(), &mut result, replace_with)?;
707 /// assert_eq!(b"The slow grey sloth.".to_vec(), result);
708 /// # Ok(()) }; example().unwrap()
709 /// ```
710 pub fn stream_replace_all<R, W, B>(
711 &self,
712 rdr: R,
713 wtr: W,
714 replace_with: &[B],
715 ) -> io::Result<()>
716 where
717 R: io::Read,
718 W: io::Write,
719 B: AsRef<[u8]>,
720 {
721 assert_eq!(
722 replace_with.len(),
723 self.pattern_count(),
724 "stream_replace_all requires a replacement for every pattern \
725 in the automaton"
726 );
727 self.stream_replace_all_with(rdr, wtr, |mat, _, wtr| {
728 wtr.write_all(replace_with[mat.pattern()].as_ref())
729 })
730 }
731
732 /// Search the given reader and replace all matches of this automaton
733 /// using the given closure. The result is written to the given
734 /// writer. Matches correspond to the same matches as reported by
735 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
736 ///
737 /// The closure accepts three parameters: the match found, the text of
738 /// the match and the writer with which to write the replaced text
739 /// (if any). If the closure returns `true`, then it continues to the next
740 /// match. If the closure returns false, then searching is stopped.
741 ///
742 /// After all matches are replaced, the writer is _not_ flushed.
743 ///
744 /// If there was a problem reading from the given reader or writing to the
745 /// given writer, then the corresponding `io::Error` is returned and all
746 /// replacement is stopped.
747 ///
748 /// When searching a stream, an internal buffer is used. Therefore, callers
749 /// should avoiding providing a buffered reader, if possible. However,
750 /// callers may want to provide a buffered writer.
751 ///
752 /// Searching a stream requires that the automaton was built with
753 /// `MatchKind::Standard` semantics. If this automaton was constructed
754 /// with leftmost semantics, then this method will panic. To determine
755 /// whether this will panic at runtime, use the
756 /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
757 /// method.
758 ///
759 /// # Memory usage
760 ///
761 /// In general, searching streams will use a constant amount of memory for
762 /// its internal buffer. The one requirement is that the internal buffer
763 /// must be at least the size of the longest possible match. In most use
764 /// cases, the default buffer size will be much larger than any individual
765 /// match.
766 ///
767 /// # Panics
768 ///
769 /// This panics when `AhoCorasick::supports_stream` returns `false`.
770 /// That is, this panics when this automaton's match semantics are not
771 /// `MatchKind::Standard`. This restriction may be lifted in the future.
772 ///
773 /// # Examples
774 ///
775 /// Basic usage:
776 ///
777 /// ```
778 /// use std::io::Write;
779 /// use aho_corasick::AhoCorasick;
780 ///
781 /// # fn example() -> Result<(), ::std::io::Error> {
782 /// let patterns = &["fox", "brown", "quick"];
783 /// let haystack = "The quick brown fox.";
784 ///
785 /// let ac = AhoCorasick::new(patterns);
786 /// let mut result = vec![];
787 /// ac.stream_replace_all_with(
788 /// haystack.as_bytes(),
789 /// &mut result,
790 /// |mat, _, wtr| {
791 /// wtr.write_all(mat.pattern().to_string().as_bytes())
792 /// },
793 /// )?;
794 /// assert_eq!(b"The 2 1 0.".to_vec(), result);
795 /// # Ok(()) }; example().unwrap()
796 /// ```
797 pub fn stream_replace_all_with<R, W, F>(
798 &self,
799 rdr: R,
800 mut wtr: W,
801 mut replace_with: F,
802 ) -> io::Result<()>
803 where
804 R: io::Read,
805 W: io::Write,
806 F: FnMut(&Match, &[u8], &mut W) -> io::Result<()>,
807 {
808 let mut it = StreamChunkIter::new(self, rdr);
809 while let Some(result) = it.next() {
810 let chunk = result?;
811 match chunk {
812 StreamChunk::NonMatch { bytes, .. } => {
813 wtr.write_all(bytes)?;
814 }
815 StreamChunk::Match { bytes, mat } => {
816 replace_with(&mat, bytes, &mut wtr)?;
817 }
818 }
819 }
820 Ok(())
821 }
822
823 /// Returns the match kind used by this automaton.
824 ///
825 /// # Examples
826 ///
827 /// Basic usage:
828 ///
829 /// ```
830 /// use aho_corasick::{AhoCorasick, MatchKind};
831 ///
832 /// let ac = AhoCorasick::new(&[
833 /// "foo", "bar", "quux", "baz",
834 /// ]);
835 /// assert_eq!(&MatchKind::Standard, ac.match_kind());
836 /// ```
837 pub fn match_kind(&self) -> &MatchKind {
838 self.imp.match_kind()
839 }
840
841 /// Returns the length of the longest pattern matched by this automaton.
842 ///
843 /// # Examples
844 ///
845 /// Basic usage:
846 ///
847 /// ```
848 /// use aho_corasick::AhoCorasick;
849 ///
850 /// let ac = AhoCorasick::new(&[
851 /// "foo", "bar", "quux", "baz",
852 /// ]);
853 /// assert_eq!(4, ac.max_pattern_len());
854 /// ```
855 pub fn max_pattern_len(&self) -> usize {
856 self.imp.max_pattern_len()
857 }
858
859 /// Return the total number of patterns matched by this automaton.
860 ///
861 /// This includes patterns that may never participate in a match. For
862 /// example, if
863 /// [`MatchKind::LeftmostFirst`](enum.MatchKind.html#variant.LeftmostFirst)
864 /// match semantics are used, and the patterns `Sam` and `Samwise` were
865 /// used to build the automaton, then `Samwise` can never participate in a
866 /// match because `Sam` will always take priority.
867 ///
868 /// # Examples
869 ///
870 /// Basic usage:
871 ///
872 /// ```
873 /// use aho_corasick::AhoCorasick;
874 ///
875 /// let ac = AhoCorasick::new(&[
876 /// "foo", "bar", "baz",
877 /// ]);
878 /// assert_eq!(3, ac.pattern_count());
879 /// ```
880 pub fn pattern_count(&self) -> usize {
881 self.imp.pattern_count()
882 }
883
884 /// Returns true if and only if this automaton supports reporting
885 /// overlapping matches.
886 ///
887 /// If this returns false and overlapping matches are requested, then it
888 /// will result in a panic.
889 ///
890 /// Since leftmost matching is inherently incompatible with overlapping
891 /// matches, only
892 /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
893 /// supports overlapping matches. This is unlikely to change in the future.
894 ///
895 /// # Examples
896 ///
897 /// Basic usage:
898 ///
899 /// ```
900 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
901 ///
902 /// let ac = AhoCorasickBuilder::new()
903 /// .match_kind(MatchKind::Standard)
904 /// .build(&["foo", "bar", "baz"]);
905 /// assert!(ac.supports_overlapping());
906 ///
907 /// let ac = AhoCorasickBuilder::new()
908 /// .match_kind(MatchKind::LeftmostFirst)
909 /// .build(&["foo", "bar", "baz"]);
910 /// assert!(!ac.supports_overlapping());
911 /// ```
912 pub fn supports_overlapping(&self) -> bool {
913 self.match_kind.supports_overlapping()
914 }
915
916 /// Returns true if and only if this automaton supports stream searching.
917 ///
918 /// If this returns false and stream searching (or replacing) is attempted,
919 /// then it will result in a panic.
920 ///
921 /// Currently, only
922 /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
923 /// supports streaming. This may be expanded in the future.
924 ///
925 /// # Examples
926 ///
927 /// Basic usage:
928 ///
929 /// ```
930 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
931 ///
932 /// let ac = AhoCorasickBuilder::new()
933 /// .match_kind(MatchKind::Standard)
934 /// .build(&["foo", "bar", "baz"]);
935 /// assert!(ac.supports_stream());
936 ///
937 /// let ac = AhoCorasickBuilder::new()
938 /// .match_kind(MatchKind::LeftmostFirst)
939 /// .build(&["foo", "bar", "baz"]);
940 /// assert!(!ac.supports_stream());
941 /// ```
942 pub fn supports_stream(&self) -> bool {
943 self.match_kind.supports_stream()
944 }
945
946 /// Returns the approximate total amount of heap used by this automaton, in
947 /// units of bytes.
948 ///
949 /// # Examples
950 ///
951 /// This example shows the difference in heap usage between a few
952 /// configurations:
953 ///
954 /// ```ignore
955 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
956 ///
957 /// let ac = AhoCorasickBuilder::new()
958 /// .dfa(false) // default
959 /// .build(&["foo", "bar", "baz"]);
960 /// assert_eq!(10_336, ac.heap_bytes());
961 ///
962 /// let ac = AhoCorasickBuilder::new()
963 /// .dfa(false) // default
964 /// .ascii_case_insensitive(true)
965 /// .build(&["foo", "bar", "baz"]);
966 /// assert_eq!(10_384, ac.heap_bytes());
967 ///
968 /// let ac = AhoCorasickBuilder::new()
969 /// .dfa(true)
970 /// .byte_classes(false)
971 /// .build(&["foo", "bar", "baz"]);
972 /// assert_eq!(20_768, ac.heap_bytes());
973 ///
974 /// let ac = AhoCorasickBuilder::new()
975 /// .dfa(true)
976 /// .byte_classes(true) // default
977 /// .build(&["foo", "bar", "baz"]);
978 /// assert_eq!(1_248, ac.heap_bytes());
979 ///
980 /// let ac = AhoCorasickBuilder::new()
981 /// .dfa(true)
982 /// .ascii_case_insensitive(true)
983 /// .build(&["foo", "bar", "baz"]);
984 /// assert_eq!(1_248, ac.heap_bytes());
985 /// ```
986 pub fn heap_bytes(&self) -> usize {
987 match self.imp {
988 Imp::NFA(ref nfa) => nfa.heap_bytes(),
989 Imp::DFA(ref dfa) => dfa.heap_bytes(),
990 }
991 }
992 }
993
994 /// The internal implementation of Aho-Corasick, which is either an NFA or
995 /// a DFA. The NFA is slower but uses less memory. The DFA is faster but uses
996 /// more memory.
997 #[derive(Clone, Debug)]
998 enum Imp<S: StateID> {
999 NFA(NFA<S>),
1000 DFA(DFA<S>),
1001 }
1002
1003 impl<S: StateID> Imp<S> {
1004 /// Returns the type of match semantics implemented by this automaton.
1005 fn match_kind(&self) -> &MatchKind {
1006 match *self {
1007 Imp::NFA(ref nfa) => nfa.match_kind(),
1008 Imp::DFA(ref dfa) => dfa.match_kind(),
1009 }
1010 }
1011
1012 /// Returns the identifier of the start state.
1013 fn start_state(&self) -> S {
1014 match *self {
1015 Imp::NFA(ref nfa) => nfa.start_state(),
1016 Imp::DFA(ref dfa) => dfa.start_state(),
1017 }
1018 }
1019
1020 /// The length, in bytes, of the longest pattern in this automaton. This
1021 /// information is useful for maintaining correct buffer sizes when
1022 /// searching on streams.
1023 fn max_pattern_len(&self) -> usize {
1024 match *self {
1025 Imp::NFA(ref nfa) => nfa.max_pattern_len(),
1026 Imp::DFA(ref dfa) => dfa.max_pattern_len(),
1027 }
1028 }
1029
1030 /// The total number of patterns added to this automaton. This includes
1031 /// patterns that may never match. The maximum matching pattern that can be
1032 /// reported is exactly one less than this number.
1033 fn pattern_count(&self) -> usize {
1034 match *self {
1035 Imp::NFA(ref nfa) => nfa.pattern_count(),
1036 Imp::DFA(ref dfa) => dfa.pattern_count(),
1037 }
1038 }
1039
1040 #[inline(always)]
1041 fn overlapping_find_at(
1042 &self,
1043 prestate: &mut PrefilterState,
1044 haystack: &[u8],
1045 at: usize,
1046 state_id: &mut S,
1047 match_index: &mut usize,
1048 ) -> Option<Match> {
1049 match *self {
1050 Imp::NFA(ref nfa) => nfa.overlapping_find_at(
1051 prestate,
1052 haystack,
1053 at,
1054 state_id,
1055 match_index,
1056 ),
1057 Imp::DFA(ref dfa) => dfa.overlapping_find_at(
1058 prestate,
1059 haystack,
1060 at,
1061 state_id,
1062 match_index,
1063 ),
1064 }
1065 }
1066
1067 #[inline(always)]
1068 fn earliest_find_at(
1069 &self,
1070 prestate: &mut PrefilterState,
1071 haystack: &[u8],
1072 at: usize,
1073 state_id: &mut S,
1074 ) -> Option<Match> {
1075 match *self {
1076 Imp::NFA(ref nfa) => {
1077 nfa.earliest_find_at(prestate, haystack, at, state_id)
1078 }
1079 Imp::DFA(ref dfa) => {
1080 dfa.earliest_find_at(prestate, haystack, at, state_id)
1081 }
1082 }
1083 }
1084
1085 #[inline(always)]
1086 fn find_at_no_state(
1087 &self,
1088 prestate: &mut PrefilterState,
1089 haystack: &[u8],
1090 at: usize,
1091 ) -> Option<Match> {
1092 match *self {
1093 Imp::NFA(ref nfa) => nfa.find_at_no_state(prestate, haystack, at),
1094 Imp::DFA(ref dfa) => dfa.find_at_no_state(prestate, haystack, at),
1095 }
1096 }
1097 }
1098
1099 /// An iterator of non-overlapping matches in a particular haystack.
1100 ///
1101 /// This iterator yields matches according to the
1102 /// [`MatchKind`](enum.MatchKind.html)
1103 /// used by this automaton.
1104 ///
1105 /// This iterator is constructed via the
1106 /// [`AhoCorasick::find_iter`](struct.AhoCorasick.html#method.find_iter)
1107 /// method.
1108 ///
1109 /// The type variable `S` refers to the representation used for state
1110 /// identifiers. (By default, this is `usize`.)
1111 ///
1112 /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
1113 ///
1114 /// The lifetime `'b` refers to the lifetime of the haystack being searched.
1115 #[derive(Debug)]
1116 pub struct FindIter<'a, 'b, S: 'a + StateID> {
1117 fsm: &'a Imp<S>,
1118 prestate: PrefilterState,
1119 haystack: &'b [u8],
1120 pos: usize,
1121 }
1122
1123 impl<'a, 'b, S: StateID> FindIter<'a, 'b, S> {
1124 fn new(ac: &'a AhoCorasick<S>, haystack: &'b [u8]) -> FindIter<'a, 'b, S> {
1125 let prestate = PrefilterState::new(ac.max_pattern_len());
1126 FindIter { fsm: &ac.imp, prestate, haystack, pos: 0 }
1127 }
1128 }
1129
1130 impl<'a, 'b, S: StateID> Iterator for FindIter<'a, 'b, S> {
1131 type Item = Match;
1132
1133 fn next(&mut self) -> Option<Match> {
1134 if self.pos > self.haystack.len() {
1135 return None;
1136 }
1137 let result = self.fsm.find_at_no_state(
1138 &mut self.prestate,
1139 self.haystack,
1140 self.pos,
1141 );
1142 let mat = match result {
1143 None => return None,
1144 Some(mat) => mat,
1145 };
1146 if mat.end() == self.pos {
1147 // If the automaton can match the empty string and if we found an
1148 // empty match, then we need to forcefully move the position.
1149 self.pos += 1;
1150 } else {
1151 self.pos = mat.end();
1152 }
1153 Some(mat)
1154 }
1155 }
1156
1157 /// An iterator of overlapping matches in a particular haystack.
1158 ///
1159 /// This iterator will report all possible matches in a particular haystack,
1160 /// even when the matches overlap.
1161 ///
1162 /// This iterator is constructed via the
1163 /// [`AhoCorasick::find_overlapping_iter`](struct.AhoCorasick.html#method.find_overlapping_iter)
1164 /// method.
1165 ///
1166 /// The type variable `S` refers to the representation used for state
1167 /// identifiers. (By default, this is `usize`.)
1168 ///
1169 /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
1170 ///
1171 /// The lifetime `'b` refers to the lifetime of the haystack being searched.
1172 #[derive(Debug)]
1173 pub struct FindOverlappingIter<'a, 'b, S: 'a + StateID> {
1174 fsm: &'a Imp<S>,
1175 prestate: PrefilterState,
1176 haystack: &'b [u8],
1177 pos: usize,
1178 last_match_end: usize,
1179 state_id: S,
1180 match_index: usize,
1181 }
1182
1183 impl<'a, 'b, S: StateID> FindOverlappingIter<'a, 'b, S> {
1184 fn new(
1185 ac: &'a AhoCorasick<S>,
1186 haystack: &'b [u8],
1187 ) -> FindOverlappingIter<'a, 'b, S> {
1188 assert!(
1189 ac.supports_overlapping(),
1190 "automaton does not support overlapping searches"
1191 );
1192 let prestate = PrefilterState::new(ac.max_pattern_len());
1193 FindOverlappingIter {
1194 fsm: &ac.imp,
1195 prestate,
1196 haystack,
1197 pos: 0,
1198 last_match_end: 0,
1199 state_id: ac.imp.start_state(),
1200 match_index: 0,
1201 }
1202 }
1203 }
1204
1205 impl<'a, 'b, S: StateID> Iterator for FindOverlappingIter<'a, 'b, S> {
1206 type Item = Match;
1207
1208 fn next(&mut self) -> Option<Match> {
1209 let result = self.fsm.overlapping_find_at(
1210 &mut self.prestate,
1211 self.haystack,
1212 self.pos,
1213 &mut self.state_id,
1214 &mut self.match_index,
1215 );
1216 match result {
1217 None => return None,
1218 Some(m) => {
1219 self.pos = m.end();
1220 Some(m)
1221 }
1222 }
1223 }
1224 }
1225
1226 /// An iterator that reports Aho-Corasick matches in a stream.
1227 ///
1228 /// This iterator yields elements of type `io::Result<Match>`, where an error
1229 /// is reported if there was a problem reading from the underlying stream.
1230 /// The iterator terminates only when the underlying stream reaches `EOF`.
1231 ///
1232 /// This iterator is constructed via the
1233 /// [`AhoCorasick::stream_find_iter`](struct.AhoCorasick.html#method.stream_find_iter)
1234 /// method.
1235 ///
1236 /// The type variable `R` refers to the `io::Read` stream that is being read
1237 /// from.
1238 ///
1239 /// The type variable `S` refers to the representation used for state
1240 /// identifiers. (By default, this is `usize`.)
1241 ///
1242 /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
1243 #[derive(Debug)]
1244 pub struct StreamFindIter<'a, R, S: 'a + StateID> {
1245 it: StreamChunkIter<'a, R, S>,
1246 }
1247
1248 impl<'a, R: io::Read, S: StateID> StreamFindIter<'a, R, S> {
1249 fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamFindIter<'a, R, S> {
1250 StreamFindIter { it: StreamChunkIter::new(ac, rdr) }
1251 }
1252 }
1253
1254 impl<'a, R: io::Read, S: StateID> Iterator for StreamFindIter<'a, R, S> {
1255 type Item = io::Result<Match>;
1256
1257 fn next(&mut self) -> Option<io::Result<Match>> {
1258 loop {
1259 match self.it.next() {
1260 None => return None,
1261 Some(Err(err)) => return Some(Err(err)),
1262 Some(Ok(StreamChunk::NonMatch { .. })) => {}
1263 Some(Ok(StreamChunk::Match { mat, .. })) => {
1264 return Some(Ok(mat));
1265 }
1266 }
1267 }
1268 }
1269 }
1270
1271 /// An iterator over chunks in an underlying reader. Each chunk either
1272 /// corresponds to non-matching bytes or matching bytes, but all bytes from
1273 /// the underlying reader are reported in sequence. There may be an arbitrary
1274 /// number of non-matching chunks before seeing a matching chunk.
1275 ///
1276 /// N.B. This does not actually implement Iterator because we need to borrow
1277 /// from the underlying reader. But conceptually, it's still an iterator.
1278 #[derive(Debug)]
1279 struct StreamChunkIter<'a, R, S: 'a + StateID> {
1280 /// The AC automaton.
1281 fsm: &'a Imp<S>,
1282 /// State associated with this automaton's prefilter. It is a heuristic
1283 /// for stopping the prefilter if it's deemed ineffective.
1284 prestate: PrefilterState,
1285 /// The source of bytes we read from.
1286 rdr: R,
1287 /// A fixed size buffer. This is what we actually search. There are some
1288 /// invariants around the buffer's size, namely, it must be big enough to
1289 /// contain the longest possible match.
1290 buf: Buffer,
1291 /// The ID of the FSM state we're currently in.
1292 state_id: S,
1293 /// The current position at which to start the next search in `buf`.
1294 search_pos: usize,
1295 /// The absolute position of `search_pos`, where `0` corresponds to the
1296 /// position of the first byte read from `rdr`.
1297 absolute_pos: usize,
1298 /// The ending position of the last StreamChunk that was returned to the
1299 /// caller. This position is used to determine whether we need to emit
1300 /// non-matching bytes before emitting a match.
1301 report_pos: usize,
1302 /// A match that should be reported on the next call.
1303 pending_match: Option<Match>,
1304 /// Enabled only when the automaton can match the empty string. When
1305 /// enabled, we need to execute one final search after consuming the
1306 /// reader to find the trailing empty match.
1307 has_empty_match_at_end: bool,
1308 }
1309
1310 /// A single chunk yielded by the stream chunk iterator.
1311 ///
1312 /// The `'r` lifetime refers to the lifetime of the stream chunk iterator.
1313 #[derive(Debug)]
1314 enum StreamChunk<'r> {
1315 /// A chunk that does not contain any matches.
1316 NonMatch { bytes: &'r [u8], start: usize },
1317 /// A chunk that precisely contains a match.
1318 Match { bytes: &'r [u8], mat: Match },
1319 }
1320
1321 impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> {
1322 fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamChunkIter<'a, R, S> {
1323 assert!(
1324 ac.supports_stream(),
1325 "stream searching is only supported for Standard match semantics"
1326 );
1327
1328 let prestate = PrefilterState::new(ac.max_pattern_len());
1329 let buf = Buffer::new(ac.imp.max_pattern_len());
1330 let state_id = ac.imp.start_state();
1331 StreamChunkIter {
1332 fsm: &ac.imp,
1333 prestate,
1334 rdr,
1335 buf,
1336 state_id,
1337 absolute_pos: 0,
1338 report_pos: 0,
1339 search_pos: 0,
1340 pending_match: None,
1341 has_empty_match_at_end: ac.is_match(""),
1342 }
1343 }
1344
1345 fn next<'r>(&'r mut self) -> Option<io::Result<StreamChunk<'r>>> {
1346 loop {
1347 if let Some(mut mat) = self.pending_match.take() {
1348 let bytes = &self.buf.buffer()[mat.start()..mat.end()];
1349 self.report_pos = mat.end();
1350 mat = mat.increment(self.absolute_pos);
1351 return Some(Ok(StreamChunk::Match { bytes, mat }));
1352 }
1353 if self.search_pos >= self.buf.len() {
1354 if let Some(end) = self.unreported() {
1355 let bytes = &self.buf.buffer()[self.report_pos..end];
1356 let start = self.absolute_pos + self.report_pos;
1357 self.report_pos = end;
1358 return Some(Ok(StreamChunk::NonMatch { bytes, start }));
1359 }
1360 if self.buf.len() >= self.buf.min_buffer_len() {
1361 // This is the point at which we roll our buffer, which we
1362 // only do if our buffer has at least the minimum amount of
1363 // bytes in it. Before rolling, we update our various
1364 // positions to be consistent with the buffer after it has
1365 // been rolled.
1366
1367 self.report_pos -=
1368 self.buf.len() - self.buf.min_buffer_len();
1369 self.absolute_pos +=
1370 self.search_pos - self.buf.min_buffer_len();
1371 self.search_pos = self.buf.min_buffer_len();
1372 self.buf.roll();
1373 }
1374 match self.buf.fill(&mut self.rdr) {
1375 Err(err) => return Some(Err(err)),
1376 Ok(false) => {
1377 // We've hit EOF, but if there are still some
1378 // unreported bytes remaining, return them now.
1379 if self.report_pos < self.buf.len() {
1380 let bytes = &self.buf.buffer()[self.report_pos..];
1381 let start = self.absolute_pos + self.report_pos;
1382 self.report_pos = self.buf.len();
1383
1384 let chunk = StreamChunk::NonMatch { bytes, start };
1385 return Some(Ok(chunk));
1386 } else {
1387 // We've reported everything, but there might still
1388 // be a match at the very last position.
1389 if !self.has_empty_match_at_end {
1390 return None;
1391 }
1392 // fallthrough for another search to get trailing
1393 // empty matches
1394 self.has_empty_match_at_end = false;
1395 }
1396 }
1397 Ok(true) => {}
1398 }
1399 }
1400 let result = self.fsm.earliest_find_at(
1401 &mut self.prestate,
1402 self.buf.buffer(),
1403 self.search_pos,
1404 &mut self.state_id,
1405 );
1406 match result {
1407 None => {
1408 self.search_pos = self.buf.len();
1409 }
1410 Some(mat) => {
1411 self.state_id = self.fsm.start_state();
1412 if mat.end() == self.search_pos {
1413 // If the automaton can match the empty string and if
1414 // we found an empty match, then we need to forcefully
1415 // move the position.
1416 self.search_pos += 1;
1417 } else {
1418 self.search_pos = mat.end();
1419 }
1420 self.pending_match = Some(mat.clone());
1421 if self.report_pos < mat.start() {
1422 let bytes =
1423 &self.buf.buffer()[self.report_pos..mat.start()];
1424 let start = self.absolute_pos + self.report_pos;
1425 self.report_pos = mat.start();
1426
1427 let chunk = StreamChunk::NonMatch { bytes, start };
1428 return Some(Ok(chunk));
1429 }
1430 }
1431 }
1432 }
1433 }
1434
1435 fn unreported(&self) -> Option<usize> {
1436 let end = self.search_pos.saturating_sub(self.buf.min_buffer_len());
1437 if self.report_pos < end {
1438 Some(end)
1439 } else {
1440 None
1441 }
1442 }
1443 }
1444
1445 /// A builder for configuring an Aho-Corasick automaton.
1446 #[derive(Clone, Debug)]
1447 pub struct AhoCorasickBuilder {
1448 nfa_builder: nfa::Builder,
1449 dfa_builder: dfa::Builder,
1450 dfa: bool,
1451 }
1452
1453 impl Default for AhoCorasickBuilder {
1454 fn default() -> AhoCorasickBuilder {
1455 AhoCorasickBuilder::new()
1456 }
1457 }
1458
1459 impl AhoCorasickBuilder {
1460 /// Create a new builder for configuring an Aho-Corasick automaton.
1461 ///
1462 /// If you don't need fine grained configuration or aren't sure which knobs
1463 /// to set, try using
1464 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured)
1465 /// instead.
1466 pub fn new() -> AhoCorasickBuilder {
1467 AhoCorasickBuilder {
1468 nfa_builder: nfa::Builder::new(),
1469 dfa_builder: dfa::Builder::new(),
1470 dfa: false,
1471 }
1472 }
1473
1474 /// Build an Aho-Corasick automaton using the configuration set on this
1475 /// builder.
1476 ///
1477 /// A builder may be reused to create more automatons.
1478 ///
1479 /// This method will use the default for representing internal state
1480 /// identifiers, which is `usize`. This guarantees that building the
1481 /// automaton will succeed and is generally a good default, but can make
1482 /// the size of the automaton 2-8 times bigger than it needs to be,
1483 /// depending on your target platform.
1484 ///
1485 /// # Examples
1486 ///
1487 /// Basic usage:
1488 ///
1489 /// ```
1490 /// use aho_corasick::AhoCorasickBuilder;
1491 ///
1492 /// let patterns = &["foo", "bar", "baz"];
1493 /// let ac = AhoCorasickBuilder::new()
1494 /// .build(patterns);
1495 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1496 /// ```
1497 pub fn build<I, P>(&self, patterns: I) -> AhoCorasick
1498 where
1499 I: IntoIterator<Item = P>,
1500 P: AsRef<[u8]>,
1501 {
1502 // The builder only returns an error if the chosen state ID
1503 // representation is too small to fit all of the given patterns. In
1504 // this case, since we fix the representation to usize, it will always
1505 // work because it's impossible to overflow usize since the underlying
1506 // storage would OOM long before that happens.
1507 self.build_with_size::<usize, I, P>(patterns)
1508 .expect("usize state ID type should always work")
1509 }
1510
1511 /// Build an Aho-Corasick automaton using the configuration set on this
1512 /// builder with a specific state identifier representation. This only has
1513 /// an effect when the `dfa` option is enabled.
1514 ///
1515 /// Generally, the choices for a state identifier representation are
1516 /// `u8`, `u16`, `u32`, `u64` or `usize`, with `usize` being the default.
1517 /// The advantage of choosing a smaller state identifier representation
1518 /// is that the automaton produced will be smaller. This might be
1519 /// beneficial for just generally using less space, or might even allow it
1520 /// to fit more of the automaton in your CPU's cache, leading to overall
1521 /// better search performance.
1522 ///
1523 /// Unlike the standard `build` method, this can report an error if the
1524 /// state identifier representation cannot support the size of the
1525 /// automaton.
1526 ///
1527 /// Note that the state identifier representation is determined by the
1528 /// `S` type variable. This requires a type hint of some sort, either
1529 /// by specifying the return type or using the turbofish, e.g.,
1530 /// `build_with_size::<u16, _, _>(...)`.
1531 ///
1532 /// # Examples
1533 ///
1534 /// Basic usage:
1535 ///
1536 /// ```
1537 /// use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
1538 ///
1539 /// # fn example() -> Result<(), ::aho_corasick::Error> {
1540 /// let patterns = &["foo", "bar", "baz"];
1541 /// let ac: AhoCorasick<u8> = AhoCorasickBuilder::new()
1542 /// .build_with_size(patterns)?;
1543 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1544 /// # Ok(()) }; example().unwrap()
1545 /// ```
1546 ///
1547 /// Or alternatively, with turbofish:
1548 ///
1549 /// ```
1550 /// use aho_corasick::AhoCorasickBuilder;
1551 ///
1552 /// # fn example() -> Result<(), ::aho_corasick::Error> {
1553 /// let patterns = &["foo", "bar", "baz"];
1554 /// let ac = AhoCorasickBuilder::new()
1555 /// .build_with_size::<u8, _, _>(patterns)?;
1556 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1557 /// # Ok(()) }; example().unwrap()
1558 /// ```
1559 pub fn build_with_size<S, I, P>(
1560 &self,
1561 patterns: I,
1562 ) -> Result<AhoCorasick<S>>
1563 where
1564 S: StateID,
1565 I: IntoIterator<Item = P>,
1566 P: AsRef<[u8]>,
1567 {
1568 let nfa = self.nfa_builder.build(patterns)?;
1569 let match_kind = nfa.match_kind().clone();
1570 let imp = if self.dfa {
1571 let dfa = self.dfa_builder.build(&nfa)?;
1572 Imp::DFA(dfa)
1573 } else {
1574 Imp::NFA(nfa)
1575 };
1576 Ok(AhoCorasick { imp, match_kind })
1577 }
1578
1579 /// Automatically configure the settings on this builder according to the
1580 /// patterns that will be used to construct the automaton.
1581 ///
1582 /// The idea here is to balance space and time automatically. That is, when
1583 /// searching a small number of patterns, this will attempt to use the
1584 /// fastest possible configuration since the total space required will be
1585 /// small anyway. As the number of patterns grows, this will fall back to
1586 /// slower configurations that use less space.
1587 ///
1588 /// This is guaranteed to never set `match_kind`, but any other option may
1589 /// be overridden.
1590 ///
1591 /// # Examples
1592 ///
1593 /// Basic usage:
1594 ///
1595 /// ```
1596 /// use aho_corasick::AhoCorasickBuilder;
1597 ///
1598 /// let patterns = &["foo", "bar", "baz"];
1599 /// let ac = AhoCorasickBuilder::new()
1600 /// .auto_configure(patterns)
1601 /// .build(patterns);
1602 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1603 /// ```
1604 pub fn auto_configure<B: AsRef<[u8]>>(
1605 &mut self,
1606 patterns: &[B],
1607 ) -> &mut AhoCorasickBuilder {
1608 // N.B. Currently we only use the length of `patterns` to make a
1609 // decision here, and could therefore ask for an `ExactSizeIterator`
1610 // instead. But it's conceivable that we might adapt this to look at
1611 // the total number of bytes, which would requires a second pass.
1612 //
1613 // The logic here is fairly rudimentary at the moment, but probably
1614 // OK. The idea here is to use the fastest thing possible for a small
1615 // number of patterns. That is, a DFA with no byte classes, since byte
1616 // classes require an extra indirection for every byte searched. With a
1617 // moderate number of patterns, we still want a DFA, but save on both
1618 // space and compilation time by enabling byte classes. Finally, fall
1619 // back to the slower but smaller NFA.
1620 if patterns.len() <= 100 {
1621 // N.B. Using byte classes can actually be faster by improving
1622 // locality, but this only really applies for multi-megabyte
1623 // automata (i.e., automata that don't fit in your CPU's cache).
1624 self.dfa(true).byte_classes(false);
1625 } else if patterns.len() <= 5000 {
1626 self.dfa(true);
1627 }
1628 self
1629 }
1630
1631 /// Set the desired match semantics.
1632 ///
1633 /// The default is `MatchKind::Standard`, which corresponds to the match
1634 /// semantics supported by the standard textbook description of the
1635 /// Aho-Corasick algorithm. Namely, matches are reported as soon as they
1636 /// are found. Moreover, this is the only way to get overlapping matches
1637 /// or do stream searching.
1638 ///
1639 /// The other kinds of match semantics that are supported are
1640 /// `MatchKind::LeftmostFirst` and `MatchKind::LeftmostLongest`. The former
1641 /// corresponds to the match you would get if you were to try to match
1642 /// each pattern at each position in the haystack in the same order that
1643 /// you give to the automaton. That is, it returns the leftmost match
1644 /// corresponding the earliest pattern given to the automaton. The latter
1645 /// corresponds to finding the longest possible match among all leftmost
1646 /// matches.
1647 ///
1648 /// For more details on match semantics, see the
1649 /// [documentation for `MatchKind`](enum.MatchKind.html).
1650 ///
1651 /// # Examples
1652 ///
1653 /// In these examples, we demonstrate the differences between match
1654 /// semantics for a particular set of patterns in a specific order:
1655 /// `b`, `abc`, `abcd`.
1656 ///
1657 /// Standard semantics:
1658 ///
1659 /// ```
1660 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
1661 ///
1662 /// let patterns = &["b", "abc", "abcd"];
1663 /// let haystack = "abcd";
1664 ///
1665 /// let ac = AhoCorasickBuilder::new()
1666 /// .match_kind(MatchKind::Standard) // default, not necessary
1667 /// .build(patterns);
1668 /// let mat = ac.find(haystack).expect("should have a match");
1669 /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
1670 /// ```
1671 ///
1672 /// Leftmost-first semantics:
1673 ///
1674 /// ```
1675 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
1676 ///
1677 /// let patterns = &["b", "abc", "abcd"];
1678 /// let haystack = "abcd";
1679 ///
1680 /// let ac = AhoCorasickBuilder::new()
1681 /// .match_kind(MatchKind::LeftmostFirst)
1682 /// .build(patterns);
1683 /// let mat = ac.find(haystack).expect("should have a match");
1684 /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
1685 /// ```
1686 ///
1687 /// Leftmost-longest semantics:
1688 ///
1689 /// ```
1690 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
1691 ///
1692 /// let patterns = &["b", "abc", "abcd"];
1693 /// let haystack = "abcd";
1694 ///
1695 /// let ac = AhoCorasickBuilder::new()
1696 /// .match_kind(MatchKind::LeftmostLongest)
1697 /// .build(patterns);
1698 /// let mat = ac.find(haystack).expect("should have a match");
1699 /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]);
1700 /// ```
1701 pub fn match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder {
1702 self.nfa_builder.match_kind(kind);
1703 self
1704 }
1705
1706 /// Enable anchored mode, which requires all matches to start at the
1707 /// first position in a haystack.
1708 ///
1709 /// This option is disabled by default.
1710 ///
1711 /// # Examples
1712 ///
1713 /// Basic usage:
1714 ///
1715 /// ```
1716 /// use aho_corasick::AhoCorasickBuilder;
1717 ///
1718 /// let patterns = &["foo", "bar"];
1719 /// let haystack = "foobar";
1720 ///
1721 /// let ac = AhoCorasickBuilder::new()
1722 /// .anchored(true)
1723 /// .build(patterns);
1724 /// assert_eq!(1, ac.find_iter(haystack).count());
1725 /// ```
1726 ///
1727 /// When searching for overlapping matches, all matches that start at
1728 /// the beginning of a haystack will be reported:
1729 ///
1730 /// ```
1731 /// use aho_corasick::AhoCorasickBuilder;
1732 ///
1733 /// let patterns = &["foo", "foofoo"];
1734 /// let haystack = "foofoo";
1735 ///
1736 /// let ac = AhoCorasickBuilder::new()
1737 /// .anchored(true)
1738 /// .build(patterns);
1739 /// assert_eq!(2, ac.find_overlapping_iter(haystack).count());
1740 /// // A non-anchored search would return 3 matches.
1741 /// ```
1742 pub fn anchored(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1743 self.nfa_builder.anchored(yes);
1744 self
1745 }
1746
1747 /// Enable ASCII-aware case insensitive matching.
1748 ///
1749 /// When this option is enabled, searching will be performed without
1750 /// respect to case for ASCII letters (`a-z` and `A-Z`) only.
1751 ///
1752 /// Enabling this option does not change the search algorithm, but it may
1753 /// increase the size of the automaton.
1754 ///
1755 /// **NOTE:** In the future, support for full Unicode case insensitivity
1756 /// may be added, but ASCII case insensitivity is comparatively much
1757 /// simpler to add.
1758 ///
1759 /// # Examples
1760 ///
1761 /// Basic usage:
1762 ///
1763 /// ```
1764 /// use aho_corasick::AhoCorasickBuilder;
1765 ///
1766 /// let patterns = &["FOO", "bAr", "BaZ"];
1767 /// let haystack = "foo bar baz";
1768 ///
1769 /// let ac = AhoCorasickBuilder::new()
1770 /// .ascii_case_insensitive(true)
1771 /// .build(patterns);
1772 /// assert_eq!(3, ac.find_iter(haystack).count());
1773 /// ```
1774 pub fn ascii_case_insensitive(
1775 &mut self,
1776 yes: bool,
1777 ) -> &mut AhoCorasickBuilder {
1778 self.nfa_builder.ascii_case_insensitive(yes);
1779 self
1780 }
1781
1782 /// Set the limit on how many NFA states use a dense representation for
1783 /// their transitions.
1784 ///
1785 /// A dense representation uses more space, but supports faster access to
1786 /// transitions at search time. Thus, this setting permits the control of a
1787 /// space vs time trade off when using the NFA variant of Aho-Corasick.
1788 ///
1789 /// This limit is expressed in terms of the depth of a state, i.e., the
1790 /// number of transitions from the starting state of the NFA. The idea is
1791 /// that most of the time searching will be spent near the starting state
1792 /// of the automaton, so states near the start state should use a dense
1793 /// representation. States further away from the start state would then use
1794 /// a sparse representation, which uses less space but is slower to access
1795 /// transitions at search time.
1796 ///
1797 /// By default, this is set to a low but non-zero number.
1798 ///
1799 /// This setting has no effect if the `dfa` option is enabled.
1800 pub fn dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder {
1801 self.nfa_builder.dense_depth(depth);
1802 self
1803 }
1804
1805 /// Compile the standard Aho-Corasick automaton into a deterministic finite
1806 /// automaton (DFA).
1807 ///
1808 /// When this is disabled (which is the default), then a non-deterministic
1809 /// finite automaton (NFA) is used instead.
1810 ///
1811 /// The main benefit to a DFA is that it can execute searches more quickly
1812 /// than a DFA (perhaps 2-4 times as fast). The main drawback is that the
1813 /// DFA uses more space and can take much longer to build.
1814 ///
1815 /// Enabling this option does not change the time complexity for
1816 /// constructing the Aho-Corasick automaton (which is `O(p)` where
1817 /// `p` is the total number of patterns being compiled). Enabling this
1818 /// option does however reduce the time complexity of non-overlapping
1819 /// searches from `O(n + p)` to `O(n)`, where `n` is the length of the
1820 /// haystack.
1821 ///
1822 /// In general, it's a good idea to enable this if you're searching a
1823 /// small number of fairly short patterns (~1000), or if you want the
1824 /// fastest possible search without regard to compilation time or space
1825 /// usage.
1826 pub fn dfa(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1827 self.dfa = yes;
1828 self
1829 }
1830
1831 /// Enable heuristic prefilter optimizations.
1832 ///
1833 /// When enabled, searching will attempt to quickly skip to match
1834 /// candidates using specialized literal search routines. A prefilter
1835 /// cannot always be used, and is generally treated as a heuristic. It
1836 /// can be useful to disable this if the prefilter is observed to be
1837 /// sub-optimal for a particular workload.
1838 ///
1839 /// This is enabled by default.
1840 pub fn prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1841 self.nfa_builder.prefilter(yes);
1842 self
1843 }
1844
1845 /// Shrink the size of the transition alphabet by mapping bytes to their
1846 /// equivalence classes. This only has an effect when the `dfa` option is
1847 /// enabled.
1848 ///
1849 /// When enabled, each a DFA will use a map from all possible bytes
1850 /// to their corresponding equivalence class. Each equivalence class
1851 /// represents a set of bytes that does not discriminate between a match
1852 /// and a non-match in the DFA. For example, the patterns `bar` and `baz`
1853 /// have at least five equivalence classes: singleton sets of `b`, `a`, `r`
1854 /// and `z`, and a final set that contains every other byte.
1855 ///
1856 /// The advantage of this map is that the size of the transition table can
1857 /// be reduced drastically from `#states * 256 * sizeof(id)` to
1858 /// `#states * k * sizeof(id)` where `k` is the number of equivalence
1859 /// classes. As a result, total space usage can decrease substantially.
1860 /// Moreover, since a smaller alphabet is used, compilation becomes faster
1861 /// as well.
1862 ///
1863 /// The disadvantage of this map is that every byte searched must be
1864 /// passed through this map before it can be used to determine the next
1865 /// transition. This has a small match time performance cost. However, if
1866 /// the DFA is otherwise very large without byte classes, then using byte
1867 /// classes can greatly improve memory locality and thus lead to better
1868 /// overall performance.
1869 ///
1870 /// This option is enabled by default.
1871 pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1872 self.dfa_builder.byte_classes(yes);
1873 self
1874 }
1875
1876 /// Premultiply state identifiers in the transition table. This only has
1877 /// an effect when the `dfa` option is enabled.
1878 ///
1879 /// When enabled, state identifiers are premultiplied to point to their
1880 /// corresponding row in the transition table. That is, given the `i`th
1881 /// state, its corresponding premultiplied identifier is `i * k` where `k`
1882 /// is the alphabet size of the automaton. (The alphabet size is at most
1883 /// 256, but is in practice smaller if byte classes is enabled.)
1884 ///
1885 /// When state identifiers are not premultiplied, then the identifier of
1886 /// the `i`th state is `i`.
1887 ///
1888 /// The advantage of premultiplying state identifiers is that is saves a
1889 /// multiplication instruction per byte when searching with a DFA. This has
1890 /// been observed to lead to a 20% performance benefit in micro-benchmarks.
1891 ///
1892 /// The primary disadvantage of premultiplying state identifiers is
1893 /// that they require a larger integer size to represent. For example,
1894 /// if the DFA has 200 states, then its premultiplied form requires 16
1895 /// bits to represent every possible state identifier, where as its
1896 /// non-premultiplied form only requires 8 bits.
1897 ///
1898 /// This option is enabled by default.
1899 pub fn premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1900 self.dfa_builder.premultiply(yes);
1901 self
1902 }
1903 }
1904
1905 /// A knob for controlling the match semantics of an Aho-Corasick automaton.
1906 ///
1907 /// There are two generally different ways that Aho-Corasick automatons can
1908 /// report matches. The first way is the "standard" approach that results from
1909 /// implementing most textbook explanations of Aho-Corasick. The second way is
1910 /// to report only the leftmost non-overlapping matches. The leftmost approach
1911 /// is in turn split into two different ways of resolving ambiguous matches:
1912 /// leftmost-first and leftmost-longest.
1913 ///
1914 /// The `Standard` match kind is the default and is the only one that supports
1915 /// overlapping matches and stream searching. (Trying to find overlapping
1916 /// or streaming matches using leftmost match semantics will result in a
1917 /// panic.) The `Standard` match kind will report matches as they are seen.
1918 /// When searching for overlapping matches, then all possible matches are
1919 /// reported. When searching for non-overlapping matches, the first match seen
1920 /// is reported. For example, for non-overlapping matches, given the patterns
1921 /// `abcd` and `b` and the subject string `abcdef`, only a match for `b` is
1922 /// reported since it is detected first. The `abcd` match is never reported
1923 /// since it overlaps with the `b` match.
1924 ///
1925 /// In contrast, the leftmost match kind always prefers the leftmost match
1926 /// among all possible matches. Given the same example as above with `abcd` and
1927 /// `b` as patterns and `abcdef` as the subject string, the leftmost match is
1928 /// `abcd` since it begins before the `b` match, even though the `b` match is
1929 /// detected before the `abcd` match. In this case, the `b` match is not
1930 /// reported at all since it overlaps with the `abcd` match.
1931 ///
1932 /// The difference between leftmost-first and leftmost-longest is in how they
1933 /// resolve ambiguous matches when there are multiple leftmost matches to
1934 /// choose from. Leftmost-first always chooses the pattern that was provided
1935 /// earliest, where as leftmost-longest always chooses the longest matching
1936 /// pattern. For example, given the patterns `a` and `ab` and the subject
1937 /// string `ab`, the leftmost-first match is `a` but the leftmost-longest match
1938 /// is `ab`. Conversely, if the patterns were given in reverse order, i.e.,
1939 /// `ab` and `a`, then both the leftmost-first and leftmost-longest matches
1940 /// would be `ab`. Stated differently, the leftmost-first match depends on the
1941 /// order in which the patterns were given to the Aho-Corasick automaton.
1942 /// Because of that, when leftmost-first matching is used, if a pattern `A`
1943 /// that appears before a pattern `B` is a prefix of `B`, then it is impossible
1944 /// to ever observe a match of `B`.
1945 ///
1946 /// If you're not sure which match kind to pick, then stick with the standard
1947 /// kind, which is the default. In particular, if you need overlapping or
1948 /// streaming matches, then you _must_ use the standard kind. The leftmost
1949 /// kinds are useful in specific circumstances. For example, leftmost-first can
1950 /// be very useful as a way to implement match priority based on the order of
1951 /// patterns given and leftmost-longest can be useful for dictionary searching
1952 /// such that only the longest matching words are reported.
1953 ///
1954 /// # Relationship with regular expression alternations
1955 ///
1956 /// Understanding match semantics can be a little tricky, and one easy way
1957 /// to conceptualize non-overlapping matches from an Aho-Corasick automaton
1958 /// is to think about them as a simple alternation of literals in a regular
1959 /// expression. For example, let's say we wanted to match the strings
1960 /// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It
1961 /// turns out that regular expression engines have two different ways of
1962 /// matching this alternation. The first way, leftmost-longest, is commonly
1963 /// found in POSIX compatible implementations of regular expressions (such as
1964 /// `grep`). The second way, leftmost-first, is commonly found in backtracking
1965 /// implementations such as Perl. (Some regex engines, such as RE2 and Rust's
1966 /// regex engine do not use backtracking, but still implement leftmost-first
1967 /// semantics in an effort to match the behavior of dominant backtracking
1968 /// regex engines such as those found in Perl, Ruby, Python, Javascript and
1969 /// PHP.)
1970 ///
1971 /// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex
1972 /// will match `Samwise` because it is the longest possible match, but a
1973 /// Perl-like regex will match `Sam` since it appears earlier in the
1974 /// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine
1975 /// will never match `Samwise` since `Sam` will always have higher priority.
1976 /// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to
1977 /// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is
1978 /// still longest match, but it also appears earlier than `Sam`.
1979 ///
1980 /// The "standard" match semantics of Aho-Corasick generally don't correspond
1981 /// to the match semantics of any large group of regex implementations, so
1982 /// there's no direct analogy that can be made here. Standard match semantics
1983 /// are generally useful for overlapping matches, or if you just want to see
1984 /// matches as they are detected.
1985 ///
1986 /// The main conclusion to draw from this section is that the match semantics
1987 /// can be tweaked to precisely match either Perl-like regex alternations or
1988 /// POSIX regex alternations.
1989 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
1990 pub enum MatchKind {
1991 /// Use standard match semantics, which support overlapping matches. When
1992 /// used with non-overlapping matches, matches are reported as they are
1993 /// seen.
1994 Standard,
1995 /// Use leftmost-first match semantics, which reports leftmost matches.
1996 /// When there are multiple possible leftmost matches, the match
1997 /// corresponding to the pattern that appeared earlier when constructing
1998 /// the automaton is reported.
1999 ///
2000 /// This does **not** support overlapping matches or stream searching. If
2001 /// this match kind is used, attempting to find overlapping matches or
2002 /// stream matches will panic.
2003 LeftmostFirst,
2004 /// Use leftmost-longest match semantics, which reports leftmost matches.
2005 /// When there are multiple possible leftmost matches, the longest match
2006 /// is chosen.
2007 ///
2008 /// This does **not** support overlapping matches or stream searching. If
2009 /// this match kind is used, attempting to find overlapping matches or
2010 /// stream matches will panic.
2011 LeftmostLongest,
2012 /// Hints that destructuring should not be exhaustive.
2013 ///
2014 /// This enum may grow additional variants, so this makes sure clients
2015 /// don't count on exhaustive matching. (Otherwise, adding a new variant
2016 /// could break existing code.)
2017 #[doc(hidden)]
2018 __Nonexhaustive,
2019 }
2020
2021 /// The default match kind is `MatchKind::Standard`.
2022 impl Default for MatchKind {
2023 fn default() -> MatchKind {
2024 MatchKind::Standard
2025 }
2026 }
2027
2028 impl MatchKind {
2029 fn supports_overlapping(&self) -> bool {
2030 self.is_standard()
2031 }
2032
2033 fn supports_stream(&self) -> bool {
2034 // TODO: It may be possible to support this. It's hard.
2035 //
2036 // See: https://github.com/rust-lang/regex/issues/425#issuecomment-471367838
2037 self.is_standard()
2038 }
2039
2040 pub(crate) fn is_standard(&self) -> bool {
2041 *self == MatchKind::Standard
2042 }
2043
2044 pub(crate) fn is_leftmost(&self) -> bool {
2045 *self == MatchKind::LeftmostFirst
2046 || *self == MatchKind::LeftmostLongest
2047 }
2048
2049 pub(crate) fn is_leftmost_first(&self) -> bool {
2050 *self == MatchKind::LeftmostFirst
2051 }
2052
2053 /// Convert this match kind into a packed match kind. If this match kind
2054 /// corresponds to standard semantics, then this returns None, since
2055 /// packed searching does not support standard semantics.
2056 pub(crate) fn as_packed(&self) -> Option<packed::MatchKind> {
2057 match *self {
2058 MatchKind::Standard => None,
2059 MatchKind::LeftmostFirst => Some(packed::MatchKind::LeftmostFirst),
2060 MatchKind::LeftmostLongest => {
2061 Some(packed::MatchKind::LeftmostLongest)
2062 }
2063 MatchKind::__Nonexhaustive => unreachable!(),
2064 }
2065 }
2066 }
2067
2068 #[cfg(test)]
2069 mod tests {
2070 use super::*;
2071
2072 #[test]
2073 fn oibits() {
2074 use std::panic::{RefUnwindSafe, UnwindSafe};
2075
2076 fn assert_send<T: Send>() {}
2077 fn assert_sync<T: Sync>() {}
2078 fn assert_unwind_safe<T: RefUnwindSafe + UnwindSafe>() {}
2079
2080 assert_send::<AhoCorasick>();
2081 assert_sync::<AhoCorasick>();
2082 assert_unwind_safe::<AhoCorasick>();
2083 assert_send::<AhoCorasickBuilder>();
2084 assert_sync::<AhoCorasickBuilder>();
2085 assert_unwind_safe::<AhoCorasickBuilder>();
2086 }
2087 }