3 use packed
::pattern
::Patterns
;
4 use packed
::rabinkarp
::RabinKarp
;
5 use packed
::teddy
::{self, Teddy}
;
8 /// This is a limit placed on the total number of patterns we're willing to try
9 /// and match at once. As more sophisticated algorithms are added, this number
11 const PATTERN_LIMIT
: usize = 128;
13 /// A knob for controlling the match semantics of a packed multiple string
16 /// This differs from the
17 /// [`MatchKind`](../enum.MatchKind.html)
18 /// type in the top-level crate module in that it doesn't support
19 /// "standard" match semantics, and instead only supports leftmost-first or
20 /// leftmost-longest. Namely, "standard" semantics cannot be easily supported
21 /// by packed searchers.
23 /// For more information on the distinction between leftmost-first and
24 /// leftmost-longest, see the docs on the top-level `MatchKind` type.
26 /// Unlike the top-level `MatchKind` type, the default match semantics for this
27 /// type are leftmost-first.
28 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
30 /// Use leftmost-first match semantics, which reports leftmost matches.
31 /// When there are multiple possible leftmost matches, the match
32 /// corresponding to the pattern that appeared earlier when constructing
33 /// the automaton is reported.
35 /// This is the default.
37 /// Use leftmost-longest match semantics, which reports leftmost matches.
38 /// When there are multiple possible leftmost matches, the longest match
41 /// Hints that destructuring should not be exhaustive.
43 /// This enum may grow additional variants, so this makes sure clients
44 /// don't count on exhaustive matching. (Otherwise, adding a new variant
45 /// could break existing code.)
50 impl Default
for MatchKind
{
51 fn default() -> MatchKind
{
52 MatchKind
::LeftmostFirst
56 /// The configuration for a packed multiple pattern searcher.
58 /// The configuration is currently limited only to being able to select the
59 /// match semantics (leftmost-first or leftmost-longest) of a searcher. In the
60 /// future, more knobs may be made available.
62 /// A configuration produces a [`packed::Builder`](struct.Builder.html), which
63 /// in turn can be used to construct a
64 /// [`packed::Searcher`](struct.Searcher.html) for searching.
68 /// This example shows how to use leftmost-longest semantics instead of the
69 /// default (leftmost-first).
72 /// use aho_corasick::packed::{Config, MatchKind};
74 /// # fn example() -> Option<()> {
75 /// let searcher = Config::new()
76 /// .match_kind(MatchKind::LeftmostLongest)
81 /// let matches: Vec<usize> = searcher
82 /// .find_iter("foobar")
83 /// .map(|mat| mat.pattern())
85 /// assert_eq!(vec![1], matches);
87 /// # if cfg!(target_arch = "x86_64") {
88 /// # example().unwrap()
90 /// # assert!(example().is_none());
93 #[derive(Clone, Debug)]
96 force
: Option
<ForceAlgorithm
>,
97 force_teddy_fat
: Option
<bool
>,
98 force_avx
: Option
<bool
>,
101 /// An internal option for forcing the use of a particular packed algorithm.
103 /// When an algorithm is forced, if a searcher could not be constructed for it,
104 /// then no searcher will be returned even if an alternative algorithm would
106 #[derive(Clone, Debug)]
107 enum ForceAlgorithm
{
112 impl Default
for Config
{
113 fn default() -> Config
{
119 /// Create a new default configuration. A default configuration uses
120 /// leftmost-first match semantics.
121 pub fn new() -> Config
{
123 kind
: MatchKind
::LeftmostFirst
,
125 force_teddy_fat
: None
,
130 /// Create a packed builder from this configuration. The builder can be
131 /// used to accumulate patterns and create a
132 /// [`Searcher`](struct.Searcher.html)
134 pub fn builder(&self) -> Builder
{
135 Builder
::from_config(self.clone())
138 /// Set the match semantics for this configuration.
139 pub fn match_kind(&mut self, kind
: MatchKind
) -> &mut Config
{
144 /// An undocumented method for forcing the use of the Teddy algorithm.
146 /// This is only exposed for more precise testing and benchmarks. Callers
147 /// should not use it as it is not part of the API stability guarantees of
150 pub fn force_teddy(&mut self, yes
: bool
) -> &mut Config
{
152 self.force
= Some(ForceAlgorithm
::Teddy
);
159 /// An undocumented method for forcing the use of the Fat Teddy algorithm.
161 /// This is only exposed for more precise testing and benchmarks. Callers
162 /// should not use it as it is not part of the API stability guarantees of
165 pub fn force_teddy_fat(&mut self, yes
: Option
<bool
>) -> &mut Config
{
166 self.force_teddy_fat
= yes
;
170 /// An undocumented method for forcing the use of SSE (`Some(false)`) or
171 /// AVX (`Some(true)`) algorithms.
173 /// This is only exposed for more precise testing and benchmarks. Callers
174 /// should not use it as it is not part of the API stability guarantees of
177 pub fn force_avx(&mut self, yes
: Option
<bool
>) -> &mut Config
{
178 self.force_avx
= yes
;
182 /// An undocumented method for forcing the use of the Rabin-Karp algorithm.
184 /// This is only exposed for more precise testing and benchmarks. Callers
185 /// should not use it as it is not part of the API stability guarantees of
188 pub fn force_rabin_karp(&mut self, yes
: bool
) -> &mut Config
{
190 self.force
= Some(ForceAlgorithm
::RabinKarp
);
198 /// A builder for constructing a packed searcher from a collection of patterns.
202 /// This example shows how to use a builder to construct a searcher. By
203 /// default, leftmost-first match semantics are used.
206 /// use aho_corasick::packed::{Builder, MatchKind};
208 /// # fn example() -> Option<()> {
209 /// let searcher = Builder::new()
213 /// let matches: Vec<usize> = searcher
214 /// .find_iter("foobar")
215 /// .map(|mat| mat.pattern())
217 /// assert_eq!(vec![0], matches);
219 /// # if cfg!(target_arch = "x86_64") {
220 /// # example().unwrap()
222 /// # assert!(example().is_none());
225 #[derive(Clone, Debug)]
227 /// The configuration of this builder and subsequent matcher.
229 /// Set to true if the builder detects that a matcher cannot be built.
231 /// The patterns provided by the caller.
236 /// Create a new builder for constructing a multi-pattern searcher. This
237 /// constructor uses the default configuration.
238 pub fn new() -> Builder
{
239 Builder
::from_config(Config
::new())
242 fn from_config(config
: Config
) -> Builder
{
243 Builder { config, inert: false, patterns: Patterns::new() }
246 /// Build a searcher from the patterns added to this builder so far.
247 pub fn build(&self) -> Option
<Searcher
> {
248 if self.inert
|| self.patterns
.is_empty() {
251 let mut patterns
= self.patterns
.clone();
252 patterns
.set_match_kind(self.config
.kind
);
253 let rabinkarp
= RabinKarp
::new(&patterns
);
254 // Effectively, we only want to return a searcher if we can use Teddy,
255 // since Teddy is our only fast packed searcher at the moment.
256 // Rabin-Karp is only used when searching haystacks smaller than what
257 // Teddy can support. Thus, the only way to get a Rabin-Karp searcher
258 // is to force it using undocumented APIs (for tests/benchmarks).
259 let (search_kind
, minimum_len
) = match self.config
.force
{
260 None
| Some(ForceAlgorithm
::Teddy
) => {
261 let teddy
= match self.build_teddy(&patterns
) {
263 Some(teddy
) => teddy
,
265 let minimum_len
= teddy
.minimum_len();
266 (SearchKind
::Teddy(teddy
), minimum_len
)
268 Some(ForceAlgorithm
::RabinKarp
) => (SearchKind
::RabinKarp
, 0),
271 config
: self.config
.clone(),
279 fn build_teddy(&self, patterns
: &Patterns
) -> Option
<Teddy
> {
280 teddy
::Builder
::new()
281 .avx(self.config
.force_avx
)
282 .fat(self.config
.force_teddy_fat
)
286 /// Add the given pattern to this set to match.
288 /// The order in which patterns are added is significant. Namely, when
289 /// using leftmost-first match semantics, then when multiple patterns can
290 /// match at a particular location, the pattern that was added first is
291 /// used as the match.
293 /// If the number of patterns added exceeds the amount supported by packed
294 /// searchers, then the builder will stop accumulating patterns and render
295 /// itself inert. At this point, constructing a searcher will always return
297 pub fn add
<P
: AsRef
<[u8]>>(&mut self, pattern
: P
) -> &mut Builder
{
300 } else if self.patterns
.len() >= PATTERN_LIMIT
{
302 self.patterns
.reset();
305 // Just in case PATTERN_LIMIT increases beyond u16::MAX.
306 assert
!(self.patterns
.len() <= u16::MAX
as usize);
308 let pattern
= pattern
.as_ref();
309 if pattern
.is_empty() {
311 self.patterns
.reset();
314 self.patterns
.add(pattern
);
318 /// Add the given iterator of patterns to this set to match.
320 /// The iterator must yield elements that can be converted into a `&[u8]`.
322 /// The order in which patterns are added is significant. Namely, when
323 /// using leftmost-first match semantics, then when multiple patterns can
324 /// match at a particular location, the pattern that was added first is
325 /// used as the match.
327 /// If the number of patterns added exceeds the amount supported by packed
328 /// searchers, then the builder will stop accumulating patterns and render
329 /// itself inert. At this point, constructing a searcher will always return
331 pub fn extend
<I
, P
>(&mut self, patterns
: I
) -> &mut Builder
333 I
: IntoIterator
<Item
= P
>,
343 impl Default
for Builder
{
344 fn default() -> Builder
{
349 /// A packed searcher for quickly finding occurrences of multiple patterns.
351 /// If callers need more flexible construction, or if one wants to change the
352 /// match semantics (either leftmost-first or leftmost-longest), then one can
353 /// use the [`Config`](struct.Config.html) and/or
354 /// [`Builder`](struct.Builder.html) types for more fine grained control.
358 /// This example shows how to create a searcher from an iterator of patterns.
359 /// By default, leftmost-first match semantics are used.
362 /// use aho_corasick::packed::{MatchKind, Searcher};
364 /// # fn example() -> Option<()> {
365 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
366 /// let matches: Vec<usize> = searcher
367 /// .find_iter("foobar")
368 /// .map(|mat| mat.pattern())
370 /// assert_eq!(vec![0], matches);
372 /// # if cfg!(target_arch = "x86_64") {
373 /// # example().unwrap()
375 /// # assert!(example().is_none());
378 #[derive(Clone, Debug)]
379 pub struct Searcher
{
382 rabinkarp
: RabinKarp
,
383 search_kind
: SearchKind
,
387 #[derive(Clone, Debug)]
394 /// A convenience function for constructing a searcher from an iterator
395 /// of things that can be converted to a `&[u8]`.
397 /// If a searcher could not be constructed (either because of an
398 /// unsupported CPU or because there are too many patterns), then `None`
406 /// use aho_corasick::packed::{MatchKind, Searcher};
408 /// # fn example() -> Option<()> {
409 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
410 /// let matches: Vec<usize> = searcher
411 /// .find_iter("foobar")
412 /// .map(|mat| mat.pattern())
414 /// assert_eq!(vec![0], matches);
416 /// # if cfg!(target_arch = "x86_64") {
417 /// # example().unwrap()
419 /// # assert!(example().is_none());
422 pub fn new
<I
, P
>(patterns
: I
) -> Option
<Searcher
>
424 I
: IntoIterator
<Item
= P
>,
427 Builder
::new().extend(patterns
).build()
430 /// Return the first occurrence of any of the patterns in this searcher,
431 /// according to its match semantics, in the given haystack. The `Match`
432 /// returned will include the identifier of the pattern that matched, which
433 /// corresponds to the index of the pattern (starting from `0`) in which it
441 /// use aho_corasick::packed::{MatchKind, Searcher};
443 /// # fn example() -> Option<()> {
444 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
445 /// let mat = searcher.find("foobar")?;
446 /// assert_eq!(0, mat.pattern());
447 /// assert_eq!(0, mat.start());
448 /// assert_eq!(6, mat.end());
450 /// # if cfg!(target_arch = "x86_64") {
451 /// # example().unwrap()
453 /// # assert!(example().is_none());
456 pub fn find
<B
: AsRef
<[u8]>>(&self, haystack
: B
) -> Option
<Match
> {
457 self.find_at(haystack
, 0)
460 /// Return the first occurrence of any of the patterns in this searcher,
461 /// according to its match semantics, in the given haystack starting from
462 /// the given position.
464 /// The `Match` returned will include the identifier of the pattern that
465 /// matched, which corresponds to the index of the pattern (starting from
466 /// `0`) in which it was added. The offsets in the `Match` will be relative
467 /// to the start of `haystack` (and not `at`).
474 /// use aho_corasick::packed::{MatchKind, Searcher};
476 /// # fn example() -> Option<()> {
477 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
478 /// let mat = searcher.find_at("foofoobar", 3)?;
479 /// assert_eq!(0, mat.pattern());
480 /// assert_eq!(3, mat.start());
481 /// assert_eq!(9, mat.end());
483 /// # if cfg!(target_arch = "x86_64") {
484 /// # example().unwrap()
486 /// # assert!(example().is_none());
489 pub fn find_at
<B
: AsRef
<[u8]>>(
494 let haystack
= haystack
.as_ref();
495 match self.search_kind
{
496 SearchKind
::Teddy(ref teddy
) => {
497 if haystack
[at
..].len() < teddy
.minimum_len() {
498 return self.slow_at(haystack
, at
);
500 teddy
.find_at(&self.patterns
, haystack
, at
)
502 SearchKind
::RabinKarp
=> {
503 self.rabinkarp
.find_at(&self.patterns
, haystack
, at
)
508 /// Return an iterator of non-overlapping occurrences of the patterns in
509 /// this searcher, according to its match semantics, in the given haystack.
516 /// use aho_corasick::packed::{MatchKind, Searcher};
518 /// # fn example() -> Option<()> {
519 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
520 /// let matches: Vec<usize> = searcher
521 /// .find_iter("foobar fooba foofoo")
522 /// .map(|mat| mat.pattern())
524 /// assert_eq!(vec![0, 1, 1, 1], matches);
526 /// # if cfg!(target_arch = "x86_64") {
527 /// # example().unwrap()
529 /// # assert!(example().is_none());
532 pub fn find_iter
<'a
, 'b
, B
: ?Sized
+ AsRef
<[u8]>>(
535 ) -> FindIter
<'a
, 'b
> {
536 FindIter { searcher: self, haystack: haystack.as_ref(), at: 0 }
539 /// Returns the match kind used by this packed searcher.
546 /// use aho_corasick::packed::{MatchKind, Searcher};
548 /// # fn example() -> Option<()> {
549 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
550 /// // leftmost-first is the default.
551 /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind());
553 /// # if cfg!(target_arch = "x86_64") {
554 /// # example().unwrap()
556 /// # assert!(example().is_none());
559 pub fn match_kind(&self) -> &MatchKind
{
560 self.patterns
.match_kind()
563 /// Returns the minimum length of a haystack that is required in order for
564 /// packed searching to be effective.
566 /// In some cases, the underlying packed searcher may not be able to search
567 /// very short haystacks. When that occurs, the implementation will defer
568 /// to a slower non-packed searcher (which is still generally faster than
569 /// Aho-Corasick for a small number of patterns). However, callers may
570 /// want to avoid ever using the slower variant, which one can do by
571 /// never passing a haystack shorter than the minimum length returned by
573 pub fn minimum_len(&self) -> usize {
577 /// Returns the approximate total amount of heap used by this searcher, in
579 pub fn heap_bytes(&self) -> usize {
580 self.patterns
.heap_bytes()
581 + self.rabinkarp
.heap_bytes()
582 + self.search_kind
.heap_bytes()
585 /// Use a slow (non-packed) searcher.
587 /// This is useful when a packed searcher could be constructed, but could
588 /// not be used to search a specific haystack. For example, if Teddy was
589 /// built but the haystack is smaller than ~34 bytes, then Teddy might not
591 fn slow_at(&self, haystack
: &[u8], at
: usize) -> Option
<Match
> {
592 self.rabinkarp
.find_at(&self.patterns
, haystack
, at
)
597 fn heap_bytes(&self) -> usize {
599 SearchKind
::Teddy(ref ted
) => ted
.heap_bytes(),
600 SearchKind
::RabinKarp
=> 0,
605 /// An iterator over non-overlapping matches from a packed searcher.
607 /// The lifetime `'s` refers to the lifetime of the underlying
608 /// [`Searcher`](struct.Searcher.html), while the lifetime `'h` refers to the
609 /// lifetime of the haystack being searched.
611 pub struct FindIter
<'s
, 'h
> {
612 searcher
: &'s Searcher
,
617 impl<'s
, 'h
> Iterator
for FindIter
<'s
, 'h
> {
620 fn next(&mut self) -> Option
<Match
> {
621 if self.at
> self.haystack
.len() {
624 match self.searcher
.find_at(&self.haystack
, self.at
) {