]> git.proxmox.com Git - rustc.git/blob - vendor/aho-corasick/src/packed/api.rs
New upstream version 1.45.0+dfsg1
[rustc.git] / vendor / aho-corasick / src / packed / api.rs
1 use std::u16;
2
3 use packed::pattern::Patterns;
4 use packed::rabinkarp::RabinKarp;
5 use packed::teddy::{self, Teddy};
6 use Match;
7
8 /// This is a limit placed on the total number of patterns we're willing to try
9 /// and match at once. As more sophisticated algorithms are added, this number
10 /// may be increased.
11 const PATTERN_LIMIT: usize = 128;
12
13 /// A knob for controlling the match semantics of a packed multiple string
14 /// searcher.
15 ///
16 /// This differs from the
17 /// [`MatchKind`](../enum.MatchKind.html)
18 /// type in the top-level crate module in that it doesn't support
19 /// "standard" match semantics, and instead only supports leftmost-first or
20 /// leftmost-longest. Namely, "standard" semantics cannot be easily supported
21 /// by packed searchers.
22 ///
23 /// For more information on the distinction between leftmost-first and
24 /// leftmost-longest, see the docs on the top-level `MatchKind` type.
25 ///
26 /// Unlike the top-level `MatchKind` type, the default match semantics for this
27 /// type are leftmost-first.
28 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
29 pub enum MatchKind {
30 /// Use leftmost-first match semantics, which reports leftmost matches.
31 /// When there are multiple possible leftmost matches, the match
32 /// corresponding to the pattern that appeared earlier when constructing
33 /// the automaton is reported.
34 ///
35 /// This is the default.
36 LeftmostFirst,
37 /// Use leftmost-longest match semantics, which reports leftmost matches.
38 /// When there are multiple possible leftmost matches, the longest match
39 /// is chosen.
40 LeftmostLongest,
41 /// Hints that destructuring should not be exhaustive.
42 ///
43 /// This enum may grow additional variants, so this makes sure clients
44 /// don't count on exhaustive matching. (Otherwise, adding a new variant
45 /// could break existing code.)
46 #[doc(hidden)]
47 __Nonexhaustive,
48 }
49
50 impl Default for MatchKind {
51 fn default() -> MatchKind {
52 MatchKind::LeftmostFirst
53 }
54 }
55
56 /// The configuration for a packed multiple pattern searcher.
57 ///
58 /// The configuration is currently limited only to being able to select the
59 /// match semantics (leftmost-first or leftmost-longest) of a searcher. In the
60 /// future, more knobs may be made available.
61 ///
62 /// A configuration produces a [`packed::Builder`](struct.Builder.html), which
63 /// in turn can be used to construct a
64 /// [`packed::Searcher`](struct.Searcher.html) for searching.
65 ///
66 /// # Example
67 ///
68 /// This example shows how to use leftmost-longest semantics instead of the
69 /// default (leftmost-first).
70 ///
71 /// ```
72 /// use aho_corasick::packed::{Config, MatchKind};
73 ///
74 /// # fn example() -> Option<()> {
75 /// let searcher = Config::new()
76 /// .match_kind(MatchKind::LeftmostLongest)
77 /// .builder()
78 /// .add("foo")
79 /// .add("foobar")
80 /// .build()?;
81 /// let matches: Vec<usize> = searcher
82 /// .find_iter("foobar")
83 /// .map(|mat| mat.pattern())
84 /// .collect();
85 /// assert_eq!(vec![1], matches);
86 /// # Some(()) }
87 /// # if cfg!(target_arch = "x86_64") {
88 /// # example().unwrap()
89 /// # } else {
90 /// # assert!(example().is_none());
91 /// # }
92 /// ```
93 #[derive(Clone, Debug)]
94 pub struct Config {
95 kind: MatchKind,
96 force: Option<ForceAlgorithm>,
97 force_teddy_fat: Option<bool>,
98 force_avx: Option<bool>,
99 }
100
101 /// An internal option for forcing the use of a particular packed algorithm.
102 ///
103 /// When an algorithm is forced, if a searcher could not be constructed for it,
104 /// then no searcher will be returned even if an alternative algorithm would
105 /// work.
106 #[derive(Clone, Debug)]
107 enum ForceAlgorithm {
108 Teddy,
109 RabinKarp,
110 }
111
112 impl Default for Config {
113 fn default() -> Config {
114 Config::new()
115 }
116 }
117
118 impl Config {
119 /// Create a new default configuration. A default configuration uses
120 /// leftmost-first match semantics.
121 pub fn new() -> Config {
122 Config {
123 kind: MatchKind::LeftmostFirst,
124 force: None,
125 force_teddy_fat: None,
126 force_avx: None,
127 }
128 }
129
130 /// Create a packed builder from this configuration. The builder can be
131 /// used to accumulate patterns and create a
132 /// [`Searcher`](struct.Searcher.html)
133 /// from them.
134 pub fn builder(&self) -> Builder {
135 Builder::from_config(self.clone())
136 }
137
138 /// Set the match semantics for this configuration.
139 pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
140 self.kind = kind;
141 self
142 }
143
144 /// An undocumented method for forcing the use of the Teddy algorithm.
145 ///
146 /// This is only exposed for more precise testing and benchmarks. Callers
147 /// should not use it as it is not part of the API stability guarantees of
148 /// this crate.
149 #[doc(hidden)]
150 pub fn force_teddy(&mut self, yes: bool) -> &mut Config {
151 if yes {
152 self.force = Some(ForceAlgorithm::Teddy);
153 } else {
154 self.force = None;
155 }
156 self
157 }
158
159 /// An undocumented method for forcing the use of the Fat Teddy algorithm.
160 ///
161 /// This is only exposed for more precise testing and benchmarks. Callers
162 /// should not use it as it is not part of the API stability guarantees of
163 /// this crate.
164 #[doc(hidden)]
165 pub fn force_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config {
166 self.force_teddy_fat = yes;
167 self
168 }
169
170 /// An undocumented method for forcing the use of SSE (`Some(false)`) or
171 /// AVX (`Some(true)`) algorithms.
172 ///
173 /// This is only exposed for more precise testing and benchmarks. Callers
174 /// should not use it as it is not part of the API stability guarantees of
175 /// this crate.
176 #[doc(hidden)]
177 pub fn force_avx(&mut self, yes: Option<bool>) -> &mut Config {
178 self.force_avx = yes;
179 self
180 }
181
182 /// An undocumented method for forcing the use of the Rabin-Karp algorithm.
183 ///
184 /// This is only exposed for more precise testing and benchmarks. Callers
185 /// should not use it as it is not part of the API stability guarantees of
186 /// this crate.
187 #[doc(hidden)]
188 pub fn force_rabin_karp(&mut self, yes: bool) -> &mut Config {
189 if yes {
190 self.force = Some(ForceAlgorithm::RabinKarp);
191 } else {
192 self.force = None;
193 }
194 self
195 }
196 }
197
198 /// A builder for constructing a packed searcher from a collection of patterns.
199 ///
200 /// # Example
201 ///
202 /// This example shows how to use a builder to construct a searcher. By
203 /// default, leftmost-first match semantics are used.
204 ///
205 /// ```
206 /// use aho_corasick::packed::{Builder, MatchKind};
207 ///
208 /// # fn example() -> Option<()> {
209 /// let searcher = Builder::new()
210 /// .add("foobar")
211 /// .add("foo")
212 /// .build()?;
213 /// let matches: Vec<usize> = searcher
214 /// .find_iter("foobar")
215 /// .map(|mat| mat.pattern())
216 /// .collect();
217 /// assert_eq!(vec![0], matches);
218 /// # Some(()) }
219 /// # if cfg!(target_arch = "x86_64") {
220 /// # example().unwrap()
221 /// # } else {
222 /// # assert!(example().is_none());
223 /// # }
224 /// ```
225 #[derive(Clone, Debug)]
226 pub struct Builder {
227 /// The configuration of this builder and subsequent matcher.
228 config: Config,
229 /// Set to true if the builder detects that a matcher cannot be built.
230 inert: bool,
231 /// The patterns provided by the caller.
232 patterns: Patterns,
233 }
234
235 impl Builder {
236 /// Create a new builder for constructing a multi-pattern searcher. This
237 /// constructor uses the default configuration.
238 pub fn new() -> Builder {
239 Builder::from_config(Config::new())
240 }
241
242 fn from_config(config: Config) -> Builder {
243 Builder { config, inert: false, patterns: Patterns::new() }
244 }
245
246 /// Build a searcher from the patterns added to this builder so far.
247 pub fn build(&self) -> Option<Searcher> {
248 if self.inert || self.patterns.is_empty() {
249 return None;
250 }
251 let mut patterns = self.patterns.clone();
252 patterns.set_match_kind(self.config.kind);
253 let rabinkarp = RabinKarp::new(&patterns);
254 // Effectively, we only want to return a searcher if we can use Teddy,
255 // since Teddy is our only fast packed searcher at the moment.
256 // Rabin-Karp is only used when searching haystacks smaller than what
257 // Teddy can support. Thus, the only way to get a Rabin-Karp searcher
258 // is to force it using undocumented APIs (for tests/benchmarks).
259 let (search_kind, minimum_len) = match self.config.force {
260 None | Some(ForceAlgorithm::Teddy) => {
261 let teddy = match self.build_teddy(&patterns) {
262 None => return None,
263 Some(teddy) => teddy,
264 };
265 let minimum_len = teddy.minimum_len();
266 (SearchKind::Teddy(teddy), minimum_len)
267 }
268 Some(ForceAlgorithm::RabinKarp) => (SearchKind::RabinKarp, 0),
269 };
270 Some(Searcher {
271 config: self.config.clone(),
272 patterns,
273 rabinkarp,
274 search_kind,
275 minimum_len,
276 })
277 }
278
279 fn build_teddy(&self, patterns: &Patterns) -> Option<Teddy> {
280 teddy::Builder::new()
281 .avx(self.config.force_avx)
282 .fat(self.config.force_teddy_fat)
283 .build(&patterns)
284 }
285
286 /// Add the given pattern to this set to match.
287 ///
288 /// The order in which patterns are added is significant. Namely, when
289 /// using leftmost-first match semantics, then when multiple patterns can
290 /// match at a particular location, the pattern that was added first is
291 /// used as the match.
292 ///
293 /// If the number of patterns added exceeds the amount supported by packed
294 /// searchers, then the builder will stop accumulating patterns and render
295 /// itself inert. At this point, constructing a searcher will always return
296 /// `None`.
297 pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder {
298 if self.inert {
299 return self;
300 } else if self.patterns.len() >= PATTERN_LIMIT {
301 self.inert = true;
302 self.patterns.reset();
303 return self;
304 }
305 // Just in case PATTERN_LIMIT increases beyond u16::MAX.
306 assert!(self.patterns.len() <= u16::MAX as usize);
307
308 let pattern = pattern.as_ref();
309 if pattern.is_empty() {
310 self.inert = true;
311 self.patterns.reset();
312 return self;
313 }
314 self.patterns.add(pattern);
315 self
316 }
317
318 /// Add the given iterator of patterns to this set to match.
319 ///
320 /// The iterator must yield elements that can be converted into a `&[u8]`.
321 ///
322 /// The order in which patterns are added is significant. Namely, when
323 /// using leftmost-first match semantics, then when multiple patterns can
324 /// match at a particular location, the pattern that was added first is
325 /// used as the match.
326 ///
327 /// If the number of patterns added exceeds the amount supported by packed
328 /// searchers, then the builder will stop accumulating patterns and render
329 /// itself inert. At this point, constructing a searcher will always return
330 /// `None`.
331 pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder
332 where
333 I: IntoIterator<Item = P>,
334 P: AsRef<[u8]>,
335 {
336 for p in patterns {
337 self.add(p);
338 }
339 self
340 }
341 }
342
343 impl Default for Builder {
344 fn default() -> Builder {
345 Builder::new()
346 }
347 }
348
349 /// A packed searcher for quickly finding occurrences of multiple patterns.
350 ///
351 /// If callers need more flexible construction, or if one wants to change the
352 /// match semantics (either leftmost-first or leftmost-longest), then one can
353 /// use the [`Config`](struct.Config.html) and/or
354 /// [`Builder`](struct.Builder.html) types for more fine grained control.
355 ///
356 /// # Example
357 ///
358 /// This example shows how to create a searcher from an iterator of patterns.
359 /// By default, leftmost-first match semantics are used.
360 ///
361 /// ```
362 /// use aho_corasick::packed::{MatchKind, Searcher};
363 ///
364 /// # fn example() -> Option<()> {
365 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
366 /// let matches: Vec<usize> = searcher
367 /// .find_iter("foobar")
368 /// .map(|mat| mat.pattern())
369 /// .collect();
370 /// assert_eq!(vec![0], matches);
371 /// # Some(()) }
372 /// # if cfg!(target_arch = "x86_64") {
373 /// # example().unwrap()
374 /// # } else {
375 /// # assert!(example().is_none());
376 /// # }
377 /// ```
378 #[derive(Clone, Debug)]
379 pub struct Searcher {
380 config: Config,
381 patterns: Patterns,
382 rabinkarp: RabinKarp,
383 search_kind: SearchKind,
384 minimum_len: usize,
385 }
386
387 #[derive(Clone, Debug)]
388 enum SearchKind {
389 Teddy(Teddy),
390 RabinKarp,
391 }
392
393 impl Searcher {
394 /// A convenience function for constructing a searcher from an iterator
395 /// of things that can be converted to a `&[u8]`.
396 ///
397 /// If a searcher could not be constructed (either because of an
398 /// unsupported CPU or because there are too many patterns), then `None`
399 /// is returned.
400 ///
401 /// # Example
402 ///
403 /// Basic usage:
404 ///
405 /// ```
406 /// use aho_corasick::packed::{MatchKind, Searcher};
407 ///
408 /// # fn example() -> Option<()> {
409 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
410 /// let matches: Vec<usize> = searcher
411 /// .find_iter("foobar")
412 /// .map(|mat| mat.pattern())
413 /// .collect();
414 /// assert_eq!(vec![0], matches);
415 /// # Some(()) }
416 /// # if cfg!(target_arch = "x86_64") {
417 /// # example().unwrap()
418 /// # } else {
419 /// # assert!(example().is_none());
420 /// # }
421 /// ```
422 pub fn new<I, P>(patterns: I) -> Option<Searcher>
423 where
424 I: IntoIterator<Item = P>,
425 P: AsRef<[u8]>,
426 {
427 Builder::new().extend(patterns).build()
428 }
429
430 /// Return the first occurrence of any of the patterns in this searcher,
431 /// according to its match semantics, in the given haystack. The `Match`
432 /// returned will include the identifier of the pattern that matched, which
433 /// corresponds to the index of the pattern (starting from `0`) in which it
434 /// was added.
435 ///
436 /// # Example
437 ///
438 /// Basic usage:
439 ///
440 /// ```
441 /// use aho_corasick::packed::{MatchKind, Searcher};
442 ///
443 /// # fn example() -> Option<()> {
444 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
445 /// let mat = searcher.find("foobar")?;
446 /// assert_eq!(0, mat.pattern());
447 /// assert_eq!(0, mat.start());
448 /// assert_eq!(6, mat.end());
449 /// # Some(()) }
450 /// # if cfg!(target_arch = "x86_64") {
451 /// # example().unwrap()
452 /// # } else {
453 /// # assert!(example().is_none());
454 /// # }
455 /// ```
456 pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
457 self.find_at(haystack, 0)
458 }
459
460 /// Return the first occurrence of any of the patterns in this searcher,
461 /// according to its match semantics, in the given haystack starting from
462 /// the given position.
463 ///
464 /// The `Match` returned will include the identifier of the pattern that
465 /// matched, which corresponds to the index of the pattern (starting from
466 /// `0`) in which it was added. The offsets in the `Match` will be relative
467 /// to the start of `haystack` (and not `at`).
468 ///
469 /// # Example
470 ///
471 /// Basic usage:
472 ///
473 /// ```
474 /// use aho_corasick::packed::{MatchKind, Searcher};
475 ///
476 /// # fn example() -> Option<()> {
477 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
478 /// let mat = searcher.find_at("foofoobar", 3)?;
479 /// assert_eq!(0, mat.pattern());
480 /// assert_eq!(3, mat.start());
481 /// assert_eq!(9, mat.end());
482 /// # Some(()) }
483 /// # if cfg!(target_arch = "x86_64") {
484 /// # example().unwrap()
485 /// # } else {
486 /// # assert!(example().is_none());
487 /// # }
488 /// ```
489 pub fn find_at<B: AsRef<[u8]>>(
490 &self,
491 haystack: B,
492 at: usize,
493 ) -> Option<Match> {
494 let haystack = haystack.as_ref();
495 match self.search_kind {
496 SearchKind::Teddy(ref teddy) => {
497 if haystack[at..].len() < teddy.minimum_len() {
498 return self.slow_at(haystack, at);
499 }
500 teddy.find_at(&self.patterns, haystack, at)
501 }
502 SearchKind::RabinKarp => {
503 self.rabinkarp.find_at(&self.patterns, haystack, at)
504 }
505 }
506 }
507
508 /// Return an iterator of non-overlapping occurrences of the patterns in
509 /// this searcher, according to its match semantics, in the given haystack.
510 ///
511 /// # Example
512 ///
513 /// Basic usage:
514 ///
515 /// ```
516 /// use aho_corasick::packed::{MatchKind, Searcher};
517 ///
518 /// # fn example() -> Option<()> {
519 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
520 /// let matches: Vec<usize> = searcher
521 /// .find_iter("foobar fooba foofoo")
522 /// .map(|mat| mat.pattern())
523 /// .collect();
524 /// assert_eq!(vec![0, 1, 1, 1], matches);
525 /// # Some(()) }
526 /// # if cfg!(target_arch = "x86_64") {
527 /// # example().unwrap()
528 /// # } else {
529 /// # assert!(example().is_none());
530 /// # }
531 /// ```
532 pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
533 &'a self,
534 haystack: &'b B,
535 ) -> FindIter<'a, 'b> {
536 FindIter { searcher: self, haystack: haystack.as_ref(), at: 0 }
537 }
538
539 /// Returns the match kind used by this packed searcher.
540 ///
541 /// # Examples
542 ///
543 /// Basic usage:
544 ///
545 /// ```
546 /// use aho_corasick::packed::{MatchKind, Searcher};
547 ///
548 /// # fn example() -> Option<()> {
549 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
550 /// // leftmost-first is the default.
551 /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind());
552 /// # Some(()) }
553 /// # if cfg!(target_arch = "x86_64") {
554 /// # example().unwrap()
555 /// # } else {
556 /// # assert!(example().is_none());
557 /// # }
558 /// ```
559 pub fn match_kind(&self) -> &MatchKind {
560 self.patterns.match_kind()
561 }
562
563 /// Returns the minimum length of a haystack that is required in order for
564 /// packed searching to be effective.
565 ///
566 /// In some cases, the underlying packed searcher may not be able to search
567 /// very short haystacks. When that occurs, the implementation will defer
568 /// to a slower non-packed searcher (which is still generally faster than
569 /// Aho-Corasick for a small number of patterns). However, callers may
570 /// want to avoid ever using the slower variant, which one can do by
571 /// never passing a haystack shorter than the minimum length returned by
572 /// this method.
573 pub fn minimum_len(&self) -> usize {
574 self.minimum_len
575 }
576
577 /// Returns the approximate total amount of heap used by this searcher, in
578 /// units of bytes.
579 pub fn heap_bytes(&self) -> usize {
580 self.patterns.heap_bytes()
581 + self.rabinkarp.heap_bytes()
582 + self.search_kind.heap_bytes()
583 }
584
585 /// Use a slow (non-packed) searcher.
586 ///
587 /// This is useful when a packed searcher could be constructed, but could
588 /// not be used to search a specific haystack. For example, if Teddy was
589 /// built but the haystack is smaller than ~34 bytes, then Teddy might not
590 /// be able to run.
591 fn slow_at(&self, haystack: &[u8], at: usize) -> Option<Match> {
592 self.rabinkarp.find_at(&self.patterns, haystack, at)
593 }
594 }
595
596 impl SearchKind {
597 fn heap_bytes(&self) -> usize {
598 match *self {
599 SearchKind::Teddy(ref ted) => ted.heap_bytes(),
600 SearchKind::RabinKarp => 0,
601 }
602 }
603 }
604
605 /// An iterator over non-overlapping matches from a packed searcher.
606 ///
607 /// The lifetime `'s` refers to the lifetime of the underlying
608 /// [`Searcher`](struct.Searcher.html), while the lifetime `'h` refers to the
609 /// lifetime of the haystack being searched.
610 #[derive(Debug)]
611 pub struct FindIter<'s, 'h> {
612 searcher: &'s Searcher,
613 haystack: &'h [u8],
614 at: usize,
615 }
616
617 impl<'s, 'h> Iterator for FindIter<'s, 'h> {
618 type Item = Match;
619
620 fn next(&mut self) -> Option<Match> {
621 if self.at > self.haystack.len() {
622 return None;
623 }
624 match self.searcher.find_at(&self.haystack, self.at) {
625 None => None,
626 Some(c) => {
627 self.at = c.end;
628 Some(c)
629 }
630 }
631 }
632 }