1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
14 use aho_corasick
::{Automaton, AcAutomaton, FullAcAutomaton}
;
15 use memchr
::{memchr, memchr2, memchr3}
;
16 use syntax
::hir
::literal
::{Literal, Literals}
;
18 use freqs
::BYTE_FREQUENCIES
;
19 use self::teddy_avx2
::{Teddy as TeddyAVX2}
;
20 use self::teddy_ssse3
::{Teddy as TeddySSSE3}
;
25 /// A prefix extracted from a compiled regular expression.
27 /// A regex prefix is a set of literal strings that *must* be matched at the
28 /// beginning of a regex in order for the entire regex to match. Similarly
29 /// for a regex suffix.
30 #[derive(Clone, Debug)]
31 pub struct LiteralSearcher
{
38 #[derive(Clone, Debug)]
40 /// No literals. (Never advances through the input.)
42 /// A set of four or more single byte literals.
44 /// A single substring, find using memchr and frequency analysis.
45 FreqyPacked(FreqyPacked
),
46 /// A single substring, find using Boyer-Moore.
47 BoyerMoore(BoyerMooreSearch
),
48 /// An Aho-Corasick automaton.
49 AC(FullAcAutomaton
<Literal
>),
50 /// A simd accelerated multiple string matcher. Used only for a small
51 /// number of small literals.
52 TeddySSSE3(TeddySSSE3
),
53 /// A simd accelerated multiple string matcher. Used only for a small
54 /// number of small literals. This uses 256-bit vectors.
58 impl LiteralSearcher
{
59 /// Returns a matcher that never matches and never advances the input.
60 pub fn empty() -> Self {
61 Self::new(Literals
::empty(), Matcher
::Empty
)
64 /// Returns a matcher for literal prefixes from the given set.
65 pub fn prefixes(lits
: Literals
) -> Self {
66 let matcher
= Matcher
::prefixes(&lits
);
67 Self::new(lits
, matcher
)
70 /// Returns a matcher for literal suffixes from the given set.
71 pub fn suffixes(lits
: Literals
) -> Self {
72 let matcher
= Matcher
::suffixes(&lits
);
73 Self::new(lits
, matcher
)
76 fn new(lits
: Literals
, matcher
: Matcher
) -> Self {
77 let complete
= lits
.all_complete();
80 lcp
: FreqyPacked
::new(lits
.longest_common_prefix().to_vec()),
81 lcs
: FreqyPacked
::new(lits
.longest_common_suffix().to_vec()),
86 /// Returns true if all matches comprise the entire regular expression.
88 /// This does not necessarily mean that a literal match implies a match
89 /// of the regular expression. For example, the regular expresison `^a`
90 /// is comprised of a single complete literal `a`, but the regular
91 /// expression demands that it only match at the beginning of a string.
92 pub fn complete(&self) -> bool
{
93 self.complete
&& !self.is_empty()
96 /// Find the position of a literal in `haystack` if it exists.
97 #[inline(always)] // reduces constant overhead
98 pub fn find(&self, haystack
: &[u8]) -> Option
<(usize, usize)> {
101 Empty
=> Some((0, 0)),
102 Bytes(ref sset
) => sset
.find(haystack
).map(|i
| (i
, i
+ 1)),
103 FreqyPacked(ref s
) => s
.find(haystack
).map(|i
| (i
, i
+ s
.len())),
104 BoyerMoore(ref s
) => s
.find(haystack
).map(|i
| (i
, i
+ s
.len())),
105 AC(ref aut
) => aut
.find(haystack
).next().map(|m
| (m
.start
, m
.end
)),
106 TeddySSSE3(ref t
) => t
.find(haystack
).map(|m
| (m
.start
, m
.end
)),
107 TeddyAVX2(ref t
) => t
.find(haystack
).map(|m
| (m
.start
, m
.end
)),
111 /// Like find, except matches must start at index `0`.
112 pub fn find_start(&self, haystack
: &[u8]) -> Option
<(usize, usize)> {
113 for lit
in self.iter() {
114 if lit
.len() > haystack
.len() {
117 if lit
== &haystack
[0..lit
.len()] {
118 return Some((0, lit
.len()));
124 /// Like find, except matches must end at index `haystack.len()`.
125 pub fn find_end(&self, haystack
: &[u8]) -> Option
<(usize, usize)> {
126 for lit
in self.iter() {
127 if lit
.len() > haystack
.len() {
130 if lit
== &haystack
[haystack
.len() - lit
.len()..] {
131 return Some((haystack
.len() - lit
.len(), haystack
.len()));
137 /// Returns an iterator over all literals to be matched.
138 pub fn iter(&self) -> LiteralIter
{
140 Matcher
::Empty
=> LiteralIter
::Empty
,
141 Matcher
::Bytes(ref sset
) => LiteralIter
::Bytes(&sset
.dense
),
142 Matcher
::FreqyPacked(ref s
) => LiteralIter
::Single(&s
.pat
),
143 Matcher
::BoyerMoore(ref s
) => LiteralIter
::Single(&s
.pattern
),
144 Matcher
::AC(ref ac
) => LiteralIter
::AC(ac
.patterns()),
145 Matcher
::TeddySSSE3(ref ted
) => {
146 LiteralIter
::TeddySSSE3(ted
.patterns())
148 Matcher
::TeddyAVX2(ref ted
) => {
149 LiteralIter
::TeddyAVX2(ted
.patterns())
154 /// Returns a matcher for the longest common prefix of this matcher.
155 pub fn lcp(&self) -> &FreqyPacked
{
159 /// Returns a matcher for the longest common suffix of this matcher.
160 pub fn lcs(&self) -> &FreqyPacked
{
164 /// Returns true iff this prefix is empty.
165 pub fn is_empty(&self) -> bool
{
169 /// Returns the number of prefixes in this machine.
170 pub fn len(&self) -> usize {
171 use self::Matcher
::*;
174 Bytes(ref sset
) => sset
.dense
.len(),
177 AC(ref aut
) => aut
.len(),
178 TeddySSSE3(ref ted
) => ted
.len(),
179 TeddyAVX2(ref ted
) => ted
.len(),
183 /// Return the approximate heap usage of literals in bytes.
184 pub fn approximate_size(&self) -> usize {
185 use self::Matcher
::*;
188 Bytes(ref sset
) => sset
.approximate_size(),
189 FreqyPacked(ref single
) => single
.approximate_size(),
190 BoyerMoore(ref single
) => single
.approximate_size(),
191 AC(ref aut
) => aut
.heap_bytes(),
192 TeddySSSE3(ref ted
) => ted
.approximate_size(),
193 TeddyAVX2(ref ted
) => ted
.approximate_size(),
199 fn prefixes(lits
: &Literals
) -> Self {
200 let sset
= SingleByteSet
::prefixes(lits
);
201 Matcher
::new(lits
, sset
)
204 fn suffixes(lits
: &Literals
) -> Self {
205 let sset
= SingleByteSet
::suffixes(lits
);
206 Matcher
::new(lits
, sset
)
209 fn new(lits
: &Literals
, sset
: SingleByteSet
) -> Self {
210 if lits
.literals().is_empty() {
211 return Matcher
::Empty
;
213 if sset
.dense
.len() >= 26 {
214 // Avoid trying to match a large number of single bytes.
215 // This is *very* sensitive to a frequency analysis comparison
216 // between the bytes in sset and the composition of the haystack.
217 // No matter the size of sset, if its members all are rare in the
218 // haystack, then it'd be worth using it. How to tune this... IDK.
220 return Matcher
::Empty
;
223 return Matcher
::Bytes(sset
);
225 if lits
.literals().len() == 1 {
226 let lit
= lits
.literals()[0].to_vec();
227 if BoyerMooreSearch
::should_use(lit
.as_slice()) {
228 return Matcher
::BoyerMoore(BoyerMooreSearch
::new(lit
));
230 return Matcher
::FreqyPacked(FreqyPacked
::new(lit
));
233 let is_aho_corasick_fast
= sset
.dense
.len() == 1 && sset
.all_ascii
;
234 if TeddyAVX2
::available() && !is_aho_corasick_fast
{
235 const MAX_TEDDY_LITERALS
: usize = 32;
236 if lits
.literals().len() <= MAX_TEDDY_LITERALS
{
237 if let Some(ted
) = TeddyAVX2
::new(lits
) {
238 return Matcher
::TeddyAVX2(ted
);
242 if TeddySSSE3
::available() && !is_aho_corasick_fast
{
243 // Only try Teddy if Aho-Corasick can't use memchr on an ASCII
244 // byte. Also, in its current form, Teddy doesn't scale well to
247 // We impose the ASCII restriction since an alternation of
248 // non-ASCII string literals in the same language is likely to all
249 // start with the same byte. Even worse, the corpus being searched
250 // probably has a similar composition, which ends up completely
251 // negating the benefit of memchr.
252 const MAX_TEDDY_LITERALS
: usize = 32;
253 if lits
.literals().len() <= MAX_TEDDY_LITERALS
{
254 if let Some(ted
) = TeddySSSE3
::new(lits
) {
255 return Matcher
::TeddySSSE3(ted
);
258 // Fallthrough to ol' reliable Aho-Corasick...
260 let pats
= lits
.literals().to_owned();
261 Matcher
::AC(AcAutomaton
::new(pats
).into_full())
265 pub enum LiteralIter
<'a
> {
270 TeddySSSE3(&'a
[Vec
<u8>]),
271 TeddyAVX2(&'a
[Vec
<u8>]),
274 impl<'a
> Iterator
for LiteralIter
<'a
> {
275 type Item
= &'a
[u8];
277 fn next(&mut self) -> Option
<Self::Item
> {
279 LiteralIter
::Empty
=> None
,
280 LiteralIter
::Bytes(ref mut many
) => {
284 let next
= &many
[0..1];
289 LiteralIter
::Single(ref mut one
) => {
298 LiteralIter
::AC(ref mut lits
) => {
307 LiteralIter
::TeddySSSE3(ref mut lits
) => {
316 LiteralIter
::TeddyAVX2(ref mut lits
) => {
329 #[derive(Clone, Debug)]
330 struct SingleByteSet
{
338 fn new() -> SingleByteSet
{
340 sparse
: vec
![false; 256],
347 fn prefixes(lits
: &Literals
) -> SingleByteSet
{
348 let mut sset
= SingleByteSet
::new();
349 for lit
in lits
.literals() {
350 sset
.complete
= sset
.complete
&& lit
.len() == 1;
351 if let Some(&b
) = lit
.get(0) {
352 if !sset
.sparse
[b
as usize] {
354 sset
.all_ascii
= false;
357 sset
.sparse
[b
as usize] = true;
364 fn suffixes(lits
: &Literals
) -> SingleByteSet
{
365 let mut sset
= SingleByteSet
::new();
366 for lit
in lits
.literals() {
367 sset
.complete
= sset
.complete
&& lit
.len() == 1;
368 if let Some(&b
) = lit
.get(lit
.len().checked_sub(1).unwrap()) {
369 if !sset
.sparse
[b
as usize] {
371 sset
.all_ascii
= false;
374 sset
.sparse
[b
as usize] = true;
381 /// Faster find that special cases certain sizes to use memchr.
382 #[inline(always)] // reduces constant overhead
383 fn find(&self, text
: &[u8]) -> Option
<usize> {
384 match self.dense
.len() {
386 1 => memchr(self.dense
[0], text
),
387 2 => memchr2(self.dense
[0], self.dense
[1], text
),
388 3 => memchr3(self.dense
[0], self.dense
[1], self.dense
[2], text
),
389 _
=> self._find(text
),
393 /// Generic find that works on any sized set.
394 fn _find(&self, haystack
: &[u8]) -> Option
<usize> {
395 for (i
, &b
) in haystack
.iter().enumerate() {
396 if self.sparse
[b
as usize] {
403 fn approximate_size(&self) -> usize {
404 (self.dense
.len() * mem
::size_of
::<u8>())
405 + (self.sparse
.len() * mem
::size_of
::<bool
>())
409 /// Provides an implementation of fast subtring search using frequency
412 /// memchr is so fast that we do everything we can to keep the loop in memchr
413 /// for as long as possible. The easiest way to do this is to intelligently
414 /// pick the byte to send to memchr. The best byte is the byte that occurs
415 /// least frequently in the haystack. Since doing frequency analysis on the
416 /// haystack is far too expensive, we compute a set of fixed frequencies up
417 /// front and hard code them in src/freqs.rs. Frequency analysis is done via
418 /// scripts/frequencies.py.
419 #[derive(Clone, Debug)]
420 pub struct FreqyPacked
{
423 /// The number of Unicode characters in the pattern. This is useful for
424 /// determining the effective length of a pattern when deciding which
425 /// optimizations to perform. A trailing incomplete UTF-8 sequence counts
426 /// as one character.
428 /// The rarest byte in the pattern, according to pre-computed frequency
431 /// The offset of the rarest byte in `pat`.
433 /// The second rarest byte in the pattern, according to pre-computed
434 /// frequency analysis. (This may be equivalent to the rarest byte.)
436 /// The second rarest byte is used as a type of guard for quickly detecting
437 /// a mismatch after memchr locates an instance of the rarest byte. This
438 /// is a hedge against pathological cases where the pre-computed frequency
439 /// analysis may be off. (But of course, does not prevent *all*
440 /// pathological cases.)
442 /// The offset of the second rarest byte in `pat`.
447 fn new(pat
: Vec
<u8>) -> FreqyPacked
{
449 return FreqyPacked
::empty();
452 // Find the rarest two bytes. Try to make them distinct (but it's not
454 let mut rare1
= pat
[0];
455 let mut rare2
= pat
[0];
456 for b
in pat
[1..].iter().cloned() {
457 if freq_rank(b
) < freq_rank(rare1
) {
464 } else if b
!= rare1
&& freq_rank(b
) < freq_rank(rare2
) {
469 // And find the offsets of their last occurrences.
470 let rare1i
= pat
.iter().rposition(|&b
| b
== rare1
).unwrap();
471 let rare2i
= pat
.iter().rposition(|&b
| b
== rare2
).unwrap();
473 let char_len
= char_len_lossy(&pat
);
484 fn empty() -> FreqyPacked
{
495 #[inline(always)] // reduces constant overhead
496 pub fn find(&self, haystack
: &[u8]) -> Option
<usize> {
497 let pat
= &*self.pat
;
498 if haystack
.len() < pat
.len() || pat
.is_empty() {
501 let mut i
= self.rare1i
;
502 while i
< haystack
.len() {
503 i
+= match memchr(self.rare1
, &haystack
[i
..]) {
507 let start
= i
- self.rare1i
;
508 let end
= start
+ pat
.len();
509 if end
> haystack
.len() {
512 let aligned
= &haystack
[start
..end
];
513 if aligned
[self.rare2i
] == self.rare2
&& aligned
== &*self.pat
{
521 #[inline(always)] // reduces constant overhead
522 pub fn is_suffix(&self, text
: &[u8]) -> bool
{
523 if text
.len() < self.len() {
526 text
[text
.len() - self.len()..] == *self.pat
529 pub fn len(&self) -> usize {
533 pub fn char_len(&self) -> usize {
537 fn approximate_size(&self) -> usize {
538 self.pat
.len() * mem
::size_of
::<u8>()
542 fn char_len_lossy(bytes
: &[u8]) -> usize {
543 String
::from_utf8_lossy(bytes
).chars().count()
546 /// An implementation of Tuned Boyer-Moore as laid out by
547 /// Andrew Hume and Daniel Sunday in "Fast String Searching".
548 /// O(n) in the size of the input.
550 /// Fast string searching algorithms come in many variations,
551 /// but they can generally be described in terms of three main
554 /// The skip loop is where the string searcher wants to spend
555 /// as much time as possible. Exactly which character in the
556 /// pattern the skip loop examines varies from algorithm to
557 /// algorithm, but in the simplest case this loop repeated
558 /// looks at the last character in the pattern and jumps
559 /// forward in the input if it is not in the pattern.
560 /// Robert Boyer and J Moore called this the "fast" loop in
561 /// their original paper.
563 /// The match loop is responsible for actually examining the
564 /// whole potentially matching substring. In order to fail
565 /// faster, the match loop sometimes has a guard test attached.
566 /// The guard test uses frequency analysis of the different
567 /// characters in the pattern to choose the least frequency
568 /// occurring character and use it to find match failures
569 /// as quickly as possible.
571 /// The shift rule governs how the algorithm will shuffle its
572 /// test window in the event of a failure during the match loop.
573 /// Certain shift rules allow the worst-case run time of the
574 /// algorithm to be shown to be O(n) in the size of the input
575 /// rather than O(nm) in the size of the input and the size
576 /// of the pattern (as naive Boyer-Moore is).
578 /// "Fast String Searching", in addition to presenting a tuned
579 /// algorithm, provides a comprehensive taxonomy of the many
580 /// different flavors of string searchers. Under that taxonomy
581 /// TBM, the algorithm implemented here, uses an unrolled fast
582 /// skip loop with memchr fallback, a forward match loop with guard,
583 /// and the mini Sunday's delta shift rule. To unpack that you'll have to
585 #[derive(Clone, Debug)]
586 pub struct BoyerMooreSearch
{
587 /// The pattern we are going to look for in the haystack.
590 /// The skip table for the skip loop.
592 /// Maps the character at the end of the input
594 skip_table
: Vec
<usize>,
596 /// The guard character (least frequently occurring char).
598 /// The reverse-index of the guard character in the pattern.
599 guard_reverse_idx
: usize,
601 /// Daniel Sunday's mini generalized delta2 shift table.
603 /// We use a skip loop, so we only have to provide a shift
604 /// for the skip char (last char). This is why it is a mini
609 impl BoyerMooreSearch
{
610 /// Create a new string searcher, performing whatever
611 /// compilation steps are required.
612 fn new(pattern
: Vec
<u8>) -> Self {
613 debug_assert
!(pattern
.len() > 0);
615 let (g
, gi
) = Self::select_guard(pattern
.as_slice());
616 let skip_table
= Self::compile_skip_table(pattern
.as_slice());
617 let md2_shift
= Self::compile_md2_shift(pattern
.as_slice());
620 skip_table
: skip_table
,
622 guard_reverse_idx
: gi
,
623 md2_shift
: md2_shift
,
627 /// Find the pattern in `haystack`, returning the offset
628 /// of the start of the first occurrence of the pattern
631 fn find(&self, haystack
: &[u8]) -> Option
<usize> {
632 if haystack
.len() < self.pattern
.len() {
636 let mut window_end
= self.pattern
.len() - 1;
638 // Inspired by the grep source. It is a way
639 // to do correct loop unrolling without having to place
640 // a crashpad of terminating charicters at the end in
641 // the way described in the Fast String Searching paper.
642 const NUM_UNROLL
: usize = 10;
643 // 1 for the initial position, and 1 for the md2 shift
644 let short_circut
= (NUM_UNROLL
+ 2) * self.pattern
.len();
646 if haystack
.len() > short_circut
{
647 // just 1 for the md2 shift
648 let backstop
= haystack
.len() - ((NUM_UNROLL
+ 1) * self.pattern
.len());
650 window_end
= match self.skip_loop(haystack
, window_end
, backstop
) {
654 if window_end
>= backstop
{
658 if self.check_match(haystack
, window_end
) {
659 return Some(window_end
- (self.pattern
.len() - 1));
661 let skip
= self.skip_table
[haystack
[window_end
] as usize];
663 if skip
== 0 { self.md2_shift }
else { skip }
;
669 // now process the input after the backstop
670 while window_end
< haystack
.len() {
671 let mut skip
= self.skip_table
[haystack
[window_end
] as usize];
673 if self.check_match(haystack
, window_end
) {
674 return Some(window_end
- (self.pattern
.len() - 1));
676 skip
= self.md2_shift
;
685 fn len(&self) -> usize {
686 return self.pattern
.len()
689 /// The key heuristic behind which the BoyerMooreSearch lives.
691 /// See `rust-lang/regex/issues/408`.
693 /// Tuned Boyer-Moore is actually pretty slow! It turns out a handrolled
694 /// platform-specific memchr routine with a bit of frequency
695 /// analysis sprinkled on top actually wins most of the time.
696 /// However, there are a few cases where Tuned Boyer-Moore still
699 /// If the haystack is random, frequency analysis doesn't help us,
700 /// so Boyer-Moore will win for sufficiently large needles.
701 /// Unfortunately, there is no obvious way to determine this
704 /// If the pattern itself consists of very common characters,
705 /// frequency analysis won't get us anywhere. The most extreme
706 /// example of this is a pattern like `eeeeeeeeeeeeeeee`. Fortunately,
707 /// this case is wholly determined by the pattern, so we can actually
708 /// implement the heuristic.
710 /// A third case is if the pattern is sufficiently long. The idea
711 /// here is that once the pattern gets long enough the Tuned
712 /// Boyer-Moore skip loop will start making strides long enough
713 /// to beat the asm deep magic that is memchr.
714 fn should_use(pattern
: &[u8]) -> bool
{
715 // The minimum pattern length required to use TBM.
716 const MIN_LEN
: usize = 9;
717 // The minimum frequency rank (lower is rarer) that every byte in the
718 // pattern must have in order to use TBM. That is, if the pattern
719 // contains _any_ byte with a lower rank, then TBM won't be used.
720 const MIN_CUTOFF
: usize = 150;
721 // The maximum frequency rank for any byte.
722 const MAX_CUTOFF
: usize = 255;
723 // The scaling factor used to determine the actual cutoff frequency
724 // to use (keeping in mind that the minimum frequency rank is bounded
725 // by MIN_CUTOFF). This scaling factor is an attempt to make TBM more
726 // likely to be used as the pattern grows longer. That is, longer
727 // patterns permit somewhat less frequent bytes than shorter patterns,
728 // under the assumption that TBM gets better as the pattern gets
730 const LEN_CUTOFF_PROPORTION
: usize = 4;
732 let scaled_rank
= pattern
.len().wrapping_mul(LEN_CUTOFF_PROPORTION
);
733 let cutoff
= cmp
::max(
735 MAX_CUTOFF
- cmp
::min(MAX_CUTOFF
, scaled_rank
),
737 // The pattern must be long enough to be worthwhile. e.g., memchr will
738 // be faster on `e` because it is short even though e is quite common.
739 pattern
.len() > MIN_LEN
740 // all the bytes must be more common than the cutoff.
741 && pattern
.iter().all(|c
| freq_rank(*c
) >= cutoff
)
744 /// Check to see if there is a match at the given position
746 fn check_match(&self, haystack
: &[u8], window_end
: usize) -> bool
{
748 if haystack
[window_end
- self.guard_reverse_idx
] != self.guard
{
753 let window_start
= window_end
- (self.pattern
.len() - 1);
754 for i
in 0..self.pattern
.len() {
755 if self.pattern
[i
] != haystack
[window_start
+ i
] {
763 /// Skip forward according to the shift table.
765 /// Returns the offset of the next occurrence
766 /// of the last char in the pattern, or the none
767 /// if it never reappears. If `skip_loop` hits the backstop
768 /// it will leave early.
772 mut window_end
: usize,
777 let window_end_snapshot
= window_end
;
778 let skip_of
= |we
: usize| -> usize {
779 // Unsafe might make this faster, but the benchmarks
780 // were hard to interpret.
781 self.skip_table
[haystack
[we
] as usize]
785 let mut skip
= skip_of(window_end
); window_end
+= skip
;
786 skip
= skip_of(window_end
); window_end
+= skip
;
788 skip
= skip_of(window_end
); window_end
+= skip
;
789 skip
= skip_of(window_end
); window_end
+= skip
;
790 skip
= skip_of(window_end
); window_end
+= skip
;
792 skip
= skip_of(window_end
); window_end
+= skip
;
793 skip
= skip_of(window_end
); window_end
+= skip
;
794 skip
= skip_of(window_end
); window_end
+= skip
;
796 skip
= skip_of(window_end
); window_end
+= skip
;
797 skip
= skip_of(window_end
); window_end
+= skip
;
799 // If ten iterations did not make at least 16 words
800 // worth of progress, we just fall back on memchr.
801 if window_end
- window_end_snapshot
>
802 16 * mem
::size_of
::<usize>() {
804 // Returning a window_end >= backstop will immediatly
805 // break us out of the inner loop in `find`.
806 if window_end
>= backstop
{
807 return Some(window_end
);
810 continue; // we made enough progress
812 // In case we are already there, and so that
813 // we will catch the guard char.
814 window_end
= window_end
815 .checked_sub(1 + self.guard_reverse_idx
)
818 match memchr(self.guard
, &haystack
[window_end
..]) {
821 return Some(window_end
+ g_idx
+ self.guard_reverse_idx
);
829 return Some(window_end
);
833 /// Compute the ufast skip table.
834 fn compile_skip_table(pattern
: &[u8]) -> Vec
<usize> {
835 let mut tab
= vec
![pattern
.len(); 256];
837 // For every char in the pattern, we write a skip
838 // that will line us up with the rightmost occurrence.
840 // N.B. the sentinel (0) is written by the last
842 for (i
, c
) in pattern
.iter().enumerate() {
843 tab
[*c
as usize] = (pattern
.len() - 1) - i
;
849 /// Select the guard character based off of the precomputed
851 fn select_guard(pattern
: &[u8]) -> (u8, usize) {
852 let mut rarest
= pattern
[0];
853 let mut rarest_rev_idx
= pattern
.len() - 1;
854 for (i
, c
) in pattern
.iter().enumerate() {
855 if freq_rank(*c
) < freq_rank(rarest
) {
857 rarest_rev_idx
= (pattern
.len() - 1) - i
;
861 (rarest
, rarest_rev_idx
)
864 /// If there is another occurrence of the skip
865 /// char, shift to it, otherwise just shift to
867 fn compile_md2_shift(pattern
: &[u8]) -> usize {
868 let shiftc
= *pattern
.last().unwrap();
870 // For a pattern of length 1 we will never apply the
871 // shift rule, so we use a poison value on the principle
872 // that failing fast is a good thing.
873 if pattern
.len() == 1 {
877 let mut i
= pattern
.len() - 2;
879 if pattern
[i
] == shiftc
{
880 return (pattern
.len() - 1) - i
;
885 // The skip char never re-occurs in the pattern, so
886 // we can just shift the whole window length.
890 fn approximate_size(&self) -> usize {
891 (self.pattern
.len() * mem
::size_of
::<u8>())
892 + (256 * mem
::size_of
::<usize>()) // skip table
896 fn freq_rank(b
: u8) -> usize {
897 BYTE_FREQUENCIES
[b
as usize] as usize
902 use super::{BoyerMooreSearch, FreqyPacked}
;
908 // The "hello, world" of string searching
911 let searcher
= BoyerMooreSearch
::new(Vec
::from(&b
"pattern"[..]));
912 let haystack
= b
"I keep seeing patterns in this text";
913 assert_eq
!(14, searcher
.find(haystack
).unwrap());
917 fn bm_find_no_subs() {
918 let searcher
= BoyerMooreSearch
::new(Vec
::from(&b
"pattern"[..]));
919 let haystack
= b
"I keep seeing needles in this text";
920 assert_eq
!(None
, searcher
.find(haystack
));
928 fn bm_skip_reset_bug() {
929 let haystack
= vec
![0, 0, 0, 0, 0, 1, 1, 0];
930 let needle
= vec
![0, 1, 1, 0];
932 let searcher
= BoyerMooreSearch
::new(needle
);
933 let offset
= searcher
.find(haystack
.as_slice()).unwrap();
934 assert_eq
!(4, offset
);
938 fn bm_backstop_underflow_bug() {
939 let haystack
= vec
![0, 0];
940 let needle
= vec
![0, 0];
942 let searcher
= BoyerMooreSearch
::new(needle
);
943 let offset
= searcher
.find(haystack
.as_slice()).unwrap();
944 assert_eq
!(0, offset
);
948 fn bm_naive_off_by_one_bug() {
949 let haystack
= vec
![91];
950 let needle
= vec
![91];
952 let naive_offset
= naive_find(&needle
, &haystack
).unwrap();
953 assert_eq
!(0, naive_offset
);
957 fn bm_memchr_fallback_indexing_bug() {
958 let mut haystack
= vec
![
959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960 0, 0, 0, 0, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
963 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
964 let needle
= vec
![1, 1, 1, 1, 32, 32, 87];
965 let needle_start
= haystack
.len();
966 haystack
.extend(needle
.clone());
968 let searcher
= BoyerMooreSearch
::new(needle
);
969 assert_eq
!(needle_start
, searcher
.find(haystack
.as_slice()).unwrap());
973 fn bm_backstop_boundary() {
975 // aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
976 e_data.clone_created(entity_id, entity_to_add.entity_id);
977 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
978 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
980 let needle
= b
"clone_created".to_vec();
982 let searcher
= BoyerMooreSearch
::new(needle
);
983 let result
= searcher
.find(&haystack
);
984 assert_eq
!(Some(43), result
);
988 fn bm_win_gnu_indexing_bug() {
989 let haystack_raw
= vec
![
990 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
991 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
992 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
993 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
994 let needle
= vec
![1, 1, 1, 1, 1, 1, 1];
995 let haystack
= haystack_raw
.as_slice();
997 BoyerMooreSearch
::new(needle
.clone()).find(haystack
);
1001 // QuickCheck Properties
1004 use quickcheck
::TestResult
;
1006 fn naive_find(needle
: &[u8], haystack
: &[u8]) -> Option
<usize> {
1007 assert
!(needle
.len() <= haystack
.len());
1009 for i
in 0..(haystack
.len() - (needle
.len() - 1)) {
1010 if haystack
[i
] == needle
[0]
1011 && &haystack
[i
..(i
+needle
.len())] == needle
{
1020 fn qc_bm_equals_nieve_find(pile1
: Vec
<u8>, pile2
: Vec
<u8>) -> TestResult
{
1021 if pile1
.len() == 0 || pile2
.len() == 0 {
1022 return TestResult
::discard();
1025 let (needle
, haystack
) = if pile1
.len() < pile2
.len() {
1026 (pile1
, pile2
.as_slice())
1028 (pile2
, pile1
.as_slice())
1031 let searcher
= BoyerMooreSearch
::new(needle
.clone());
1032 TestResult
::from_bool(
1033 searcher
.find(haystack
) == naive_find(&needle
, haystack
))
1036 fn qc_bm_equals_single(pile1
: Vec
<u8>, pile2
: Vec
<u8>) -> TestResult
{
1037 if pile1
.len() == 0 || pile2
.len() == 0 {
1038 return TestResult
::discard();
1041 let (needle
, haystack
) = if pile1
.len() < pile2
.len() {
1042 (pile1
, pile2
.as_slice())
1044 (pile2
, pile1
.as_slice())
1047 let bm_searcher
= BoyerMooreSearch
::new(needle
.clone());
1048 let freqy_memchr
= FreqyPacked
::new(needle
);
1049 TestResult
::from_bool(
1050 bm_searcher
.find(haystack
) == freqy_memchr
.find(haystack
))
1053 fn qc_bm_finds_trailing_needle(
1054 haystack_pre
: Vec
<u8>,
1057 if needle
.len() == 0 {
1058 return TestResult
::discard();
1061 let mut haystack
= haystack_pre
.clone();
1062 let searcher
= BoyerMooreSearch
::new(needle
.clone());
1064 if haystack
.len() >= needle
.len() &&
1065 searcher
.find(haystack
.as_slice()).is_some() {
1066 return TestResult
::discard();
1069 haystack
.extend(needle
.clone());
1071 // What if the the tail of the haystack can start the
1073 let start
= haystack_pre
.len()
1074 .checked_sub(needle
.len())
1076 for i
in 0..(needle
.len() - 1) {
1077 if searcher
.find(&haystack
[(i
+ start
)..]).is_some() {
1078 return TestResult
::discard();
1082 TestResult
::from_bool(
1083 searcher
.find(haystack
.as_slice())
1084 .map(|x
| x
== haystack_pre
.len())
1088 // qc_equals_* is only testing the negative case as @burntsushi
1089 // pointed out in https://github.com/rust-lang/regex/issues/446.
1090 // This quickcheck prop represents an effort to force testing of
1091 // the positive case. qc_bm_finds_first and qc_bm_finds_trailing_needle
1092 // already check some of the positive cases, but they don't cover
1093 // cases where the needle is in the middle of haystack. This prop
1095 fn qc_bm_finds_subslice(
1097 needle_start
: usize,
1098 needle_length
: usize
1100 if haystack
.len() == 0 {
1101 return TestResult
::discard();
1104 let needle_start
= needle_start
% haystack
.len();
1105 let needle_length
= needle_length
% (haystack
.len() - needle_start
);
1107 if needle_length
== 0 {
1108 return TestResult
::discard();
1111 let needle
= &haystack
[needle_start
..(needle_start
+ needle_length
)];
1113 let bm_searcher
= BoyerMooreSearch
::new(needle
.to_vec());
1115 let start
= naive_find(&needle
, &haystack
);
1117 None
=> TestResult
::from_bool(false),
1119 TestResult
::from_bool(
1120 nf_start
<= needle_start
1121 && bm_searcher
.find(&haystack
) == start
1126 fn qc_bm_finds_first(needle
: Vec
<u8>) -> TestResult
{
1127 if needle
.len() == 0 {
1128 return TestResult
::discard();
1131 let mut haystack
= needle
.clone();
1132 let searcher
= BoyerMooreSearch
::new(needle
.clone());
1133 haystack
.extend(needle
);
1135 TestResult
::from_bool(
1136 searcher
.find(haystack
.as_slice())