4 use aho_corasick
::{self, packed, AhoCorasick, AhoCorasickBuilder}
;
5 use memchr
::{memchr, memchr2, memchr3}
;
6 use syntax
::hir
::literal
::{Literal, Literals}
;
8 use freqs
::BYTE_FREQUENCIES
;
10 /// A prefix extracted from a compiled regular expression.
12 /// A regex prefix is a set of literal strings that *must* be matched at the
13 /// beginning of a regex in order for the entire regex to match. Similarly
14 /// for a regex suffix.
15 #[derive(Clone, Debug)]
16 pub struct LiteralSearcher
{
23 #[derive(Clone, Debug)]
25 /// No literals. (Never advances through the input.)
27 /// A set of four or more single byte literals.
29 /// A single substring, find using memchr and frequency analysis.
30 FreqyPacked(FreqyPacked
),
31 /// A single substring, find using Boyer-Moore.
32 BoyerMoore(BoyerMooreSearch
),
33 /// An Aho-Corasick automaton.
34 AC { ac: AhoCorasick<u32>, lits: Vec<Literal> }
,
35 /// A packed multiple substring searcher, using SIMD.
37 /// Note that Aho-Corasick will actually use this packed searcher
38 /// internally automatically, however, there is some overhead associated
39 /// with going through the Aho-Corasick machinery. So using the packed
40 /// searcher directly results in some gains.
41 Packed { s: packed::Searcher, lits: Vec<Literal> }
,
44 impl LiteralSearcher
{
45 /// Returns a matcher that never matches and never advances the input.
46 pub fn empty() -> Self {
47 Self::new(Literals
::empty(), Matcher
::Empty
)
50 /// Returns a matcher for literal prefixes from the given set.
51 pub fn prefixes(lits
: Literals
) -> Self {
52 let matcher
= Matcher
::prefixes(&lits
);
53 Self::new(lits
, matcher
)
56 /// Returns a matcher for literal suffixes from the given set.
57 pub fn suffixes(lits
: Literals
) -> Self {
58 let matcher
= Matcher
::suffixes(&lits
);
59 Self::new(lits
, matcher
)
62 fn new(lits
: Literals
, matcher
: Matcher
) -> Self {
63 let complete
= lits
.all_complete();
66 lcp
: FreqyPacked
::new(lits
.longest_common_prefix().to_vec()),
67 lcs
: FreqyPacked
::new(lits
.longest_common_suffix().to_vec()),
72 /// Returns true if all matches comprise the entire regular expression.
74 /// This does not necessarily mean that a literal match implies a match
75 /// of the regular expression. For example, the regular expression `^a`
76 /// is comprised of a single complete literal `a`, but the regular
77 /// expression demands that it only match at the beginning of a string.
78 pub fn complete(&self) -> bool
{
79 self.complete
&& !self.is_empty()
82 /// Find the position of a literal in `haystack` if it exists.
83 #[cfg_attr(feature = "perf-inline", inline(always))]
84 pub fn find(&self, haystack
: &[u8]) -> Option
<(usize, usize)> {
87 Empty
=> Some((0, 0)),
88 Bytes(ref sset
) => sset
.find(haystack
).map(|i
| (i
, i
+ 1)),
89 FreqyPacked(ref s
) => s
.find(haystack
).map(|i
| (i
, i
+ s
.len())),
90 BoyerMoore(ref s
) => s
.find(haystack
).map(|i
| (i
, i
+ s
.len())),
91 AC { ref ac, .. }
=> {
92 ac
.find(haystack
).map(|m
| (m
.start(), m
.end()))
94 Packed { ref s, .. }
=> {
95 s
.find(haystack
).map(|m
| (m
.start(), m
.end()))
100 /// Like find, except matches must start at index `0`.
101 pub fn find_start(&self, haystack
: &[u8]) -> Option
<(usize, usize)> {
102 for lit
in self.iter() {
103 if lit
.len() > haystack
.len() {
106 if lit
== &haystack
[0..lit
.len()] {
107 return Some((0, lit
.len()));
113 /// Like find, except matches must end at index `haystack.len()`.
114 pub fn find_end(&self, haystack
: &[u8]) -> Option
<(usize, usize)> {
115 for lit
in self.iter() {
116 if lit
.len() > haystack
.len() {
119 if lit
== &haystack
[haystack
.len() - lit
.len()..] {
120 return Some((haystack
.len() - lit
.len(), haystack
.len()));
126 /// Returns an iterator over all literals to be matched.
127 pub fn iter(&self) -> LiteralIter
{
129 Matcher
::Empty
=> LiteralIter
::Empty
,
130 Matcher
::Bytes(ref sset
) => LiteralIter
::Bytes(&sset
.dense
),
131 Matcher
::FreqyPacked(ref s
) => LiteralIter
::Single(&s
.pat
),
132 Matcher
::BoyerMoore(ref s
) => LiteralIter
::Single(&s
.pattern
),
133 Matcher
::AC { ref lits, .. }
=> LiteralIter
::AC(lits
),
134 Matcher
::Packed { ref lits, .. }
=> LiteralIter
::Packed(lits
),
138 /// Returns a matcher for the longest common prefix of this matcher.
139 pub fn lcp(&self) -> &FreqyPacked
{
143 /// Returns a matcher for the longest common suffix of this matcher.
144 pub fn lcs(&self) -> &FreqyPacked
{
148 /// Returns true iff this prefix is empty.
149 pub fn is_empty(&self) -> bool
{
153 /// Returns the number of prefixes in this machine.
154 pub fn len(&self) -> usize {
155 use self::Matcher
::*;
158 Bytes(ref sset
) => sset
.dense
.len(),
161 AC { ref ac, .. }
=> ac
.pattern_count(),
162 Packed { ref lits, .. }
=> lits
.len(),
166 /// Return the approximate heap usage of literals in bytes.
167 pub fn approximate_size(&self) -> usize {
168 use self::Matcher
::*;
171 Bytes(ref sset
) => sset
.approximate_size(),
172 FreqyPacked(ref single
) => single
.approximate_size(),
173 BoyerMoore(ref single
) => single
.approximate_size(),
174 AC { ref ac, .. }
=> ac
.heap_bytes(),
175 Packed { ref s, .. }
=> s
.heap_bytes(),
181 fn prefixes(lits
: &Literals
) -> Self {
182 let sset
= SingleByteSet
::prefixes(lits
);
183 Matcher
::new(lits
, sset
)
186 fn suffixes(lits
: &Literals
) -> Self {
187 let sset
= SingleByteSet
::suffixes(lits
);
188 Matcher
::new(lits
, sset
)
191 fn new(lits
: &Literals
, sset
: SingleByteSet
) -> Self {
192 if lits
.literals().is_empty() {
193 return Matcher
::Empty
;
195 if sset
.dense
.len() >= 26 {
196 // Avoid trying to match a large number of single bytes.
197 // This is *very* sensitive to a frequency analysis comparison
198 // between the bytes in sset and the composition of the haystack.
199 // No matter the size of sset, if its members all are rare in the
200 // haystack, then it'd be worth using it. How to tune this... IDK.
202 return Matcher
::Empty
;
205 return Matcher
::Bytes(sset
);
207 if lits
.literals().len() == 1 {
208 let lit
= lits
.literals()[0].to_vec();
209 if BoyerMooreSearch
::should_use(lit
.as_slice()) {
210 return Matcher
::BoyerMoore(BoyerMooreSearch
::new(lit
));
212 return Matcher
::FreqyPacked(FreqyPacked
::new(lit
));
216 let pats
= lits
.literals().to_owned();
217 let is_aho_corasick_fast
= sset
.dense
.len() <= 1 && sset
.all_ascii
;
218 if lits
.literals().len() <= 100 && !is_aho_corasick_fast
{
219 let mut builder
= packed
::Config
::new()
220 .match_kind(packed
::MatchKind
::LeftmostFirst
)
222 if let Some(s
) = builder
.extend(&pats
).build() {
223 return Matcher
::Packed { s, lits: pats }
;
226 let ac
= AhoCorasickBuilder
::new()
227 .match_kind(aho_corasick
::MatchKind
::LeftmostFirst
)
229 .build_with_size
::<u32, _
, _
>(&pats
)
231 Matcher
::AC { ac, lits: pats }
236 pub enum LiteralIter
<'a
> {
241 Packed(&'a
[Literal
]),
244 impl<'a
> Iterator
for LiteralIter
<'a
> {
245 type Item
= &'a
[u8];
247 fn next(&mut self) -> Option
<Self::Item
> {
249 LiteralIter
::Empty
=> None
,
250 LiteralIter
::Bytes(ref mut many
) => {
254 let next
= &many
[0..1];
259 LiteralIter
::Single(ref mut one
) => {
268 LiteralIter
::AC(ref mut lits
) => {
277 LiteralIter
::Packed(ref mut lits
) => {
290 #[derive(Clone, Debug)]
291 struct SingleByteSet
{
299 fn new() -> SingleByteSet
{
301 sparse
: vec
![false; 256],
308 fn prefixes(lits
: &Literals
) -> SingleByteSet
{
309 let mut sset
= SingleByteSet
::new();
310 for lit
in lits
.literals() {
311 sset
.complete
= sset
.complete
&& lit
.len() == 1;
312 if let Some(&b
) = lit
.get(0) {
313 if !sset
.sparse
[b
as usize] {
315 sset
.all_ascii
= false;
318 sset
.sparse
[b
as usize] = true;
325 fn suffixes(lits
: &Literals
) -> SingleByteSet
{
326 let mut sset
= SingleByteSet
::new();
327 for lit
in lits
.literals() {
328 sset
.complete
= sset
.complete
&& lit
.len() == 1;
329 if let Some(&b
) = lit
.get(lit
.len().checked_sub(1).unwrap()) {
330 if !sset
.sparse
[b
as usize] {
332 sset
.all_ascii
= false;
335 sset
.sparse
[b
as usize] = true;
342 /// Faster find that special cases certain sizes to use memchr.
343 #[cfg_attr(feature = "perf-inline", inline(always))]
344 fn find(&self, text
: &[u8]) -> Option
<usize> {
345 match self.dense
.len() {
347 1 => memchr(self.dense
[0], text
),
348 2 => memchr2(self.dense
[0], self.dense
[1], text
),
349 3 => memchr3(self.dense
[0], self.dense
[1], self.dense
[2], text
),
350 _
=> self._find(text
),
354 /// Generic find that works on any sized set.
355 fn _find(&self, haystack
: &[u8]) -> Option
<usize> {
356 for (i
, &b
) in haystack
.iter().enumerate() {
357 if self.sparse
[b
as usize] {
364 fn approximate_size(&self) -> usize {
365 (self.dense
.len() * mem
::size_of
::<u8>())
366 + (self.sparse
.len() * mem
::size_of
::<bool
>())
370 /// Provides an implementation of fast subtring search using frequency
373 /// memchr is so fast that we do everything we can to keep the loop in memchr
374 /// for as long as possible. The easiest way to do this is to intelligently
375 /// pick the byte to send to memchr. The best byte is the byte that occurs
376 /// least frequently in the haystack. Since doing frequency analysis on the
377 /// haystack is far too expensive, we compute a set of fixed frequencies up
378 /// front and hard code them in src/freqs.rs. Frequency analysis is done via
379 /// scripts/frequencies.py.
380 #[derive(Clone, Debug)]
381 pub struct FreqyPacked
{
384 /// The number of Unicode characters in the pattern. This is useful for
385 /// determining the effective length of a pattern when deciding which
386 /// optimizations to perform. A trailing incomplete UTF-8 sequence counts
387 /// as one character.
389 /// The rarest byte in the pattern, according to pre-computed frequency
392 /// The offset of the rarest byte in `pat`.
394 /// The second rarest byte in the pattern, according to pre-computed
395 /// frequency analysis. (This may be equivalent to the rarest byte.)
397 /// The second rarest byte is used as a type of guard for quickly detecting
398 /// a mismatch after memchr locates an instance of the rarest byte. This
399 /// is a hedge against pathological cases where the pre-computed frequency
400 /// analysis may be off. (But of course, does not prevent *all*
401 /// pathological cases.)
403 /// The offset of the second rarest byte in `pat`.
408 fn new(pat
: Vec
<u8>) -> FreqyPacked
{
410 return FreqyPacked
::empty();
413 // Find the rarest two bytes. Try to make them distinct (but it's not
415 let mut rare1
= pat
[0];
416 let mut rare2
= pat
[0];
417 for b
in pat
[1..].iter().cloned() {
418 if freq_rank(b
) < freq_rank(rare1
) {
425 } else if b
!= rare1
&& freq_rank(b
) < freq_rank(rare2
) {
430 // And find the offsets of their last occurrences.
431 let rare1i
= pat
.iter().rposition(|&b
| b
== rare1
).unwrap();
432 let rare2i
= pat
.iter().rposition(|&b
| b
== rare2
).unwrap();
434 let char_len
= char_len_lossy(&pat
);
445 fn empty() -> FreqyPacked
{
456 #[cfg_attr(feature = "perf-inline", inline(always))]
457 pub fn find(&self, haystack
: &[u8]) -> Option
<usize> {
458 let pat
= &*self.pat
;
459 if haystack
.len() < pat
.len() || pat
.is_empty() {
462 let mut i
= self.rare1i
;
463 while i
< haystack
.len() {
464 i
+= match memchr(self.rare1
, &haystack
[i
..]) {
468 let start
= i
- self.rare1i
;
469 let end
= start
+ pat
.len();
470 if end
> haystack
.len() {
473 let aligned
= &haystack
[start
..end
];
474 if aligned
[self.rare2i
] == self.rare2
&& aligned
== &*self.pat
{
482 #[cfg_attr(feature = "perf-inline", inline(always))]
483 pub fn is_suffix(&self, text
: &[u8]) -> bool
{
484 if text
.len() < self.len() {
487 text
[text
.len() - self.len()..] == *self.pat
490 pub fn len(&self) -> usize {
494 pub fn char_len(&self) -> usize {
498 fn approximate_size(&self) -> usize {
499 self.pat
.len() * mem
::size_of
::<u8>()
503 fn char_len_lossy(bytes
: &[u8]) -> usize {
504 String
::from_utf8_lossy(bytes
).chars().count()
507 /// An implementation of Tuned Boyer-Moore as laid out by
508 /// Andrew Hume and Daniel Sunday in "Fast String Searching".
509 /// O(n) in the size of the input.
511 /// Fast string searching algorithms come in many variations,
512 /// but they can generally be described in terms of three main
515 /// The skip loop is where the string searcher wants to spend
516 /// as much time as possible. Exactly which character in the
517 /// pattern the skip loop examines varies from algorithm to
518 /// algorithm, but in the simplest case this loop repeated
519 /// looks at the last character in the pattern and jumps
520 /// forward in the input if it is not in the pattern.
521 /// Robert Boyer and J Moore called this the "fast" loop in
522 /// their original paper.
524 /// The match loop is responsible for actually examining the
525 /// whole potentially matching substring. In order to fail
526 /// faster, the match loop sometimes has a guard test attached.
527 /// The guard test uses frequency analysis of the different
528 /// characters in the pattern to choose the least frequency
529 /// occurring character and use it to find match failures
530 /// as quickly as possible.
532 /// The shift rule governs how the algorithm will shuffle its
533 /// test window in the event of a failure during the match loop.
534 /// Certain shift rules allow the worst-case run time of the
535 /// algorithm to be shown to be O(n) in the size of the input
536 /// rather than O(nm) in the size of the input and the size
537 /// of the pattern (as naive Boyer-Moore is).
539 /// "Fast String Searching", in addition to presenting a tuned
540 /// algorithm, provides a comprehensive taxonomy of the many
541 /// different flavors of string searchers. Under that taxonomy
542 /// TBM, the algorithm implemented here, uses an unrolled fast
543 /// skip loop with memchr fallback, a forward match loop with guard,
544 /// and the mini Sunday's delta shift rule. To unpack that you'll have to
546 #[derive(Clone, Debug)]
547 pub struct BoyerMooreSearch
{
548 /// The pattern we are going to look for in the haystack.
551 /// The skip table for the skip loop.
553 /// Maps the character at the end of the input
555 skip_table
: Vec
<usize>,
557 /// The guard character (least frequently occurring char).
559 /// The reverse-index of the guard character in the pattern.
560 guard_reverse_idx
: usize,
562 /// Daniel Sunday's mini generalized delta2 shift table.
564 /// We use a skip loop, so we only have to provide a shift
565 /// for the skip char (last char). This is why it is a mini
570 impl BoyerMooreSearch
{
571 /// Create a new string searcher, performing whatever
572 /// compilation steps are required.
573 fn new(pattern
: Vec
<u8>) -> Self {
574 debug_assert
!(!pattern
.is_empty());
576 let (g
, gi
) = Self::select_guard(pattern
.as_slice());
577 let skip_table
= Self::compile_skip_table(pattern
.as_slice());
578 let md2_shift
= Self::compile_md2_shift(pattern
.as_slice());
581 skip_table
: skip_table
,
583 guard_reverse_idx
: gi
,
584 md2_shift
: md2_shift
,
588 /// Find the pattern in `haystack`, returning the offset
589 /// of the start of the first occurrence of the pattern
592 fn find(&self, haystack
: &[u8]) -> Option
<usize> {
593 if haystack
.len() < self.pattern
.len() {
597 let mut window_end
= self.pattern
.len() - 1;
599 // Inspired by the grep source. It is a way
600 // to do correct loop unrolling without having to place
601 // a crashpad of terminating charicters at the end in
602 // the way described in the Fast String Searching paper.
603 const NUM_UNROLL
: usize = 10;
604 // 1 for the initial position, and 1 for the md2 shift
605 let short_circut
= (NUM_UNROLL
+ 2) * self.pattern
.len();
607 if haystack
.len() > short_circut
{
608 // just 1 for the md2 shift
610 haystack
.len() - ((NUM_UNROLL
+ 1) * self.pattern
.len());
613 match self.skip_loop(haystack
, window_end
, backstop
) {
617 if window_end
>= backstop
{
621 if self.check_match(haystack
, window_end
) {
622 return Some(window_end
- (self.pattern
.len() - 1));
624 let skip
= self.skip_table
[haystack
[window_end
] as usize];
626 if skip
== 0 { self.md2_shift }
else { skip }
;
632 // now process the input after the backstop
633 while window_end
< haystack
.len() {
634 let mut skip
= self.skip_table
[haystack
[window_end
] as usize];
636 if self.check_match(haystack
, window_end
) {
637 return Some(window_end
- (self.pattern
.len() - 1));
639 skip
= self.md2_shift
;
648 fn len(&self) -> usize {
649 return self.pattern
.len();
652 /// The key heuristic behind which the BoyerMooreSearch lives.
654 /// See `rust-lang/regex/issues/408`.
656 /// Tuned Boyer-Moore is actually pretty slow! It turns out a handrolled
657 /// platform-specific memchr routine with a bit of frequency
658 /// analysis sprinkled on top actually wins most of the time.
659 /// However, there are a few cases where Tuned Boyer-Moore still
662 /// If the haystack is random, frequency analysis doesn't help us,
663 /// so Boyer-Moore will win for sufficiently large needles.
664 /// Unfortunately, there is no obvious way to determine this
667 /// If the pattern itself consists of very common characters,
668 /// frequency analysis won't get us anywhere. The most extreme
669 /// example of this is a pattern like `eeeeeeeeeeeeeeee`. Fortunately,
670 /// this case is wholly determined by the pattern, so we can actually
671 /// implement the heuristic.
673 /// A third case is if the pattern is sufficiently long. The idea
674 /// here is that once the pattern gets long enough the Tuned
675 /// Boyer-Moore skip loop will start making strides long enough
676 /// to beat the asm deep magic that is memchr.
677 fn should_use(pattern
: &[u8]) -> bool
{
678 // The minimum pattern length required to use TBM.
679 const MIN_LEN
: usize = 9;
680 // The minimum frequency rank (lower is rarer) that every byte in the
681 // pattern must have in order to use TBM. That is, if the pattern
682 // contains _any_ byte with a lower rank, then TBM won't be used.
683 const MIN_CUTOFF
: usize = 150;
684 // The maximum frequency rank for any byte.
685 const MAX_CUTOFF
: usize = 255;
686 // The scaling factor used to determine the actual cutoff frequency
687 // to use (keeping in mind that the minimum frequency rank is bounded
688 // by MIN_CUTOFF). This scaling factor is an attempt to make TBM more
689 // likely to be used as the pattern grows longer. That is, longer
690 // patterns permit somewhat less frequent bytes than shorter patterns,
691 // under the assumption that TBM gets better as the pattern gets
693 const LEN_CUTOFF_PROPORTION
: usize = 4;
695 let scaled_rank
= pattern
.len().wrapping_mul(LEN_CUTOFF_PROPORTION
);
696 let cutoff
= cmp
::max(
698 MAX_CUTOFF
- cmp
::min(MAX_CUTOFF
, scaled_rank
),
700 // The pattern must be long enough to be worthwhile. e.g., memchr will
701 // be faster on `e` because it is short even though e is quite common.
702 pattern
.len() > MIN_LEN
703 // all the bytes must be more common than the cutoff.
704 && pattern
.iter().all(|c
| freq_rank(*c
) >= cutoff
)
707 /// Check to see if there is a match at the given position
709 fn check_match(&self, haystack
: &[u8], window_end
: usize) -> bool
{
711 if haystack
[window_end
- self.guard_reverse_idx
] != self.guard
{
716 let window_start
= window_end
- (self.pattern
.len() - 1);
717 for i
in 0..self.pattern
.len() {
718 if self.pattern
[i
] != haystack
[window_start
+ i
] {
726 /// Skip forward according to the shift table.
728 /// Returns the offset of the next occurrence
729 /// of the last char in the pattern, or the none
730 /// if it never reappears. If `skip_loop` hits the backstop
731 /// it will leave early.
736 mut window_end
: usize,
739 let window_end_snapshot
= window_end
;
740 let skip_of
= |we
: usize| -> usize {
741 // Unsafe might make this faster, but the benchmarks
742 // were hard to interpret.
743 self.skip_table
[haystack
[we
] as usize]
747 let mut skip
= skip_of(window_end
);
749 skip
= skip_of(window_end
);
752 skip
= skip_of(window_end
);
754 skip
= skip_of(window_end
);
756 skip
= skip_of(window_end
);
759 skip
= skip_of(window_end
);
761 skip
= skip_of(window_end
);
763 skip
= skip_of(window_end
);
766 skip
= skip_of(window_end
);
768 skip
= skip_of(window_end
);
771 // If ten iterations did not make at least 16 words
772 // worth of progress, we just fall back on memchr.
773 if window_end
- window_end_snapshot
774 > 16 * mem
::size_of
::<usize>()
776 // Returning a window_end >= backstop will
777 // immediatly break us out of the inner loop in
779 if window_end
>= backstop
{
780 return Some(window_end
);
783 continue; // we made enough progress
785 // In case we are already there, and so that
786 // we will catch the guard char.
787 window_end
= window_end
788 .checked_sub(1 + self.guard_reverse_idx
)
791 match memchr(self.guard
, &haystack
[window_end
..]) {
797 + self.guard_reverse_idx
,
806 return Some(window_end
);
810 /// Compute the ufast skip table.
811 fn compile_skip_table(pattern
: &[u8]) -> Vec
<usize> {
812 let mut tab
= vec
![pattern
.len(); 256];
814 // For every char in the pattern, we write a skip
815 // that will line us up with the rightmost occurrence.
817 // N.B. the sentinel (0) is written by the last
819 for (i
, c
) in pattern
.iter().enumerate() {
820 tab
[*c
as usize] = (pattern
.len() - 1) - i
;
826 /// Select the guard character based off of the precomputed
828 fn select_guard(pattern
: &[u8]) -> (u8, usize) {
829 let mut rarest
= pattern
[0];
830 let mut rarest_rev_idx
= pattern
.len() - 1;
831 for (i
, c
) in pattern
.iter().enumerate() {
832 if freq_rank(*c
) < freq_rank(rarest
) {
834 rarest_rev_idx
= (pattern
.len() - 1) - i
;
838 (rarest
, rarest_rev_idx
)
841 /// If there is another occurrence of the skip
842 /// char, shift to it, otherwise just shift to
844 fn compile_md2_shift(pattern
: &[u8]) -> usize {
845 let shiftc
= *pattern
.last().unwrap();
847 // For a pattern of length 1 we will never apply the
848 // shift rule, so we use a poison value on the principle
849 // that failing fast is a good thing.
850 if pattern
.len() == 1 {
854 let mut i
= pattern
.len() - 2;
856 if pattern
[i
] == shiftc
{
857 return (pattern
.len() - 1) - i
;
862 // The skip char never re-occurs in the pattern, so
863 // we can just shift the whole window length.
867 fn approximate_size(&self) -> usize {
868 (self.pattern
.len() * mem
::size_of
::<u8>())
869 + (256 * mem
::size_of
::<usize>()) // skip table
873 fn freq_rank(b
: u8) -> usize {
874 BYTE_FREQUENCIES
[b
as usize] as usize
879 use super::{BoyerMooreSearch, FreqyPacked}
;
885 // The "hello, world" of string searching
888 let searcher
= BoyerMooreSearch
::new(Vec
::from(&b
"pattern"[..]));
889 let haystack
= b
"I keep seeing patterns in this text";
890 assert_eq
!(14, searcher
.find(haystack
).unwrap());
894 fn bm_find_no_subs() {
895 let searcher
= BoyerMooreSearch
::new(Vec
::from(&b
"pattern"[..]));
896 let haystack
= b
"I keep seeing needles in this text";
897 assert_eq
!(None
, searcher
.find(haystack
));
905 fn bm_skip_reset_bug() {
906 let haystack
= vec
![0, 0, 0, 0, 0, 1, 1, 0];
907 let needle
= vec
![0, 1, 1, 0];
909 let searcher
= BoyerMooreSearch
::new(needle
);
910 let offset
= searcher
.find(haystack
.as_slice()).unwrap();
911 assert_eq
!(4, offset
);
915 fn bm_backstop_underflow_bug() {
916 let haystack
= vec
![0, 0];
917 let needle
= vec
![0, 0];
919 let searcher
= BoyerMooreSearch
::new(needle
);
920 let offset
= searcher
.find(haystack
.as_slice()).unwrap();
921 assert_eq
!(0, offset
);
925 fn bm_naive_off_by_one_bug() {
926 let haystack
= vec
![91];
927 let needle
= vec
![91];
929 let naive_offset
= naive_find(&needle
, &haystack
).unwrap();
930 assert_eq
!(0, naive_offset
);
934 fn bm_memchr_fallback_indexing_bug() {
935 let mut haystack
= vec
![
936 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
937 0, 0, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
938 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
939 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
940 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
942 let needle
= vec
![1, 1, 1, 1, 32, 32, 87];
943 let needle_start
= haystack
.len();
944 haystack
.extend(needle
.clone());
946 let searcher
= BoyerMooreSearch
::new(needle
);
947 assert_eq
!(needle_start
, searcher
.find(haystack
.as_slice()).unwrap());
951 fn bm_backstop_boundary() {
953 // aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
954 e_data.clone_created(entity_id, entity_to_add.entity_id);
955 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
956 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
959 let needle
= b
"clone_created".to_vec();
961 let searcher
= BoyerMooreSearch
::new(needle
);
962 let result
= searcher
.find(&haystack
);
963 assert_eq
!(Some(43), result
);
967 fn bm_win_gnu_indexing_bug() {
968 let haystack_raw
= vec
![
969 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
970 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
971 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
972 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
974 let needle
= vec
![1, 1, 1, 1, 1, 1, 1];
975 let haystack
= haystack_raw
.as_slice();
977 BoyerMooreSearch
::new(needle
.clone()).find(haystack
);
981 // QuickCheck Properties
984 use quickcheck
::TestResult
;
986 fn naive_find(needle
: &[u8], haystack
: &[u8]) -> Option
<usize> {
987 assert
!(needle
.len() <= haystack
.len());
989 for i
in 0..(haystack
.len() - (needle
.len() - 1)) {
990 if haystack
[i
] == needle
[0]
991 && &haystack
[i
..(i
+ needle
.len())] == needle
1001 fn qc_bm_equals_nieve_find(pile1
: Vec
<u8>, pile2
: Vec
<u8>) -> TestResult
{
1002 if pile1
.len() == 0 || pile2
.len() == 0 {
1003 return TestResult
::discard();
1006 let (needle
, haystack
) = if pile1
.len() < pile2
.len() {
1007 (pile1
, pile2
.as_slice())
1009 (pile2
, pile1
.as_slice())
1012 let searcher
= BoyerMooreSearch
::new(needle
.clone());
1013 TestResult
::from_bool(
1014 searcher
.find(haystack
) == naive_find(&needle
, haystack
))
1017 fn qc_bm_equals_single(pile1
: Vec
<u8>, pile2
: Vec
<u8>) -> TestResult
{
1018 if pile1
.len() == 0 || pile2
.len() == 0 {
1019 return TestResult
::discard();
1022 let (needle
, haystack
) = if pile1
.len() < pile2
.len() {
1023 (pile1
, pile2
.as_slice())
1025 (pile2
, pile1
.as_slice())
1028 let bm_searcher
= BoyerMooreSearch
::new(needle
.clone());
1029 let freqy_memchr
= FreqyPacked
::new(needle
);
1030 TestResult
::from_bool(
1031 bm_searcher
.find(haystack
) == freqy_memchr
.find(haystack
))
1034 fn qc_bm_finds_trailing_needle(
1035 haystack_pre
: Vec
<u8>,
1038 if needle
.len() == 0 {
1039 return TestResult
::discard();
1042 let mut haystack
= haystack_pre
.clone();
1043 let searcher
= BoyerMooreSearch
::new(needle
.clone());
1045 if haystack
.len() >= needle
.len() &&
1046 searcher
.find(haystack
.as_slice()).is_some() {
1047 return TestResult
::discard();
1050 haystack
.extend(needle
.clone());
1052 // What if the the tail of the haystack can start the
1054 let start
= haystack_pre
.len()
1055 .checked_sub(needle
.len())
1057 for i
in 0..(needle
.len() - 1) {
1058 if searcher
.find(&haystack
[(i
+ start
)..]).is_some() {
1059 return TestResult
::discard();
1063 TestResult
::from_bool(
1064 searcher
.find(haystack
.as_slice())
1065 .map(|x
| x
== haystack_pre
.len())
1069 // qc_equals_* is only testing the negative case as @burntsushi
1070 // pointed out in https://github.com/rust-lang/regex/issues/446.
1071 // This quickcheck prop represents an effort to force testing of
1072 // the positive case. qc_bm_finds_first and qc_bm_finds_trailing_needle
1073 // already check some of the positive cases, but they don't cover
1074 // cases where the needle is in the middle of haystack. This prop
1076 fn qc_bm_finds_subslice(
1078 needle_start
: usize,
1079 needle_length
: usize
1081 if haystack
.len() == 0 {
1082 return TestResult
::discard();
1085 let needle_start
= needle_start
% haystack
.len();
1086 let needle_length
= needle_length
% (haystack
.len() - needle_start
);
1088 if needle_length
== 0 {
1089 return TestResult
::discard();
1092 let needle
= &haystack
[needle_start
..(needle_start
+ needle_length
)];
1094 let bm_searcher
= BoyerMooreSearch
::new(needle
.to_vec());
1096 let start
= naive_find(&needle
, &haystack
);
1098 None
=> TestResult
::from_bool(false),
1100 TestResult
::from_bool(
1101 nf_start
<= needle_start
1102 && bm_searcher
.find(&haystack
) == start
1107 fn qc_bm_finds_first(needle
: Vec
<u8>) -> TestResult
{
1108 if needle
.len() == 0 {
1109 return TestResult
::discard();
1112 let mut haystack
= needle
.clone();
1113 let searcher
= BoyerMooreSearch
::new(needle
.clone());
1114 haystack
.extend(needle
);
1116 TestResult
::from_bool(
1117 searcher
.find(haystack
.as_slice())