4 use search
::byte_frequencies
::BYTE_FREQUENCIES
;
6 /// PrefilterState tracks state associated with the effectiveness of a
7 /// prefilter. It is used to track how many bytes, on average, are skipped by
8 /// the prefilter. If this average dips below a certain threshold over time,
9 /// then the state renders the prefilter inert and stops using it.
11 /// A prefilter state should be created for each search. (Where creating an
12 /// iterator via, e.g., `find_iter`, is treated as a single search.)
13 #[derive(Clone, Debug)]
14 pub struct PrefilterState
{
15 /// The number of skips that has been executed.
17 /// The total number of bytes that have been skipped.
19 /// The maximum length of a match. This is used to help determine how many
20 /// bytes on average should be skipped in order for a prefilter to be
23 /// Once this heuristic has been deemed ineffective, it will be inert
24 /// throughout the rest of its lifetime. This serves as a cheap way to
30 /// The minimum number of skip attempts to try before considering whether
31 /// a prefilter is effective or not.
32 const MIN_SKIPS
: usize = 50;
34 /// The minimum amount of bytes that skipping must average.
36 /// This value was chosen based on varying it and checking the bstr/find/
37 /// microbenchmarks. In particular, this can impact the
38 /// pathological/repeated-{huge,small} benchmarks quite a bit if it's
40 const MIN_SKIP_BYTES
: usize = 8;
42 /// Create a fresh prefilter state.
43 pub fn new(max_match_len
: usize) -> PrefilterState
{
44 if max_match_len
== 0 {
45 return PrefilterState
::inert();
47 PrefilterState { skips: 0, skipped: 0, max_match_len, inert: false }
50 /// Create a fresh prefilter state that is always inert.
51 fn inert() -> PrefilterState
{
52 PrefilterState { skips: 0, skipped: 0, max_match_len: 0, inert: true }
55 /// Update this state with the number of bytes skipped on the last
56 /// invocation of the prefilter.
58 pub fn update(&mut self, skipped
: usize) {
60 self.skipped
+= skipped
;
63 /// Return true if and only if this state indicates that a prefilter is
66 pub fn is_effective(&mut self) -> bool
{
70 if self.skips
< PrefilterState
::MIN_SKIPS
{
73 if self.skipped
>= PrefilterState
::MIN_SKIP_BYTES
* self.skips
{
83 /// A heuristic frequency based prefilter for searching a single needle.
85 /// This prefilter attempts to pick out the byte in a needle that is predicted
86 /// to occur least frequently, and search for that using fast vectorized
87 /// routines. If a rare enough byte could not be found, then this prefilter's
88 /// constructors will return `None`.
90 /// This can be combined with `PrefilterState` to dynamically render this
91 /// prefilter inert if it proves to ineffective.
92 #[derive(Clone, Debug)]
94 /// Whether this prefilter should be used or not.
96 /// The length of the needle we're searching for.
98 /// The rarest byte in the needle, according to pre-computed frequency
101 /// The leftmost offset of the rarest byte in the needle.
103 /// The second rarest byte in the needle, according to pre-computed
104 /// frequency analysis. (This may be equivalent to the rarest byte.)
106 /// The second rarest byte is used as a type of guard for quickly detecting
107 /// a mismatch after memchr locates an instance of the rarest byte. This
108 /// is a hedge against pathological cases where the pre-computed frequency
109 /// analysis may be off. (But of course, does not prevent *all*
110 /// pathological cases.)
112 /// The leftmost offset of the second rarest byte in the needle.
117 /// The maximum frequency rank permitted. If the rarest byte in the needle
118 /// has a frequency rank above this value, then Freqy is not used.
119 const MAX_RANK
: usize = 200;
121 /// Return a fresh prefilter state that can be used with this prefilter. A
122 /// prefilter state is used to track the effectiveness of a prefilter for
123 /// speeding up searches. Therefore, the prefilter state should generally
124 /// be reused on subsequent searches (such as in an iterator). For searches
125 /// on a different haystack, then a new prefilter state should be used.
126 pub fn prefilter_state(&self) -> PrefilterState
{
128 PrefilterState
::inert()
130 PrefilterState
::new(self.needle_len
)
134 /// Returns a valid but inert prefilter. This is valid for both the forward
135 /// and reverse direction.
137 /// It is never correct to use an inert prefilter. The results of finding
138 /// the next (or previous) candidate are unspecified.
139 fn inert() -> Freqy
{
150 /// Return search info for the given needle in the forward direction.
151 pub fn forward(needle
: &BStr
) -> Freqy
{
152 if needle
.is_empty() {
153 return Freqy
::inert();
156 // Find the rarest two bytes. Try to make them distinct (but it's not
158 let (mut rare1
, mut rare1i
) = (needle
[0], 0);
159 let (mut rare2
, mut rare2i
) = (needle
[0], 0);
160 if needle
.len() >= 2 {
164 if Freqy
::rank(rare2
) < Freqy
::rank(rare1
) {
165 mem
::swap(&mut rare1
, &mut rare2
);
166 mem
::swap(&mut rare1i
, &mut rare2i
);
168 for (i
, b
) in needle
.bytes().enumerate().skip(2) {
169 if Freqy
::rank(b
) < Freqy
::rank(rare1
) {
174 } else if b
!= rare1
&& Freqy
::rank(b
) < Freqy
::rank(rare2
) {
179 if Freqy
::rank(rare1
) > Freqy
::MAX_RANK
{
180 return Freqy
::inert();
182 let needle_len
= needle
.len();
183 Freqy { inert: false, needle_len, rare1, rare1i, rare2, rare2i }
186 /// Return search info for the given needle in the reverse direction.
187 pub fn reverse(needle
: &BStr
) -> Freqy
{
188 if needle
.is_empty() {
189 return Freqy
::inert();
192 // Find the rarest two bytes. Try to make them distinct (but it's not
193 // required). In reverse, the offsets correspond to the number of bytes
194 // from the end of the needle. So `0` is the last byte in the needle.
195 let (mut rare1i
, mut rare2i
) = (0, 0);
196 if needle
.len() >= 2 {
199 let mut rare1
= needle
[needle
.len() - rare1i
- 1];
200 let mut rare2
= needle
[needle
.len() - rare2i
- 1];
201 if Freqy
::rank(rare2
) < Freqy
::rank(rare1
) {
202 mem
::swap(&mut rare1
, &mut rare2
);
203 mem
::swap(&mut rare1i
, &mut rare2i
);
205 for (i
, b
) in needle
.bytes().rev().enumerate().skip(2) {
206 if Freqy
::rank(b
) < Freqy
::rank(rare1
) {
211 } else if b
!= rare1
&& Freqy
::rank(b
) < Freqy
::rank(rare2
) {
216 if Freqy
::rank(rare1
) > Freqy
::MAX_RANK
{
217 return Freqy
::inert();
219 let needle_len
= needle
.len();
220 Freqy { inert: false, needle_len, rare1, rare1i, rare2, rare2i }
223 /// Look for a possible occurrence of needle. The position returned
224 /// corresponds to the beginning of the occurrence, if one exists.
226 /// Callers may assume that this never returns false negatives (i.e., it
227 /// never misses an actual occurrence), but must check that the returned
228 /// position corresponds to a match. That is, it can return false
231 /// This should only be used when Freqy is constructed for forward
233 pub fn find_candidate(
235 prestate
: &mut PrefilterState
,
238 debug_assert
!(!self.inert
);
241 while prestate
.is_effective() {
242 // Use a fast vectorized implementation to skip to the next
243 // occurrence of the rarest byte (heuristically chosen) in the
245 i
+= match haystack
[i
..].find_byte(self.rare1
) {
248 prestate
.update(found
);
253 // If we can't align our first match with the haystack, then a
254 // match is impossible.
260 // Align our rare2 byte with the haystack. A mismatch means that
261 // a match is impossible.
262 let aligned_rare2i
= i
- self.rare1i
+ self.rare2i
;
263 if haystack
.get(aligned_rare2i
) != Some(&self.rare2
) {
268 // We've done what we can. There might be a match here.
269 return Some(i
- self.rare1i
);
271 // The only way we get here is if we believe our skipping heuristic
272 // has become ineffective. We're allowed to return false positives,
273 // so return the position at which we advanced to, aligned to the
275 Some(i
.saturating_sub(self.rare1i
))
278 /// Look for a possible occurrence of needle, in reverse, starting from the
279 /// end of the given haystack. The position returned corresponds to the
280 /// position immediately after the end of the occurrence, if one exists.
282 /// Callers may assume that this never returns false negatives (i.e., it
283 /// never misses an actual occurrence), but must check that the returned
284 /// position corresponds to a match. That is, it can return false
287 /// This should only be used when Freqy is constructed for reverse
289 pub fn rfind_candidate(
291 prestate
: &mut PrefilterState
,
294 debug_assert
!(!self.inert
);
296 let mut i
= haystack
.len();
297 while prestate
.is_effective() {
298 // Use a fast vectorized implementation to skip to the next
299 // occurrence of the rarest byte (heuristically chosen) in the
301 i
= match haystack
[..i
].rfind_byte(self.rare1
) {
304 prestate
.update(i
- found
);
309 // If we can't align our first match with the haystack, then a
310 // match is impossible.
311 if i
+ self.rare1i
+ 1 > haystack
.len() {
315 // Align our rare2 byte with the haystack. A mismatch means that
316 // a match is impossible.
317 let aligned
= match (i
+ self.rare1i
).checked_sub(self.rare2i
) {
319 Some(aligned
) => aligned
,
321 if haystack
.get(aligned
) != Some(&self.rare2
) {
325 // We've done what we can. There might be a match here.
326 return Some(i
+ self.rare1i
+ 1);
328 // The only way we get here is if we believe our skipping heuristic
329 // has become ineffective. We're allowed to return false positives,
330 // so return the position at which we advanced to, aligned to the
332 Some(i
+ self.rare1i
+ 1)
335 /// Return the heuristical frequency rank of the given byte. A lower rank
336 /// means the byte is believed to occur less frequently.
337 fn rank(b
: u8) -> usize {
338 BYTE_FREQUENCIES
[b
as usize] as usize
349 // N.B. We sometimes use uppercase here since that mostly ensures freqy
350 // will be constructable. Lowercase letters may be too common for freqy
353 let s
= Freqy
::forward(B("BAR"));
354 let mut pre
= s
.prefilter_state();
355 assert_eq
!(Some(0), s
.find_candidate(&mut pre
, B("BARFOO")));
357 let s
= Freqy
::forward(B("BAR"));
358 let mut pre
= s
.prefilter_state();
359 assert_eq
!(Some(3), s
.find_candidate(&mut pre
, B("FOOBAR")));
361 let s
= Freqy
::forward(B("zyzy"));
362 let mut pre
= s
.prefilter_state();
363 assert_eq
!(Some(0), s
.find_candidate(&mut pre
, B("zyzz")));
365 let s
= Freqy
::forward(B("zyzy"));
366 let mut pre
= s
.prefilter_state();
367 assert_eq
!(Some(2), s
.find_candidate(&mut pre
, B("zzzy")));
369 let s
= Freqy
::forward(B("zyzy"));
370 let mut pre
= s
.prefilter_state();
371 assert_eq
!(None
, s
.find_candidate(&mut pre
, B("zazb")));
373 let s
= Freqy
::forward(B("yzyz"));
374 let mut pre
= s
.prefilter_state();
375 assert_eq
!(Some(0), s
.find_candidate(&mut pre
, B("yzyy")));
377 let s
= Freqy
::forward(B("yzyz"));
378 let mut pre
= s
.prefilter_state();
379 assert_eq
!(Some(2), s
.find_candidate(&mut pre
, B("yyyz")));
381 let s
= Freqy
::forward(B("yzyz"));
382 let mut pre
= s
.prefilter_state();
383 assert_eq
!(None
, s
.find_candidate(&mut pre
, B("yayb")));
388 // N.B. We sometimes use uppercase here since that mostly ensures freqy
389 // will be constructable. Lowercase letters may be too common for freqy
392 let s
= Freqy
::reverse(B("BAR"));
393 let mut pre
= s
.prefilter_state();
394 assert_eq
!(Some(3), s
.rfind_candidate(&mut pre
, B("BARFOO")));
396 let s
= Freqy
::reverse(B("BAR"));
397 let mut pre
= s
.prefilter_state();
398 assert_eq
!(Some(6), s
.rfind_candidate(&mut pre
, B("FOOBAR")));
400 let s
= Freqy
::reverse(B("zyzy"));
401 let mut pre
= s
.prefilter_state();
402 assert_eq
!(Some(2), s
.rfind_candidate(&mut pre
, B("zyzz")));
404 let s
= Freqy
::reverse(B("zyzy"));
405 let mut pre
= s
.prefilter_state();
406 assert_eq
!(Some(4), s
.rfind_candidate(&mut pre
, B("zzzy")));
408 let s
= Freqy
::reverse(B("zyzy"));
409 let mut pre
= s
.prefilter_state();
410 assert_eq
!(None
, s
.rfind_candidate(&mut pre
, B("zazb")));
412 let s
= Freqy
::reverse(B("yzyz"));
413 let mut pre
= s
.prefilter_state();
414 assert_eq
!(Some(2), s
.rfind_candidate(&mut pre
, B("yzyy")));
416 let s
= Freqy
::reverse(B("yzyz"));
417 let mut pre
= s
.prefilter_state();
418 assert_eq
!(Some(4), s
.rfind_candidate(&mut pre
, B("yyyz")));
420 let s
= Freqy
::reverse(B("yzyz"));
421 let mut pre
= s
.prefilter_state();
422 assert_eq
!(None
, s
.rfind_candidate(&mut pre
, B("yayb")));