]> git.proxmox.com Git - cargo.git/blob - vendor/bstr/src/search/prefilter.rs
New upstream version 0.35.0
[cargo.git] / vendor / bstr / src / search / prefilter.rs
1 use core::mem;
2
3 use bstr::BStr;
4 use search::byte_frequencies::BYTE_FREQUENCIES;
5
6 /// PrefilterState tracks state associated with the effectiveness of a
7 /// prefilter. It is used to track how many bytes, on average, are skipped by
8 /// the prefilter. If this average dips below a certain threshold over time,
9 /// then the state renders the prefilter inert and stops using it.
10 ///
11 /// A prefilter state should be created for each search. (Where creating an
12 /// iterator via, e.g., `find_iter`, is treated as a single search.)
13 #[derive(Clone, Debug)]
14 pub struct PrefilterState {
15 /// The number of skips that has been executed.
16 skips: usize,
17 /// The total number of bytes that have been skipped.
18 skipped: usize,
19 /// The maximum length of a match. This is used to help determine how many
20 /// bytes on average should be skipped in order for a prefilter to be
21 /// effective.
22 max_match_len: usize,
23 /// Once this heuristic has been deemed ineffective, it will be inert
24 /// throughout the rest of its lifetime. This serves as a cheap way to
25 /// check inertness.
26 inert: bool,
27 }
28
29 impl PrefilterState {
30 /// The minimum number of skip attempts to try before considering whether
31 /// a prefilter is effective or not.
32 const MIN_SKIPS: usize = 50;
33
34 /// The minimum amount of bytes that skipping must average.
35 ///
36 /// This value was chosen based on varying it and checking the bstr/find/
37 /// microbenchmarks. In particular, this can impact the
38 /// pathological/repeated-{huge,small} benchmarks quite a bit if it's
39 /// set too low.
40 const MIN_SKIP_BYTES: usize = 8;
41
42 /// Create a fresh prefilter state.
43 pub fn new(max_match_len: usize) -> PrefilterState {
44 if max_match_len == 0 {
45 return PrefilterState::inert();
46 }
47 PrefilterState { skips: 0, skipped: 0, max_match_len, inert: false }
48 }
49
50 /// Create a fresh prefilter state that is always inert.
51 fn inert() -> PrefilterState {
52 PrefilterState { skips: 0, skipped: 0, max_match_len: 0, inert: true }
53 }
54
55 /// Update this state with the number of bytes skipped on the last
56 /// invocation of the prefilter.
57 #[inline]
58 pub fn update(&mut self, skipped: usize) {
59 self.skips += 1;
60 self.skipped += skipped;
61 }
62
63 /// Return true if and only if this state indicates that a prefilter is
64 /// still effective.
65 #[inline]
66 pub fn is_effective(&mut self) -> bool {
67 if self.inert {
68 return false;
69 }
70 if self.skips < PrefilterState::MIN_SKIPS {
71 return true;
72 }
73 if self.skipped >= PrefilterState::MIN_SKIP_BYTES * self.skips {
74 return true;
75 }
76
77 // We're inert.
78 self.inert = true;
79 false
80 }
81 }
82
83 /// A heuristic frequency based prefilter for searching a single needle.
84 ///
85 /// This prefilter attempts to pick out the byte in a needle that is predicted
86 /// to occur least frequently, and search for that using fast vectorized
87 /// routines. If a rare enough byte could not be found, then this prefilter's
88 /// constructors will return `None`.
89 ///
90 /// This can be combined with `PrefilterState` to dynamically render this
91 /// prefilter inert if it proves to ineffective.
92 #[derive(Clone, Debug)]
93 pub struct Freqy {
94 /// Whether this prefilter should be used or not.
95 inert: bool,
96 /// The length of the needle we're searching for.
97 needle_len: usize,
98 /// The rarest byte in the needle, according to pre-computed frequency
99 /// analysis.
100 rare1: u8,
101 /// The leftmost offset of the rarest byte in the needle.
102 rare1i: usize,
103 /// The second rarest byte in the needle, according to pre-computed
104 /// frequency analysis. (This may be equivalent to the rarest byte.)
105 ///
106 /// The second rarest byte is used as a type of guard for quickly detecting
107 /// a mismatch after memchr locates an instance of the rarest byte. This
108 /// is a hedge against pathological cases where the pre-computed frequency
109 /// analysis may be off. (But of course, does not prevent *all*
110 /// pathological cases.)
111 rare2: u8,
112 /// The leftmost offset of the second rarest byte in the needle.
113 rare2i: usize,
114 }
115
116 impl Freqy {
117 /// The maximum frequency rank permitted. If the rarest byte in the needle
118 /// has a frequency rank above this value, then Freqy is not used.
119 const MAX_RANK: usize = 200;
120
121 /// Return a fresh prefilter state that can be used with this prefilter. A
122 /// prefilter state is used to track the effectiveness of a prefilter for
123 /// speeding up searches. Therefore, the prefilter state should generally
124 /// be reused on subsequent searches (such as in an iterator). For searches
125 /// on a different haystack, then a new prefilter state should be used.
126 pub fn prefilter_state(&self) -> PrefilterState {
127 if self.inert {
128 PrefilterState::inert()
129 } else {
130 PrefilterState::new(self.needle_len)
131 }
132 }
133
134 /// Returns a valid but inert prefilter. This is valid for both the forward
135 /// and reverse direction.
136 ///
137 /// It is never correct to use an inert prefilter. The results of finding
138 /// the next (or previous) candidate are unspecified.
139 fn inert() -> Freqy {
140 Freqy {
141 inert: true,
142 needle_len: 0,
143 rare1: 0,
144 rare1i: 0,
145 rare2: 0,
146 rare2i: 0,
147 }
148 }
149
150 /// Return search info for the given needle in the forward direction.
151 pub fn forward(needle: &BStr) -> Freqy {
152 if needle.is_empty() {
153 return Freqy::inert();
154 }
155
156 // Find the rarest two bytes. Try to make them distinct (but it's not
157 // required).
158 let (mut rare1, mut rare1i) = (needle[0], 0);
159 let (mut rare2, mut rare2i) = (needle[0], 0);
160 if needle.len() >= 2 {
161 rare2 = needle[1];
162 rare2i = 1;
163 }
164 if Freqy::rank(rare2) < Freqy::rank(rare1) {
165 mem::swap(&mut rare1, &mut rare2);
166 mem::swap(&mut rare1i, &mut rare2i);
167 }
168 for (i, b) in needle.bytes().enumerate().skip(2) {
169 if Freqy::rank(b) < Freqy::rank(rare1) {
170 rare2 = rare1;
171 rare2i = rare1i;
172 rare1 = b;
173 rare1i = i;
174 } else if b != rare1 && Freqy::rank(b) < Freqy::rank(rare2) {
175 rare2 = b;
176 rare2i = i;
177 }
178 }
179 if Freqy::rank(rare1) > Freqy::MAX_RANK {
180 return Freqy::inert();
181 }
182 let needle_len = needle.len();
183 Freqy { inert: false, needle_len, rare1, rare1i, rare2, rare2i }
184 }
185
186 /// Return search info for the given needle in the reverse direction.
187 pub fn reverse(needle: &BStr) -> Freqy {
188 if needle.is_empty() {
189 return Freqy::inert();
190 }
191
192 // Find the rarest two bytes. Try to make them distinct (but it's not
193 // required). In reverse, the offsets correspond to the number of bytes
194 // from the end of the needle. So `0` is the last byte in the needle.
195 let (mut rare1i, mut rare2i) = (0, 0);
196 if needle.len() >= 2 {
197 rare2i += 1;
198 }
199 let mut rare1 = needle[needle.len() - rare1i - 1];
200 let mut rare2 = needle[needle.len() - rare2i - 1];
201 if Freqy::rank(rare2) < Freqy::rank(rare1) {
202 mem::swap(&mut rare1, &mut rare2);
203 mem::swap(&mut rare1i, &mut rare2i);
204 }
205 for (i, b) in needle.bytes().rev().enumerate().skip(2) {
206 if Freqy::rank(b) < Freqy::rank(rare1) {
207 rare2 = rare1;
208 rare2i = rare1i;
209 rare1 = b;
210 rare1i = i;
211 } else if b != rare1 && Freqy::rank(b) < Freqy::rank(rare2) {
212 rare2 = b;
213 rare2i = i;
214 }
215 }
216 if Freqy::rank(rare1) > Freqy::MAX_RANK {
217 return Freqy::inert();
218 }
219 let needle_len = needle.len();
220 Freqy { inert: false, needle_len, rare1, rare1i, rare2, rare2i }
221 }
222
223 /// Look for a possible occurrence of needle. The position returned
224 /// corresponds to the beginning of the occurrence, if one exists.
225 ///
226 /// Callers may assume that this never returns false negatives (i.e., it
227 /// never misses an actual occurrence), but must check that the returned
228 /// position corresponds to a match. That is, it can return false
229 /// positives.
230 ///
231 /// This should only be used when Freqy is constructed for forward
232 /// searching.
233 pub fn find_candidate(
234 &self,
235 prestate: &mut PrefilterState,
236 haystack: &BStr,
237 ) -> Option<usize> {
238 debug_assert!(!self.inert);
239
240 let mut i = 0;
241 while prestate.is_effective() {
242 // Use a fast vectorized implementation to skip to the next
243 // occurrence of the rarest byte (heuristically chosen) in the
244 // needle.
245 i += match haystack[i..].find_byte(self.rare1) {
246 None => return None,
247 Some(found) => {
248 prestate.update(found);
249 found
250 }
251 };
252
253 // If we can't align our first match with the haystack, then a
254 // match is impossible.
255 if i < self.rare1i {
256 i += 1;
257 continue;
258 }
259
260 // Align our rare2 byte with the haystack. A mismatch means that
261 // a match is impossible.
262 let aligned_rare2i = i - self.rare1i + self.rare2i;
263 if haystack.get(aligned_rare2i) != Some(&self.rare2) {
264 i += 1;
265 continue;
266 }
267
268 // We've done what we can. There might be a match here.
269 return Some(i - self.rare1i);
270 }
271 // The only way we get here is if we believe our skipping heuristic
272 // has become ineffective. We're allowed to return false positives,
273 // so return the position at which we advanced to, aligned to the
274 // haystack.
275 Some(i.saturating_sub(self.rare1i))
276 }
277
278 /// Look for a possible occurrence of needle, in reverse, starting from the
279 /// end of the given haystack. The position returned corresponds to the
280 /// position immediately after the end of the occurrence, if one exists.
281 ///
282 /// Callers may assume that this never returns false negatives (i.e., it
283 /// never misses an actual occurrence), but must check that the returned
284 /// position corresponds to a match. That is, it can return false
285 /// positives.
286 ///
287 /// This should only be used when Freqy is constructed for reverse
288 /// searching.
289 pub fn rfind_candidate(
290 &self,
291 prestate: &mut PrefilterState,
292 haystack: &BStr,
293 ) -> Option<usize> {
294 debug_assert!(!self.inert);
295
296 let mut i = haystack.len();
297 while prestate.is_effective() {
298 // Use a fast vectorized implementation to skip to the next
299 // occurrence of the rarest byte (heuristically chosen) in the
300 // needle.
301 i = match haystack[..i].rfind_byte(self.rare1) {
302 None => return None,
303 Some(found) => {
304 prestate.update(i - found);
305 found
306 }
307 };
308
309 // If we can't align our first match with the haystack, then a
310 // match is impossible.
311 if i + self.rare1i + 1 > haystack.len() {
312 continue;
313 }
314
315 // Align our rare2 byte with the haystack. A mismatch means that
316 // a match is impossible.
317 let aligned = match (i + self.rare1i).checked_sub(self.rare2i) {
318 None => continue,
319 Some(aligned) => aligned,
320 };
321 if haystack.get(aligned) != Some(&self.rare2) {
322 continue;
323 }
324
325 // We've done what we can. There might be a match here.
326 return Some(i + self.rare1i + 1);
327 }
328 // The only way we get here is if we believe our skipping heuristic
329 // has become ineffective. We're allowed to return false positives,
330 // so return the position at which we advanced to, aligned to the
331 // haystack.
332 Some(i + self.rare1i + 1)
333 }
334
335 /// Return the heuristical frequency rank of the given byte. A lower rank
336 /// means the byte is believed to occur less frequently.
337 fn rank(b: u8) -> usize {
338 BYTE_FREQUENCIES[b as usize] as usize
339 }
340 }
341
342 #[cfg(test)]
343 mod tests {
344 use bstr::B;
345 use super::*;
346
347 #[test]
348 fn freqy_forward() {
349 // N.B. We sometimes use uppercase here since that mostly ensures freqy
350 // will be constructable. Lowercase letters may be too common for freqy
351 // to work.
352
353 let s = Freqy::forward(B("BAR"));
354 let mut pre = s.prefilter_state();
355 assert_eq!(Some(0), s.find_candidate(&mut pre, B("BARFOO")));
356
357 let s = Freqy::forward(B("BAR"));
358 let mut pre = s.prefilter_state();
359 assert_eq!(Some(3), s.find_candidate(&mut pre, B("FOOBAR")));
360
361 let s = Freqy::forward(B("zyzy"));
362 let mut pre = s.prefilter_state();
363 assert_eq!(Some(0), s.find_candidate(&mut pre, B("zyzz")));
364
365 let s = Freqy::forward(B("zyzy"));
366 let mut pre = s.prefilter_state();
367 assert_eq!(Some(2), s.find_candidate(&mut pre, B("zzzy")));
368
369 let s = Freqy::forward(B("zyzy"));
370 let mut pre = s.prefilter_state();
371 assert_eq!(None, s.find_candidate(&mut pre, B("zazb")));
372
373 let s = Freqy::forward(B("yzyz"));
374 let mut pre = s.prefilter_state();
375 assert_eq!(Some(0), s.find_candidate(&mut pre, B("yzyy")));
376
377 let s = Freqy::forward(B("yzyz"));
378 let mut pre = s.prefilter_state();
379 assert_eq!(Some(2), s.find_candidate(&mut pre, B("yyyz")));
380
381 let s = Freqy::forward(B("yzyz"));
382 let mut pre = s.prefilter_state();
383 assert_eq!(None, s.find_candidate(&mut pre, B("yayb")));
384 }
385
386 #[test]
387 fn freqy_reverse() {
388 // N.B. We sometimes use uppercase here since that mostly ensures freqy
389 // will be constructable. Lowercase letters may be too common for freqy
390 // to work.
391
392 let s = Freqy::reverse(B("BAR"));
393 let mut pre = s.prefilter_state();
394 assert_eq!(Some(3), s.rfind_candidate(&mut pre, B("BARFOO")));
395
396 let s = Freqy::reverse(B("BAR"));
397 let mut pre = s.prefilter_state();
398 assert_eq!(Some(6), s.rfind_candidate(&mut pre, B("FOOBAR")));
399
400 let s = Freqy::reverse(B("zyzy"));
401 let mut pre = s.prefilter_state();
402 assert_eq!(Some(2), s.rfind_candidate(&mut pre, B("zyzz")));
403
404 let s = Freqy::reverse(B("zyzy"));
405 let mut pre = s.prefilter_state();
406 assert_eq!(Some(4), s.rfind_candidate(&mut pre, B("zzzy")));
407
408 let s = Freqy::reverse(B("zyzy"));
409 let mut pre = s.prefilter_state();
410 assert_eq!(None, s.rfind_candidate(&mut pre, B("zazb")));
411
412 let s = Freqy::reverse(B("yzyz"));
413 let mut pre = s.prefilter_state();
414 assert_eq!(Some(2), s.rfind_candidate(&mut pre, B("yzyy")));
415
416 let s = Freqy::reverse(B("yzyz"));
417 let mut pre = s.prefilter_state();
418 assert_eq!(Some(4), s.rfind_candidate(&mut pre, B("yyyz")));
419
420 let s = Freqy::reverse(B("yzyz"));
421 let mut pre = s.prefilter_state();
422 assert_eq!(None, s.rfind_candidate(&mut pre, B("yayb")));
423 }
424 }