]>
Commit | Line | Data |
---|---|---|
f9f354fc XL |
1 | use std::cmp; |
2 | use std::mem; | |
3 | ||
4 | use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder}; | |
5 | use memchr::{memchr, memchr2, memchr3}; | |
6 | use syntax::hir::literal::{Literal, Literals}; | |
7 | ||
8 | use freqs::BYTE_FREQUENCIES; | |
9 | ||
10 | /// A prefix extracted from a compiled regular expression. | |
11 | /// | |
12 | /// A regex prefix is a set of literal strings that *must* be matched at the | |
13 | /// beginning of a regex in order for the entire regex to match. Similarly | |
14 | /// for a regex suffix. | |
15 | #[derive(Clone, Debug)] | |
16 | pub struct LiteralSearcher { | |
17 | complete: bool, | |
18 | lcp: FreqyPacked, | |
19 | lcs: FreqyPacked, | |
20 | matcher: Matcher, | |
21 | } | |
22 | ||
23 | #[derive(Clone, Debug)] | |
24 | enum Matcher { | |
25 | /// No literals. (Never advances through the input.) | |
26 | Empty, | |
27 | /// A set of four or more single byte literals. | |
28 | Bytes(SingleByteSet), | |
29 | /// A single substring, find using memchr and frequency analysis. | |
30 | FreqyPacked(FreqyPacked), | |
31 | /// A single substring, find using Boyer-Moore. | |
32 | BoyerMoore(BoyerMooreSearch), | |
33 | /// An Aho-Corasick automaton. | |
34 | AC { ac: AhoCorasick<u32>, lits: Vec<Literal> }, | |
35 | /// A packed multiple substring searcher, using SIMD. | |
36 | /// | |
37 | /// Note that Aho-Corasick will actually use this packed searcher | |
38 | /// internally automatically, however, there is some overhead associated | |
39 | /// with going through the Aho-Corasick machinery. So using the packed | |
40 | /// searcher directly results in some gains. | |
41 | Packed { s: packed::Searcher, lits: Vec<Literal> }, | |
42 | } | |
43 | ||
44 | impl LiteralSearcher { | |
45 | /// Returns a matcher that never matches and never advances the input. | |
46 | pub fn empty() -> Self { | |
47 | Self::new(Literals::empty(), Matcher::Empty) | |
48 | } | |
49 | ||
50 | /// Returns a matcher for literal prefixes from the given set. | |
51 | pub fn prefixes(lits: Literals) -> Self { | |
52 | let matcher = Matcher::prefixes(&lits); | |
53 | Self::new(lits, matcher) | |
54 | } | |
55 | ||
56 | /// Returns a matcher for literal suffixes from the given set. | |
57 | pub fn suffixes(lits: Literals) -> Self { | |
58 | let matcher = Matcher::suffixes(&lits); | |
59 | Self::new(lits, matcher) | |
60 | } | |
61 | ||
62 | fn new(lits: Literals, matcher: Matcher) -> Self { | |
63 | let complete = lits.all_complete(); | |
64 | LiteralSearcher { | |
65 | complete: complete, | |
66 | lcp: FreqyPacked::new(lits.longest_common_prefix().to_vec()), | |
67 | lcs: FreqyPacked::new(lits.longest_common_suffix().to_vec()), | |
68 | matcher: matcher, | |
69 | } | |
70 | } | |
71 | ||
72 | /// Returns true if all matches comprise the entire regular expression. | |
73 | /// | |
74 | /// This does not necessarily mean that a literal match implies a match | |
5869c6ff | 75 | /// of the regular expression. For example, the regular expression `^a` |
f9f354fc XL |
76 | /// is comprised of a single complete literal `a`, but the regular |
77 | /// expression demands that it only match at the beginning of a string. | |
78 | pub fn complete(&self) -> bool { | |
79 | self.complete && !self.is_empty() | |
80 | } | |
81 | ||
82 | /// Find the position of a literal in `haystack` if it exists. | |
83 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
84 | pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> { | |
85 | use self::Matcher::*; | |
86 | match self.matcher { | |
87 | Empty => Some((0, 0)), | |
88 | Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)), | |
89 | FreqyPacked(ref s) => s.find(haystack).map(|i| (i, i + s.len())), | |
90 | BoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())), | |
91 | AC { ref ac, .. } => { | |
92 | ac.find(haystack).map(|m| (m.start(), m.end())) | |
93 | } | |
94 | Packed { ref s, .. } => { | |
95 | s.find(haystack).map(|m| (m.start(), m.end())) | |
96 | } | |
97 | } | |
98 | } | |
99 | ||
100 | /// Like find, except matches must start at index `0`. | |
101 | pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> { | |
102 | for lit in self.iter() { | |
103 | if lit.len() > haystack.len() { | |
104 | continue; | |
105 | } | |
106 | if lit == &haystack[0..lit.len()] { | |
107 | return Some((0, lit.len())); | |
108 | } | |
109 | } | |
110 | None | |
111 | } | |
112 | ||
113 | /// Like find, except matches must end at index `haystack.len()`. | |
114 | pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> { | |
115 | for lit in self.iter() { | |
116 | if lit.len() > haystack.len() { | |
117 | continue; | |
118 | } | |
119 | if lit == &haystack[haystack.len() - lit.len()..] { | |
120 | return Some((haystack.len() - lit.len(), haystack.len())); | |
121 | } | |
122 | } | |
123 | None | |
124 | } | |
125 | ||
126 | /// Returns an iterator over all literals to be matched. | |
127 | pub fn iter(&self) -> LiteralIter { | |
128 | match self.matcher { | |
129 | Matcher::Empty => LiteralIter::Empty, | |
130 | Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense), | |
131 | Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat), | |
132 | Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern), | |
133 | Matcher::AC { ref lits, .. } => LiteralIter::AC(lits), | |
134 | Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits), | |
135 | } | |
136 | } | |
137 | ||
138 | /// Returns a matcher for the longest common prefix of this matcher. | |
139 | pub fn lcp(&self) -> &FreqyPacked { | |
140 | &self.lcp | |
141 | } | |
142 | ||
143 | /// Returns a matcher for the longest common suffix of this matcher. | |
144 | pub fn lcs(&self) -> &FreqyPacked { | |
145 | &self.lcs | |
146 | } | |
147 | ||
148 | /// Returns true iff this prefix is empty. | |
149 | pub fn is_empty(&self) -> bool { | |
150 | self.len() == 0 | |
151 | } | |
152 | ||
153 | /// Returns the number of prefixes in this machine. | |
154 | pub fn len(&self) -> usize { | |
155 | use self::Matcher::*; | |
156 | match self.matcher { | |
157 | Empty => 0, | |
158 | Bytes(ref sset) => sset.dense.len(), | |
159 | FreqyPacked(_) => 1, | |
160 | BoyerMoore(_) => 1, | |
161 | AC { ref ac, .. } => ac.pattern_count(), | |
162 | Packed { ref lits, .. } => lits.len(), | |
163 | } | |
164 | } | |
165 | ||
166 | /// Return the approximate heap usage of literals in bytes. | |
167 | pub fn approximate_size(&self) -> usize { | |
168 | use self::Matcher::*; | |
169 | match self.matcher { | |
170 | Empty => 0, | |
171 | Bytes(ref sset) => sset.approximate_size(), | |
172 | FreqyPacked(ref single) => single.approximate_size(), | |
173 | BoyerMoore(ref single) => single.approximate_size(), | |
174 | AC { ref ac, .. } => ac.heap_bytes(), | |
175 | Packed { ref s, .. } => s.heap_bytes(), | |
176 | } | |
177 | } | |
178 | } | |
179 | ||
180 | impl Matcher { | |
181 | fn prefixes(lits: &Literals) -> Self { | |
182 | let sset = SingleByteSet::prefixes(lits); | |
183 | Matcher::new(lits, sset) | |
184 | } | |
185 | ||
186 | fn suffixes(lits: &Literals) -> Self { | |
187 | let sset = SingleByteSet::suffixes(lits); | |
188 | Matcher::new(lits, sset) | |
189 | } | |
190 | ||
191 | fn new(lits: &Literals, sset: SingleByteSet) -> Self { | |
192 | if lits.literals().is_empty() { | |
193 | return Matcher::Empty; | |
194 | } | |
195 | if sset.dense.len() >= 26 { | |
196 | // Avoid trying to match a large number of single bytes. | |
197 | // This is *very* sensitive to a frequency analysis comparison | |
198 | // between the bytes in sset and the composition of the haystack. | |
199 | // No matter the size of sset, if its members all are rare in the | |
200 | // haystack, then it'd be worth using it. How to tune this... IDK. | |
201 | // ---AG | |
202 | return Matcher::Empty; | |
203 | } | |
204 | if sset.complete { | |
205 | return Matcher::Bytes(sset); | |
206 | } | |
207 | if lits.literals().len() == 1 { | |
208 | let lit = lits.literals()[0].to_vec(); | |
209 | if BoyerMooreSearch::should_use(lit.as_slice()) { | |
210 | return Matcher::BoyerMoore(BoyerMooreSearch::new(lit)); | |
211 | } else { | |
212 | return Matcher::FreqyPacked(FreqyPacked::new(lit)); | |
213 | } | |
214 | } | |
215 | ||
216 | let pats = lits.literals().to_owned(); | |
217 | let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii; | |
218 | if lits.literals().len() <= 100 && !is_aho_corasick_fast { | |
219 | let mut builder = packed::Config::new() | |
220 | .match_kind(packed::MatchKind::LeftmostFirst) | |
221 | .builder(); | |
222 | if let Some(s) = builder.extend(&pats).build() { | |
223 | return Matcher::Packed { s, lits: pats }; | |
224 | } | |
225 | } | |
226 | let ac = AhoCorasickBuilder::new() | |
227 | .match_kind(aho_corasick::MatchKind::LeftmostFirst) | |
228 | .dfa(true) | |
229 | .build_with_size::<u32, _, _>(&pats) | |
230 | .unwrap(); | |
231 | Matcher::AC { ac, lits: pats } | |
232 | } | |
233 | } | |
234 | ||
5869c6ff | 235 | #[derive(Debug)] |
f9f354fc XL |
236 | pub enum LiteralIter<'a> { |
237 | Empty, | |
238 | Bytes(&'a [u8]), | |
239 | Single(&'a [u8]), | |
240 | AC(&'a [Literal]), | |
241 | Packed(&'a [Literal]), | |
242 | } | |
243 | ||
244 | impl<'a> Iterator for LiteralIter<'a> { | |
245 | type Item = &'a [u8]; | |
246 | ||
247 | fn next(&mut self) -> Option<Self::Item> { | |
248 | match *self { | |
249 | LiteralIter::Empty => None, | |
250 | LiteralIter::Bytes(ref mut many) => { | |
251 | if many.is_empty() { | |
252 | None | |
253 | } else { | |
254 | let next = &many[0..1]; | |
255 | *many = &many[1..]; | |
256 | Some(next) | |
257 | } | |
258 | } | |
259 | LiteralIter::Single(ref mut one) => { | |
260 | if one.is_empty() { | |
261 | None | |
262 | } else { | |
263 | let next = &one[..]; | |
264 | *one = &[]; | |
265 | Some(next) | |
266 | } | |
267 | } | |
268 | LiteralIter::AC(ref mut lits) => { | |
269 | if lits.is_empty() { | |
270 | None | |
271 | } else { | |
272 | let next = &lits[0]; | |
273 | *lits = &lits[1..]; | |
274 | Some(&**next) | |
275 | } | |
276 | } | |
277 | LiteralIter::Packed(ref mut lits) => { | |
278 | if lits.is_empty() { | |
279 | None | |
280 | } else { | |
281 | let next = &lits[0]; | |
282 | *lits = &lits[1..]; | |
283 | Some(&**next) | |
284 | } | |
285 | } | |
286 | } | |
287 | } | |
288 | } | |
289 | ||
290 | #[derive(Clone, Debug)] | |
291 | struct SingleByteSet { | |
292 | sparse: Vec<bool>, | |
293 | dense: Vec<u8>, | |
294 | complete: bool, | |
295 | all_ascii: bool, | |
296 | } | |
297 | ||
298 | impl SingleByteSet { | |
299 | fn new() -> SingleByteSet { | |
300 | SingleByteSet { | |
301 | sparse: vec![false; 256], | |
302 | dense: vec![], | |
303 | complete: true, | |
304 | all_ascii: true, | |
305 | } | |
306 | } | |
307 | ||
308 | fn prefixes(lits: &Literals) -> SingleByteSet { | |
309 | let mut sset = SingleByteSet::new(); | |
310 | for lit in lits.literals() { | |
311 | sset.complete = sset.complete && lit.len() == 1; | |
312 | if let Some(&b) = lit.get(0) { | |
313 | if !sset.sparse[b as usize] { | |
314 | if b > 0x7F { | |
315 | sset.all_ascii = false; | |
316 | } | |
317 | sset.dense.push(b); | |
318 | sset.sparse[b as usize] = true; | |
319 | } | |
320 | } | |
321 | } | |
322 | sset | |
323 | } | |
324 | ||
325 | fn suffixes(lits: &Literals) -> SingleByteSet { | |
326 | let mut sset = SingleByteSet::new(); | |
327 | for lit in lits.literals() { | |
328 | sset.complete = sset.complete && lit.len() == 1; | |
329 | if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) { | |
330 | if !sset.sparse[b as usize] { | |
331 | if b > 0x7F { | |
332 | sset.all_ascii = false; | |
333 | } | |
334 | sset.dense.push(b); | |
335 | sset.sparse[b as usize] = true; | |
336 | } | |
337 | } | |
338 | } | |
339 | sset | |
340 | } | |
341 | ||
342 | /// Faster find that special cases certain sizes to use memchr. | |
343 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
344 | fn find(&self, text: &[u8]) -> Option<usize> { | |
345 | match self.dense.len() { | |
346 | 0 => None, | |
347 | 1 => memchr(self.dense[0], text), | |
348 | 2 => memchr2(self.dense[0], self.dense[1], text), | |
349 | 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text), | |
350 | _ => self._find(text), | |
351 | } | |
352 | } | |
353 | ||
354 | /// Generic find that works on any sized set. | |
355 | fn _find(&self, haystack: &[u8]) -> Option<usize> { | |
356 | for (i, &b) in haystack.iter().enumerate() { | |
357 | if self.sparse[b as usize] { | |
358 | return Some(i); | |
359 | } | |
360 | } | |
361 | None | |
362 | } | |
363 | ||
364 | fn approximate_size(&self) -> usize { | |
365 | (self.dense.len() * mem::size_of::<u8>()) | |
366 | + (self.sparse.len() * mem::size_of::<bool>()) | |
367 | } | |
368 | } | |
369 | ||
370 | /// Provides an implementation of fast subtring search using frequency | |
371 | /// analysis. | |
372 | /// | |
373 | /// memchr is so fast that we do everything we can to keep the loop in memchr | |
374 | /// for as long as possible. The easiest way to do this is to intelligently | |
375 | /// pick the byte to send to memchr. The best byte is the byte that occurs | |
376 | /// least frequently in the haystack. Since doing frequency analysis on the | |
377 | /// haystack is far too expensive, we compute a set of fixed frequencies up | |
378 | /// front and hard code them in src/freqs.rs. Frequency analysis is done via | |
379 | /// scripts/frequencies.py. | |
380 | #[derive(Clone, Debug)] | |
381 | pub struct FreqyPacked { | |
382 | /// The pattern. | |
383 | pat: Vec<u8>, | |
384 | /// The number of Unicode characters in the pattern. This is useful for | |
385 | /// determining the effective length of a pattern when deciding which | |
386 | /// optimizations to perform. A trailing incomplete UTF-8 sequence counts | |
387 | /// as one character. | |
388 | char_len: usize, | |
389 | /// The rarest byte in the pattern, according to pre-computed frequency | |
390 | /// analysis. | |
391 | rare1: u8, | |
392 | /// The offset of the rarest byte in `pat`. | |
393 | rare1i: usize, | |
394 | /// The second rarest byte in the pattern, according to pre-computed | |
395 | /// frequency analysis. (This may be equivalent to the rarest byte.) | |
396 | /// | |
397 | /// The second rarest byte is used as a type of guard for quickly detecting | |
398 | /// a mismatch after memchr locates an instance of the rarest byte. This | |
399 | /// is a hedge against pathological cases where the pre-computed frequency | |
400 | /// analysis may be off. (But of course, does not prevent *all* | |
401 | /// pathological cases.) | |
402 | rare2: u8, | |
403 | /// The offset of the second rarest byte in `pat`. | |
404 | rare2i: usize, | |
405 | } | |
406 | ||
407 | impl FreqyPacked { | |
408 | fn new(pat: Vec<u8>) -> FreqyPacked { | |
409 | if pat.is_empty() { | |
410 | return FreqyPacked::empty(); | |
411 | } | |
412 | ||
413 | // Find the rarest two bytes. Try to make them distinct (but it's not | |
414 | // required). | |
415 | let mut rare1 = pat[0]; | |
416 | let mut rare2 = pat[0]; | |
417 | for b in pat[1..].iter().cloned() { | |
418 | if freq_rank(b) < freq_rank(rare1) { | |
419 | rare1 = b; | |
420 | } | |
421 | } | |
422 | for &b in &pat { | |
423 | if rare1 == rare2 { | |
424 | rare2 = b | |
425 | } else if b != rare1 && freq_rank(b) < freq_rank(rare2) { | |
426 | rare2 = b; | |
427 | } | |
428 | } | |
429 | ||
430 | // And find the offsets of their last occurrences. | |
431 | let rare1i = pat.iter().rposition(|&b| b == rare1).unwrap(); | |
432 | let rare2i = pat.iter().rposition(|&b| b == rare2).unwrap(); | |
433 | ||
434 | let char_len = char_len_lossy(&pat); | |
435 | FreqyPacked { | |
436 | pat: pat, | |
437 | char_len: char_len, | |
438 | rare1: rare1, | |
439 | rare1i: rare1i, | |
440 | rare2: rare2, | |
441 | rare2i: rare2i, | |
442 | } | |
443 | } | |
444 | ||
445 | fn empty() -> FreqyPacked { | |
446 | FreqyPacked { | |
447 | pat: vec![], | |
448 | char_len: 0, | |
449 | rare1: 0, | |
450 | rare1i: 0, | |
451 | rare2: 0, | |
452 | rare2i: 0, | |
453 | } | |
454 | } | |
455 | ||
456 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
457 | pub fn find(&self, haystack: &[u8]) -> Option<usize> { | |
458 | let pat = &*self.pat; | |
459 | if haystack.len() < pat.len() || pat.is_empty() { | |
460 | return None; | |
461 | } | |
462 | let mut i = self.rare1i; | |
463 | while i < haystack.len() { | |
464 | i += match memchr(self.rare1, &haystack[i..]) { | |
465 | None => return None, | |
466 | Some(i) => i, | |
467 | }; | |
468 | let start = i - self.rare1i; | |
469 | let end = start + pat.len(); | |
470 | if end > haystack.len() { | |
471 | return None; | |
472 | } | |
473 | let aligned = &haystack[start..end]; | |
474 | if aligned[self.rare2i] == self.rare2 && aligned == &*self.pat { | |
475 | return Some(start); | |
476 | } | |
477 | i += 1; | |
478 | } | |
479 | None | |
480 | } | |
481 | ||
482 | #[cfg_attr(feature = "perf-inline", inline(always))] | |
483 | pub fn is_suffix(&self, text: &[u8]) -> bool { | |
484 | if text.len() < self.len() { | |
485 | return false; | |
486 | } | |
487 | text[text.len() - self.len()..] == *self.pat | |
488 | } | |
489 | ||
490 | pub fn len(&self) -> usize { | |
491 | self.pat.len() | |
492 | } | |
493 | ||
494 | pub fn char_len(&self) -> usize { | |
495 | self.char_len | |
496 | } | |
497 | ||
498 | fn approximate_size(&self) -> usize { | |
499 | self.pat.len() * mem::size_of::<u8>() | |
500 | } | |
501 | } | |
502 | ||
503 | fn char_len_lossy(bytes: &[u8]) -> usize { | |
504 | String::from_utf8_lossy(bytes).chars().count() | |
505 | } | |
506 | ||
507 | /// An implementation of Tuned Boyer-Moore as laid out by | |
508 | /// Andrew Hume and Daniel Sunday in "Fast String Searching". | |
509 | /// O(n) in the size of the input. | |
510 | /// | |
511 | /// Fast string searching algorithms come in many variations, | |
512 | /// but they can generally be described in terms of three main | |
513 | /// components. | |
514 | /// | |
515 | /// The skip loop is where the string searcher wants to spend | |
516 | /// as much time as possible. Exactly which character in the | |
517 | /// pattern the skip loop examines varies from algorithm to | |
518 | /// algorithm, but in the simplest case this loop repeated | |
519 | /// looks at the last character in the pattern and jumps | |
520 | /// forward in the input if it is not in the pattern. | |
521 | /// Robert Boyer and J Moore called this the "fast" loop in | |
522 | /// their original paper. | |
523 | /// | |
524 | /// The match loop is responsible for actually examining the | |
525 | /// whole potentially matching substring. In order to fail | |
526 | /// faster, the match loop sometimes has a guard test attached. | |
527 | /// The guard test uses frequency analysis of the different | |
528 | /// characters in the pattern to choose the least frequency | |
529 | /// occurring character and use it to find match failures | |
530 | /// as quickly as possible. | |
531 | /// | |
532 | /// The shift rule governs how the algorithm will shuffle its | |
533 | /// test window in the event of a failure during the match loop. | |
534 | /// Certain shift rules allow the worst-case run time of the | |
535 | /// algorithm to be shown to be O(n) in the size of the input | |
536 | /// rather than O(nm) in the size of the input and the size | |
537 | /// of the pattern (as naive Boyer-Moore is). | |
538 | /// | |
539 | /// "Fast String Searching", in addition to presenting a tuned | |
540 | /// algorithm, provides a comprehensive taxonomy of the many | |
541 | /// different flavors of string searchers. Under that taxonomy | |
542 | /// TBM, the algorithm implemented here, uses an unrolled fast | |
543 | /// skip loop with memchr fallback, a forward match loop with guard, | |
544 | /// and the mini Sunday's delta shift rule. To unpack that you'll have to | |
545 | /// read the paper. | |
546 | #[derive(Clone, Debug)] | |
547 | pub struct BoyerMooreSearch { | |
548 | /// The pattern we are going to look for in the haystack. | |
549 | pattern: Vec<u8>, | |
550 | ||
551 | /// The skip table for the skip loop. | |
552 | /// | |
553 | /// Maps the character at the end of the input | |
554 | /// to a shift. | |
555 | skip_table: Vec<usize>, | |
556 | ||
557 | /// The guard character (least frequently occurring char). | |
558 | guard: u8, | |
559 | /// The reverse-index of the guard character in the pattern. | |
560 | guard_reverse_idx: usize, | |
561 | ||
562 | /// Daniel Sunday's mini generalized delta2 shift table. | |
563 | /// | |
564 | /// We use a skip loop, so we only have to provide a shift | |
565 | /// for the skip char (last char). This is why it is a mini | |
566 | /// shift rule. | |
567 | md2_shift: usize, | |
568 | } | |
569 | ||
570 | impl BoyerMooreSearch { | |
571 | /// Create a new string searcher, performing whatever | |
572 | /// compilation steps are required. | |
573 | fn new(pattern: Vec<u8>) -> Self { | |
574 | debug_assert!(!pattern.is_empty()); | |
575 | ||
576 | let (g, gi) = Self::select_guard(pattern.as_slice()); | |
577 | let skip_table = Self::compile_skip_table(pattern.as_slice()); | |
578 | let md2_shift = Self::compile_md2_shift(pattern.as_slice()); | |
579 | BoyerMooreSearch { | |
580 | pattern: pattern, | |
581 | skip_table: skip_table, | |
582 | guard: g, | |
583 | guard_reverse_idx: gi, | |
584 | md2_shift: md2_shift, | |
585 | } | |
586 | } | |
587 | ||
588 | /// Find the pattern in `haystack`, returning the offset | |
589 | /// of the start of the first occurrence of the pattern | |
590 | /// in `haystack`. | |
591 | #[inline] | |
592 | fn find(&self, haystack: &[u8]) -> Option<usize> { | |
593 | if haystack.len() < self.pattern.len() { | |
594 | return None; | |
595 | } | |
596 | ||
597 | let mut window_end = self.pattern.len() - 1; | |
598 | ||
599 | // Inspired by the grep source. It is a way | |
600 | // to do correct loop unrolling without having to place | |
601 | // a crashpad of terminating charicters at the end in | |
602 | // the way described in the Fast String Searching paper. | |
603 | const NUM_UNROLL: usize = 10; | |
604 | // 1 for the initial position, and 1 for the md2 shift | |
605 | let short_circut = (NUM_UNROLL + 2) * self.pattern.len(); | |
606 | ||
607 | if haystack.len() > short_circut { | |
608 | // just 1 for the md2 shift | |
609 | let backstop = | |
610 | haystack.len() - ((NUM_UNROLL + 1) * self.pattern.len()); | |
611 | loop { | |
612 | window_end = | |
613 | match self.skip_loop(haystack, window_end, backstop) { | |
614 | Some(i) => i, | |
615 | None => return None, | |
616 | }; | |
617 | if window_end >= backstop { | |
618 | break; | |
619 | } | |
620 | ||
621 | if self.check_match(haystack, window_end) { | |
622 | return Some(window_end - (self.pattern.len() - 1)); | |
623 | } else { | |
624 | let skip = self.skip_table[haystack[window_end] as usize]; | |
625 | window_end += | |
626 | if skip == 0 { self.md2_shift } else { skip }; | |
627 | continue; | |
628 | } | |
629 | } | |
630 | } | |
631 | ||
632 | // now process the input after the backstop | |
633 | while window_end < haystack.len() { | |
634 | let mut skip = self.skip_table[haystack[window_end] as usize]; | |
635 | if skip == 0 { | |
636 | if self.check_match(haystack, window_end) { | |
637 | return Some(window_end - (self.pattern.len() - 1)); | |
638 | } else { | |
639 | skip = self.md2_shift; | |
640 | } | |
641 | } | |
642 | window_end += skip; | |
643 | } | |
644 | ||
645 | None | |
646 | } | |
647 | ||
648 | fn len(&self) -> usize { | |
649 | return self.pattern.len(); | |
650 | } | |
651 | ||
652 | /// The key heuristic behind which the BoyerMooreSearch lives. | |
653 | /// | |
654 | /// See `rust-lang/regex/issues/408`. | |
655 | /// | |
656 | /// Tuned Boyer-Moore is actually pretty slow! It turns out a handrolled | |
657 | /// platform-specific memchr routine with a bit of frequency | |
658 | /// analysis sprinkled on top actually wins most of the time. | |
659 | /// However, there are a few cases where Tuned Boyer-Moore still | |
660 | /// wins. | |
661 | /// | |
662 | /// If the haystack is random, frequency analysis doesn't help us, | |
663 | /// so Boyer-Moore will win for sufficiently large needles. | |
664 | /// Unfortunately, there is no obvious way to determine this | |
665 | /// ahead of time. | |
666 | /// | |
667 | /// If the pattern itself consists of very common characters, | |
668 | /// frequency analysis won't get us anywhere. The most extreme | |
669 | /// example of this is a pattern like `eeeeeeeeeeeeeeee`. Fortunately, | |
670 | /// this case is wholly determined by the pattern, so we can actually | |
671 | /// implement the heuristic. | |
672 | /// | |
673 | /// A third case is if the pattern is sufficiently long. The idea | |
674 | /// here is that once the pattern gets long enough the Tuned | |
675 | /// Boyer-Moore skip loop will start making strides long enough | |
676 | /// to beat the asm deep magic that is memchr. | |
677 | fn should_use(pattern: &[u8]) -> bool { | |
678 | // The minimum pattern length required to use TBM. | |
679 | const MIN_LEN: usize = 9; | |
680 | // The minimum frequency rank (lower is rarer) that every byte in the | |
681 | // pattern must have in order to use TBM. That is, if the pattern | |
682 | // contains _any_ byte with a lower rank, then TBM won't be used. | |
683 | const MIN_CUTOFF: usize = 150; | |
684 | // The maximum frequency rank for any byte. | |
685 | const MAX_CUTOFF: usize = 255; | |
686 | // The scaling factor used to determine the actual cutoff frequency | |
687 | // to use (keeping in mind that the minimum frequency rank is bounded | |
688 | // by MIN_CUTOFF). This scaling factor is an attempt to make TBM more | |
689 | // likely to be used as the pattern grows longer. That is, longer | |
690 | // patterns permit somewhat less frequent bytes than shorter patterns, | |
691 | // under the assumption that TBM gets better as the pattern gets | |
692 | // longer. | |
693 | const LEN_CUTOFF_PROPORTION: usize = 4; | |
694 | ||
695 | let scaled_rank = pattern.len().wrapping_mul(LEN_CUTOFF_PROPORTION); | |
696 | let cutoff = cmp::max( | |
697 | MIN_CUTOFF, | |
698 | MAX_CUTOFF - cmp::min(MAX_CUTOFF, scaled_rank), | |
699 | ); | |
700 | // The pattern must be long enough to be worthwhile. e.g., memchr will | |
701 | // be faster on `e` because it is short even though e is quite common. | |
702 | pattern.len() > MIN_LEN | |
703 | // all the bytes must be more common than the cutoff. | |
704 | && pattern.iter().all(|c| freq_rank(*c) >= cutoff) | |
705 | } | |
706 | ||
707 | /// Check to see if there is a match at the given position | |
708 | #[inline] | |
709 | fn check_match(&self, haystack: &[u8], window_end: usize) -> bool { | |
710 | // guard test | |
711 | if haystack[window_end - self.guard_reverse_idx] != self.guard { | |
712 | return false; | |
713 | } | |
714 | ||
715 | // match loop | |
716 | let window_start = window_end - (self.pattern.len() - 1); | |
717 | for i in 0..self.pattern.len() { | |
718 | if self.pattern[i] != haystack[window_start + i] { | |
719 | return false; | |
720 | } | |
721 | } | |
722 | ||
723 | true | |
724 | } | |
725 | ||
726 | /// Skip forward according to the shift table. | |
727 | /// | |
728 | /// Returns the offset of the next occurrence | |
729 | /// of the last char in the pattern, or the none | |
730 | /// if it never reappears. If `skip_loop` hits the backstop | |
731 | /// it will leave early. | |
732 | #[inline] | |
733 | fn skip_loop( | |
734 | &self, | |
735 | haystack: &[u8], | |
736 | mut window_end: usize, | |
737 | backstop: usize, | |
738 | ) -> Option<usize> { | |
739 | let window_end_snapshot = window_end; | |
740 | let skip_of = |we: usize| -> usize { | |
741 | // Unsafe might make this faster, but the benchmarks | |
742 | // were hard to interpret. | |
743 | self.skip_table[haystack[we] as usize] | |
744 | }; | |
745 | ||
746 | loop { | |
747 | let mut skip = skip_of(window_end); | |
748 | window_end += skip; | |
749 | skip = skip_of(window_end); | |
750 | window_end += skip; | |
751 | if skip != 0 { | |
752 | skip = skip_of(window_end); | |
753 | window_end += skip; | |
754 | skip = skip_of(window_end); | |
755 | window_end += skip; | |
756 | skip = skip_of(window_end); | |
757 | window_end += skip; | |
758 | if skip != 0 { | |
759 | skip = skip_of(window_end); | |
760 | window_end += skip; | |
761 | skip = skip_of(window_end); | |
762 | window_end += skip; | |
763 | skip = skip_of(window_end); | |
764 | window_end += skip; | |
765 | if skip != 0 { | |
766 | skip = skip_of(window_end); | |
767 | window_end += skip; | |
768 | skip = skip_of(window_end); | |
769 | window_end += skip; | |
770 | ||
771 | // If ten iterations did not make at least 16 words | |
772 | // worth of progress, we just fall back on memchr. | |
773 | if window_end - window_end_snapshot | |
774 | > 16 * mem::size_of::<usize>() | |
775 | { | |
776 | // Returning a window_end >= backstop will | |
777 | // immediatly break us out of the inner loop in | |
778 | // `find`. | |
779 | if window_end >= backstop { | |
780 | return Some(window_end); | |
781 | } | |
782 | ||
783 | continue; // we made enough progress | |
784 | } else { | |
785 | // In case we are already there, and so that | |
786 | // we will catch the guard char. | |
787 | window_end = window_end | |
788 | .checked_sub(1 + self.guard_reverse_idx) | |
789 | .unwrap_or(0); | |
790 | ||
791 | match memchr(self.guard, &haystack[window_end..]) { | |
792 | None => return None, | |
793 | Some(g_idx) => { | |
794 | return Some( | |
795 | window_end | |
796 | + g_idx | |
797 | + self.guard_reverse_idx, | |
798 | ); | |
799 | } | |
800 | } | |
801 | } | |
802 | } | |
803 | } | |
804 | } | |
805 | ||
806 | return Some(window_end); | |
807 | } | |
808 | } | |
809 | ||
810 | /// Compute the ufast skip table. | |
811 | fn compile_skip_table(pattern: &[u8]) -> Vec<usize> { | |
812 | let mut tab = vec![pattern.len(); 256]; | |
813 | ||
814 | // For every char in the pattern, we write a skip | |
815 | // that will line us up with the rightmost occurrence. | |
816 | // | |
817 | // N.B. the sentinel (0) is written by the last | |
818 | // loop iteration. | |
819 | for (i, c) in pattern.iter().enumerate() { | |
820 | tab[*c as usize] = (pattern.len() - 1) - i; | |
821 | } | |
822 | ||
823 | tab | |
824 | } | |
825 | ||
826 | /// Select the guard character based off of the precomputed | |
827 | /// frequency table. | |
828 | fn select_guard(pattern: &[u8]) -> (u8, usize) { | |
829 | let mut rarest = pattern[0]; | |
830 | let mut rarest_rev_idx = pattern.len() - 1; | |
831 | for (i, c) in pattern.iter().enumerate() { | |
832 | if freq_rank(*c) < freq_rank(rarest) { | |
833 | rarest = *c; | |
834 | rarest_rev_idx = (pattern.len() - 1) - i; | |
835 | } | |
836 | } | |
837 | ||
838 | (rarest, rarest_rev_idx) | |
839 | } | |
840 | ||
841 | /// If there is another occurrence of the skip | |
842 | /// char, shift to it, otherwise just shift to | |
843 | /// the next window. | |
844 | fn compile_md2_shift(pattern: &[u8]) -> usize { | |
845 | let shiftc = *pattern.last().unwrap(); | |
846 | ||
847 | // For a pattern of length 1 we will never apply the | |
848 | // shift rule, so we use a poison value on the principle | |
849 | // that failing fast is a good thing. | |
850 | if pattern.len() == 1 { | |
851 | return 0xDEADBEAF; | |
852 | } | |
853 | ||
854 | let mut i = pattern.len() - 2; | |
855 | while i > 0 { | |
856 | if pattern[i] == shiftc { | |
857 | return (pattern.len() - 1) - i; | |
858 | } | |
859 | i -= 1; | |
860 | } | |
861 | ||
862 | // The skip char never re-occurs in the pattern, so | |
863 | // we can just shift the whole window length. | |
864 | pattern.len() - 1 | |
865 | } | |
866 | ||
867 | fn approximate_size(&self) -> usize { | |
868 | (self.pattern.len() * mem::size_of::<u8>()) | |
869 | + (256 * mem::size_of::<usize>()) // skip table | |
870 | } | |
871 | } | |
872 | ||
873 | fn freq_rank(b: u8) -> usize { | |
874 | BYTE_FREQUENCIES[b as usize] as usize | |
875 | } | |
876 | ||
877 | #[cfg(test)] | |
878 | mod tests { | |
879 | use super::{BoyerMooreSearch, FreqyPacked}; | |
880 | ||
881 | // | |
882 | // Unit Tests | |
883 | // | |
884 | ||
885 | // The "hello, world" of string searching | |
886 | #[test] | |
887 | fn bm_find_subs() { | |
888 | let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..])); | |
889 | let haystack = b"I keep seeing patterns in this text"; | |
890 | assert_eq!(14, searcher.find(haystack).unwrap()); | |
891 | } | |
892 | ||
893 | #[test] | |
894 | fn bm_find_no_subs() { | |
895 | let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..])); | |
896 | let haystack = b"I keep seeing needles in this text"; | |
897 | assert_eq!(None, searcher.find(haystack)); | |
898 | } | |
899 | ||
900 | // | |
901 | // Regression Tests | |
902 | // | |
903 | ||
904 | #[test] | |
905 | fn bm_skip_reset_bug() { | |
906 | let haystack = vec![0, 0, 0, 0, 0, 1, 1, 0]; | |
907 | let needle = vec![0, 1, 1, 0]; | |
908 | ||
909 | let searcher = BoyerMooreSearch::new(needle); | |
910 | let offset = searcher.find(haystack.as_slice()).unwrap(); | |
911 | assert_eq!(4, offset); | |
912 | } | |
913 | ||
914 | #[test] | |
915 | fn bm_backstop_underflow_bug() { | |
916 | let haystack = vec![0, 0]; | |
917 | let needle = vec![0, 0]; | |
918 | ||
919 | let searcher = BoyerMooreSearch::new(needle); | |
920 | let offset = searcher.find(haystack.as_slice()).unwrap(); | |
921 | assert_eq!(0, offset); | |
922 | } | |
923 | ||
924 | #[test] | |
925 | fn bm_naive_off_by_one_bug() { | |
926 | let haystack = vec![91]; | |
927 | let needle = vec![91]; | |
928 | ||
929 | let naive_offset = naive_find(&needle, &haystack).unwrap(); | |
930 | assert_eq!(0, naive_offset); | |
931 | } | |
932 | ||
933 | #[test] | |
934 | fn bm_memchr_fallback_indexing_bug() { | |
935 | let mut haystack = vec![ | |
936 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
937 | 0, 0, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
938 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
939 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
940 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
941 | ]; | |
942 | let needle = vec![1, 1, 1, 1, 32, 32, 87]; | |
943 | let needle_start = haystack.len(); | |
944 | haystack.extend(needle.clone()); | |
945 | ||
946 | let searcher = BoyerMooreSearch::new(needle); | |
947 | assert_eq!(needle_start, searcher.find(haystack.as_slice()).unwrap()); | |
948 | } | |
949 | ||
950 | #[test] | |
951 | fn bm_backstop_boundary() { | |
952 | let haystack = b"\ | |
953 | // aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | |
954 | e_data.clone_created(entity_id, entity_to_add.entity_id); | |
955 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | |
956 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | |
957 | " | |
958 | .to_vec(); | |
959 | let needle = b"clone_created".to_vec(); | |
960 | ||
961 | let searcher = BoyerMooreSearch::new(needle); | |
962 | let result = searcher.find(&haystack); | |
963 | assert_eq!(Some(43), result); | |
964 | } | |
965 | ||
966 | #[test] | |
967 | fn bm_win_gnu_indexing_bug() { | |
968 | let haystack_raw = vec![ | |
969 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
970 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
971 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
972 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
973 | ]; | |
974 | let needle = vec![1, 1, 1, 1, 1, 1, 1]; | |
975 | let haystack = haystack_raw.as_slice(); | |
976 | ||
977 | BoyerMooreSearch::new(needle.clone()).find(haystack); | |
978 | } | |
979 | ||
980 | // | |
981 | // QuickCheck Properties | |
982 | // | |
983 | ||
984 | use quickcheck::TestResult; | |
985 | ||
986 | fn naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize> { | |
987 | assert!(needle.len() <= haystack.len()); | |
988 | ||
989 | for i in 0..(haystack.len() - (needle.len() - 1)) { | |
990 | if haystack[i] == needle[0] | |
991 | && &haystack[i..(i + needle.len())] == needle | |
992 | { | |
993 | return Some(i); | |
994 | } | |
995 | } | |
996 | ||
997 | None | |
998 | } | |
999 | ||
1000 | quickcheck! { | |
1001 | fn qc_bm_equals_nieve_find(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult { | |
1002 | if pile1.len() == 0 || pile2.len() == 0 { | |
1003 | return TestResult::discard(); | |
1004 | } | |
1005 | ||
1006 | let (needle, haystack) = if pile1.len() < pile2.len() { | |
1007 | (pile1, pile2.as_slice()) | |
1008 | } else { | |
1009 | (pile2, pile1.as_slice()) | |
1010 | }; | |
1011 | ||
1012 | let searcher = BoyerMooreSearch::new(needle.clone()); | |
1013 | TestResult::from_bool( | |
1014 | searcher.find(haystack) == naive_find(&needle, haystack)) | |
1015 | } | |
1016 | ||
1017 | fn qc_bm_equals_single(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult { | |
1018 | if pile1.len() == 0 || pile2.len() == 0 { | |
1019 | return TestResult::discard(); | |
1020 | } | |
1021 | ||
1022 | let (needle, haystack) = if pile1.len() < pile2.len() { | |
1023 | (pile1, pile2.as_slice()) | |
1024 | } else { | |
1025 | (pile2, pile1.as_slice()) | |
1026 | }; | |
1027 | ||
1028 | let bm_searcher = BoyerMooreSearch::new(needle.clone()); | |
1029 | let freqy_memchr = FreqyPacked::new(needle); | |
1030 | TestResult::from_bool( | |
1031 | bm_searcher.find(haystack) == freqy_memchr.find(haystack)) | |
1032 | } | |
1033 | ||
1034 | fn qc_bm_finds_trailing_needle( | |
1035 | haystack_pre: Vec<u8>, | |
1036 | needle: Vec<u8> | |
1037 | ) -> TestResult { | |
1038 | if needle.len() == 0 { | |
1039 | return TestResult::discard(); | |
1040 | } | |
1041 | ||
1042 | let mut haystack = haystack_pre.clone(); | |
1043 | let searcher = BoyerMooreSearch::new(needle.clone()); | |
1044 | ||
1045 | if haystack.len() >= needle.len() && | |
1046 | searcher.find(haystack.as_slice()).is_some() { | |
1047 | return TestResult::discard(); | |
1048 | } | |
1049 | ||
1050 | haystack.extend(needle.clone()); | |
1051 | ||
1052 | // What if the the tail of the haystack can start the | |
1053 | // needle? | |
1054 | let start = haystack_pre.len() | |
1055 | .checked_sub(needle.len()) | |
1056 | .unwrap_or(0); | |
1057 | for i in 0..(needle.len() - 1) { | |
1058 | if searcher.find(&haystack[(i + start)..]).is_some() { | |
1059 | return TestResult::discard(); | |
1060 | } | |
1061 | } | |
1062 | ||
1063 | TestResult::from_bool( | |
1064 | searcher.find(haystack.as_slice()) | |
1065 | .map(|x| x == haystack_pre.len()) | |
1066 | .unwrap_or(false)) | |
1067 | } | |
1068 | ||
1069 | // qc_equals_* is only testing the negative case as @burntsushi | |
1070 | // pointed out in https://github.com/rust-lang/regex/issues/446. | |
1071 | // This quickcheck prop represents an effort to force testing of | |
1072 | // the positive case. qc_bm_finds_first and qc_bm_finds_trailing_needle | |
1073 | // already check some of the positive cases, but they don't cover | |
1074 | // cases where the needle is in the middle of haystack. This prop | |
1075 | // fills that hole. | |
1076 | fn qc_bm_finds_subslice( | |
1077 | haystack: Vec<u8>, | |
1078 | needle_start: usize, | |
1079 | needle_length: usize | |
1080 | ) -> TestResult { | |
1081 | if haystack.len() == 0 { | |
1082 | return TestResult::discard(); | |
1083 | } | |
1084 | ||
1085 | let needle_start = needle_start % haystack.len(); | |
1086 | let needle_length = needle_length % (haystack.len() - needle_start); | |
1087 | ||
1088 | if needle_length == 0 { | |
1089 | return TestResult::discard(); | |
1090 | } | |
1091 | ||
1092 | let needle = &haystack[needle_start..(needle_start + needle_length)]; | |
1093 | ||
1094 | let bm_searcher = BoyerMooreSearch::new(needle.to_vec()); | |
1095 | ||
1096 | let start = naive_find(&needle, &haystack); | |
1097 | match start { | |
1098 | None => TestResult::from_bool(false), | |
1099 | Some(nf_start) => | |
1100 | TestResult::from_bool( | |
1101 | nf_start <= needle_start | |
1102 | && bm_searcher.find(&haystack) == start | |
1103 | ) | |
1104 | } | |
1105 | } | |
1106 | ||
1107 | fn qc_bm_finds_first(needle: Vec<u8>) -> TestResult { | |
1108 | if needle.len() == 0 { | |
1109 | return TestResult::discard(); | |
1110 | } | |
1111 | ||
1112 | let mut haystack = needle.clone(); | |
1113 | let searcher = BoyerMooreSearch::new(needle.clone()); | |
1114 | haystack.extend(needle); | |
1115 | ||
1116 | TestResult::from_bool( | |
1117 | searcher.find(haystack.as_slice()) | |
1118 | .map(|x| x == 0) | |
1119 | .unwrap_or(false)) | |
1120 | } | |
1121 | } | |
1122 | } |