]> git.proxmox.com Git - rustc.git/blame - vendor/regex-0.2.11/src/literal/mod.rs
New upstream version 1.33.0+dfsg1
[rustc.git] / vendor / regex-0.2.11 / src / literal / mod.rs
CommitLineData
94b46f34
XL
1// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use std::cmp;
12use std::mem;
13
14use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton};
15use memchr::{memchr, memchr2, memchr3};
16use syntax::hir::literal::{Literal, Literals};
17
18use freqs::BYTE_FREQUENCIES;
19use self::teddy_avx2::{Teddy as TeddyAVX2};
20use self::teddy_ssse3::{Teddy as TeddySSSE3};
21
22mod teddy_avx2;
23mod teddy_ssse3;
24
25/// A prefix extracted from a compiled regular expression.
26///
27/// A regex prefix is a set of literal strings that *must* be matched at the
28/// beginning of a regex in order for the entire regex to match. Similarly
29/// for a regex suffix.
30#[derive(Clone, Debug)]
31pub struct LiteralSearcher {
32 complete: bool,
33 lcp: FreqyPacked,
34 lcs: FreqyPacked,
35 matcher: Matcher,
36}
37
38#[derive(Clone, Debug)]
39enum Matcher {
40 /// No literals. (Never advances through the input.)
41 Empty,
42 /// A set of four or more single byte literals.
43 Bytes(SingleByteSet),
44 /// A single substring, find using memchr and frequency analysis.
45 FreqyPacked(FreqyPacked),
46 /// A single substring, find using Boyer-Moore.
47 BoyerMoore(BoyerMooreSearch),
48 /// An Aho-Corasick automaton.
49 AC(FullAcAutomaton<Literal>),
50 /// A simd accelerated multiple string matcher. Used only for a small
51 /// number of small literals.
52 TeddySSSE3(TeddySSSE3),
53 /// A simd accelerated multiple string matcher. Used only for a small
54 /// number of small literals. This uses 256-bit vectors.
55 TeddyAVX2(TeddyAVX2),
56}
57
58impl LiteralSearcher {
59 /// Returns a matcher that never matches and never advances the input.
60 pub fn empty() -> Self {
61 Self::new(Literals::empty(), Matcher::Empty)
62 }
63
64 /// Returns a matcher for literal prefixes from the given set.
65 pub fn prefixes(lits: Literals) -> Self {
66 let matcher = Matcher::prefixes(&lits);
67 Self::new(lits, matcher)
68 }
69
70 /// Returns a matcher for literal suffixes from the given set.
71 pub fn suffixes(lits: Literals) -> Self {
72 let matcher = Matcher::suffixes(&lits);
73 Self::new(lits, matcher)
74 }
75
76 fn new(lits: Literals, matcher: Matcher) -> Self {
77 let complete = lits.all_complete();
78 LiteralSearcher {
79 complete: complete,
80 lcp: FreqyPacked::new(lits.longest_common_prefix().to_vec()),
81 lcs: FreqyPacked::new(lits.longest_common_suffix().to_vec()),
82 matcher: matcher,
83 }
84 }
85
86 /// Returns true if all matches comprise the entire regular expression.
87 ///
88 /// This does not necessarily mean that a literal match implies a match
89 /// of the regular expression. For example, the regular expresison `^a`
90 /// is comprised of a single complete literal `a`, but the regular
91 /// expression demands that it only match at the beginning of a string.
92 pub fn complete(&self) -> bool {
93 self.complete && !self.is_empty()
94 }
95
96 /// Find the position of a literal in `haystack` if it exists.
97 #[inline(always)] // reduces constant overhead
98 pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
99 use self::Matcher::*;
100 match self.matcher {
101 Empty => Some((0, 0)),
102 Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
103 FreqyPacked(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
104 BoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
105 AC(ref aut) => aut.find(haystack).next().map(|m| (m.start, m.end)),
106 TeddySSSE3(ref t) => t.find(haystack).map(|m| (m.start, m.end)),
107 TeddyAVX2(ref t) => t.find(haystack).map(|m| (m.start, m.end)),
108 }
109 }
110
111 /// Like find, except matches must start at index `0`.
112 pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
113 for lit in self.iter() {
114 if lit.len() > haystack.len() {
115 continue;
116 }
117 if lit == &haystack[0..lit.len()] {
118 return Some((0, lit.len()));
119 }
120 }
121 None
122 }
123
124 /// Like find, except matches must end at index `haystack.len()`.
125 pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
126 for lit in self.iter() {
127 if lit.len() > haystack.len() {
128 continue;
129 }
130 if lit == &haystack[haystack.len() - lit.len()..] {
131 return Some((haystack.len() - lit.len(), haystack.len()));
132 }
133 }
134 None
135 }
136
137 /// Returns an iterator over all literals to be matched.
138 pub fn iter(&self) -> LiteralIter {
139 match self.matcher {
140 Matcher::Empty => LiteralIter::Empty,
141 Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
142 Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat),
143 Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern),
144 Matcher::AC(ref ac) => LiteralIter::AC(ac.patterns()),
145 Matcher::TeddySSSE3(ref ted) => {
146 LiteralIter::TeddySSSE3(ted.patterns())
147 }
148 Matcher::TeddyAVX2(ref ted) => {
149 LiteralIter::TeddyAVX2(ted.patterns())
150 }
151 }
152 }
153
154 /// Returns a matcher for the longest common prefix of this matcher.
155 pub fn lcp(&self) -> &FreqyPacked {
156 &self.lcp
157 }
158
159 /// Returns a matcher for the longest common suffix of this matcher.
160 pub fn lcs(&self) -> &FreqyPacked {
161 &self.lcs
162 }
163
164 /// Returns true iff this prefix is empty.
165 pub fn is_empty(&self) -> bool {
166 self.len() == 0
167 }
168
169 /// Returns the number of prefixes in this machine.
170 pub fn len(&self) -> usize {
171 use self::Matcher::*;
172 match self.matcher {
173 Empty => 0,
174 Bytes(ref sset) => sset.dense.len(),
175 FreqyPacked(_) => 1,
176 BoyerMoore(_) => 1,
177 AC(ref aut) => aut.len(),
178 TeddySSSE3(ref ted) => ted.len(),
179 TeddyAVX2(ref ted) => ted.len(),
180 }
181 }
182
183 /// Return the approximate heap usage of literals in bytes.
184 pub fn approximate_size(&self) -> usize {
185 use self::Matcher::*;
186 match self.matcher {
187 Empty => 0,
188 Bytes(ref sset) => sset.approximate_size(),
189 FreqyPacked(ref single) => single.approximate_size(),
190 BoyerMoore(ref single) => single.approximate_size(),
191 AC(ref aut) => aut.heap_bytes(),
192 TeddySSSE3(ref ted) => ted.approximate_size(),
193 TeddyAVX2(ref ted) => ted.approximate_size(),
194 }
195 }
196}
197
198impl Matcher {
199 fn prefixes(lits: &Literals) -> Self {
200 let sset = SingleByteSet::prefixes(lits);
201 Matcher::new(lits, sset)
202 }
203
204 fn suffixes(lits: &Literals) -> Self {
205 let sset = SingleByteSet::suffixes(lits);
206 Matcher::new(lits, sset)
207 }
208
209 fn new(lits: &Literals, sset: SingleByteSet) -> Self {
210 if lits.literals().is_empty() {
211 return Matcher::Empty;
212 }
213 if sset.dense.len() >= 26 {
214 // Avoid trying to match a large number of single bytes.
215 // This is *very* sensitive to a frequency analysis comparison
216 // between the bytes in sset and the composition of the haystack.
217 // No matter the size of sset, if its members all are rare in the
218 // haystack, then it'd be worth using it. How to tune this... IDK.
219 // ---AG
220 return Matcher::Empty;
221 }
222 if sset.complete {
223 return Matcher::Bytes(sset);
224 }
225 if lits.literals().len() == 1 {
226 let lit = lits.literals()[0].to_vec();
227 if BoyerMooreSearch::should_use(lit.as_slice()) {
228 return Matcher::BoyerMoore(BoyerMooreSearch::new(lit));
229 } else {
230 return Matcher::FreqyPacked(FreqyPacked::new(lit));
231 }
232 }
233 let is_aho_corasick_fast = sset.dense.len() == 1 && sset.all_ascii;
234 if TeddyAVX2::available() && !is_aho_corasick_fast {
235 const MAX_TEDDY_LITERALS: usize = 32;
236 if lits.literals().len() <= MAX_TEDDY_LITERALS {
237 if let Some(ted) = TeddyAVX2::new(lits) {
238 return Matcher::TeddyAVX2(ted);
239 }
240 }
241 }
242 if TeddySSSE3::available() && !is_aho_corasick_fast {
243 // Only try Teddy if Aho-Corasick can't use memchr on an ASCII
244 // byte. Also, in its current form, Teddy doesn't scale well to
245 // lots of literals.
246 //
247 // We impose the ASCII restriction since an alternation of
248 // non-ASCII string literals in the same language is likely to all
249 // start with the same byte. Even worse, the corpus being searched
250 // probably has a similar composition, which ends up completely
251 // negating the benefit of memchr.
252 const MAX_TEDDY_LITERALS: usize = 32;
253 if lits.literals().len() <= MAX_TEDDY_LITERALS {
254 if let Some(ted) = TeddySSSE3::new(lits) {
255 return Matcher::TeddySSSE3(ted);
256 }
257 }
258 // Fallthrough to ol' reliable Aho-Corasick...
259 }
260 let pats = lits.literals().to_owned();
261 Matcher::AC(AcAutomaton::new(pats).into_full())
262 }
263}
264
265pub enum LiteralIter<'a> {
266 Empty,
267 Bytes(&'a [u8]),
268 Single(&'a [u8]),
269 AC(&'a [Literal]),
270 TeddySSSE3(&'a [Vec<u8>]),
271 TeddyAVX2(&'a [Vec<u8>]),
272}
273
274impl<'a> Iterator for LiteralIter<'a> {
275 type Item = &'a [u8];
276
277 fn next(&mut self) -> Option<Self::Item> {
278 match *self {
279 LiteralIter::Empty => None,
280 LiteralIter::Bytes(ref mut many) => {
281 if many.is_empty() {
282 None
283 } else {
284 let next = &many[0..1];
285 *many = &many[1..];
286 Some(next)
287 }
288 }
289 LiteralIter::Single(ref mut one) => {
290 if one.is_empty() {
291 None
292 } else {
293 let next = &one[..];
294 *one = &[];
295 Some(next)
296 }
297 }
298 LiteralIter::AC(ref mut lits) => {
299 if lits.is_empty() {
300 None
301 } else {
302 let next = &lits[0];
303 *lits = &lits[1..];
304 Some(&**next)
305 }
306 }
307 LiteralIter::TeddySSSE3(ref mut lits) => {
308 if lits.is_empty() {
309 None
310 } else {
311 let next = &lits[0];
312 *lits = &lits[1..];
313 Some(&**next)
314 }
315 }
316 LiteralIter::TeddyAVX2(ref mut lits) => {
317 if lits.is_empty() {
318 None
319 } else {
320 let next = &lits[0];
321 *lits = &lits[1..];
322 Some(&**next)
323 }
324 }
325 }
326 }
327}
328
329#[derive(Clone, Debug)]
330struct SingleByteSet {
331 sparse: Vec<bool>,
332 dense: Vec<u8>,
333 complete: bool,
334 all_ascii: bool,
335}
336
337impl SingleByteSet {
338 fn new() -> SingleByteSet {
339 SingleByteSet {
340 sparse: vec![false; 256],
341 dense: vec![],
342 complete: true,
343 all_ascii: true,
344 }
345 }
346
347 fn prefixes(lits: &Literals) -> SingleByteSet {
348 let mut sset = SingleByteSet::new();
349 for lit in lits.literals() {
350 sset.complete = sset.complete && lit.len() == 1;
351 if let Some(&b) = lit.get(0) {
352 if !sset.sparse[b as usize] {
353 if b > 0x7F {
354 sset.all_ascii = false;
355 }
356 sset.dense.push(b);
357 sset.sparse[b as usize] = true;
358 }
359 }
360 }
361 sset
362 }
363
364 fn suffixes(lits: &Literals) -> SingleByteSet {
365 let mut sset = SingleByteSet::new();
366 for lit in lits.literals() {
367 sset.complete = sset.complete && lit.len() == 1;
368 if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
369 if !sset.sparse[b as usize] {
370 if b > 0x7F {
371 sset.all_ascii = false;
372 }
373 sset.dense.push(b);
374 sset.sparse[b as usize] = true;
375 }
376 }
377 }
378 sset
379 }
380
381 /// Faster find that special cases certain sizes to use memchr.
382 #[inline(always)] // reduces constant overhead
383 fn find(&self, text: &[u8]) -> Option<usize> {
384 match self.dense.len() {
385 0 => None,
386 1 => memchr(self.dense[0], text),
387 2 => memchr2(self.dense[0], self.dense[1], text),
388 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
389 _ => self._find(text),
390 }
391 }
392
393 /// Generic find that works on any sized set.
394 fn _find(&self, haystack: &[u8]) -> Option<usize> {
395 for (i, &b) in haystack.iter().enumerate() {
396 if self.sparse[b as usize] {
397 return Some(i);
398 }
399 }
400 None
401 }
402
403 fn approximate_size(&self) -> usize {
404 (self.dense.len() * mem::size_of::<u8>())
405 + (self.sparse.len() * mem::size_of::<bool>())
406 }
407}
408
409/// Provides an implementation of fast subtring search using frequency
410/// analysis.
411///
412/// memchr is so fast that we do everything we can to keep the loop in memchr
413/// for as long as possible. The easiest way to do this is to intelligently
414/// pick the byte to send to memchr. The best byte is the byte that occurs
415/// least frequently in the haystack. Since doing frequency analysis on the
416/// haystack is far too expensive, we compute a set of fixed frequencies up
417/// front and hard code them in src/freqs.rs. Frequency analysis is done via
418/// scripts/frequencies.py.
419#[derive(Clone, Debug)]
420pub struct FreqyPacked {
421 /// The pattern.
422 pat: Vec<u8>,
423 /// The number of Unicode characters in the pattern. This is useful for
424 /// determining the effective length of a pattern when deciding which
425 /// optimizations to perform. A trailing incomplete UTF-8 sequence counts
426 /// as one character.
427 char_len: usize,
428 /// The rarest byte in the pattern, according to pre-computed frequency
429 /// analysis.
430 rare1: u8,
431 /// The offset of the rarest byte in `pat`.
432 rare1i: usize,
433 /// The second rarest byte in the pattern, according to pre-computed
434 /// frequency analysis. (This may be equivalent to the rarest byte.)
435 ///
436 /// The second rarest byte is used as a type of guard for quickly detecting
437 /// a mismatch after memchr locates an instance of the rarest byte. This
438 /// is a hedge against pathological cases where the pre-computed frequency
439 /// analysis may be off. (But of course, does not prevent *all*
440 /// pathological cases.)
441 rare2: u8,
442 /// The offset of the second rarest byte in `pat`.
443 rare2i: usize,
444}
445
446impl FreqyPacked {
447 fn new(pat: Vec<u8>) -> FreqyPacked {
448 if pat.is_empty() {
449 return FreqyPacked::empty();
450 }
451
452 // Find the rarest two bytes. Try to make them distinct (but it's not
453 // required).
454 let mut rare1 = pat[0];
455 let mut rare2 = pat[0];
456 for b in pat[1..].iter().cloned() {
457 if freq_rank(b) < freq_rank(rare1) {
458 rare1 = b;
459 }
460 }
461 for &b in &pat {
462 if rare1 == rare2 {
463 rare2 = b
464 } else if b != rare1 && freq_rank(b) < freq_rank(rare2) {
465 rare2 = b;
466 }
467 }
468
469 // And find the offsets of their last occurrences.
470 let rare1i = pat.iter().rposition(|&b| b == rare1).unwrap();
471 let rare2i = pat.iter().rposition(|&b| b == rare2).unwrap();
472
473 let char_len = char_len_lossy(&pat);
474 FreqyPacked {
475 pat: pat,
476 char_len: char_len,
477 rare1: rare1,
478 rare1i: rare1i,
479 rare2: rare2,
480 rare2i: rare2i,
481 }
482 }
483
484 fn empty() -> FreqyPacked {
485 FreqyPacked {
486 pat: vec![],
487 char_len: 0,
488 rare1: 0,
489 rare1i: 0,
490 rare2: 0,
491 rare2i: 0,
492 }
493 }
494
495 #[inline(always)] // reduces constant overhead
496 pub fn find(&self, haystack: &[u8]) -> Option<usize> {
497 let pat = &*self.pat;
498 if haystack.len() < pat.len() || pat.is_empty() {
499 return None;
500 }
501 let mut i = self.rare1i;
502 while i < haystack.len() {
503 i += match memchr(self.rare1, &haystack[i..]) {
504 None => return None,
505 Some(i) => i,
506 };
507 let start = i - self.rare1i;
508 let end = start + pat.len();
509 if end > haystack.len() {
510 return None;
511 }
512 let aligned = &haystack[start..end];
513 if aligned[self.rare2i] == self.rare2 && aligned == &*self.pat {
514 return Some(start);
515 }
516 i += 1;
517 }
518 None
519 }
520
521 #[inline(always)] // reduces constant overhead
522 pub fn is_suffix(&self, text: &[u8]) -> bool {
523 if text.len() < self.len() {
524 return false;
525 }
526 text[text.len() - self.len()..] == *self.pat
527 }
528
529 pub fn len(&self) -> usize {
530 self.pat.len()
531 }
532
533 pub fn char_len(&self) -> usize {
534 self.char_len
535 }
536
537 fn approximate_size(&self) -> usize {
538 self.pat.len() * mem::size_of::<u8>()
539 }
540}
541
542fn char_len_lossy(bytes: &[u8]) -> usize {
543 String::from_utf8_lossy(bytes).chars().count()
544}
545
546/// An implementation of Tuned Boyer-Moore as laid out by
547/// Andrew Hume and Daniel Sunday in "Fast String Searching".
548/// O(n) in the size of the input.
549///
550/// Fast string searching algorithms come in many variations,
551/// but they can generally be described in terms of three main
552/// components.
553///
554/// The skip loop is where the string searcher wants to spend
555/// as much time as possible. Exactly which character in the
556/// pattern the skip loop examines varies from algorithm to
557/// algorithm, but in the simplest case this loop repeated
558/// looks at the last character in the pattern and jumps
559/// forward in the input if it is not in the pattern.
560/// Robert Boyer and J Moore called this the "fast" loop in
561/// their original paper.
562///
563/// The match loop is responsible for actually examining the
564/// whole potentially matching substring. In order to fail
565/// faster, the match loop sometimes has a guard test attached.
566/// The guard test uses frequency analysis of the different
567/// characters in the pattern to choose the least frequency
568/// occurring character and use it to find match failures
569/// as quickly as possible.
570///
571/// The shift rule governs how the algorithm will shuffle its
572/// test window in the event of a failure during the match loop.
573/// Certain shift rules allow the worst-case run time of the
574/// algorithm to be shown to be O(n) in the size of the input
575/// rather than O(nm) in the size of the input and the size
576/// of the pattern (as naive Boyer-Moore is).
577///
578/// "Fast String Searching", in addition to presenting a tuned
579/// algorithm, provides a comprehensive taxonomy of the many
580/// different flavors of string searchers. Under that taxonomy
581/// TBM, the algorithm implemented here, uses an unrolled fast
582/// skip loop with memchr fallback, a forward match loop with guard,
583/// and the mini Sunday's delta shift rule. To unpack that you'll have to
584/// read the paper.
585#[derive(Clone, Debug)]
586pub struct BoyerMooreSearch {
587 /// The pattern we are going to look for in the haystack.
588 pattern: Vec<u8>,
589
590 /// The skip table for the skip loop.
591 ///
592 /// Maps the character at the end of the input
593 /// to a shift.
594 skip_table: Vec<usize>,
595
596 /// The guard character (least frequently occurring char).
597 guard: u8,
598 /// The reverse-index of the guard character in the pattern.
599 guard_reverse_idx: usize,
600
601 /// Daniel Sunday's mini generalized delta2 shift table.
602 ///
603 /// We use a skip loop, so we only have to provide a shift
604 /// for the skip char (last char). This is why it is a mini
605 /// shift rule.
606 md2_shift: usize,
607}
608
609impl BoyerMooreSearch {
610 /// Create a new string searcher, performing whatever
611 /// compilation steps are required.
612 fn new(pattern: Vec<u8>) -> Self {
613 debug_assert!(pattern.len() > 0);
614
615 let (g, gi) = Self::select_guard(pattern.as_slice());
616 let skip_table = Self::compile_skip_table(pattern.as_slice());
617 let md2_shift = Self::compile_md2_shift(pattern.as_slice());
618 BoyerMooreSearch {
619 pattern: pattern,
620 skip_table: skip_table,
621 guard: g,
622 guard_reverse_idx: gi,
623 md2_shift: md2_shift,
624 }
625 }
626
627 /// Find the pattern in `haystack`, returning the offset
628 /// of the start of the first occurrence of the pattern
629 /// in `haystack`.
630 #[inline]
631 fn find(&self, haystack: &[u8]) -> Option<usize> {
632 if haystack.len() < self.pattern.len() {
633 return None;
634 }
635
636 let mut window_end = self.pattern.len() - 1;
637
638 // Inspired by the grep source. It is a way
639 // to do correct loop unrolling without having to place
640 // a crashpad of terminating charicters at the end in
641 // the way described in the Fast String Searching paper.
642 const NUM_UNROLL: usize = 10;
643 // 1 for the initial position, and 1 for the md2 shift
644 let short_circut = (NUM_UNROLL + 2) * self.pattern.len();
645
646 if haystack.len() > short_circut {
647 // just 1 for the md2 shift
648 let backstop = haystack.len() - ((NUM_UNROLL + 1) * self.pattern.len());
649 loop {
650 window_end = match self.skip_loop(haystack, window_end, backstop) {
651 Some(i) => i,
652 None => return None,
653 };
654 if window_end >= backstop {
655 break;
656 }
657
658 if self.check_match(haystack, window_end) {
659 return Some(window_end - (self.pattern.len() - 1));
660 } else {
661 let skip = self.skip_table[haystack[window_end] as usize];
662 window_end +=
663 if skip == 0 { self.md2_shift } else { skip };
664 continue;
665 }
666 }
667 }
668
669 // now process the input after the backstop
670 while window_end < haystack.len() {
671 let mut skip = self.skip_table[haystack[window_end] as usize];
672 if skip == 0 {
673 if self.check_match(haystack, window_end) {
674 return Some(window_end - (self.pattern.len() - 1));
675 } else {
676 skip = self.md2_shift;
677 }
678 }
679 window_end += skip;
680 }
681
682 None
683 }
684
685 fn len(&self) -> usize {
686 return self.pattern.len()
687 }
688
689 /// The key heuristic behind which the BoyerMooreSearch lives.
690 ///
691 /// See `rust-lang/regex/issues/408`.
692 ///
693 /// Tuned Boyer-Moore is actually pretty slow! It turns out a handrolled
694 /// platform-specific memchr routine with a bit of frequency
695 /// analysis sprinkled on top actually wins most of the time.
696 /// However, there are a few cases where Tuned Boyer-Moore still
697 /// wins.
698 ///
699 /// If the haystack is random, frequency analysis doesn't help us,
700 /// so Boyer-Moore will win for sufficiently large needles.
701 /// Unfortunately, there is no obvious way to determine this
702 /// ahead of time.
703 ///
704 /// If the pattern itself consists of very common characters,
705 /// frequency analysis won't get us anywhere. The most extreme
706 /// example of this is a pattern like `eeeeeeeeeeeeeeee`. Fortunately,
707 /// this case is wholly determined by the pattern, so we can actually
708 /// implement the heuristic.
709 ///
710 /// A third case is if the pattern is sufficiently long. The idea
711 /// here is that once the pattern gets long enough the Tuned
712 /// Boyer-Moore skip loop will start making strides long enough
713 /// to beat the asm deep magic that is memchr.
714 fn should_use(pattern: &[u8]) -> bool {
715 // The minimum pattern length required to use TBM.
716 const MIN_LEN: usize = 9;
717 // The minimum frequency rank (lower is rarer) that every byte in the
718 // pattern must have in order to use TBM. That is, if the pattern
719 // contains _any_ byte with a lower rank, then TBM won't be used.
720 const MIN_CUTOFF: usize = 150;
721 // The maximum frequency rank for any byte.
722 const MAX_CUTOFF: usize = 255;
723 // The scaling factor used to determine the actual cutoff frequency
724 // to use (keeping in mind that the minimum frequency rank is bounded
725 // by MIN_CUTOFF). This scaling factor is an attempt to make TBM more
726 // likely to be used as the pattern grows longer. That is, longer
727 // patterns permit somewhat less frequent bytes than shorter patterns,
728 // under the assumption that TBM gets better as the pattern gets
729 // longer.
730 const LEN_CUTOFF_PROPORTION: usize = 4;
731
732 let scaled_rank = pattern.len().wrapping_mul(LEN_CUTOFF_PROPORTION);
733 let cutoff = cmp::max(
734 MIN_CUTOFF,
735 MAX_CUTOFF - cmp::min(MAX_CUTOFF, scaled_rank),
736 );
737 // The pattern must be long enough to be worthwhile. e.g., memchr will
738 // be faster on `e` because it is short even though e is quite common.
739 pattern.len() > MIN_LEN
740 // all the bytes must be more common than the cutoff.
741 && pattern.iter().all(|c| freq_rank(*c) >= cutoff)
742 }
743
744 /// Check to see if there is a match at the given position
745 #[inline]
746 fn check_match(&self, haystack: &[u8], window_end: usize) -> bool {
747 // guard test
748 if haystack[window_end - self.guard_reverse_idx] != self.guard {
749 return false;
750 }
751
752 // match loop
753 let window_start = window_end - (self.pattern.len() - 1);
754 for i in 0..self.pattern.len() {
755 if self.pattern[i] != haystack[window_start + i] {
756 return false;
757 }
758 }
759
760 true
761 }
762
763 /// Skip forward according to the shift table.
764 ///
765 /// Returns the offset of the next occurrence
766 /// of the last char in the pattern, or the none
767 /// if it never reappears. If `skip_loop` hits the backstop
768 /// it will leave early.
769 #[inline]
770 fn skip_loop(&self,
771 haystack: &[u8],
772 mut window_end: usize,
773 backstop: usize,
774 ) -> Option<usize> {
775 use std::mem;
776
777 let window_end_snapshot = window_end;
778 let skip_of = |we: usize| -> usize {
779 // Unsafe might make this faster, but the benchmarks
780 // were hard to interpret.
781 self.skip_table[haystack[we] as usize]
782 };
783
784 loop {
785 let mut skip = skip_of(window_end); window_end += skip;
786 skip = skip_of(window_end); window_end += skip;
787 if skip != 0 {
788 skip = skip_of(window_end); window_end += skip;
789 skip = skip_of(window_end); window_end += skip;
790 skip = skip_of(window_end); window_end += skip;
791 if skip != 0 {
792 skip = skip_of(window_end); window_end += skip;
793 skip = skip_of(window_end); window_end += skip;
794 skip = skip_of(window_end); window_end += skip;
795 if skip != 0 {
796 skip = skip_of(window_end); window_end += skip;
797 skip = skip_of(window_end); window_end += skip;
798
799 // If ten iterations did not make at least 16 words
800 // worth of progress, we just fall back on memchr.
801 if window_end - window_end_snapshot >
802 16 * mem::size_of::<usize>() {
803
804 // Returning a window_end >= backstop will immediatly
805 // break us out of the inner loop in `find`.
806 if window_end >= backstop {
807 return Some(window_end);
808 }
809
810 continue; // we made enough progress
811 } else {
812 // In case we are already there, and so that
813 // we will catch the guard char.
814 window_end = window_end
815 .checked_sub(1 + self.guard_reverse_idx)
816 .unwrap_or(0);
817
818 match memchr(self.guard, &haystack[window_end..]) {
819 None => return None,
820 Some(g_idx) => {
821 return Some(window_end + g_idx + self.guard_reverse_idx);
822 }
823 }
824 }
825 }
826 }
827 }
828
829 return Some(window_end);
830 }
831 }
832
833 /// Compute the ufast skip table.
834 fn compile_skip_table(pattern: &[u8]) -> Vec<usize> {
835 let mut tab = vec![pattern.len(); 256];
836
837 // For every char in the pattern, we write a skip
838 // that will line us up with the rightmost occurrence.
839 //
840 // N.B. the sentinel (0) is written by the last
841 // loop iteration.
842 for (i, c) in pattern.iter().enumerate() {
843 tab[*c as usize] = (pattern.len() - 1) - i;
844 }
845
846 tab
847 }
848
849 /// Select the guard character based off of the precomputed
850 /// frequency table.
851 fn select_guard(pattern: &[u8]) -> (u8, usize) {
852 let mut rarest = pattern[0];
853 let mut rarest_rev_idx = pattern.len() - 1;
854 for (i, c) in pattern.iter().enumerate() {
855 if freq_rank(*c) < freq_rank(rarest) {
856 rarest = *c;
857 rarest_rev_idx = (pattern.len() - 1) - i;
858 }
859 }
860
861 (rarest, rarest_rev_idx)
862 }
863
864 /// If there is another occurrence of the skip
865 /// char, shift to it, otherwise just shift to
866 /// the next window.
867 fn compile_md2_shift(pattern: &[u8]) -> usize {
868 let shiftc = *pattern.last().unwrap();
869
870 // For a pattern of length 1 we will never apply the
871 // shift rule, so we use a poison value on the principle
872 // that failing fast is a good thing.
873 if pattern.len() == 1 {
874 return 0xDEADBEAF;
875 }
876
877 let mut i = pattern.len() - 2;
878 while i > 0 {
879 if pattern[i] == shiftc {
880 return (pattern.len() - 1) - i;
881 }
882 i -= 1;
883 }
884
885 // The skip char never re-occurs in the pattern, so
886 // we can just shift the whole window length.
887 pattern.len() - 1
888 }
889
890 fn approximate_size(&self) -> usize {
891 (self.pattern.len() * mem::size_of::<u8>())
892 + (256 * mem::size_of::<usize>()) // skip table
893 }
894}
895
896fn freq_rank(b: u8) -> usize {
897 BYTE_FREQUENCIES[b as usize] as usize
898}
899
900#[cfg(test)]
901mod tests {
902 use super::{BoyerMooreSearch, FreqyPacked};
903
904 //
905 // Unit Tests
906 //
907
908 // The "hello, world" of string searching
909 #[test]
910 fn bm_find_subs() {
911 let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
912 let haystack = b"I keep seeing patterns in this text";
913 assert_eq!(14, searcher.find(haystack).unwrap());
914 }
915
916 #[test]
917 fn bm_find_no_subs() {
918 let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
919 let haystack = b"I keep seeing needles in this text";
920 assert_eq!(None, searcher.find(haystack));
921 }
922
923 //
924 // Regression Tests
925 //
926
927 #[test]
928 fn bm_skip_reset_bug() {
929 let haystack = vec![0, 0, 0, 0, 0, 1, 1, 0];
930 let needle = vec![0, 1, 1, 0];
931
932 let searcher = BoyerMooreSearch::new(needle);
933 let offset = searcher.find(haystack.as_slice()).unwrap();
934 assert_eq!(4, offset);
935 }
936
937 #[test]
938 fn bm_backstop_underflow_bug() {
939 let haystack = vec![0, 0];
940 let needle = vec![0, 0];
941
942 let searcher = BoyerMooreSearch::new(needle);
943 let offset = searcher.find(haystack.as_slice()).unwrap();
944 assert_eq!(0, offset);
945 }
946
947 #[test]
948 fn bm_naive_off_by_one_bug() {
949 let haystack = vec![91];
950 let needle = vec![91];
951
952 let naive_offset = naive_find(&needle, &haystack).unwrap();
953 assert_eq!(0, naive_offset);
954 }
955
956 #[test]
957 fn bm_memchr_fallback_indexing_bug() {
958 let mut haystack = vec![
959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960 0, 0, 0, 0, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
963 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
964 let needle = vec![1, 1, 1, 1, 32, 32, 87];
965 let needle_start = haystack.len();
966 haystack.extend(needle.clone());
967
968 let searcher = BoyerMooreSearch::new(needle);
969 assert_eq!(needle_start, searcher.find(haystack.as_slice()).unwrap());
970 }
971
972 #[test]
973 fn bm_backstop_boundary() {
974 let haystack = b"\
975// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
976e_data.clone_created(entity_id, entity_to_add.entity_id);
977aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
978aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
979".to_vec();
980 let needle = b"clone_created".to_vec();
981
982 let searcher = BoyerMooreSearch::new(needle);
983 let result = searcher.find(&haystack);
984 assert_eq!(Some(43), result);
985 }
986
987 #[test]
988 fn bm_win_gnu_indexing_bug() {
989 let haystack_raw = vec![
990 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
991 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
992 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
993 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
994 let needle = vec![1, 1, 1, 1, 1, 1, 1];
995 let haystack = haystack_raw.as_slice();
996
997 BoyerMooreSearch::new(needle.clone()).find(haystack);
998 }
999
1000 //
1001 // QuickCheck Properties
1002 //
1003
1004 use quickcheck::TestResult;
1005
1006 fn naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize> {
1007 assert!(needle.len() <= haystack.len());
1008
1009 for i in 0..(haystack.len() - (needle.len() - 1)) {
1010 if haystack[i] == needle[0]
1011 && &haystack[i..(i+needle.len())] == needle {
1012 return Some(i)
1013 }
1014 }
1015
1016 None
1017 }
1018
1019 quickcheck! {
1020 fn qc_bm_equals_nieve_find(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
1021 if pile1.len() == 0 || pile2.len() == 0 {
1022 return TestResult::discard();
1023 }
1024
1025 let (needle, haystack) = if pile1.len() < pile2.len() {
1026 (pile1, pile2.as_slice())
1027 } else {
1028 (pile2, pile1.as_slice())
1029 };
1030
1031 let searcher = BoyerMooreSearch::new(needle.clone());
1032 TestResult::from_bool(
1033 searcher.find(haystack) == naive_find(&needle, haystack))
1034 }
1035
1036 fn qc_bm_equals_single(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
1037 if pile1.len() == 0 || pile2.len() == 0 {
1038 return TestResult::discard();
1039 }
1040
1041 let (needle, haystack) = if pile1.len() < pile2.len() {
1042 (pile1, pile2.as_slice())
1043 } else {
1044 (pile2, pile1.as_slice())
1045 };
1046
1047 let bm_searcher = BoyerMooreSearch::new(needle.clone());
1048 let freqy_memchr = FreqyPacked::new(needle);
1049 TestResult::from_bool(
1050 bm_searcher.find(haystack) == freqy_memchr.find(haystack))
1051 }
1052
1053 fn qc_bm_finds_trailing_needle(
1054 haystack_pre: Vec<u8>,
1055 needle: Vec<u8>
1056 ) -> TestResult {
1057 if needle.len() == 0 {
1058 return TestResult::discard();
1059 }
1060
1061 let mut haystack = haystack_pre.clone();
1062 let searcher = BoyerMooreSearch::new(needle.clone());
1063
1064 if haystack.len() >= needle.len() &&
1065 searcher.find(haystack.as_slice()).is_some() {
1066 return TestResult::discard();
1067 }
1068
1069 haystack.extend(needle.clone());
1070
1071 // What if the the tail of the haystack can start the
1072 // needle?
1073 let start = haystack_pre.len()
1074 .checked_sub(needle.len())
1075 .unwrap_or(0);
1076 for i in 0..(needle.len() - 1) {
1077 if searcher.find(&haystack[(i + start)..]).is_some() {
1078 return TestResult::discard();
1079 }
1080 }
1081
1082 TestResult::from_bool(
1083 searcher.find(haystack.as_slice())
1084 .map(|x| x == haystack_pre.len())
1085 .unwrap_or(false))
1086 }
1087
1088 // qc_equals_* is only testing the negative case as @burntsushi
1089 // pointed out in https://github.com/rust-lang/regex/issues/446.
1090 // This quickcheck prop represents an effort to force testing of
1091 // the positive case. qc_bm_finds_first and qc_bm_finds_trailing_needle
1092 // already check some of the positive cases, but they don't cover
1093 // cases where the needle is in the middle of haystack. This prop
1094 // fills that hole.
1095 fn qc_bm_finds_subslice(
1096 haystack: Vec<u8>,
1097 needle_start: usize,
1098 needle_length: usize
1099 ) -> TestResult {
1100 if haystack.len() == 0 {
1101 return TestResult::discard();
1102 }
1103
1104 let needle_start = needle_start % haystack.len();
1105 let needle_length = needle_length % (haystack.len() - needle_start);
1106
1107 if needle_length == 0 {
1108 return TestResult::discard();
1109 }
1110
1111 let needle = &haystack[needle_start..(needle_start + needle_length)];
1112
1113 let bm_searcher = BoyerMooreSearch::new(needle.to_vec());
1114
1115 let start = naive_find(&needle, &haystack);
1116 match start {
1117 None => TestResult::from_bool(false),
1118 Some(nf_start) =>
1119 TestResult::from_bool(
1120 nf_start <= needle_start
1121 && bm_searcher.find(&haystack) == start
1122 )
1123 }
1124 }
1125
1126 fn qc_bm_finds_first(needle: Vec<u8>) -> TestResult {
1127 if needle.len() == 0 {
1128 return TestResult::discard();
1129 }
1130
1131 let mut haystack = needle.clone();
1132 let searcher = BoyerMooreSearch::new(needle.clone());
1133 haystack.extend(needle);
1134
1135 TestResult::from_bool(
1136 searcher.find(haystack.as_slice())
1137 .map(|x| x == 0)
1138 .unwrap_or(false))
1139 }
1140 }
1141}