vendor/regex-1.4.3/src/literal/imp.rs

   1 use std::cmp;
   2 use std::mem;
   3
   4 use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
   5 use memchr::{memchr, memchr2, memchr3};
   6 use syntax::hir::literal::{Literal, Literals};
   7
   8 use freqs::BYTE_FREQUENCIES;
   9
  10 /// A prefix extracted from a compiled regular expression.
  11 ///
  12 /// A regex prefix is a set of literal strings that *must* be matched at the
  13 /// beginning of a regex in order for the entire regex to match. Similarly
  14 /// for a regex suffix.
  15 #[derive(Clone, Debug)]
  16 pub struct LiteralSearcher {
  17     complete: bool,
  18     lcp: FreqyPacked,
  19     lcs: FreqyPacked,
  20     matcher: Matcher,
  21 }
  22
  23 #[derive(Clone, Debug)]
  24 enum Matcher {
  25     /// No literals. (Never advances through the input.)
  26     Empty,
  27     /// A set of four or more single byte literals.
  28     Bytes(SingleByteSet),
  29     /// A single substring, find using memchr and frequency analysis.
  30     FreqyPacked(FreqyPacked),
  31     /// A single substring, find using Boyer-Moore.
  32     BoyerMoore(BoyerMooreSearch),
  33     /// An Aho-Corasick automaton.
  34     AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
  35     /// A packed multiple substring searcher, using SIMD.
  36     ///
  37     /// Note that Aho-Corasick will actually use this packed searcher
  38     /// internally automatically, however, there is some overhead associated
  39     /// with going through the Aho-Corasick machinery. So using the packed
  40     /// searcher directly results in some gains.
  41     Packed { s: packed::Searcher, lits: Vec<Literal> },
  42 }
  43
  44 impl LiteralSearcher {
  45     /// Returns a matcher that never matches and never advances the input.
  46     pub fn empty() -> Self {
  47         Self::new(Literals::empty(), Matcher::Empty)
  48     }
  49
  50     /// Returns a matcher for literal prefixes from the given set.
  51     pub fn prefixes(lits: Literals) -> Self {
  52         let matcher = Matcher::prefixes(&lits);
  53         Self::new(lits, matcher)
  54     }
  55
  56     /// Returns a matcher for literal suffixes from the given set.
  57     pub fn suffixes(lits: Literals) -> Self {
  58         let matcher = Matcher::suffixes(&lits);
  59         Self::new(lits, matcher)
  60     }
  61
  62     fn new(lits: Literals, matcher: Matcher) -> Self {
  63         let complete = lits.all_complete();
  64         LiteralSearcher {
  65             complete: complete,
  66             lcp: FreqyPacked::new(lits.longest_common_prefix().to_vec()),
  67             lcs: FreqyPacked::new(lits.longest_common_suffix().to_vec()),
  68             matcher: matcher,
  69         }
  70     }
  71
  72     /// Returns true if all matches comprise the entire regular expression.
  73     ///
  74     /// This does not necessarily mean that a literal match implies a match
  75     /// of the regular expression. For example, the regular expression `^a`
  76     /// is comprised of a single complete literal `a`, but the regular
  77     /// expression demands that it only match at the beginning of a string.
  78     pub fn complete(&self) -> bool {
  79         self.complete && !self.is_empty()
  80     }
  81
  82     /// Find the position of a literal in `haystack` if it exists.
  83     #[cfg_attr(feature = "perf-inline", inline(always))]
  84     pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
  85         use self::Matcher::*;
  86         match self.matcher {
  87             Empty => Some((0, 0)),
  88             Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
  89             FreqyPacked(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
  90             BoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
  91             AC { ref ac, .. } => {
  92                 ac.find(haystack).map(|m| (m.start(), m.end()))
  93             }
  94             Packed { ref s, .. } => {
  95                 s.find(haystack).map(|m| (m.start(), m.end()))
  96             }
  97         }
  98     }
  99
 100     /// Like find, except matches must start at index `0`.
 101     pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
 102         for lit in self.iter() {
 103             if lit.len() > haystack.len() {
 104                 continue;
 105             }
 106             if lit == &haystack[0..lit.len()] {
 107                 return Some((0, lit.len()));
 108             }
 109         }
 110         None
 111     }
 112
 113     /// Like find, except matches must end at index `haystack.len()`.
 114     pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
 115         for lit in self.iter() {
 116             if lit.len() > haystack.len() {
 117                 continue;
 118             }
 119             if lit == &haystack[haystack.len() - lit.len()..] {
 120                 return Some((haystack.len() - lit.len(), haystack.len()));
 121             }
 122         }
 123         None
 124     }
 125
 126     /// Returns an iterator over all literals to be matched.
 127     pub fn iter(&self) -> LiteralIter {
 128         match self.matcher {
 129             Matcher::Empty => LiteralIter::Empty,
 130             Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
 131             Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat),
 132             Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern),
 133             Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
 134             Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
 135         }
 136     }
 137
 138     /// Returns a matcher for the longest common prefix of this matcher.
 139     pub fn lcp(&self) -> &FreqyPacked {
 140         &self.lcp
 141     }
 142
 143     /// Returns a matcher for the longest common suffix of this matcher.
 144     pub fn lcs(&self) -> &FreqyPacked {
 145         &self.lcs
 146     }
 147
 148     /// Returns true iff this prefix is empty.
 149     pub fn is_empty(&self) -> bool {
 150         self.len() == 0
 151     }
 152
 153     /// Returns the number of prefixes in this machine.
 154     pub fn len(&self) -> usize {
 155         use self::Matcher::*;
 156         match self.matcher {
 157             Empty => 0,
 158             Bytes(ref sset) => sset.dense.len(),
 159             FreqyPacked(_) => 1,
 160             BoyerMoore(_) => 1,
 161             AC { ref ac, .. } => ac.pattern_count(),
 162             Packed { ref lits, .. } => lits.len(),
 163         }
 164     }
 165
 166     /// Return the approximate heap usage of literals in bytes.
 167     pub fn approximate_size(&self) -> usize {
 168         use self::Matcher::*;
 169         match self.matcher {
 170             Empty => 0,
 171             Bytes(ref sset) => sset.approximate_size(),
 172             FreqyPacked(ref single) => single.approximate_size(),
 173             BoyerMoore(ref single) => single.approximate_size(),
 174             AC { ref ac, .. } => ac.heap_bytes(),
 175             Packed { ref s, .. } => s.heap_bytes(),
 176         }
 177     }
 178 }
 179
 180 impl Matcher {
 181     fn prefixes(lits: &Literals) -> Self {
 182         let sset = SingleByteSet::prefixes(lits);
 183         Matcher::new(lits, sset)
 184     }
 185
 186     fn suffixes(lits: &Literals) -> Self {
 187         let sset = SingleByteSet::suffixes(lits);
 188         Matcher::new(lits, sset)
 189     }
 190
 191     fn new(lits: &Literals, sset: SingleByteSet) -> Self {
 192         if lits.literals().is_empty() {
 193             return Matcher::Empty;
 194         }
 195         if sset.dense.len() >= 26 {
 196             // Avoid trying to match a large number of single bytes.
 197             // This is *very* sensitive to a frequency analysis comparison
 198             // between the bytes in sset and the composition of the haystack.
 199             // No matter the size of sset, if its members all are rare in the
 200             // haystack, then it'd be worth using it. How to tune this... IDK.
 201             // ---AG
 202             return Matcher::Empty;
 203         }
 204         if sset.complete {
 205             return Matcher::Bytes(sset);
 206         }
 207         if lits.literals().len() == 1 {
 208             let lit = lits.literals()[0].to_vec();
 209             if BoyerMooreSearch::should_use(lit.as_slice()) {
 210                 return Matcher::BoyerMoore(BoyerMooreSearch::new(lit));
 211             } else {
 212                 return Matcher::FreqyPacked(FreqyPacked::new(lit));
 213             }
 214         }
 215
 216         let pats = lits.literals().to_owned();
 217         let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
 218         if lits.literals().len() <= 100 && !is_aho_corasick_fast {
 219             let mut builder = packed::Config::new()
 220                 .match_kind(packed::MatchKind::LeftmostFirst)
 221                 .builder();
 222             if let Some(s) = builder.extend(&pats).build() {
 223                 return Matcher::Packed { s, lits: pats };
 224             }
 225         }
 226         let ac = AhoCorasickBuilder::new()
 227             .match_kind(aho_corasick::MatchKind::LeftmostFirst)
 228             .dfa(true)
 229             .build_with_size::<u32, _, _>(&pats)
 230             .unwrap();
 231         Matcher::AC { ac, lits: pats }
 232     }
 233 }
 234
 235 #[derive(Debug)]
 236 pub enum LiteralIter<'a> {
 237     Empty,
 238     Bytes(&'a [u8]),
 239     Single(&'a [u8]),
 240     AC(&'a [Literal]),
 241     Packed(&'a [Literal]),
 242 }
 243
 244 impl<'a> Iterator for LiteralIter<'a> {
 245     type Item = &'a [u8];
 246
 247     fn next(&mut self) -> Option<Self::Item> {
 248         match *self {
 249             LiteralIter::Empty => None,
 250             LiteralIter::Bytes(ref mut many) => {
 251                 if many.is_empty() {
 252                     None
 253                 } else {
 254                     let next = &many[0..1];
 255                     *many = &many[1..];
 256                     Some(next)
 257                 }
 258             }
 259             LiteralIter::Single(ref mut one) => {
 260                 if one.is_empty() {
 261                     None
 262                 } else {
 263                     let next = &one[..];
 264                     *one = &[];
 265                     Some(next)
 266                 }
 267             }
 268             LiteralIter::AC(ref mut lits) => {
 269                 if lits.is_empty() {
 270                     None
 271                 } else {
 272                     let next = &lits[0];
 273                     *lits = &lits[1..];
 274                     Some(&**next)
 275                 }
 276             }
 277             LiteralIter::Packed(ref mut lits) => {
 278                 if lits.is_empty() {
 279                     None
 280                 } else {
 281                     let next = &lits[0];
 282                     *lits = &lits[1..];
 283                     Some(&**next)
 284                 }
 285             }
 286         }
 287     }
 288 }
 289
 290 #[derive(Clone, Debug)]
 291 struct SingleByteSet {
 292     sparse: Vec<bool>,
 293     dense: Vec<u8>,
 294     complete: bool,
 295     all_ascii: bool,
 296 }
 297
 298 impl SingleByteSet {
 299     fn new() -> SingleByteSet {
 300         SingleByteSet {
 301             sparse: vec![false; 256],
 302             dense: vec![],
 303             complete: true,
 304             all_ascii: true,
 305         }
 306     }
 307
 308     fn prefixes(lits: &Literals) -> SingleByteSet {
 309         let mut sset = SingleByteSet::new();
 310         for lit in lits.literals() {
 311             sset.complete = sset.complete && lit.len() == 1;
 312             if let Some(&b) = lit.get(0) {
 313                 if !sset.sparse[b as usize] {
 314                     if b > 0x7F {
 315                         sset.all_ascii = false;
 316                     }
 317                     sset.dense.push(b);
 318                     sset.sparse[b as usize] = true;
 319                 }
 320             }
 321         }
 322         sset
 323     }
 324
 325     fn suffixes(lits: &Literals) -> SingleByteSet {
 326         let mut sset = SingleByteSet::new();
 327         for lit in lits.literals() {
 328             sset.complete = sset.complete && lit.len() == 1;
 329             if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
 330                 if !sset.sparse[b as usize] {
 331                     if b > 0x7F {
 332                         sset.all_ascii = false;
 333                     }
 334                     sset.dense.push(b);
 335                     sset.sparse[b as usize] = true;
 336                 }
 337             }
 338         }
 339         sset
 340     }
 341
 342     /// Faster find that special cases certain sizes to use memchr.
 343     #[cfg_attr(feature = "perf-inline", inline(always))]
 344     fn find(&self, text: &[u8]) -> Option<usize> {
 345         match self.dense.len() {
 346             0 => None,
 347             1 => memchr(self.dense[0], text),
 348             2 => memchr2(self.dense[0], self.dense[1], text),
 349             3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
 350             _ => self._find(text),
 351         }
 352     }
 353
 354     /// Generic find that works on any sized set.
 355     fn _find(&self, haystack: &[u8]) -> Option<usize> {
 356         for (i, &b) in haystack.iter().enumerate() {
 357             if self.sparse[b as usize] {
 358                 return Some(i);
 359             }
 360         }
 361         None
 362     }
 363
 364     fn approximate_size(&self) -> usize {
 365         (self.dense.len() * mem::size_of::<u8>())
 366             + (self.sparse.len() * mem::size_of::<bool>())
 367     }
 368 }
 369
 370 /// Provides an implementation of fast subtring search using frequency
 371 /// analysis.
 372 ///
 373 /// memchr is so fast that we do everything we can to keep the loop in memchr
 374 /// for as long as possible. The easiest way to do this is to intelligently
 375 /// pick the byte to send to memchr. The best byte is the byte that occurs
 376 /// least frequently in the haystack. Since doing frequency analysis on the
 377 /// haystack is far too expensive, we compute a set of fixed frequencies up
 378 /// front and hard code them in src/freqs.rs. Frequency analysis is done via
 379 /// scripts/frequencies.py.
 380 #[derive(Clone, Debug)]
 381 pub struct FreqyPacked {
 382     /// The pattern.
 383     pat: Vec<u8>,
 384     /// The number of Unicode characters in the pattern. This is useful for
 385     /// determining the effective length of a pattern when deciding which
 386     /// optimizations to perform. A trailing incomplete UTF-8 sequence counts
 387     /// as one character.
 388     char_len: usize,
 389     /// The rarest byte in the pattern, according to pre-computed frequency
 390     /// analysis.
 391     rare1: u8,
 392     /// The offset of the rarest byte in `pat`.
 393     rare1i: usize,
 394     /// The second rarest byte in the pattern, according to pre-computed
 395     /// frequency analysis. (This may be equivalent to the rarest byte.)
 396     ///
 397     /// The second rarest byte is used as a type of guard for quickly detecting
 398     /// a mismatch after memchr locates an instance of the rarest byte. This
 399     /// is a hedge against pathological cases where the pre-computed frequency
 400     /// analysis may be off. (But of course, does not prevent *all*
 401     /// pathological cases.)
 402     rare2: u8,
 403     /// The offset of the second rarest byte in `pat`.
 404     rare2i: usize,
 405 }
 406
 407 impl FreqyPacked {
 408     fn new(pat: Vec<u8>) -> FreqyPacked {
 409         if pat.is_empty() {
 410             return FreqyPacked::empty();
 411         }
 412
 413         // Find the rarest two bytes. Try to make them distinct (but it's not
 414         // required).
 415         let mut rare1 = pat[0];
 416         let mut rare2 = pat[0];
 417         for b in pat[1..].iter().cloned() {
 418             if freq_rank(b) < freq_rank(rare1) {
 419                 rare1 = b;
 420             }
 421         }
 422         for &b in &pat {
 423             if rare1 == rare2 {
 424                 rare2 = b
 425             } else if b != rare1 && freq_rank(b) < freq_rank(rare2) {
 426                 rare2 = b;
 427             }
 428         }
 429
 430         // And find the offsets of their last occurrences.
 431         let rare1i = pat.iter().rposition(|&b| b == rare1).unwrap();
 432         let rare2i = pat.iter().rposition(|&b| b == rare2).unwrap();
 433
 434         let char_len = char_len_lossy(&pat);
 435         FreqyPacked {
 436             pat: pat,
 437             char_len: char_len,
 438             rare1: rare1,
 439             rare1i: rare1i,
 440             rare2: rare2,
 441             rare2i: rare2i,
 442         }
 443     }
 444
 445     fn empty() -> FreqyPacked {
 446         FreqyPacked {
 447             pat: vec![],
 448             char_len: 0,
 449             rare1: 0,
 450             rare1i: 0,
 451             rare2: 0,
 452             rare2i: 0,
 453         }
 454     }
 455
 456     #[cfg_attr(feature = "perf-inline", inline(always))]
 457     pub fn find(&self, haystack: &[u8]) -> Option<usize> {
 458         let pat = &*self.pat;
 459         if haystack.len() < pat.len() || pat.is_empty() {
 460             return None;
 461         }
 462         let mut i = self.rare1i;
 463         while i < haystack.len() {
 464             i += match memchr(self.rare1, &haystack[i..]) {
 465                 None => return None,
 466                 Some(i) => i,
 467             };
 468             let start = i - self.rare1i;
 469             let end = start + pat.len();
 470             if end > haystack.len() {
 471                 return None;
 472             }
 473             let aligned = &haystack[start..end];
 474             if aligned[self.rare2i] == self.rare2 && aligned == &*self.pat {
 475                 return Some(start);
 476             }
 477             i += 1;
 478         }
 479         None
 480     }
 481
 482     #[cfg_attr(feature = "perf-inline", inline(always))]
 483     pub fn is_suffix(&self, text: &[u8]) -> bool {
 484         if text.len() < self.len() {
 485             return false;
 486         }
 487         text[text.len() - self.len()..] == *self.pat
 488     }
 489
 490     pub fn len(&self) -> usize {
 491         self.pat.len()
 492     }
 493
 494     pub fn char_len(&self) -> usize {
 495         self.char_len
 496     }
 497
 498     fn approximate_size(&self) -> usize {
 499         self.pat.len() * mem::size_of::<u8>()
 500     }
 501 }
 502
 503 fn char_len_lossy(bytes: &[u8]) -> usize {
 504     String::from_utf8_lossy(bytes).chars().count()
 505 }
 506
 507 /// An implementation of Tuned Boyer-Moore as laid out by
 508 /// Andrew Hume and Daniel Sunday in "Fast String Searching".
 509 /// O(n) in the size of the input.
 510 ///
 511 /// Fast string searching algorithms come in many variations,
 512 /// but they can generally be described in terms of three main
 513 /// components.
 514 ///
 515 /// The skip loop is where the string searcher wants to spend
 516 /// as much time as possible. Exactly which character in the
 517 /// pattern the skip loop examines varies from algorithm to
 518 /// algorithm, but in the simplest case this loop repeated
 519 /// looks at the last character in the pattern and jumps
 520 /// forward in the input if it is not in the pattern.
 521 /// Robert Boyer and J Moore called this the "fast" loop in
 522 /// their original paper.
 523 ///
 524 /// The match loop is responsible for actually examining the
 525 /// whole potentially matching substring. In order to fail
 526 /// faster, the match loop sometimes has a guard test attached.
 527 /// The guard test uses frequency analysis of the different
 528 /// characters in the pattern to choose the least frequency
 529 /// occurring character and use it to find match failures
 530 /// as quickly as possible.
 531 ///
 532 /// The shift rule governs how the algorithm will shuffle its
 533 /// test window in the event of a failure during the match loop.
 534 /// Certain shift rules allow the worst-case run time of the
 535 /// algorithm to be shown to be O(n) in the size of the input
 536 /// rather than O(nm) in the size of the input and the size
 537 /// of the pattern (as naive Boyer-Moore is).
 538 ///
 539 /// "Fast String Searching", in addition to presenting a tuned
 540 /// algorithm, provides a comprehensive taxonomy of the many
 541 /// different flavors of string searchers. Under that taxonomy
 542 /// TBM, the algorithm implemented here, uses an unrolled fast
 543 /// skip loop with memchr fallback, a forward match loop with guard,
 544 /// and the mini Sunday's delta shift rule. To unpack that you'll have to
 545 /// read the paper.
 546 #[derive(Clone, Debug)]
 547 pub struct BoyerMooreSearch {
 548     /// The pattern we are going to look for in the haystack.
 549     pattern: Vec<u8>,
 550
 551     /// The skip table for the skip loop.
 552     ///
 553     /// Maps the character at the end of the input
 554     /// to a shift.
 555     skip_table: Vec<usize>,
 556
 557     /// The guard character (least frequently occurring char).
 558     guard: u8,
 559     /// The reverse-index of the guard character in the pattern.
 560     guard_reverse_idx: usize,
 561
 562     /// Daniel Sunday's mini generalized delta2 shift table.
 563     ///
 564     /// We use a skip loop, so we only have to provide a shift
 565     /// for the skip char (last char). This is why it is a mini
 566     /// shift rule.
 567     md2_shift: usize,
 568 }
 569
 570 impl BoyerMooreSearch {
 571     /// Create a new string searcher, performing whatever
 572     /// compilation steps are required.
 573     fn new(pattern: Vec<u8>) -> Self {
 574         debug_assert!(!pattern.is_empty());
 575
 576         let (g, gi) = Self::select_guard(pattern.as_slice());
 577         let skip_table = Self::compile_skip_table(pattern.as_slice());
 578         let md2_shift = Self::compile_md2_shift(pattern.as_slice());
 579         BoyerMooreSearch {
 580             pattern: pattern,
 581             skip_table: skip_table,
 582             guard: g,
 583             guard_reverse_idx: gi,
 584             md2_shift: md2_shift,
 585         }
 586     }
 587
 588     /// Find the pattern in `haystack`, returning the offset
 589     /// of the start of the first occurrence of the pattern
 590     /// in `haystack`.
 591     #[inline]
 592     fn find(&self, haystack: &[u8]) -> Option<usize> {
 593         if haystack.len() < self.pattern.len() {
 594             return None;
 595         }
 596
 597         let mut window_end = self.pattern.len() - 1;
 598
 599         // Inspired by the grep source. It is a way
 600         // to do correct loop unrolling without having to place
 601         // a crashpad of terminating charicters at the end in
 602         // the way described in the Fast String Searching paper.
 603         const NUM_UNROLL: usize = 10;
 604         // 1 for the initial position, and 1 for the md2 shift
 605         let short_circut = (NUM_UNROLL + 2) * self.pattern.len();
 606
 607         if haystack.len() > short_circut {
 608             // just 1 for the md2 shift
 609             let backstop =
 610                 haystack.len() - ((NUM_UNROLL + 1) * self.pattern.len());
 611             loop {
 612                 window_end =
 613                     match self.skip_loop(haystack, window_end, backstop) {
 614                         Some(i) => i,
 615                         None => return None,
 616                     };
 617                 if window_end >= backstop {
 618                     break;
 619                 }
 620
 621                 if self.check_match(haystack, window_end) {
 622                     return Some(window_end - (self.pattern.len() - 1));
 623                 } else {
 624                     let skip = self.skip_table[haystack[window_end] as usize];
 625                     window_end +=
 626                         if skip == 0 { self.md2_shift } else { skip };
 627                     continue;
 628                 }
 629             }
 630         }
 631
 632         // now process the input after the backstop
 633         while window_end < haystack.len() {
 634             let mut skip = self.skip_table[haystack[window_end] as usize];
 635             if skip == 0 {
 636                 if self.check_match(haystack, window_end) {
 637                     return Some(window_end - (self.pattern.len() - 1));
 638                 } else {
 639                     skip = self.md2_shift;
 640                 }
 641             }
 642             window_end += skip;
 643         }
 644
 645         None
 646     }
 647
 648     fn len(&self) -> usize {
 649         return self.pattern.len();
 650     }
 651
 652     /// The key heuristic behind which the BoyerMooreSearch lives.
 653     ///
 654     /// See `rust-lang/regex/issues/408`.
 655     ///
 656     /// Tuned Boyer-Moore is actually pretty slow! It turns out a handrolled
 657     /// platform-specific memchr routine with a bit of frequency
 658     /// analysis sprinkled on top actually wins most of the time.
 659     /// However, there are a few cases where Tuned Boyer-Moore still
 660     /// wins.
 661     ///
 662     /// If the haystack is random, frequency analysis doesn't help us,
 663     /// so Boyer-Moore will win for sufficiently large needles.
 664     /// Unfortunately, there is no obvious way to determine this
 665     /// ahead of time.
 666     ///
 667     /// If the pattern itself consists of very common characters,
 668     /// frequency analysis won't get us anywhere. The most extreme
 669     /// example of this is a pattern like `eeeeeeeeeeeeeeee`. Fortunately,
 670     /// this case is wholly determined by the pattern, so we can actually
 671     /// implement the heuristic.
 672     ///
 673     /// A third case is if the pattern is sufficiently long. The idea
 674     /// here is that once the pattern gets long enough the Tuned
 675     /// Boyer-Moore skip loop will start making strides long enough
 676     /// to beat the asm deep magic that is memchr.
 677     fn should_use(pattern: &[u8]) -> bool {
 678         // The minimum pattern length required to use TBM.
 679         const MIN_LEN: usize = 9;
 680         // The minimum frequency rank (lower is rarer) that every byte in the
 681         // pattern must have in order to use TBM. That is, if the pattern
 682         // contains _any_ byte with a lower rank, then TBM won't be used.
 683         const MIN_CUTOFF: usize = 150;
 684         // The maximum frequency rank for any byte.
 685         const MAX_CUTOFF: usize = 255;
 686         // The scaling factor used to determine the actual cutoff frequency
 687         // to use (keeping in mind that the minimum frequency rank is bounded
 688         // by MIN_CUTOFF). This scaling factor is an attempt to make TBM more
 689         // likely to be used as the pattern grows longer. That is, longer
 690         // patterns permit somewhat less frequent bytes than shorter patterns,
 691         // under the assumption that TBM gets better as the pattern gets
 692         // longer.
 693         const LEN_CUTOFF_PROPORTION: usize = 4;
 694
 695         let scaled_rank = pattern.len().wrapping_mul(LEN_CUTOFF_PROPORTION);
 696         let cutoff = cmp::max(
 697             MIN_CUTOFF,
 698             MAX_CUTOFF - cmp::min(MAX_CUTOFF, scaled_rank),
 699         );
 700         // The pattern must be long enough to be worthwhile. e.g., memchr will
 701         // be faster on `e` because it is short even though e is quite common.
 702         pattern.len() > MIN_LEN
 703             // all the bytes must be more common than the cutoff.
 704             && pattern.iter().all(|c| freq_rank(*c) >= cutoff)
 705     }
 706
 707     /// Check to see if there is a match at the given position
 708     #[inline]
 709     fn check_match(&self, haystack: &[u8], window_end: usize) -> bool {
 710         // guard test
 711         if haystack[window_end - self.guard_reverse_idx] != self.guard {
 712             return false;
 713         }
 714
 715         // match loop
 716         let window_start = window_end - (self.pattern.len() - 1);
 717         for i in 0..self.pattern.len() {
 718             if self.pattern[i] != haystack[window_start + i] {
 719                 return false;
 720             }
 721         }
 722
 723         true
 724     }
 725
 726     /// Skip forward according to the shift table.
 727     ///
 728     /// Returns the offset of the next occurrence
 729     /// of the last char in the pattern, or the none
 730     /// if it never reappears. If `skip_loop` hits the backstop
 731     /// it will leave early.
 732     #[inline]
 733     fn skip_loop(
 734         &self,
 735         haystack: &[u8],
 736         mut window_end: usize,
 737         backstop: usize,
 738     ) -> Option<usize> {
 739         let window_end_snapshot = window_end;
 740         let skip_of = |we: usize| -> usize {
 741             // Unsafe might make this faster, but the benchmarks
 742             // were hard to interpret.
 743             self.skip_table[haystack[we] as usize]
 744         };
 745
 746         loop {
 747             let mut skip = skip_of(window_end);
 748             window_end += skip;
 749             skip = skip_of(window_end);
 750             window_end += skip;
 751             if skip != 0 {
 752                 skip = skip_of(window_end);
 753                 window_end += skip;
 754                 skip = skip_of(window_end);
 755                 window_end += skip;
 756                 skip = skip_of(window_end);
 757                 window_end += skip;
 758                 if skip != 0 {
 759                     skip = skip_of(window_end);
 760                     window_end += skip;
 761                     skip = skip_of(window_end);
 762                     window_end += skip;
 763                     skip = skip_of(window_end);
 764                     window_end += skip;
 765                     if skip != 0 {
 766                         skip = skip_of(window_end);
 767                         window_end += skip;
 768                         skip = skip_of(window_end);
 769                         window_end += skip;
 770
 771                         // If ten iterations did not make at least 16 words
 772                         // worth of progress, we just fall back on memchr.
 773                         if window_end - window_end_snapshot
 774                             > 16 * mem::size_of::<usize>()
 775                         {
 776                             // Returning a window_end >= backstop will
 777                             // immediatly break us out of the inner loop in
 778                             // `find`.
 779                             if window_end >= backstop {
 780                                 return Some(window_end);
 781                             }
 782
 783                             continue; // we made enough progress
 784                         } else {
 785                             // In case we are already there, and so that
 786                             // we will catch the guard char.
 787                             window_end = window_end
 788                                 .checked_sub(1 + self.guard_reverse_idx)
 789                                 .unwrap_or(0);
 790
 791                             match memchr(self.guard, &haystack[window_end..]) {
 792                                 None => return None,
 793                                 Some(g_idx) => {
 794                                     return Some(
 795                                         window_end
 796                                             + g_idx
 797                                             + self.guard_reverse_idx,
 798                                     );
 799                                 }
 800                             }
 801                         }
 802                     }
 803                 }
 804             }
 805
 806             return Some(window_end);
 807         }
 808     }
 809
 810     /// Compute the ufast skip table.
 811     fn compile_skip_table(pattern: &[u8]) -> Vec<usize> {
 812         let mut tab = vec![pattern.len(); 256];
 813
 814         // For every char in the pattern, we write a skip
 815         // that will line us up with the rightmost occurrence.
 816         //
 817         // N.B. the sentinel (0) is written by the last
 818         // loop iteration.
 819         for (i, c) in pattern.iter().enumerate() {
 820             tab[*c as usize] = (pattern.len() - 1) - i;
 821         }
 822
 823         tab
 824     }
 825
 826     /// Select the guard character based off of the precomputed
 827     /// frequency table.
 828     fn select_guard(pattern: &[u8]) -> (u8, usize) {
 829         let mut rarest = pattern[0];
 830         let mut rarest_rev_idx = pattern.len() - 1;
 831         for (i, c) in pattern.iter().enumerate() {
 832             if freq_rank(*c) < freq_rank(rarest) {
 833                 rarest = *c;
 834                 rarest_rev_idx = (pattern.len() - 1) - i;
 835             }
 836         }
 837
 838         (rarest, rarest_rev_idx)
 839     }
 840
 841     /// If there is another occurrence of the skip
 842     /// char, shift to it, otherwise just shift to
 843     /// the next window.
 844     fn compile_md2_shift(pattern: &[u8]) -> usize {
 845         let shiftc = *pattern.last().unwrap();
 846
 847         // For a pattern of length 1 we will never apply the
 848         // shift rule, so we use a poison value on the principle
 849         // that failing fast is a good thing.
 850         if pattern.len() == 1 {
 851             return 0xDEADBEAF;
 852         }
 853
 854         let mut i = pattern.len() - 2;
 855         while i > 0 {
 856             if pattern[i] == shiftc {
 857                 return (pattern.len() - 1) - i;
 858             }
 859             i -= 1;
 860         }
 861
 862         // The skip char never re-occurs in the pattern, so
 863         // we can just shift the whole window length.
 864         pattern.len() - 1
 865     }
 866
 867     fn approximate_size(&self) -> usize {
 868         (self.pattern.len() * mem::size_of::<u8>())
 869             + (256 * mem::size_of::<usize>()) // skip table
 870     }
 871 }
 872
 873 fn freq_rank(b: u8) -> usize {
 874     BYTE_FREQUENCIES[b as usize] as usize
 875 }
 876
 877 #[cfg(test)]
 878 mod tests {
 879     use super::{BoyerMooreSearch, FreqyPacked};
 880
 881     //
 882     // Unit Tests
 883     //
 884
 885     // The "hello, world" of string searching
 886     #[test]
 887     fn bm_find_subs() {
 888         let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
 889         let haystack = b"I keep seeing patterns in this text";
 890         assert_eq!(14, searcher.find(haystack).unwrap());
 891     }
 892
 893     #[test]
 894     fn bm_find_no_subs() {
 895         let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
 896         let haystack = b"I keep seeing needles in this text";
 897         assert_eq!(None, searcher.find(haystack));
 898     }
 899
 900     //
 901     // Regression Tests
 902     //
 903
 904     #[test]
 905     fn bm_skip_reset_bug() {
 906         let haystack = vec![0, 0, 0, 0, 0, 1, 1, 0];
 907         let needle = vec![0, 1, 1, 0];
 908
 909         let searcher = BoyerMooreSearch::new(needle);
 910         let offset = searcher.find(haystack.as_slice()).unwrap();
 911         assert_eq!(4, offset);
 912     }
 913
 914     #[test]
 915     fn bm_backstop_underflow_bug() {
 916         let haystack = vec![0, 0];
 917         let needle = vec![0, 0];
 918
 919         let searcher = BoyerMooreSearch::new(needle);
 920         let offset = searcher.find(haystack.as_slice()).unwrap();
 921         assert_eq!(0, offset);
 922     }
 923
 924     #[test]
 925     fn bm_naive_off_by_one_bug() {
 926         let haystack = vec![91];
 927         let needle = vec![91];
 928
 929         let naive_offset = naive_find(&needle, &haystack).unwrap();
 930         assert_eq!(0, naive_offset);
 931     }
 932
 933     #[test]
 934     fn bm_memchr_fallback_indexing_bug() {
 935         let mut haystack = vec![
 936             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 937             0, 0, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 938             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 939             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 940             0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 941         ];
 942         let needle = vec![1, 1, 1, 1, 32, 32, 87];
 943         let needle_start = haystack.len();
 944         haystack.extend(needle.clone());
 945
 946         let searcher = BoyerMooreSearch::new(needle);
 947         assert_eq!(needle_start, searcher.find(haystack.as_slice()).unwrap());
 948     }
 949
 950     #[test]
 951     fn bm_backstop_boundary() {
 952         let haystack = b"\
 953 // aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
 954 e_data.clone_created(entity_id, entity_to_add.entity_id);
 955 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
 956 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
 957 "
 958         .to_vec();
 959         let needle = b"clone_created".to_vec();
 960
 961         let searcher = BoyerMooreSearch::new(needle);
 962         let result = searcher.find(&haystack);
 963         assert_eq!(Some(43), result);
 964     }
 965
 966     #[test]
 967     fn bm_win_gnu_indexing_bug() {
 968         let haystack_raw = vec![
 969             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 970             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 971             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 972             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 973         ];
 974         let needle = vec![1, 1, 1, 1, 1, 1, 1];
 975         let haystack = haystack_raw.as_slice();
 976
 977         BoyerMooreSearch::new(needle.clone()).find(haystack);
 978     }
 979
 980     //
 981     // QuickCheck Properties
 982     //
 983
 984     use quickcheck::TestResult;
 985
 986     fn naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize> {
 987         assert!(needle.len() <= haystack.len());
 988
 989         for i in 0..(haystack.len() - (needle.len() - 1)) {
 990             if haystack[i] == needle[0]
 991                 && &haystack[i..(i + needle.len())] == needle
 992             {
 993                 return Some(i);
 994             }
 995         }
 996
 997         None
 998     }
 999
1000     quickcheck! {
1001         fn qc_bm_equals_nieve_find(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
1002             if pile1.len() == 0 || pile2.len() == 0 {
1003                 return TestResult::discard();
1004             }
1005
1006             let (needle, haystack) = if pile1.len() < pile2.len() {
1007                 (pile1, pile2.as_slice())
1008             } else {
1009                 (pile2, pile1.as_slice())
1010             };
1011
1012             let searcher = BoyerMooreSearch::new(needle.clone());
1013             TestResult::from_bool(
1014                 searcher.find(haystack) == naive_find(&needle, haystack))
1015         }
1016
1017         fn qc_bm_equals_single(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
1018             if pile1.len() == 0 || pile2.len() == 0 {
1019                 return TestResult::discard();
1020             }
1021
1022             let (needle, haystack) = if pile1.len() < pile2.len() {
1023                 (pile1, pile2.as_slice())
1024             } else {
1025                 (pile2, pile1.as_slice())
1026             };
1027
1028             let bm_searcher = BoyerMooreSearch::new(needle.clone());
1029             let freqy_memchr = FreqyPacked::new(needle);
1030             TestResult::from_bool(
1031                 bm_searcher.find(haystack) == freqy_memchr.find(haystack))
1032         }
1033
1034         fn qc_bm_finds_trailing_needle(
1035             haystack_pre: Vec<u8>,
1036             needle: Vec<u8>
1037         ) -> TestResult {
1038             if needle.len() == 0 {
1039                 return TestResult::discard();
1040             }
1041
1042             let mut haystack = haystack_pre.clone();
1043             let searcher = BoyerMooreSearch::new(needle.clone());
1044
1045             if haystack.len() >= needle.len() &&
1046                 searcher.find(haystack.as_slice()).is_some() {
1047                 return TestResult::discard();
1048             }
1049
1050             haystack.extend(needle.clone());
1051
1052             // What if the the tail of the haystack can start the
1053             // needle?
1054             let start = haystack_pre.len()
1055                 .checked_sub(needle.len())
1056                 .unwrap_or(0);
1057             for i in 0..(needle.len() - 1) {
1058                 if searcher.find(&haystack[(i + start)..]).is_some() {
1059                     return TestResult::discard();
1060                 }
1061             }
1062
1063             TestResult::from_bool(
1064                 searcher.find(haystack.as_slice())
1065                         .map(|x| x == haystack_pre.len())
1066                         .unwrap_or(false))
1067         }
1068
1069         // qc_equals_* is only testing the negative case as @burntsushi
1070         // pointed out in https://github.com/rust-lang/regex/issues/446.
1071         // This quickcheck prop represents an effort to force testing of
1072         // the positive case. qc_bm_finds_first and qc_bm_finds_trailing_needle
1073         // already check some of the positive cases, but they don't cover
1074         // cases where the needle is in the middle of haystack. This prop
1075         // fills that hole.
1076         fn qc_bm_finds_subslice(
1077             haystack: Vec<u8>,
1078             needle_start: usize,
1079             needle_length: usize
1080         ) -> TestResult {
1081             if haystack.len() == 0 {
1082                 return TestResult::discard();
1083             }
1084
1085             let needle_start = needle_start % haystack.len();
1086             let needle_length = needle_length % (haystack.len() - needle_start);
1087
1088             if needle_length == 0 {
1089                 return TestResult::discard();
1090             }
1091
1092             let needle = &haystack[needle_start..(needle_start + needle_length)];
1093
1094             let bm_searcher = BoyerMooreSearch::new(needle.to_vec());
1095
1096             let start = naive_find(&needle, &haystack);
1097             match start {
1098                 None => TestResult::from_bool(false),
1099                 Some(nf_start) =>
1100                     TestResult::from_bool(
1101                         nf_start <= needle_start
1102                             && bm_searcher.find(&haystack) == start
1103                     )
1104             }
1105         }
1106
1107         fn qc_bm_finds_first(needle: Vec<u8>) -> TestResult {
1108             if needle.len() == 0 {
1109                 return TestResult::discard();
1110             }
1111
1112             let mut haystack = needle.clone();
1113             let searcher = BoyerMooreSearch::new(needle.clone());
1114             haystack.extend(needle);
1115
1116             TestResult::from_bool(
1117                 searcher.find(haystack.as_slice())
1118                         .map(|x| x == 0)
1119                         .unwrap_or(false))
1120         }
1121     }
1122 }