vendor/regex-syntax-0.5.6/src/hir/literal/mod.rs

   1 // Copyright 2018 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 /*!
  12 Provides routines for extracting literal prefixes and suffixes from an `Hir`.
  13 */
  14
  15 use std::cmp;
  16 use std::fmt;
  17 use std::iter;
  18 use std::mem;
  19 use std::ops;
  20
  21 use hir::{self, Hir, HirKind};
  22 use unicode;
  23
  24 /// A set of literal byte strings extracted from a regular expression.
  25 ///
  26 /// Every member of the set is a `Literal`, which is represented by a
  27 /// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is
  28 /// said to be either *complete* or *cut*. A complete literal means that
  29 /// it extends until the beginning (or end) of the regular expression. In
  30 /// some circumstances, this can be used to indicate a match in the regular
  31 /// expression.
  32 ///
  33 /// A key aspect of literal extraction is knowing when to stop. It is not
  34 /// feasible to blindly extract all literals from a regular expression, even if
  35 /// there are finitely many. For example, the regular expression `[0-9]{10}`
  36 /// has `10^10` distinct literals. For this reason, literal extraction is
  37 /// bounded to some low number by default using heuristics, but the limits can
  38 /// be tweaked.
  39 ///
  40 /// **WARNING**: Literal extraction uses stack space proportional to the size
  41 /// of the `Hir` expression. At some point, this drawback will be eliminated.
  42 /// To protect yourself, set a reasonable
  43 /// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit).
  44 /// This is done for you by default.
  45 #[derive(Clone, Eq, PartialEq)]
  46 pub struct Literals {
  47     lits: Vec<Literal>,
  48     limit_size: usize,
  49     limit_class: usize,
  50 }
  51
  52 /// A single member of a set of literals extracted from a regular expression.
  53 ///
  54 /// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice
  55 /// and `Vec` operations are available.
  56 #[derive(Clone, Eq, Ord)]
  57 pub struct Literal {
  58     v: Vec<u8>,
  59     cut: bool,
  60 }
  61
  62 impl Literals {
  63     /// Returns a new empty set of literals using default limits.
  64     pub fn empty() -> Literals {
  65         Literals {
  66             lits: vec![],
  67             limit_size: 250,
  68             limit_class: 10,
  69         }
  70     }
  71
  72     /// Returns a set of literal prefixes extracted from the given `Hir`.
  73     pub fn prefixes(expr: &Hir) -> Literals {
  74         let mut lits = Literals::empty();
  75         lits.union_prefixes(expr);
  76         lits
  77     }
  78
  79     /// Returns a set of literal suffixes extracted from the given `Hir`.
  80     pub fn suffixes(expr: &Hir) -> Literals {
  81         let mut lits = Literals::empty();
  82         lits.union_suffixes(expr);
  83         lits
  84     }
  85
  86     /// Get the approximate size limit (in bytes) of this set.
  87     pub fn limit_size(&self) -> usize {
  88         self.limit_size
  89     }
  90
  91     /// Set the approximate size limit (in bytes) of this set.
  92     ///
  93     /// If extracting a literal would put the set over this limit, then
  94     /// extraction stops.
  95     ///
  96     /// The new limits will only apply to additions to this set. Existing
  97     /// members remain unchanged, even if the set exceeds the new limit.
  98     pub fn set_limit_size(&mut self, size: usize) -> &mut Literals {
  99         self.limit_size = size;
 100         self
 101     }
 102
 103     /// Get the character class size limit for this set.
 104     pub fn limit_class(&self) -> usize {
 105         self.limit_class
 106     }
 107
 108     /// Limits the size of character(or byte) classes considered.
 109     ///
 110     /// A value of `0` prevents all character classes from being considered.
 111     ///
 112     /// This limit also applies to case insensitive literals, since each
 113     /// character in the case insensitive literal is converted to a class, and
 114     /// then case folded.
 115     ///
 116     /// The new limits will only apply to additions to this set. Existing
 117     /// members remain unchanged, even if the set exceeds the new limit.
 118     pub fn set_limit_class(&mut self, size: usize) -> &mut Literals {
 119         self.limit_class = size;
 120         self
 121     }
 122
 123     /// Returns the set of literals as a slice. Its order is unspecified.
 124     pub fn literals(&self) -> &[Literal] {
 125         &self.lits
 126     }
 127
 128     /// Returns the length of the smallest literal.
 129     ///
 130     /// Returns None is there are no literals in the set.
 131     pub fn min_len(&self) -> Option<usize> {
 132         let mut min = None;
 133         for lit in &self.lits {
 134             match min {
 135                 None => min = Some(lit.len()),
 136                 Some(m) if lit.len() < m => min = Some(lit.len()),
 137                 _ => {}
 138             }
 139         }
 140         min
 141     }
 142
 143     /// Returns true if all members in this set are complete.
 144     pub fn all_complete(&self) -> bool {
 145         !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut())
 146     }
 147
 148     /// Returns true if any member in this set is complete.
 149     pub fn any_complete(&self) -> bool {
 150         self.lits.iter().any(|lit| !lit.is_cut())
 151     }
 152
 153     /// Returns true if this set contains an empty literal.
 154     pub fn contains_empty(&self) -> bool {
 155         self.lits.iter().any(|lit| lit.is_empty())
 156     }
 157
 158     /// Returns true if this set is empty or if all of its members is empty.
 159     pub fn is_empty(&self) -> bool {
 160         self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty())
 161     }
 162
 163     /// Returns a new empty set of literals using this set's limits.
 164     pub fn to_empty(&self) -> Literals {
 165         let mut lits = Literals::empty();
 166         lits.set_limit_size(self.limit_size)
 167             .set_limit_class(self.limit_class);
 168         lits
 169     }
 170
 171     /// Returns the longest common prefix of all members in this set.
 172     pub fn longest_common_prefix(&self) -> &[u8] {
 173         if self.is_empty() {
 174             return &[];
 175         }
 176         let lit0 = &*self.lits[0];
 177         let mut len = lit0.len();
 178         for lit in &self.lits[1..] {
 179             len = cmp::min(
 180                 len,
 181                 lit.iter()
 182                    .zip(lit0)
 183                    .take_while(|&(a, b)| a == b)
 184                    .count());
 185         }
 186         &self.lits[0][..len]
 187     }
 188
 189     /// Returns the longest common suffix of all members in this set.
 190     pub fn longest_common_suffix(&self) -> &[u8] {
 191         if self.is_empty() {
 192             return &[];
 193         }
 194         let lit0 = &*self.lits[0];
 195         let mut len = lit0.len();
 196         for lit in &self.lits[1..] {
 197             len = cmp::min(
 198                 len,
 199                 lit.iter()
 200                    .rev()
 201                    .zip(lit0.iter().rev())
 202                    .take_while(|&(a, b)| a == b)
 203                    .count());
 204         }
 205         &self.lits[0][self.lits[0].len() - len..]
 206     }
 207
 208     /// Returns a new set of literals with the given number of bytes trimmed
 209     /// from the suffix of each literal.
 210     ///
 211     /// If any literal would be cut out completely by trimming, then None is
 212     /// returned.
 213     ///
 214     /// Any duplicates that are created as a result of this transformation are
 215     /// removed.
 216     pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> {
 217         if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) {
 218             return None;
 219         }
 220         let mut new = self.to_empty();
 221         for mut lit in self.lits.iter().cloned() {
 222             let new_len = lit.len() - num_bytes;
 223             lit.truncate(new_len);
 224             lit.cut();
 225             new.lits.push(lit);
 226         }
 227         new.lits.sort();
 228         new.lits.dedup();
 229         Some(new)
 230     }
 231
 232     /// Returns a new set of prefixes of this set of literals that are
 233     /// guaranteed to be unambiguous.
 234     ///
 235     /// Any substring match with a member of the set is returned is guaranteed
 236     /// to never overlap with a substring match of another member of the set
 237     /// at the same starting position.
 238     ///
 239     /// Given any two members of the returned set, neither is a substring of
 240     /// the other.
 241     pub fn unambiguous_prefixes(&self) -> Literals {
 242         if self.lits.is_empty() {
 243             return self.to_empty();
 244         }
 245         let mut old: Vec<Literal> = self.lits.iter().cloned().collect();
 246         let mut new = self.to_empty();
 247     'OUTER:
 248         while let Some(mut candidate) = old.pop() {
 249             if candidate.is_empty() {
 250                 continue;
 251             }
 252             if new.lits.is_empty() {
 253                 new.lits.push(candidate);
 254                 continue;
 255             }
 256             for lit2 in &mut new.lits {
 257                 if lit2.is_empty() {
 258                     continue;
 259                 }
 260                 if &candidate == lit2 {
 261                     // If the literal is already in the set, then we can
 262                     // just drop it. But make sure that cut literals are
 263                     // infectious!
 264                     candidate.cut = candidate.cut || lit2.cut;
 265                     lit2.cut = candidate.cut;
 266                     continue 'OUTER;
 267                 }
 268                 if candidate.len() < lit2.len() {
 269                     if let Some(i) = position(&candidate, &lit2) {
 270                         candidate.cut();
 271                         let mut lit3 = lit2.clone();
 272                         lit3.truncate(i);
 273                         lit3.cut();
 274                         old.push(lit3);
 275                         lit2.clear();
 276                     }
 277                 } else {
 278                     if let Some(i) = position(&lit2, &candidate) {
 279                         lit2.cut();
 280                         let mut new_candidate = candidate.clone();
 281                         new_candidate.truncate(i);
 282                         new_candidate.cut();
 283                         old.push(new_candidate);
 284                         candidate.clear();
 285                     }
 286                 }
 287                 // Oops, the candidate is already represented in the set.
 288                 if candidate.is_empty() {
 289                     continue 'OUTER;
 290                 }
 291             }
 292             new.lits.push(candidate);
 293         }
 294         new.lits.retain(|lit| !lit.is_empty());
 295         new.lits.sort();
 296         new.lits.dedup();
 297         new
 298     }
 299
 300     /// Returns a new set of suffixes of this set of literals that are
 301     /// guaranteed to be unambiguous.
 302     ///
 303     /// Any substring match with a member of the set is returned is guaranteed
 304     /// to never overlap with a substring match of another member of the set
 305     /// at the same ending position.
 306     ///
 307     /// Given any two members of the returned set, neither is a substring of
 308     /// the other.
 309     pub fn unambiguous_suffixes(&self) -> Literals {
 310         // This is a touch wasteful...
 311         let mut lits = self.clone();
 312         lits.reverse();
 313         let mut unamb = lits.unambiguous_prefixes();
 314         unamb.reverse();
 315         unamb
 316     }
 317
 318     /// Unions the prefixes from the given expression to this set.
 319     ///
 320     /// If prefixes could not be added (for example, this set would exceed its
 321     /// size limits or the set of prefixes from `expr` includes the empty
 322     /// string), then false is returned.
 323     ///
 324     /// Note that prefix literals extracted from `expr` are said to be complete
 325     /// if and only if the literal extends from the beginning of `expr` to the
 326     /// end of `expr`.
 327     pub fn union_prefixes(&mut self, expr: &Hir) -> bool {
 328         let mut lits = self.to_empty();
 329         prefixes(expr, &mut lits);
 330         !lits.is_empty() && !lits.contains_empty() && self.union(lits)
 331     }
 332
 333     /// Unions the suffixes from the given expression to this set.
 334     ///
 335     /// If suffixes could not be added (for example, this set would exceed its
 336     /// size limits or the set of suffixes from `expr` includes the empty
 337     /// string), then false is returned.
 338     ///
 339     /// Note that prefix literals extracted from `expr` are said to be complete
 340     /// if and only if the literal extends from the end of `expr` to the
 341     /// beginning of `expr`.
 342     pub fn union_suffixes(&mut self, expr: &Hir) -> bool {
 343         let mut lits = self.to_empty();
 344         suffixes(expr, &mut lits);
 345         lits.reverse();
 346         !lits.is_empty() && !lits.contains_empty() && self.union(lits)
 347     }
 348
 349     /// Unions this set with another set.
 350     ///
 351     /// If the union would cause the set to exceed its limits, then the union
 352     /// is skipped and it returns false. Otherwise, if the union succeeds, it
 353     /// returns true.
 354     pub fn union(&mut self, lits: Literals) -> bool {
 355         if self.num_bytes() + lits.num_bytes() > self.limit_size {
 356             return false;
 357         }
 358         if lits.is_empty() {
 359             self.lits.push(Literal::empty());
 360         } else {
 361             self.lits.extend(lits.lits);
 362         }
 363         true
 364     }
 365
 366     /// Extends this set with another set.
 367     ///
 368     /// The set of literals is extended via a cross product.
 369     ///
 370     /// If a cross product would cause this set to exceed its limits, then the
 371     /// cross product is skipped and it returns false. Otherwise, if the cross
 372     /// product succeeds, it returns true.
 373     pub fn cross_product(&mut self, lits: &Literals) -> bool {
 374         if lits.is_empty() {
 375             return true;
 376         }
 377         // Check that we make sure we stay in our limits.
 378         let mut size_after;
 379         if self.is_empty() || !self.any_complete() {
 380             size_after = self.num_bytes();
 381             for lits_lit in lits.literals() {
 382                 size_after += lits_lit.len();
 383             }
 384         } else {
 385             size_after = self.lits.iter().fold(0, |accum, lit| {
 386                 accum + if lit.is_cut() { lit.len() } else { 0 }
 387             });
 388             for lits_lit in lits.literals() {
 389                 for self_lit in self.literals() {
 390                     if !self_lit.is_cut() {
 391                         size_after += self_lit.len() + lits_lit.len();
 392                     }
 393                 }
 394             }
 395         }
 396         if size_after > self.limit_size {
 397             return false;
 398         }
 399
 400         let mut base = self.remove_complete();
 401         if base.is_empty() {
 402             base = vec![Literal::empty()];
 403         }
 404         for lits_lit in lits.literals() {
 405             for mut self_lit in base.clone() {
 406                 self_lit.extend(&**lits_lit);
 407                 self_lit.cut = lits_lit.cut;
 408                 self.lits.push(self_lit);
 409             }
 410         }
 411         true
 412     }
 413
 414     /// Extends each literal in this set with the bytes given.
 415     ///
 416     /// If the set is empty, then the given literal is added to the set.
 417     ///
 418     /// If adding any number of bytes to all members of this set causes a limit
 419     /// to be exceeded, then no bytes are added and false is returned. If a
 420     /// prefix of `bytes` can be fit into this set, then it is used and all
 421     /// resulting literals are cut.
 422     pub fn cross_add(&mut self, bytes: &[u8]) -> bool {
 423         // N.B. This could be implemented by simply calling cross_product with
 424         // a literal set containing just `bytes`, but we can be smarter about
 425         // taking shorter prefixes of `bytes` if they'll fit.
 426         if bytes.is_empty() {
 427             return true;
 428         }
 429         if self.lits.is_empty() {
 430             let i = cmp::min(self.limit_size, bytes.len());
 431             self.lits.push(Literal::new(bytes[..i].to_owned()));
 432             self.lits[0].cut = i < bytes.len();
 433             return !self.lits[0].is_cut();
 434         }
 435         let size = self.num_bytes();
 436         if size + self.lits.len() >= self.limit_size {
 437             return false;
 438         }
 439         let mut i = 1;
 440         while size + (i * self.lits.len()) <= self.limit_size
 441             && i < bytes.len() {
 442             i += 1;
 443         }
 444         for lit in &mut self.lits {
 445             if !lit.is_cut() {
 446                 lit.extend(&bytes[..i]);
 447                 if i < bytes.len() {
 448                     lit.cut();
 449                 }
 450             }
 451         }
 452         true
 453     }
 454
 455     /// Adds the given literal to this set.
 456     ///
 457     /// Returns false if adding this literal would cause the class to be too
 458     /// big.
 459     pub fn add(&mut self, lit: Literal) -> bool {
 460         if self.num_bytes() + lit.len() > self.limit_size {
 461             return false;
 462         }
 463         self.lits.push(lit);
 464         true
 465     }
 466
 467     /// Extends each literal in this set with the character class given.
 468     ///
 469     /// Returns false if the character class was too big to add.
 470     pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool {
 471         self._add_char_class(cls, false)
 472     }
 473
 474     /// Extends each literal in this set with the character class given,
 475     /// writing the bytes of each character in reverse.
 476     ///
 477     /// Returns false if the character class was too big to add.
 478     fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool {
 479         self._add_char_class(cls, true)
 480     }
 481
 482     fn _add_char_class(
 483         &mut self,
 484         cls: &hir::ClassUnicode,
 485         reverse: bool,
 486     ) -> bool {
 487         use std::char;
 488
 489         if self.class_exceeds_limits(cls_char_count(cls)) {
 490             return false;
 491         }
 492         let mut base = self.remove_complete();
 493         if base.is_empty() {
 494             base = vec![Literal::empty()];
 495         }
 496         for r in cls.iter() {
 497             let (s, e) = (r.start as u32, r.end as u32 + 1);
 498             for c in (s..e).filter_map(char::from_u32) {
 499                 for mut lit in base.clone() {
 500                     let mut bytes = c.to_string().into_bytes();
 501                     if reverse {
 502                         bytes.reverse();
 503                     }
 504                     lit.extend(&bytes);
 505                     self.lits.push(lit);
 506                 }
 507             }
 508         }
 509         true
 510     }
 511
 512     /// Extends each literal in this set with the byte class given.
 513     ///
 514     /// Returns false if the byte class was too big to add.
 515     pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool {
 516         if self.class_exceeds_limits(cls_byte_count(cls)) {
 517             return false;
 518         }
 519         let mut base = self.remove_complete();
 520         if base.is_empty() {
 521             base = vec![Literal::empty()];
 522         }
 523         for r in cls.iter() {
 524             let (s, e) = (r.start as u32, r.end as u32 + 1);
 525             for b in (s..e).map(|b| b as u8) {
 526                 for mut lit in base.clone() {
 527                     lit.push(b);
 528                     self.lits.push(lit);
 529                 }
 530             }
 531         }
 532         true
 533     }
 534
 535     /// Cuts every member of this set. When a member is cut, it can never
 536     /// be extended.
 537     pub fn cut(&mut self) {
 538         for lit in &mut self.lits {
 539             lit.cut();
 540         }
 541     }
 542
 543     /// Reverses all members in place.
 544     pub fn reverse(&mut self) {
 545         for lit in &mut self.lits {
 546             lit.reverse();
 547         }
 548     }
 549
 550     /// Clears this set of all members.
 551     pub fn clear(&mut self) {
 552         self.lits.clear();
 553     }
 554
 555     /// Pops all complete literals out of this set.
 556     fn remove_complete(&mut self) -> Vec<Literal> {
 557         let mut base = vec![];
 558         for lit in mem::replace(&mut self.lits, vec![]) {
 559             if lit.is_cut() {
 560                 self.lits.push(lit);
 561             } else {
 562                 base.push(lit);
 563             }
 564         }
 565         base
 566     }
 567
 568     /// Returns the total number of bytes in this set.
 569     fn num_bytes(&self) -> usize {
 570         self.lits.iter().fold(0, |accum, lit| accum + lit.len())
 571     }
 572
 573     /// Returns true if a character class with the given size would cause this
 574     /// set to exceed its limits.
 575     ///
 576     /// The size given should correspond to the number of items in the class.
 577     fn class_exceeds_limits(&self, size: usize) -> bool {
 578         if size > self.limit_class {
 579             return true;
 580         }
 581         // This is an approximation since codepoints in a char class can encode
 582         // to 1-4 bytes.
 583         let new_byte_count =
 584             if self.lits.is_empty() {
 585                 size
 586             } else {
 587                 self.lits
 588                     .iter()
 589                     .fold(0, |accum, lit| {
 590                         accum + if lit.is_cut() {
 591                             // If the literal is cut, then we'll never add
 592                             // anything to it, so don't count it.
 593                             0
 594                         } else {
 595                             (lit.len() + 1) * size
 596                         }
 597                     })
 598             };
 599         new_byte_count > self.limit_size
 600     }
 601 }
 602
 603 fn prefixes(expr: &Hir, lits: &mut Literals) {
 604     match *expr.kind() {
 605         HirKind::Literal(hir::Literal::Unicode(c)) => {
 606             let mut buf = [0u8; 4];
 607             let i = unicode::encode_utf8(c, &mut buf).unwrap();
 608             lits.cross_add(&buf[..i]);
 609         }
 610         HirKind::Literal(hir::Literal::Byte(b)) => {
 611             lits.cross_add(&[b]);
 612         }
 613         HirKind::Class(hir::Class::Unicode(ref cls)) => {
 614             if !lits.add_char_class(cls) {
 615                 lits.cut();
 616             }
 617         }
 618         HirKind::Class(hir::Class::Bytes(ref cls)) => {
 619             if !lits.add_byte_class(cls) {
 620                 lits.cut();
 621             }
 622         }
 623         HirKind::Group(hir::Group { ref hir, .. }) => {
 624             prefixes(&**hir, lits);
 625         }
 626         HirKind::Repetition(ref x) => {
 627             match x.kind {
 628                 hir::RepetitionKind::ZeroOrOne => {
 629                     repeat_zero_or_one_literals(&x.hir, lits, prefixes);
 630                 }
 631                 hir::RepetitionKind::ZeroOrMore => {
 632                     repeat_zero_or_more_literals(&x.hir, lits, prefixes);
 633                 }
 634                 hir::RepetitionKind::OneOrMore => {
 635                     repeat_one_or_more_literals(&x.hir, lits, prefixes);
 636                 }
 637                 hir::RepetitionKind::Range(ref rng) => {
 638                     let (min, max) = match *rng {
 639                         hir::RepetitionRange::Exactly(m) => {
 640                             (m, Some(m))
 641                         }
 642                         hir::RepetitionRange::AtLeast(m) => {
 643                             (m, None)
 644                         }
 645                         hir::RepetitionRange::Bounded(m, n) => {
 646                             (m, Some(n))
 647                         }
 648                     };
 649                     repeat_range_literals(
 650                         &x.hir, min, max, x.greedy, lits, prefixes)
 651                 }
 652             }
 653         }
 654         HirKind::Concat(ref es) if es.is_empty() => {}
 655         HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
 656         HirKind::Concat(ref es) => {
 657             for e in es {
 658                 if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() {
 659                     if !lits.is_empty() {
 660                         lits.cut();
 661                         break;
 662                     }
 663                     lits.add(Literal::empty());
 664                     continue;
 665                 }
 666                 let mut lits2 = lits.to_empty();
 667                 prefixes(e, &mut lits2);
 668                 if !lits.cross_product(&lits2) || !lits2.any_complete() {
 669                     // If this expression couldn't yield any literal that
 670                     // could be extended, then we need to quit. Since we're
 671                     // short-circuiting, we also need to freeze every member.
 672                     lits.cut();
 673                     break;
 674                 }
 675             }
 676         }
 677         HirKind::Alternation(ref es) => {
 678             alternate_literals(es, lits, prefixes);
 679         }
 680         _ => lits.cut(),
 681     }
 682 }
 683
 684 fn suffixes(expr: &Hir, lits: &mut Literals) {
 685     match *expr.kind() {
 686         HirKind::Literal(hir::Literal::Unicode(c)) => {
 687             let mut buf = [0u8; 4];
 688             let i = unicode::encode_utf8(c, &mut buf).unwrap();
 689             let mut buf = &mut buf[..i];
 690             buf.reverse();
 691             lits.cross_add(buf);
 692         }
 693         HirKind::Literal(hir::Literal::Byte(b)) => {
 694             lits.cross_add(&[b]);
 695         }
 696         HirKind::Class(hir::Class::Unicode(ref cls)) => {
 697             if !lits.add_char_class_reverse(cls) {
 698                 lits.cut();
 699             }
 700         }
 701         HirKind::Class(hir::Class::Bytes(ref cls)) => {
 702             if !lits.add_byte_class(cls) {
 703                 lits.cut();
 704             }
 705         }
 706         HirKind::Group(hir::Group { ref hir, .. }) => {
 707             suffixes(&**hir, lits);
 708         }
 709         HirKind::Repetition(ref x) => {
 710             match x.kind {
 711                 hir::RepetitionKind::ZeroOrOne => {
 712                     repeat_zero_or_one_literals(&x.hir, lits, suffixes);
 713                 }
 714                 hir::RepetitionKind::ZeroOrMore => {
 715                     repeat_zero_or_more_literals(&x.hir, lits, suffixes);
 716                 }
 717                 hir::RepetitionKind::OneOrMore => {
 718                     repeat_one_or_more_literals(&x.hir, lits, suffixes);
 719                 }
 720                 hir::RepetitionKind::Range(ref rng) => {
 721                     let (min, max) = match *rng {
 722                         hir::RepetitionRange::Exactly(m) => {
 723                             (m, Some(m))
 724                         }
 725                         hir::RepetitionRange::AtLeast(m) => {
 726                             (m, None)
 727                         }
 728                         hir::RepetitionRange::Bounded(m, n) => {
 729                             (m, Some(n))
 730                         }
 731                     };
 732                     repeat_range_literals(
 733                         &x.hir, min, max, x.greedy, lits, suffixes)
 734                 }
 735             }
 736         }
 737         HirKind::Concat(ref es) if es.is_empty() => {}
 738         HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
 739         HirKind::Concat(ref es) => {
 740             for e in es.iter().rev() {
 741                 if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() {
 742                     if !lits.is_empty() {
 743                         lits.cut();
 744                         break;
 745                     }
 746                     lits.add(Literal::empty());
 747                     continue;
 748                 }
 749                 let mut lits2 = lits.to_empty();
 750                 suffixes(e, &mut lits2);
 751                 if !lits.cross_product(&lits2) || !lits2.any_complete() {
 752                     // If this expression couldn't yield any literal that
 753                     // could be extended, then we need to quit. Since we're
 754                     // short-circuiting, we also need to freeze every member.
 755                     lits.cut();
 756                     break;
 757                 }
 758             }
 759         }
 760         HirKind::Alternation(ref es) => {
 761             alternate_literals(es, lits, suffixes);
 762         }
 763         _ => lits.cut(),
 764     }
 765 }
 766
 767 fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
 768     e: &Hir,
 769     lits: &mut Literals,
 770     mut f: F,
 771 ) {
 772     let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
 773     lits3.set_limit_size(lits.limit_size() / 2);
 774     f(e, &mut lits3);
 775
 776     if lits3.is_empty() || !lits2.cross_product(&lits3) {
 777         lits.cut();
 778         return;
 779     }
 780     lits2.add(Literal::empty());
 781     if !lits.union(lits2) {
 782         lits.cut();
 783     }
 784 }
 785
 786 fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
 787     e: &Hir,
 788     lits: &mut Literals,
 789     mut f: F,
 790 ) {
 791     let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
 792     lits3.set_limit_size(lits.limit_size() / 2);
 793     f(e, &mut lits3);
 794
 795     if lits3.is_empty() || !lits2.cross_product(&lits3) {
 796         lits.cut();
 797         return;
 798     }
 799     lits2.cut();
 800     lits2.add(Literal::empty());
 801     if !lits.union(lits2) {
 802         lits.cut();
 803     }
 804 }
 805
 806 fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
 807     e: &Hir,
 808     lits: &mut Literals,
 809     mut f: F,
 810 ) {
 811     f(e, lits);
 812     lits.cut();
 813 }
 814
 815 fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
 816     e: &Hir,
 817     min: u32,
 818     max: Option<u32>,
 819     greedy: bool,
 820     lits: &mut Literals,
 821     mut f: F,
 822 ) {
 823     if min == 0 {
 824         // This is a bit conservative. If `max` is set, then we could
 825         // treat this as a finite set of alternations. For now, we
 826         // just treat it as `e*`.
 827         f(&Hir::repetition(hir::Repetition {
 828             kind: hir::RepetitionKind::ZeroOrMore,
 829             greedy: greedy,
 830             hir: Box::new(e.clone()),
 831         }), lits);
 832     } else {
 833         if min > 0 {
 834             let n = cmp::min(lits.limit_size, min as usize);
 835             let es = iter::repeat(e.clone()).take(n).collect();
 836             f(&Hir::concat(es), lits);
 837             if n < min as usize || lits.contains_empty() {
 838                 lits.cut();
 839             }
 840         }
 841         if max.map_or(true, |max| min < max) {
 842             lits.cut();
 843         }
 844     }
 845 }
 846
 847 fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
 848     es: &[Hir],
 849     lits: &mut Literals,
 850     mut f: F,
 851 ) {
 852     let mut lits2 = lits.to_empty();
 853     for e in es {
 854         let mut lits3 = lits.to_empty();
 855         lits3.set_limit_size(lits.limit_size() / 5);
 856         f(e, &mut lits3);
 857         if lits3.is_empty() || !lits2.union(lits3) {
 858             // If we couldn't find suffixes for *any* of the
 859             // alternates, then the entire alternation has to be thrown
 860             // away and any existing members must be frozen. Similarly,
 861             // if the union couldn't complete, stop and freeze.
 862             lits.cut();
 863             return;
 864         }
 865     }
 866     if !lits.cross_product(&lits2) {
 867         lits.cut();
 868     }
 869 }
 870
 871 impl fmt::Debug for Literals {
 872     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 873         f.debug_struct("Literals")
 874          .field("lits", &self.lits)
 875          .field("limit_size", &self.limit_size)
 876          .field("limit_class", &self.limit_class)
 877          .finish()
 878     }
 879 }
 880
 881 impl Literal {
 882     /// Returns a new complete literal with the bytes given.
 883     pub fn new(bytes: Vec<u8>) -> Literal {
 884         Literal { v: bytes, cut: false }
 885     }
 886
 887     /// Returns a new complete empty literal.
 888     pub fn empty() -> Literal {
 889         Literal { v: vec![], cut: false }
 890     }
 891
 892     /// Returns true if this literal was "cut."
 893     pub fn is_cut(&self) -> bool {
 894         self.cut
 895     }
 896
 897     /// Cuts this literal.
 898     pub fn cut(&mut self) {
 899         self.cut = true;
 900     }
 901 }
 902
 903 impl PartialEq for Literal {
 904     fn eq(&self, other: &Literal) -> bool {
 905         self.v == other.v
 906     }
 907 }
 908
 909 impl PartialOrd for Literal {
 910     fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> {
 911         self.v.partial_cmp(&other.v)
 912     }
 913 }
 914
 915 impl fmt::Debug for Literal {
 916     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 917         if self.is_cut() {
 918             write!(f, "Cut({})", escape_unicode(&self.v))
 919         } else {
 920             write!(f, "Complete({})", escape_unicode(&self.v))
 921         }
 922     }
 923 }
 924
 925 impl AsRef<[u8]> for Literal {
 926     fn as_ref(&self) -> &[u8] { &self.v }
 927 }
 928
 929 impl ops::Deref for Literal {
 930     type Target = Vec<u8>;
 931     fn deref(&self) -> &Vec<u8> { &self.v }
 932 }
 933
 934 impl ops::DerefMut for Literal {
 935     fn deref_mut(&mut self) -> &mut Vec<u8> { &mut self.v }
 936 }
 937
 938 fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
 939     let mut i = 0;
 940     while haystack.len() >= needle.len() {
 941         if needle == &haystack[..needle.len()] {
 942             return Some(i);
 943         }
 944         i += 1;
 945         haystack = &haystack[1..];
 946     }
 947     None
 948 }
 949
 950 fn escape_unicode(bytes: &[u8]) -> String {
 951     let show = match ::std::str::from_utf8(bytes) {
 952         Ok(v) => v.to_string(),
 953         Err(_) => escape_bytes(bytes),
 954     };
 955     let mut space_escaped = String::new();
 956     for c in show.chars() {
 957         if c.is_whitespace() {
 958             let escaped = if c as u32 <= 0x7F {
 959                 escape_byte(c as u8)
 960             } else {
 961                 if c as u32 <= 0xFFFF {
 962                     format!(r"\u{{{:04x}}}", c as u32)
 963                 } else {
 964                     format!(r"\U{{{:08x}}}", c as u32)
 965                 }
 966             };
 967             space_escaped.push_str(&escaped);
 968         } else {
 969             space_escaped.push(c);
 970         }
 971     }
 972     space_escaped
 973 }
 974
 975 fn escape_bytes(bytes: &[u8]) -> String {
 976     let mut s = String::new();
 977     for &b in bytes {
 978         s.push_str(&escape_byte(b));
 979     }
 980     s
 981 }
 982
 983 fn escape_byte(byte: u8) -> String {
 984     use std::ascii::escape_default;
 985
 986     let escaped: Vec<u8> = escape_default(byte).collect();
 987     String::from_utf8_lossy(&escaped).into_owned()
 988 }
 989
 990 fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
 991     cls.iter()
 992         .map(|&r| 1 + (r.end as u32) - (r.start as u32))
 993         .sum::<u32>() as usize
 994 }
 995
 996 fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
 997     cls.iter()
 998         .map(|&r| 1 + (r.end as u32) - (r.start as u32))
 999         .sum::<u32>() as usize
1000 }
1001
1002 #[cfg(test)]
1003 mod tests {
1004     use std::fmt;
1005
1006     use ParserBuilder;
1007     use hir::Hir;
1008     use super::{Literals, Literal, escape_bytes};
1009
1010     // To make test failures easier to read.
1011     #[derive(Debug, Eq, PartialEq)]
1012     struct Bytes(Vec<ULiteral>);
1013     #[derive(Debug, Eq, PartialEq)]
1014     struct Unicode(Vec<ULiteral>);
1015
1016     fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> {
1017         let mut ulits = vec![];
1018         for blit in blits {
1019             ulits.push(ULiteral {
1020                 v: escape_bytes(&blit),
1021                 cut: blit.is_cut(),
1022             });
1023         }
1024         ulits
1025     }
1026
1027     fn create_lits<I: IntoIterator<Item=Literal>>(it: I) -> Literals {
1028         Literals {
1029             lits: it.into_iter().collect(),
1030             limit_size: 0,
1031             limit_class: 0,
1032         }
1033     }
1034
1035     // Needs to be pub for 1.3?
1036     #[derive(Clone, Eq, PartialEq)]
1037     pub struct ULiteral {
1038         v: String,
1039         cut: bool,
1040     }
1041
1042     impl ULiteral {
1043         fn is_cut(&self) -> bool { self.cut }
1044     }
1045
1046     impl fmt::Debug for ULiteral {
1047         fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1048             if self.is_cut() {
1049                 write!(f, "Cut({})", self.v)
1050             } else {
1051                 write!(f, "Complete({})", self.v)
1052             }
1053         }
1054     }
1055
1056     impl PartialEq<Literal> for ULiteral {
1057         fn eq(&self, other: &Literal) -> bool {
1058             self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut()
1059         }
1060     }
1061
1062     impl PartialEq<ULiteral> for Literal {
1063         fn eq(&self, other: &ULiteral) -> bool {
1064             &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut()
1065         }
1066     }
1067
1068     #[allow(non_snake_case)]
1069     fn C(s: &'static str) -> ULiteral {
1070         ULiteral { v: s.to_owned(), cut: true }
1071     }
1072     #[allow(non_snake_case)]
1073     fn M(s: &'static str) -> ULiteral {
1074         ULiteral { v: s.to_owned(), cut: false }
1075     }
1076
1077     fn prefixes(lits: &mut Literals, expr: &Hir) {
1078         lits.union_prefixes(expr);
1079     }
1080
1081     fn suffixes(lits: &mut Literals, expr: &Hir) {
1082         lits.union_suffixes(expr);
1083     }
1084
1085     macro_rules! assert_lit_eq {
1086         ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{
1087             let expected: Vec<ULiteral> = vec![$($expected_lit),*];
1088             let lits = $got_lits;
1089             assert_eq!(
1090                 $which(expected.clone()),
1091                 $which(escape_lits(lits.literals())));
1092             assert_eq!(
1093                 !expected.is_empty() && expected.iter().all(|l| !l.is_cut()),
1094                 lits.all_complete());
1095             assert_eq!(
1096                 expected.iter().any(|l| !l.is_cut()),
1097                 lits.any_complete());
1098         }};
1099     }
1100
1101     macro_rules! test_lit {
1102         ($name:ident, $which:ident, $re:expr) => {
1103             test_lit!($name, $which, $re,);
1104         };
1105         ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1106             #[test]
1107             fn $name() {
1108                 let expr = ParserBuilder::new()
1109                     .build()
1110                     .parse($re)
1111                     .unwrap();
1112                 let lits = Literals::$which(&expr);
1113                 assert_lit_eq!(Unicode, lits, $($lit),*);
1114
1115                 let expr = ParserBuilder::new()
1116                     .allow_invalid_utf8(true)
1117                     .unicode(false)
1118                     .build()
1119                     .parse($re)
1120                     .unwrap();
1121                 let lits = Literals::$which(&expr);
1122                 assert_lit_eq!(Bytes, lits, $($lit),*);
1123             }
1124         };
1125     }
1126
1127     // ************************************************************************
1128     // Tests for prefix literal extraction.
1129     // ************************************************************************
1130
1131     // Elementary tests.
1132     test_lit!(pfx_one_lit1, prefixes, "a", M("a"));
1133     test_lit!(pfx_one_lit2, prefixes, "abc", M("abc"));
1134     test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1135     test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1136     test_lit!(pfx_class1, prefixes, "[1-4]",
1137               M("1"), M("2"), M("3"), M("4"));
1138     test_lit!(pfx_class2, prefixes, "(?u)[☃Ⅰ]",
1139               M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83"));
1140     test_lit!(pfx_class3, prefixes, "(?ui)[☃Ⅰ]",
1141               M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"),
1142               M("\\xe2\\x98\\x83"));
1143     test_lit!(pfx_one_lit_casei1, prefixes, "(?i)a",
1144               M("A"), M("a"));
1145     test_lit!(pfx_one_lit_casei2, prefixes, "(?i)abc",
1146               M("ABC"), M("aBC"), M("AbC"), M("abC"),
1147               M("ABc"), M("aBc"), M("Abc"), M("abc"));
1148     test_lit!(pfx_group1, prefixes, "(a)", M("a"));
1149     test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
1150     test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
1151     test_lit!(pfx_rep_zero_or_more1, prefixes, "a*");
1152     test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*");
1153     test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a"));
1154     test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc"));
1155     test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a"));
1156     test_lit!(pfx_rep_range1, prefixes, "a{0}");
1157     test_lit!(pfx_rep_range2, prefixes, "a{0,}");
1158     test_lit!(pfx_rep_range3, prefixes, "a{0,1}");
1159     test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a"));
1160     test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa"));
1161     test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a"));
1162     test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa"));
1163
1164     // Test regexes with concatenations.
1165     test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab"));
1166     test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz"));
1167     test_lit!(pfx_cat3, prefixes, "(?i)[ab]z",
1168               M("AZ"), M("BZ"), M("aZ"), M("bZ"),
1169               M("Az"), M("Bz"), M("az"), M("bz"));
1170     test_lit!(pfx_cat4, prefixes, "[ab][yz]",
1171               M("ay"), M("by"), M("az"), M("bz"));
1172     test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b"));
1173     test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c"));
1174     test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c"));
1175     test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b"));
1176     test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b"));
1177     test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a"));
1178     test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac"));
1179     test_lit!(pfx_cat12, prefixes, "ab+", C("ab"));
1180     test_lit!(pfx_cat13, prefixes, "ab+c", C("ab"));
1181     test_lit!(pfx_cat14, prefixes, "a^", C("a"));
1182     test_lit!(pfx_cat15, prefixes, "$a");
1183     test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac"));
1184     test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab"));
1185     test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb"));
1186     test_lit!(pfx_cat19, prefixes, "a.z", C("a"));
1187
1188     // Test regexes with alternations.
1189     test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b"));
1190     test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1191     test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1192     test_lit!(pfx_alt4, prefixes, "a|b*");
1193     test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b"));
1194     test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)");
1195     test_lit!(pfx_alt7, prefixes, "(a|b)*c|(a|ab)*c",
1196               C("a"), C("b"), M("c"), C("a"), C("ab"), M("c"));
1197     test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c"));
1198
1199     // Test regexes with empty assertions.
1200     test_lit!(pfx_empty1, prefixes, "^a", M("a"));
1201     test_lit!(pfx_empty2, prefixes, "a${2}", C("a"));
1202     test_lit!(pfx_empty3, prefixes, "^abc", M("abc"));
1203     test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z"));
1204
1205     // Make sure some curious regexes have no prefixes.
1206     test_lit!(pfx_nothing1, prefixes, ".");
1207     test_lit!(pfx_nothing2, prefixes, "(?s).");
1208     test_lit!(pfx_nothing3, prefixes, "^");
1209     test_lit!(pfx_nothing4, prefixes, "$");
1210     test_lit!(pfx_nothing6, prefixes, "(?m)$");
1211     test_lit!(pfx_nothing7, prefixes, r"\b");
1212     test_lit!(pfx_nothing8, prefixes, r"\B");
1213
1214     // Test a few regexes that defeat any prefix literal detection.
1215     test_lit!(pfx_defeated1, prefixes, ".a");
1216     test_lit!(pfx_defeated2, prefixes, "(?s).a");
1217     test_lit!(pfx_defeated3, prefixes, "a*b*c*");
1218     test_lit!(pfx_defeated4, prefixes, "a|.");
1219     test_lit!(pfx_defeated5, prefixes, ".|a");
1220     test_lit!(pfx_defeated6, prefixes, "a|^");
1221     test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))");
1222     test_lit!(pfx_defeated8, prefixes, "$a");
1223     test_lit!(pfx_defeated9, prefixes, "(?m)$a");
1224     test_lit!(pfx_defeated10, prefixes, r"\ba");
1225     test_lit!(pfx_defeated11, prefixes, r"\Ba");
1226     test_lit!(pfx_defeated12, prefixes, "^*a");
1227     test_lit!(pfx_defeated13, prefixes, "^+a");
1228
1229     test_lit!(
1230         pfx_crazy1,
1231         prefixes,
1232         r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
1233         C("Mo\\'am"), C("Mu\\'am"), C("Moam"), C("Muam"));
1234
1235     // ************************************************************************
1236     // Tests for quiting prefix literal search.
1237     // ************************************************************************
1238
1239     macro_rules! test_exhausted {
1240         ($name:ident, $which:ident, $re:expr) => {
1241             test_exhausted!($name, $which, $re,);
1242         };
1243         ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1244             #[test]
1245             fn $name() {
1246                 let expr = ParserBuilder::new()
1247                     .build()
1248                     .parse($re)
1249                     .unwrap();
1250                 let mut lits = Literals::empty();
1251                 lits.set_limit_size(20).set_limit_class(10);
1252                 $which(&mut lits, &expr);
1253                 assert_lit_eq!(Unicode, lits, $($lit),*);
1254
1255                 let expr = ParserBuilder::new()
1256                     .allow_invalid_utf8(true)
1257                     .unicode(false)
1258                     .build()
1259                     .parse($re)
1260                     .unwrap();
1261                 let mut lits = Literals::empty();
1262                 lits.set_limit_size(20).set_limit_class(10);
1263                 $which(&mut lits, &expr);
1264                 assert_lit_eq!(Bytes, lits, $($lit),*);
1265             }
1266         };
1267     }
1268
1269     // These test use a much lower limit than the default so that we can
1270     // write test cases of reasonable size.
1271     test_exhausted!(pfx_exhausted1, prefixes, "[a-z]");
1272     test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A");
1273     test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A"));
1274     test_exhausted!(pfx_exhausted4, prefixes, "(?i)foobar",
1275                     C("FO"), C("fO"), C("Fo"), C("fo"));
1276     test_exhausted!(pfx_exhausted5, prefixes, "(?:ab){100}",
1277                     C("abababababababababab"));
1278     test_exhausted!(pfx_exhausted6, prefixes, "(?:(?:ab){100})*cd",
1279                     C("ababababab"), M("cd"));
1280     test_exhausted!(pfx_exhausted7, prefixes, "z(?:(?:ab){100})*cd",
1281                     C("zababababab"), M("zcd"));
1282     test_exhausted!(pfx_exhausted8, prefixes, "aaaaaaaaaaaaaaaaaaaaz",
1283                     C("aaaaaaaaaaaaaaaaaaaa"));
1284
1285     // ************************************************************************
1286     // Tests for suffix literal extraction.
1287     // ************************************************************************
1288
1289     // Elementary tests.
1290     test_lit!(sfx_one_lit1, suffixes, "a", M("a"));
1291     test_lit!(sfx_one_lit2, suffixes, "abc", M("abc"));
1292     test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1293     test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1294     test_lit!(sfx_class1, suffixes, "[1-4]",
1295               M("1"), M("2"), M("3"), M("4"));
1296     test_lit!(sfx_class2, suffixes, "(?u)[☃Ⅰ]",
1297               M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83"));
1298     test_lit!(sfx_class3, suffixes, "(?ui)[☃Ⅰ]",
1299               M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"),
1300               M("\\xe2\\x98\\x83"));
1301     test_lit!(sfx_one_lit_casei1, suffixes, "(?i)a",
1302               M("A"), M("a"));
1303     test_lit!(sfx_one_lit_casei2, suffixes, "(?i)abc",
1304               M("ABC"), M("ABc"), M("AbC"), M("Abc"),
1305               M("aBC"), M("aBc"), M("abC"), M("abc"));
1306     test_lit!(sfx_group1, suffixes, "(a)", M("a"));
1307     test_lit!(sfx_rep_zero_or_one1, suffixes, "a?");
1308     test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?");
1309     test_lit!(sfx_rep_zero_or_more1, suffixes, "a*");
1310     test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*");
1311     test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a"));
1312     test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc"));
1313     test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a"));
1314     test_lit!(sfx_rep_range1, suffixes, "a{0}");
1315     test_lit!(sfx_rep_range2, suffixes, "a{0,}");
1316     test_lit!(sfx_rep_range3, suffixes, "a{0,1}");
1317     test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a"));
1318     test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa"));
1319     test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a"));
1320     test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa"));
1321
1322     // Test regexes with concatenations.
1323     test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab"));
1324     test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz"));
1325     test_lit!(sfx_cat3, suffixes, "(?i)[ab]z",
1326               M("AZ"), M("Az"), M("BZ"), M("Bz"),
1327               M("aZ"), M("az"), M("bZ"), M("bz"));
1328     test_lit!(sfx_cat4, suffixes, "[ab][yz]",
1329               M("ay"), M("az"), M("by"), M("bz"));
1330     test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b"));
1331     test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c"));
1332     test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c"));
1333     test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc"));
1334     test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b"));
1335     test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a"));
1336     test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac"));
1337     test_lit!(sfx_cat12, suffixes, "ab+", C("b"));
1338     test_lit!(sfx_cat13, suffixes, "ab+c", C("bc"));
1339     test_lit!(sfx_cat14, suffixes, "a^");
1340     test_lit!(sfx_cat15, suffixes, "$a", C("a"));
1341     test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac"));
1342     test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc"));
1343     test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb"));
1344     test_lit!(sfx_cat19, suffixes, "a.z", C("z"));
1345
1346     // Test regexes with alternations.
1347     test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b"));
1348     test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1349     test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1350     test_lit!(sfx_alt4, suffixes, "a|b*");
1351     test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b"));
1352     test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)");
1353     test_lit!(sfx_alt7, suffixes, "(a|b)*c|(a|ab)*c",
1354               C("ac"), C("bc"), M("c"), C("ac"), C("abc"), M("c"));
1355     test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c"));
1356
1357     // Test regexes with empty assertions.
1358     test_lit!(sfx_empty1, suffixes, "a$", M("a"));
1359     test_lit!(sfx_empty2, suffixes, "${2}a", C("a"));
1360
1361     // Make sure some curious regexes have no suffixes.
1362     test_lit!(sfx_nothing1, suffixes, ".");
1363     test_lit!(sfx_nothing2, suffixes, "(?s).");
1364     test_lit!(sfx_nothing3, suffixes, "^");
1365     test_lit!(sfx_nothing4, suffixes, "$");
1366     test_lit!(sfx_nothing6, suffixes, "(?m)$");
1367     test_lit!(sfx_nothing7, suffixes, r"\b");
1368     test_lit!(sfx_nothing8, suffixes, r"\B");
1369
1370     // Test a few regexes that defeat any suffix literal detection.
1371     test_lit!(sfx_defeated1, suffixes, "a.");
1372     test_lit!(sfx_defeated2, suffixes, "(?s)a.");
1373     test_lit!(sfx_defeated3, suffixes, "a*b*c*");
1374     test_lit!(sfx_defeated4, suffixes, "a|.");
1375     test_lit!(sfx_defeated5, suffixes, ".|a");
1376     test_lit!(sfx_defeated6, suffixes, "a|^");
1377     test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c)).");
1378     test_lit!(sfx_defeated8, suffixes, "a^");
1379     test_lit!(sfx_defeated9, suffixes, "(?m)a$");
1380     test_lit!(sfx_defeated10, suffixes, r"a\b");
1381     test_lit!(sfx_defeated11, suffixes, r"a\B");
1382     test_lit!(sfx_defeated12, suffixes, "a^*");
1383     test_lit!(sfx_defeated13, suffixes, "a^+");
1384
1385     // These test use a much lower limit than the default so that we can
1386     // write test cases of reasonable size.
1387     test_exhausted!(sfx_exhausted1, suffixes, "[a-z]");
1388     test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*");
1389     test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z"));
1390     test_exhausted!(sfx_exhausted4, suffixes, "(?i)foobar",
1391                     C("AR"), C("Ar"), C("aR"), C("ar"));
1392     test_exhausted!(sfx_exhausted5, suffixes, "(?:ab){100}",
1393                     C("abababababababababab"));
1394     test_exhausted!(sfx_exhausted6, suffixes, "cd(?:(?:ab){100})*",
1395                     C("ababababab"), M("cd"));
1396     test_exhausted!(sfx_exhausted7, suffixes, "cd(?:(?:ab){100})*z",
1397                     C("abababababz"), M("cdz"));
1398     test_exhausted!(sfx_exhausted8, suffixes, "zaaaaaaaaaaaaaaaaaaaa",
1399                     C("aaaaaaaaaaaaaaaaaaaa"));
1400
1401     // ************************************************************************
1402     // Tests for generating unambiguous literal sets.
1403     // ************************************************************************
1404
1405     macro_rules! test_unamb {
1406         ($name:ident, $given:expr, $expected:expr) => {
1407             #[test]
1408             fn $name() {
1409                 let given: Vec<Literal> =
1410                     $given
1411                     .into_iter()
1412                     .map(|ul| {
1413                         let cut = ul.is_cut();
1414                         Literal { v: ul.v.into_bytes(), cut: cut }
1415                     })
1416                     .collect();
1417                 let lits = create_lits(given);
1418                 let got = lits.unambiguous_prefixes();
1419                 assert_eq!($expected, escape_lits(got.literals()));
1420             }
1421         };
1422     }
1423
1424     test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]);
1425     test_unamb!(unambiguous2,
1426                 vec![M("zaaaaaa"), M("aa")], vec![C("aa"), C("z")]);
1427     test_unamb!(unambiguous3,
1428                 vec![M("Sherlock"), M("Watson")],
1429                 vec![M("Sherlock"), M("Watson")]);
1430     test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]);
1431     test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]);
1432     test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]);
1433     test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]);
1434     test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]);
1435     test_unamb!(unambiguous9,
1436                 vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
1437                 vec![C("a"), C("b"), C("c")]);
1438     test_unamb!(unambiguous10,
1439                 vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
1440                 vec![C("Mo"), C("Mu")]);
1441     test_unamb!(unambiguous11,
1442                 vec![M("zazb"), M("azb")], vec![C("a"), C("z")]);
1443     test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]);
1444     test_unamb!(unambiguous13,
1445                 vec![M("ABCX"), M("CDAX"), M("BCX")],
1446                 vec![C("A"), C("BCX"), C("CD")]);
1447     test_unamb!(unambiguous14,
1448                 vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
1449                 vec![M("DSX"), C("I"), C("MGX"), C("MV")]);
1450     test_unamb!(unambiguous15,
1451                 vec![M("IMG_"), M("MG_"), M("CIMG")],
1452                 vec![C("C"), C("I"), C("MG_")]);
1453
1454
1455     // ************************************************************************
1456     // Tests for suffix trimming.
1457     // ************************************************************************
1458     macro_rules! test_trim {
1459         ($name:ident, $trim:expr, $given:expr, $expected:expr) => {
1460             #[test]
1461             fn $name() {
1462                 let given: Vec<Literal> =
1463                     $given
1464                     .into_iter()
1465                     .map(|ul| {
1466                         let cut = ul.is_cut();
1467                         Literal { v: ul.v.into_bytes(), cut: cut }
1468                     })
1469                     .collect();
1470                 let lits = create_lits(given);
1471                 let got = lits.trim_suffix($trim).unwrap();
1472                 assert_eq!($expected, escape_lits(got.literals()));
1473             }
1474         }
1475     }
1476
1477     test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]);
1478     test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]);
1479     test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]);
1480     test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]);
1481
1482     // ************************************************************************
1483     // Tests for longest common prefix.
1484     // ************************************************************************
1485
1486     macro_rules! test_lcp {
1487         ($name:ident, $given:expr, $expected:expr) => {
1488             #[test]
1489             fn $name() {
1490                 let given: Vec<Literal> =
1491                     $given
1492                     .into_iter()
1493                     .map(|s: &str| Literal {
1494                         v: s.to_owned().into_bytes(),
1495                         cut: false,
1496                     })
1497                     .collect();
1498                 let lits = create_lits(given);
1499                 let got = lits.longest_common_prefix();
1500                 assert_eq!($expected, escape_bytes(got));
1501             }
1502         };
1503     }
1504
1505     test_lcp!(lcp1, vec!["a"], "a");
1506     test_lcp!(lcp2, vec![], "");
1507     test_lcp!(lcp3, vec!["a", "b"], "");
1508     test_lcp!(lcp4, vec!["ab", "ab"], "ab");
1509     test_lcp!(lcp5, vec!["ab", "a"], "a");
1510     test_lcp!(lcp6, vec!["a", "ab"], "a");
1511     test_lcp!(lcp7, vec!["ab", "b"], "");
1512     test_lcp!(lcp8, vec!["b", "ab"], "");
1513     test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba");
1514     test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], "");
1515     test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], "");
1516     test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f");
1517
1518     // ************************************************************************
1519     // Tests for longest common suffix.
1520     // ************************************************************************
1521
1522     macro_rules! test_lcs {
1523         ($name:ident, $given:expr, $expected:expr) => {
1524             #[test]
1525             fn $name() {
1526                 let given: Vec<Literal> =
1527                     $given
1528                     .into_iter()
1529                     .map(|s: &str| Literal {
1530                         v: s.to_owned().into_bytes(),
1531                         cut: false,
1532                     })
1533                     .collect();
1534                 let lits = create_lits(given);
1535                 let got = lits.longest_common_suffix();
1536                 assert_eq!($expected, escape_bytes(got));
1537             }
1538         };
1539     }
1540
1541     test_lcs!(lcs1, vec!["a"], "a");
1542     test_lcs!(lcs2, vec![], "");
1543     test_lcs!(lcs3, vec!["a", "b"], "");
1544     test_lcs!(lcs4, vec!["ab", "ab"], "ab");
1545     test_lcs!(lcs5, vec!["ab", "a"], "");
1546     test_lcs!(lcs6, vec!["a", "ab"], "");
1547     test_lcs!(lcs7, vec!["ab", "b"], "b");
1548     test_lcs!(lcs8, vec!["b", "ab"], "b");
1549     test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo");
1550     test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], "");
1551     test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], "");
1552     test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b");
1553 }