vendor/regex-syntax/src/hir/literal/mod.rs

   1 /*!
   2 Provides routines for extracting literal prefixes and suffixes from an `Hir`.
   3 */
   4
   5 use std::cmp;
   6 use std::fmt;
   7 use std::iter;
   8 use std::mem;
   9 use std::ops;
  10
  11 use crate::hir::{self, Hir, HirKind};
  12
  13 /// A set of literal byte strings extracted from a regular expression.
  14 ///
  15 /// Every member of the set is a `Literal`, which is represented by a
  16 /// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is
  17 /// said to be either *complete* or *cut*. A complete literal means that
  18 /// it extends until the beginning (or end) of the regular expression. In
  19 /// some circumstances, this can be used to indicate a match in the regular
  20 /// expression.
  21 ///
  22 /// A key aspect of literal extraction is knowing when to stop. It is not
  23 /// feasible to blindly extract all literals from a regular expression, even if
  24 /// there are finitely many. For example, the regular expression `[0-9]{10}`
  25 /// has `10^10` distinct literals. For this reason, literal extraction is
  26 /// bounded to some low number by default using heuristics, but the limits can
  27 /// be tweaked.
  28 ///
  29 /// **WARNING**: Literal extraction uses stack space proportional to the size
  30 /// of the `Hir` expression. At some point, this drawback will be eliminated.
  31 /// To protect yourself, set a reasonable
  32 /// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit).
  33 /// This is done for you by default.
  34 #[derive(Clone, Eq, PartialEq)]
  35 pub struct Literals {
  36     lits: Vec<Literal>,
  37     limit_size: usize,
  38     limit_class: usize,
  39 }
  40
  41 /// A single member of a set of literals extracted from a regular expression.
  42 ///
  43 /// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice
  44 /// and `Vec` operations are available.
  45 #[derive(Clone, Eq, Ord)]
  46 pub struct Literal {
  47     v: Vec<u8>,
  48     cut: bool,
  49 }
  50
  51 impl Literals {
  52     /// Returns a new empty set of literals using default limits.
  53     pub fn empty() -> Literals {
  54         Literals { lits: vec![], limit_size: 250, limit_class: 10 }
  55     }
  56
  57     /// Returns a set of literal prefixes extracted from the given `Hir`.
  58     pub fn prefixes(expr: &Hir) -> Literals {
  59         let mut lits = Literals::empty();
  60         lits.union_prefixes(expr);
  61         lits
  62     }
  63
  64     /// Returns a set of literal suffixes extracted from the given `Hir`.
  65     pub fn suffixes(expr: &Hir) -> Literals {
  66         let mut lits = Literals::empty();
  67         lits.union_suffixes(expr);
  68         lits
  69     }
  70
  71     /// Get the approximate size limit (in bytes) of this set.
  72     pub fn limit_size(&self) -> usize {
  73         self.limit_size
  74     }
  75
  76     /// Set the approximate size limit (in bytes) of this set.
  77     ///
  78     /// If extracting a literal would put the set over this limit, then
  79     /// extraction stops.
  80     ///
  81     /// The new limits will only apply to additions to this set. Existing
  82     /// members remain unchanged, even if the set exceeds the new limit.
  83     pub fn set_limit_size(&mut self, size: usize) -> &mut Literals {
  84         self.limit_size = size;
  85         self
  86     }
  87
  88     /// Get the character class size limit for this set.
  89     pub fn limit_class(&self) -> usize {
  90         self.limit_class
  91     }
  92
  93     /// Limits the size of character(or byte) classes considered.
  94     ///
  95     /// A value of `0` prevents all character classes from being considered.
  96     ///
  97     /// This limit also applies to case insensitive literals, since each
  98     /// character in the case insensitive literal is converted to a class, and
  99     /// then case folded.
 100     ///
 101     /// The new limits will only apply to additions to this set. Existing
 102     /// members remain unchanged, even if the set exceeds the new limit.
 103     pub fn set_limit_class(&mut self, size: usize) -> &mut Literals {
 104         self.limit_class = size;
 105         self
 106     }
 107
 108     /// Returns the set of literals as a slice. Its order is unspecified.
 109     pub fn literals(&self) -> &[Literal] {
 110         &self.lits
 111     }
 112
 113     /// Returns the length of the smallest literal.
 114     ///
 115     /// Returns None is there are no literals in the set.
 116     pub fn min_len(&self) -> Option<usize> {
 117         let mut min = None;
 118         for lit in &self.lits {
 119             match min {
 120                 None => min = Some(lit.len()),
 121                 Some(m) if lit.len() < m => min = Some(lit.len()),
 122                 _ => {}
 123             }
 124         }
 125         min
 126     }
 127
 128     /// Returns true if all members in this set are complete.
 129     pub fn all_complete(&self) -> bool {
 130         !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut())
 131     }
 132
 133     /// Returns true if any member in this set is complete.
 134     pub fn any_complete(&self) -> bool {
 135         self.lits.iter().any(|lit| !lit.is_cut())
 136     }
 137
 138     /// Returns true if this set contains an empty literal.
 139     pub fn contains_empty(&self) -> bool {
 140         self.lits.iter().any(|lit| lit.is_empty())
 141     }
 142
 143     /// Returns true if this set is empty or if all of its members is empty.
 144     pub fn is_empty(&self) -> bool {
 145         self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty())
 146     }
 147
 148     /// Returns a new empty set of literals using this set's limits.
 149     pub fn to_empty(&self) -> Literals {
 150         let mut lits = Literals::empty();
 151         lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class);
 152         lits
 153     }
 154
 155     /// Returns the longest common prefix of all members in this set.
 156     pub fn longest_common_prefix(&self) -> &[u8] {
 157         if self.is_empty() {
 158             return &[];
 159         }
 160         let lit0 = &*self.lits[0];
 161         let mut len = lit0.len();
 162         for lit in &self.lits[1..] {
 163             len = cmp::min(
 164                 len,
 165                 lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(),
 166             );
 167         }
 168         &self.lits[0][..len]
 169     }
 170
 171     /// Returns the longest common suffix of all members in this set.
 172     pub fn longest_common_suffix(&self) -> &[u8] {
 173         if self.is_empty() {
 174             return &[];
 175         }
 176         let lit0 = &*self.lits[0];
 177         let mut len = lit0.len();
 178         for lit in &self.lits[1..] {
 179             len = cmp::min(
 180                 len,
 181                 lit.iter()
 182                     .rev()
 183                     .zip(lit0.iter().rev())
 184                     .take_while(|&(a, b)| a == b)
 185                     .count(),
 186             );
 187         }
 188         &self.lits[0][self.lits[0].len() - len..]
 189     }
 190
 191     /// Returns a new set of literals with the given number of bytes trimmed
 192     /// from the suffix of each literal.
 193     ///
 194     /// If any literal would be cut out completely by trimming, then None is
 195     /// returned.
 196     ///
 197     /// Any duplicates that are created as a result of this transformation are
 198     /// removed.
 199     pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> {
 200         if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) {
 201             return None;
 202         }
 203         let mut new = self.to_empty();
 204         for mut lit in self.lits.iter().cloned() {
 205             let new_len = lit.len() - num_bytes;
 206             lit.truncate(new_len);
 207             lit.cut();
 208             new.lits.push(lit);
 209         }
 210         new.lits.sort();
 211         new.lits.dedup();
 212         Some(new)
 213     }
 214
 215     /// Returns a new set of prefixes of this set of literals that are
 216     /// guaranteed to be unambiguous.
 217     ///
 218     /// Any substring match with a member of the set is returned is guaranteed
 219     /// to never overlap with a substring match of another member of the set
 220     /// at the same starting position.
 221     ///
 222     /// Given any two members of the returned set, neither is a substring of
 223     /// the other.
 224     pub fn unambiguous_prefixes(&self) -> Literals {
 225         if self.lits.is_empty() {
 226             return self.to_empty();
 227         }
 228         let mut old: Vec<Literal> = self.lits.iter().cloned().collect();
 229         let mut new = self.to_empty();
 230         'OUTER: while let Some(mut candidate) = old.pop() {
 231             if candidate.is_empty() {
 232                 continue;
 233             }
 234             if new.lits.is_empty() {
 235                 new.lits.push(candidate);
 236                 continue;
 237             }
 238             for lit2 in &mut new.lits {
 239                 if lit2.is_empty() {
 240                     continue;
 241                 }
 242                 if &candidate == lit2 {
 243                     // If the literal is already in the set, then we can
 244                     // just drop it. But make sure that cut literals are
 245                     // infectious!
 246                     candidate.cut = candidate.cut || lit2.cut;
 247                     lit2.cut = candidate.cut;
 248                     continue 'OUTER;
 249                 }
 250                 if candidate.len() < lit2.len() {
 251                     if let Some(i) = position(&candidate, &lit2) {
 252                         candidate.cut();
 253                         let mut lit3 = lit2.clone();
 254                         lit3.truncate(i);
 255                         lit3.cut();
 256                         old.push(lit3);
 257                         lit2.clear();
 258                     }
 259                 } else {
 260                     if let Some(i) = position(&lit2, &candidate) {
 261                         lit2.cut();
 262                         let mut new_candidate = candidate.clone();
 263                         new_candidate.truncate(i);
 264                         new_candidate.cut();
 265                         old.push(new_candidate);
 266                         candidate.clear();
 267                     }
 268                 }
 269                 // Oops, the candidate is already represented in the set.
 270                 if candidate.is_empty() {
 271                     continue 'OUTER;
 272                 }
 273             }
 274             new.lits.push(candidate);
 275         }
 276         new.lits.retain(|lit| !lit.is_empty());
 277         new.lits.sort();
 278         new.lits.dedup();
 279         new
 280     }
 281
 282     /// Returns a new set of suffixes of this set of literals that are
 283     /// guaranteed to be unambiguous.
 284     ///
 285     /// Any substring match with a member of the set is returned is guaranteed
 286     /// to never overlap with a substring match of another member of the set
 287     /// at the same ending position.
 288     ///
 289     /// Given any two members of the returned set, neither is a substring of
 290     /// the other.
 291     pub fn unambiguous_suffixes(&self) -> Literals {
 292         // This is a touch wasteful...
 293         let mut lits = self.clone();
 294         lits.reverse();
 295         let mut unamb = lits.unambiguous_prefixes();
 296         unamb.reverse();
 297         unamb
 298     }
 299
 300     /// Unions the prefixes from the given expression to this set.
 301     ///
 302     /// If prefixes could not be added (for example, this set would exceed its
 303     /// size limits or the set of prefixes from `expr` includes the empty
 304     /// string), then false is returned.
 305     ///
 306     /// Note that prefix literals extracted from `expr` are said to be complete
 307     /// if and only if the literal extends from the beginning of `expr` to the
 308     /// end of `expr`.
 309     pub fn union_prefixes(&mut self, expr: &Hir) -> bool {
 310         let mut lits = self.to_empty();
 311         prefixes(expr, &mut lits);
 312         !lits.is_empty() && !lits.contains_empty() && self.union(lits)
 313     }
 314
 315     /// Unions the suffixes from the given expression to this set.
 316     ///
 317     /// If suffixes could not be added (for example, this set would exceed its
 318     /// size limits or the set of suffixes from `expr` includes the empty
 319     /// string), then false is returned.
 320     ///
 321     /// Note that prefix literals extracted from `expr` are said to be complete
 322     /// if and only if the literal extends from the end of `expr` to the
 323     /// beginning of `expr`.
 324     pub fn union_suffixes(&mut self, expr: &Hir) -> bool {
 325         let mut lits = self.to_empty();
 326         suffixes(expr, &mut lits);
 327         lits.reverse();
 328         !lits.is_empty() && !lits.contains_empty() && self.union(lits)
 329     }
 330
 331     /// Unions this set with another set.
 332     ///
 333     /// If the union would cause the set to exceed its limits, then the union
 334     /// is skipped and it returns false. Otherwise, if the union succeeds, it
 335     /// returns true.
 336     pub fn union(&mut self, lits: Literals) -> bool {
 337         if self.num_bytes() + lits.num_bytes() > self.limit_size {
 338             return false;
 339         }
 340         if lits.is_empty() {
 341             self.lits.push(Literal::empty());
 342         } else {
 343             self.lits.extend(lits.lits);
 344         }
 345         true
 346     }
 347
 348     /// Extends this set with another set.
 349     ///
 350     /// The set of literals is extended via a cross product.
 351     ///
 352     /// If a cross product would cause this set to exceed its limits, then the
 353     /// cross product is skipped and it returns false. Otherwise, if the cross
 354     /// product succeeds, it returns true.
 355     pub fn cross_product(&mut self, lits: &Literals) -> bool {
 356         if lits.is_empty() {
 357             return true;
 358         }
 359         // Check that we make sure we stay in our limits.
 360         let mut size_after;
 361         if self.is_empty() || !self.any_complete() {
 362             size_after = self.num_bytes();
 363             for lits_lit in lits.literals() {
 364                 size_after += lits_lit.len();
 365             }
 366         } else {
 367             size_after = self.lits.iter().fold(0, |accum, lit| {
 368                 accum + if lit.is_cut() { lit.len() } else { 0 }
 369             });
 370             for lits_lit in lits.literals() {
 371                 for self_lit in self.literals() {
 372                     if !self_lit.is_cut() {
 373                         size_after += self_lit.len() + lits_lit.len();
 374                     }
 375                 }
 376             }
 377         }
 378         if size_after > self.limit_size {
 379             return false;
 380         }
 381
 382         let mut base = self.remove_complete();
 383         if base.is_empty() {
 384             base = vec![Literal::empty()];
 385         }
 386         for lits_lit in lits.literals() {
 387             for mut self_lit in base.clone() {
 388                 self_lit.extend(&**lits_lit);
 389                 self_lit.cut = lits_lit.cut;
 390                 self.lits.push(self_lit);
 391             }
 392         }
 393         true
 394     }
 395
 396     /// Extends each literal in this set with the bytes given.
 397     ///
 398     /// If the set is empty, then the given literal is added to the set.
 399     ///
 400     /// If adding any number of bytes to all members of this set causes a limit
 401     /// to be exceeded, then no bytes are added and false is returned. If a
 402     /// prefix of `bytes` can be fit into this set, then it is used and all
 403     /// resulting literals are cut.
 404     pub fn cross_add(&mut self, bytes: &[u8]) -> bool {
 405         // N.B. This could be implemented by simply calling cross_product with
 406         // a literal set containing just `bytes`, but we can be smarter about
 407         // taking shorter prefixes of `bytes` if they'll fit.
 408         if bytes.is_empty() {
 409             return true;
 410         }
 411         if self.lits.is_empty() {
 412             let i = cmp::min(self.limit_size, bytes.len());
 413             self.lits.push(Literal::new(bytes[..i].to_owned()));
 414             self.lits[0].cut = i < bytes.len();
 415             return !self.lits[0].is_cut();
 416         }
 417         let size = self.num_bytes();
 418         if size + self.lits.len() >= self.limit_size {
 419             return false;
 420         }
 421         let mut i = 1;
 422         while size + (i * self.lits.len()) <= self.limit_size
 423             && i < bytes.len()
 424         {
 425             i += 1;
 426         }
 427         for lit in &mut self.lits {
 428             if !lit.is_cut() {
 429                 lit.extend(&bytes[..i]);
 430                 if i < bytes.len() {
 431                     lit.cut();
 432                 }
 433             }
 434         }
 435         true
 436     }
 437
 438     /// Adds the given literal to this set.
 439     ///
 440     /// Returns false if adding this literal would cause the class to be too
 441     /// big.
 442     pub fn add(&mut self, lit: Literal) -> bool {
 443         if self.num_bytes() + lit.len() > self.limit_size {
 444             return false;
 445         }
 446         self.lits.push(lit);
 447         true
 448     }
 449
 450     /// Extends each literal in this set with the character class given.
 451     ///
 452     /// Returns false if the character class was too big to add.
 453     pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool {
 454         self._add_char_class(cls, false)
 455     }
 456
 457     /// Extends each literal in this set with the character class given,
 458     /// writing the bytes of each character in reverse.
 459     ///
 460     /// Returns false if the character class was too big to add.
 461     fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool {
 462         self._add_char_class(cls, true)
 463     }
 464
 465     fn _add_char_class(
 466         &mut self,
 467         cls: &hir::ClassUnicode,
 468         reverse: bool,
 469     ) -> bool {
 470         use std::char;
 471
 472         if self.class_exceeds_limits(cls_char_count(cls)) {
 473             return false;
 474         }
 475         let mut base = self.remove_complete();
 476         if base.is_empty() {
 477             base = vec![Literal::empty()];
 478         }
 479         for r in cls.iter() {
 480             let (s, e) = (r.start as u32, r.end as u32 + 1);
 481             for c in (s..e).filter_map(char::from_u32) {
 482                 for mut lit in base.clone() {
 483                     let mut bytes = c.to_string().into_bytes();
 484                     if reverse {
 485                         bytes.reverse();
 486                     }
 487                     lit.extend(&bytes);
 488                     self.lits.push(lit);
 489                 }
 490             }
 491         }
 492         true
 493     }
 494
 495     /// Extends each literal in this set with the byte class given.
 496     ///
 497     /// Returns false if the byte class was too big to add.
 498     pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool {
 499         if self.class_exceeds_limits(cls_byte_count(cls)) {
 500             return false;
 501         }
 502         let mut base = self.remove_complete();
 503         if base.is_empty() {
 504             base = vec![Literal::empty()];
 505         }
 506         for r in cls.iter() {
 507             let (s, e) = (r.start as u32, r.end as u32 + 1);
 508             for b in (s..e).map(|b| b as u8) {
 509                 for mut lit in base.clone() {
 510                     lit.push(b);
 511                     self.lits.push(lit);
 512                 }
 513             }
 514         }
 515         true
 516     }
 517
 518     /// Cuts every member of this set. When a member is cut, it can never
 519     /// be extended.
 520     pub fn cut(&mut self) {
 521         for lit in &mut self.lits {
 522             lit.cut();
 523         }
 524     }
 525
 526     /// Reverses all members in place.
 527     pub fn reverse(&mut self) {
 528         for lit in &mut self.lits {
 529             lit.reverse();
 530         }
 531     }
 532
 533     /// Clears this set of all members.
 534     pub fn clear(&mut self) {
 535         self.lits.clear();
 536     }
 537
 538     /// Pops all complete literals out of this set.
 539     fn remove_complete(&mut self) -> Vec<Literal> {
 540         let mut base = vec![];
 541         for lit in mem::replace(&mut self.lits, vec![]) {
 542             if lit.is_cut() {
 543                 self.lits.push(lit);
 544             } else {
 545                 base.push(lit);
 546             }
 547         }
 548         base
 549     }
 550
 551     /// Returns the total number of bytes in this set.
 552     fn num_bytes(&self) -> usize {
 553         self.lits.iter().fold(0, |accum, lit| accum + lit.len())
 554     }
 555
 556     /// Returns true if a character class with the given size would cause this
 557     /// set to exceed its limits.
 558     ///
 559     /// The size given should correspond to the number of items in the class.
 560     fn class_exceeds_limits(&self, size: usize) -> bool {
 561         if size > self.limit_class {
 562             return true;
 563         }
 564         // This is an approximation since codepoints in a char class can encode
 565         // to 1-4 bytes.
 566         let new_byte_count = if self.lits.is_empty() {
 567             size
 568         } else {
 569             self.lits.iter().fold(0, |accum, lit| {
 570                 accum
 571                     + if lit.is_cut() {
 572                         // If the literal is cut, then we'll never add
 573                         // anything to it, so don't count it.
 574                         0
 575                     } else {
 576                         (lit.len() + 1) * size
 577                     }
 578             })
 579         };
 580         new_byte_count > self.limit_size
 581     }
 582 }
 583
 584 fn prefixes(expr: &Hir, lits: &mut Literals) {
 585     match *expr.kind() {
 586         HirKind::Literal(hir::Literal::Unicode(c)) => {
 587             let mut buf = [0; 4];
 588             lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
 589         }
 590         HirKind::Literal(hir::Literal::Byte(b)) => {
 591             lits.cross_add(&[b]);
 592         }
 593         HirKind::Class(hir::Class::Unicode(ref cls)) => {
 594             if !lits.add_char_class(cls) {
 595                 lits.cut();
 596             }
 597         }
 598         HirKind::Class(hir::Class::Bytes(ref cls)) => {
 599             if !lits.add_byte_class(cls) {
 600                 lits.cut();
 601             }
 602         }
 603         HirKind::Group(hir::Group { ref hir, .. }) => {
 604             prefixes(&**hir, lits);
 605         }
 606         HirKind::Repetition(ref x) => match x.kind {
 607             hir::RepetitionKind::ZeroOrOne => {
 608                 repeat_zero_or_one_literals(&x.hir, lits, prefixes);
 609             }
 610             hir::RepetitionKind::ZeroOrMore => {
 611                 repeat_zero_or_more_literals(&x.hir, lits, prefixes);
 612             }
 613             hir::RepetitionKind::OneOrMore => {
 614                 repeat_one_or_more_literals(&x.hir, lits, prefixes);
 615             }
 616             hir::RepetitionKind::Range(ref rng) => {
 617                 let (min, max) = match *rng {
 618                     hir::RepetitionRange::Exactly(m) => (m, Some(m)),
 619                     hir::RepetitionRange::AtLeast(m) => (m, None),
 620                     hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
 621                 };
 622                 repeat_range_literals(
 623                     &x.hir, min, max, x.greedy, lits, prefixes,
 624                 )
 625             }
 626         },
 627         HirKind::Concat(ref es) if es.is_empty() => {}
 628         HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
 629         HirKind::Concat(ref es) => {
 630             for e in es {
 631                 if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() {
 632                     if !lits.is_empty() {
 633                         lits.cut();
 634                         break;
 635                     }
 636                     lits.add(Literal::empty());
 637                     continue;
 638                 }
 639                 let mut lits2 = lits.to_empty();
 640                 prefixes(e, &mut lits2);
 641                 if !lits.cross_product(&lits2) || !lits2.any_complete() {
 642                     // If this expression couldn't yield any literal that
 643                     // could be extended, then we need to quit. Since we're
 644                     // short-circuiting, we also need to freeze every member.
 645                     lits.cut();
 646                     break;
 647                 }
 648             }
 649         }
 650         HirKind::Alternation(ref es) => {
 651             alternate_literals(es, lits, prefixes);
 652         }
 653         _ => lits.cut(),
 654     }
 655 }
 656
 657 fn suffixes(expr: &Hir, lits: &mut Literals) {
 658     match *expr.kind() {
 659         HirKind::Literal(hir::Literal::Unicode(c)) => {
 660             let mut buf = [0u8; 4];
 661             let i = c.encode_utf8(&mut buf).len();
 662             let buf = &mut buf[..i];
 663             buf.reverse();
 664             lits.cross_add(buf);
 665         }
 666         HirKind::Literal(hir::Literal::Byte(b)) => {
 667             lits.cross_add(&[b]);
 668         }
 669         HirKind::Class(hir::Class::Unicode(ref cls)) => {
 670             if !lits.add_char_class_reverse(cls) {
 671                 lits.cut();
 672             }
 673         }
 674         HirKind::Class(hir::Class::Bytes(ref cls)) => {
 675             if !lits.add_byte_class(cls) {
 676                 lits.cut();
 677             }
 678         }
 679         HirKind::Group(hir::Group { ref hir, .. }) => {
 680             suffixes(&**hir, lits);
 681         }
 682         HirKind::Repetition(ref x) => match x.kind {
 683             hir::RepetitionKind::ZeroOrOne => {
 684                 repeat_zero_or_one_literals(&x.hir, lits, suffixes);
 685             }
 686             hir::RepetitionKind::ZeroOrMore => {
 687                 repeat_zero_or_more_literals(&x.hir, lits, suffixes);
 688             }
 689             hir::RepetitionKind::OneOrMore => {
 690                 repeat_one_or_more_literals(&x.hir, lits, suffixes);
 691             }
 692             hir::RepetitionKind::Range(ref rng) => {
 693                 let (min, max) = match *rng {
 694                     hir::RepetitionRange::Exactly(m) => (m, Some(m)),
 695                     hir::RepetitionRange::AtLeast(m) => (m, None),
 696                     hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
 697                 };
 698                 repeat_range_literals(
 699                     &x.hir, min, max, x.greedy, lits, suffixes,
 700                 )
 701             }
 702         },
 703         HirKind::Concat(ref es) if es.is_empty() => {}
 704         HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
 705         HirKind::Concat(ref es) => {
 706             for e in es.iter().rev() {
 707                 if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() {
 708                     if !lits.is_empty() {
 709                         lits.cut();
 710                         break;
 711                     }
 712                     lits.add(Literal::empty());
 713                     continue;
 714                 }
 715                 let mut lits2 = lits.to_empty();
 716                 suffixes(e, &mut lits2);
 717                 if !lits.cross_product(&lits2) || !lits2.any_complete() {
 718                     // If this expression couldn't yield any literal that
 719                     // could be extended, then we need to quit. Since we're
 720                     // short-circuiting, we also need to freeze every member.
 721                     lits.cut();
 722                     break;
 723                 }
 724             }
 725         }
 726         HirKind::Alternation(ref es) => {
 727             alternate_literals(es, lits, suffixes);
 728         }
 729         _ => lits.cut(),
 730     }
 731 }
 732
 733 fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
 734     e: &Hir,
 735     lits: &mut Literals,
 736     mut f: F,
 737 ) {
 738     f(
 739         &Hir::repetition(hir::Repetition {
 740             kind: hir::RepetitionKind::ZeroOrMore,
 741             // FIXME: Our literal extraction doesn't care about greediness.
 742             // Which is partially why we're treating 'e?' as 'e*'. Namely,
 743             // 'ab??' yields [Complete(ab), Complete(a)], but it should yield
 744             // [Complete(a), Complete(ab)] because of the non-greediness.
 745             greedy: true,
 746             hir: Box::new(e.clone()),
 747         }),
 748         lits,
 749     );
 750 }
 751
 752 fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
 753     e: &Hir,
 754     lits: &mut Literals,
 755     mut f: F,
 756 ) {
 757     let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
 758     lits3.set_limit_size(lits.limit_size() / 2);
 759     f(e, &mut lits3);
 760
 761     if lits3.is_empty() || !lits2.cross_product(&lits3) {
 762         lits.cut();
 763         return;
 764     }
 765     lits2.cut();
 766     lits2.add(Literal::empty());
 767     if !lits.union(lits2) {
 768         lits.cut();
 769     }
 770 }
 771
 772 fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
 773     e: &Hir,
 774     lits: &mut Literals,
 775     mut f: F,
 776 ) {
 777     f(e, lits);
 778     lits.cut();
 779 }
 780
 781 fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
 782     e: &Hir,
 783     min: u32,
 784     max: Option<u32>,
 785     greedy: bool,
 786     lits: &mut Literals,
 787     mut f: F,
 788 ) {
 789     if min == 0 {
 790         // This is a bit conservative. If `max` is set, then we could
 791         // treat this as a finite set of alternations. For now, we
 792         // just treat it as `e*`.
 793         f(
 794             &Hir::repetition(hir::Repetition {
 795                 kind: hir::RepetitionKind::ZeroOrMore,
 796                 greedy: greedy,
 797                 hir: Box::new(e.clone()),
 798             }),
 799             lits,
 800         );
 801     } else {
 802         if min > 0 {
 803             let n = cmp::min(lits.limit_size, min as usize);
 804             let es = iter::repeat(e.clone()).take(n).collect();
 805             f(&Hir::concat(es), lits);
 806             if n < min as usize || lits.contains_empty() {
 807                 lits.cut();
 808             }
 809         }
 810         if max.map_or(true, |max| min < max) {
 811             lits.cut();
 812         }
 813     }
 814 }
 815
 816 fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
 817     es: &[Hir],
 818     lits: &mut Literals,
 819     mut f: F,
 820 ) {
 821     let mut lits2 = lits.to_empty();
 822     for e in es {
 823         let mut lits3 = lits.to_empty();
 824         lits3.set_limit_size(lits.limit_size() / 5);
 825         f(e, &mut lits3);
 826         if lits3.is_empty() || !lits2.union(lits3) {
 827             // If we couldn't find suffixes for *any* of the
 828             // alternates, then the entire alternation has to be thrown
 829             // away and any existing members must be frozen. Similarly,
 830             // if the union couldn't complete, stop and freeze.
 831             lits.cut();
 832             return;
 833         }
 834     }
 835     if !lits.cross_product(&lits2) {
 836         lits.cut();
 837     }
 838 }
 839
 840 impl fmt::Debug for Literals {
 841     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 842         f.debug_struct("Literals")
 843             .field("lits", &self.lits)
 844             .field("limit_size", &self.limit_size)
 845             .field("limit_class", &self.limit_class)
 846             .finish()
 847     }
 848 }
 849
 850 impl Literal {
 851     /// Returns a new complete literal with the bytes given.
 852     pub fn new(bytes: Vec<u8>) -> Literal {
 853         Literal { v: bytes, cut: false }
 854     }
 855
 856     /// Returns a new complete empty literal.
 857     pub fn empty() -> Literal {
 858         Literal { v: vec![], cut: false }
 859     }
 860
 861     /// Returns true if this literal was "cut."
 862     pub fn is_cut(&self) -> bool {
 863         self.cut
 864     }
 865
 866     /// Cuts this literal.
 867     pub fn cut(&mut self) {
 868         self.cut = true;
 869     }
 870 }
 871
 872 impl PartialEq for Literal {
 873     fn eq(&self, other: &Literal) -> bool {
 874         self.v == other.v
 875     }
 876 }
 877
 878 impl PartialOrd for Literal {
 879     fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> {
 880         self.v.partial_cmp(&other.v)
 881     }
 882 }
 883
 884 impl fmt::Debug for Literal {
 885     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 886         if self.is_cut() {
 887             write!(f, "Cut({})", escape_unicode(&self.v))
 888         } else {
 889             write!(f, "Complete({})", escape_unicode(&self.v))
 890         }
 891     }
 892 }
 893
 894 impl AsRef<[u8]> for Literal {
 895     fn as_ref(&self) -> &[u8] {
 896         &self.v
 897     }
 898 }
 899
 900 impl ops::Deref for Literal {
 901     type Target = Vec<u8>;
 902     fn deref(&self) -> &Vec<u8> {
 903         &self.v
 904     }
 905 }
 906
 907 impl ops::DerefMut for Literal {
 908     fn deref_mut(&mut self) -> &mut Vec<u8> {
 909         &mut self.v
 910     }
 911 }
 912
 913 fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
 914     let mut i = 0;
 915     while haystack.len() >= needle.len() {
 916         if needle == &haystack[..needle.len()] {
 917             return Some(i);
 918         }
 919         i += 1;
 920         haystack = &haystack[1..];
 921     }
 922     None
 923 }
 924
 925 fn escape_unicode(bytes: &[u8]) -> String {
 926     let show = match ::std::str::from_utf8(bytes) {
 927         Ok(v) => v.to_string(),
 928         Err(_) => escape_bytes(bytes),
 929     };
 930     let mut space_escaped = String::new();
 931     for c in show.chars() {
 932         if c.is_whitespace() {
 933             let escaped = if c as u32 <= 0x7F {
 934                 escape_byte(c as u8)
 935             } else {
 936                 if c as u32 <= 0xFFFF {
 937                     format!(r"\u{{{:04x}}}", c as u32)
 938                 } else {
 939                     format!(r"\U{{{:08x}}}", c as u32)
 940                 }
 941             };
 942             space_escaped.push_str(&escaped);
 943         } else {
 944             space_escaped.push(c);
 945         }
 946     }
 947     space_escaped
 948 }
 949
 950 fn escape_bytes(bytes: &[u8]) -> String {
 951     let mut s = String::new();
 952     for &b in bytes {
 953         s.push_str(&escape_byte(b));
 954     }
 955     s
 956 }
 957
 958 fn escape_byte(byte: u8) -> String {
 959     use std::ascii::escape_default;
 960
 961     let escaped: Vec<u8> = escape_default(byte).collect();
 962     String::from_utf8_lossy(&escaped).into_owned()
 963 }
 964
 965 fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
 966     cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
 967         as usize
 968 }
 969
 970 fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
 971     cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
 972         as usize
 973 }
 974
 975 #[cfg(test)]
 976 mod tests {
 977     use std::fmt;
 978
 979     use super::{escape_bytes, Literal, Literals};
 980     use crate::hir::Hir;
 981     use crate::ParserBuilder;
 982
 983     // To make test failures easier to read.
 984     #[derive(Debug, Eq, PartialEq)]
 985     struct Bytes(Vec<ULiteral>);
 986     #[derive(Debug, Eq, PartialEq)]
 987     struct Unicode(Vec<ULiteral>);
 988
 989     fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> {
 990         let mut ulits = vec![];
 991         for blit in blits {
 992             ulits
 993                 .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() });
 994         }
 995         ulits
 996     }
 997
 998     fn create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals {
 999         Literals {
1000             lits: it.into_iter().collect(),
1001             limit_size: 0,
1002             limit_class: 0,
1003         }
1004     }
1005
1006     // Needs to be pub for 1.3?
1007     #[derive(Clone, Eq, PartialEq)]
1008     pub struct ULiteral {
1009         v: String,
1010         cut: bool,
1011     }
1012
1013     impl ULiteral {
1014         fn is_cut(&self) -> bool {
1015             self.cut
1016         }
1017     }
1018
1019     impl fmt::Debug for ULiteral {
1020         fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1021             if self.is_cut() {
1022                 write!(f, "Cut({})", self.v)
1023             } else {
1024                 write!(f, "Complete({})", self.v)
1025             }
1026         }
1027     }
1028
1029     impl PartialEq<Literal> for ULiteral {
1030         fn eq(&self, other: &Literal) -> bool {
1031             self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut()
1032         }
1033     }
1034
1035     impl PartialEq<ULiteral> for Literal {
1036         fn eq(&self, other: &ULiteral) -> bool {
1037             &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut()
1038         }
1039     }
1040
1041     #[allow(non_snake_case)]
1042     fn C(s: &'static str) -> ULiteral {
1043         ULiteral { v: s.to_owned(), cut: true }
1044     }
1045     #[allow(non_snake_case)]
1046     fn M(s: &'static str) -> ULiteral {
1047         ULiteral { v: s.to_owned(), cut: false }
1048     }
1049
1050     fn prefixes(lits: &mut Literals, expr: &Hir) {
1051         lits.union_prefixes(expr);
1052     }
1053
1054     fn suffixes(lits: &mut Literals, expr: &Hir) {
1055         lits.union_suffixes(expr);
1056     }
1057
1058     macro_rules! assert_lit_eq {
1059         ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{
1060             let expected: Vec<ULiteral> = vec![$($expected_lit),*];
1061             let lits = $got_lits;
1062             assert_eq!(
1063                 $which(expected.clone()),
1064                 $which(escape_lits(lits.literals())));
1065             assert_eq!(
1066                 !expected.is_empty() && expected.iter().all(|l| !l.is_cut()),
1067                 lits.all_complete());
1068             assert_eq!(
1069                 expected.iter().any(|l| !l.is_cut()),
1070                 lits.any_complete());
1071         }};
1072     }
1073
1074     macro_rules! test_lit {
1075         ($name:ident, $which:ident, $re:expr) => {
1076             test_lit!($name, $which, $re,);
1077         };
1078         ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1079             #[test]
1080             fn $name() {
1081                 let expr = ParserBuilder::new()
1082                     .build()
1083                     .parse($re)
1084                     .unwrap();
1085                 let lits = Literals::$which(&expr);
1086                 assert_lit_eq!(Unicode, lits, $($lit),*);
1087
1088                 let expr = ParserBuilder::new()
1089                     .allow_invalid_utf8(true)
1090                     .unicode(false)
1091                     .build()
1092                     .parse($re)
1093                     .unwrap();
1094                 let lits = Literals::$which(&expr);
1095                 assert_lit_eq!(Bytes, lits, $($lit),*);
1096             }
1097         };
1098     }
1099
1100     // ************************************************************************
1101     // Tests for prefix literal extraction.
1102     // ************************************************************************
1103
1104     // Elementary tests.
1105     test_lit!(pfx_one_lit1, prefixes, "a", M("a"));
1106     test_lit!(pfx_one_lit2, prefixes, "abc", M("abc"));
1107     test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1108     #[cfg(feature = "unicode-case")]
1109     test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1110     test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
1111     test_lit!(
1112         pfx_class2,
1113         prefixes,
1114         "(?u)[☃Ⅰ]",
1115         M("\\xe2\\x85\\xa0"),
1116         M("\\xe2\\x98\\x83")
1117     );
1118     #[cfg(feature = "unicode-case")]
1119     test_lit!(
1120         pfx_class3,
1121         prefixes,
1122         "(?ui)[☃Ⅰ]",
1123         M("\\xe2\\x85\\xa0"),
1124         M("\\xe2\\x85\\xb0"),
1125         M("\\xe2\\x98\\x83")
1126     );
1127     test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a"));
1128     test_lit!(
1129         pfx_one_lit_casei2,
1130         prefixes,
1131         "(?i-u)abc",
1132         M("ABC"),
1133         M("aBC"),
1134         M("AbC"),
1135         M("abC"),
1136         M("ABc"),
1137         M("aBc"),
1138         M("Abc"),
1139         M("abc")
1140     );
1141     test_lit!(pfx_group1, prefixes, "(a)", M("a"));
1142     test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
1143     test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
1144     test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a"));
1145     // FIXME: This should return [M("a"), M("ab")] because of the non-greedy
1146     // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get
1147     // a cut literal.
1148     test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a"));
1149     test_lit!(pfx_rep_zero_or_more1, prefixes, "a*");
1150     test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*");
1151     test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a"));
1152     test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc"));
1153     test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a"));
1154     test_lit!(pfx_rep_range1, prefixes, "a{0}");
1155     test_lit!(pfx_rep_range2, prefixes, "a{0,}");
1156     test_lit!(pfx_rep_range3, prefixes, "a{0,1}");
1157     test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a"));
1158     test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa"));
1159     test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a"));
1160     test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa"));
1161
1162     // Test regexes with concatenations.
1163     test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab"));
1164     test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz"));
1165     test_lit!(
1166         pfx_cat3,
1167         prefixes,
1168         "(?i-u)[ab]z",
1169         M("AZ"),
1170         M("BZ"),
1171         M("aZ"),
1172         M("bZ"),
1173         M("Az"),
1174         M("Bz"),
1175         M("az"),
1176         M("bz")
1177     );
1178     test_lit!(
1179         pfx_cat4,
1180         prefixes,
1181         "[ab][yz]",
1182         M("ay"),
1183         M("by"),
1184         M("az"),
1185         M("bz")
1186     );
1187     test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b"));
1188     test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c"));
1189     test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c"));
1190     test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b"));
1191     test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b"));
1192     test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a"));
1193     test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac"));
1194     test_lit!(pfx_cat12, prefixes, "ab+", C("ab"));
1195     test_lit!(pfx_cat13, prefixes, "ab+c", C("ab"));
1196     test_lit!(pfx_cat14, prefixes, "a^", C("a"));
1197     test_lit!(pfx_cat15, prefixes, "$a");
1198     test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac"));
1199     test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab"));
1200     test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb"));
1201     test_lit!(pfx_cat19, prefixes, "a.z", C("a"));
1202
1203     // Test regexes with alternations.
1204     test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b"));
1205     test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1206     test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1207     test_lit!(pfx_alt4, prefixes, "a|b*");
1208     test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b"));
1209     test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)");
1210     test_lit!(
1211         pfx_alt7,
1212         prefixes,
1213         "(a|b)*c|(a|ab)*c",
1214         C("a"),
1215         C("b"),
1216         M("c"),
1217         C("a"),
1218         C("ab"),
1219         M("c")
1220     );
1221     test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c"));
1222
1223     // Test regexes with empty assertions.
1224     test_lit!(pfx_empty1, prefixes, "^a", M("a"));
1225     test_lit!(pfx_empty2, prefixes, "a${2}", C("a"));
1226     test_lit!(pfx_empty3, prefixes, "^abc", M("abc"));
1227     test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z"));
1228
1229     // Make sure some curious regexes have no prefixes.
1230     test_lit!(pfx_nothing1, prefixes, ".");
1231     test_lit!(pfx_nothing2, prefixes, "(?s).");
1232     test_lit!(pfx_nothing3, prefixes, "^");
1233     test_lit!(pfx_nothing4, prefixes, "$");
1234     test_lit!(pfx_nothing6, prefixes, "(?m)$");
1235     test_lit!(pfx_nothing7, prefixes, r"\b");
1236     test_lit!(pfx_nothing8, prefixes, r"\B");
1237
1238     // Test a few regexes that defeat any prefix literal detection.
1239     test_lit!(pfx_defeated1, prefixes, ".a");
1240     test_lit!(pfx_defeated2, prefixes, "(?s).a");
1241     test_lit!(pfx_defeated3, prefixes, "a*b*c*");
1242     test_lit!(pfx_defeated4, prefixes, "a|.");
1243     test_lit!(pfx_defeated5, prefixes, ".|a");
1244     test_lit!(pfx_defeated6, prefixes, "a|^");
1245     test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))");
1246     test_lit!(pfx_defeated8, prefixes, "$a");
1247     test_lit!(pfx_defeated9, prefixes, "(?m)$a");
1248     test_lit!(pfx_defeated10, prefixes, r"\ba");
1249     test_lit!(pfx_defeated11, prefixes, r"\Ba");
1250     test_lit!(pfx_defeated12, prefixes, "^*a");
1251     test_lit!(pfx_defeated13, prefixes, "^+a");
1252
1253     test_lit!(
1254         pfx_crazy1,
1255         prefixes,
1256         r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
1257         C("Mo\\'"),
1258         C("Mu\\'"),
1259         C("Moam"),
1260         C("Muam")
1261     );
1262
1263     // ************************************************************************
1264     // Tests for quiting prefix literal search.
1265     // ************************************************************************
1266
1267     macro_rules! test_exhausted {
1268         ($name:ident, $which:ident, $re:expr) => {
1269             test_exhausted!($name, $which, $re,);
1270         };
1271         ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1272             #[test]
1273             fn $name() {
1274                 let expr = ParserBuilder::new()
1275                     .build()
1276                     .parse($re)
1277                     .unwrap();
1278                 let mut lits = Literals::empty();
1279                 lits.set_limit_size(20).set_limit_class(10);
1280                 $which(&mut lits, &expr);
1281                 assert_lit_eq!(Unicode, lits, $($lit),*);
1282
1283                 let expr = ParserBuilder::new()
1284                     .allow_invalid_utf8(true)
1285                     .unicode(false)
1286                     .build()
1287                     .parse($re)
1288                     .unwrap();
1289                 let mut lits = Literals::empty();
1290                 lits.set_limit_size(20).set_limit_class(10);
1291                 $which(&mut lits, &expr);
1292                 assert_lit_eq!(Bytes, lits, $($lit),*);
1293             }
1294         };
1295     }
1296
1297     // These test use a much lower limit than the default so that we can
1298     // write test cases of reasonable size.
1299     test_exhausted!(pfx_exhausted1, prefixes, "[a-z]");
1300     test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A");
1301     test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A"));
1302     test_exhausted!(
1303         pfx_exhausted4,
1304         prefixes,
1305         "(?i-u)foobar",
1306         C("FO"),
1307         C("fO"),
1308         C("Fo"),
1309         C("fo")
1310     );
1311     test_exhausted!(
1312         pfx_exhausted5,
1313         prefixes,
1314         "(?:ab){100}",
1315         C("abababababababababab")
1316     );
1317     test_exhausted!(
1318         pfx_exhausted6,
1319         prefixes,
1320         "(?:(?:ab){100})*cd",
1321         C("ababababab"),
1322         M("cd")
1323     );
1324     test_exhausted!(
1325         pfx_exhausted7,
1326         prefixes,
1327         "z(?:(?:ab){100})*cd",
1328         C("zababababab"),
1329         M("zcd")
1330     );
1331     test_exhausted!(
1332         pfx_exhausted8,
1333         prefixes,
1334         "aaaaaaaaaaaaaaaaaaaaz",
1335         C("aaaaaaaaaaaaaaaaaaaa")
1336     );
1337
1338     // ************************************************************************
1339     // Tests for suffix literal extraction.
1340     // ************************************************************************
1341
1342     // Elementary tests.
1343     test_lit!(sfx_one_lit1, suffixes, "a", M("a"));
1344     test_lit!(sfx_one_lit2, suffixes, "abc", M("abc"));
1345     test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1346     #[cfg(feature = "unicode-case")]
1347     test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1348     test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
1349     test_lit!(
1350         sfx_class2,
1351         suffixes,
1352         "(?u)[☃Ⅰ]",
1353         M("\\xe2\\x85\\xa0"),
1354         M("\\xe2\\x98\\x83")
1355     );
1356     #[cfg(feature = "unicode-case")]
1357     test_lit!(
1358         sfx_class3,
1359         suffixes,
1360         "(?ui)[☃Ⅰ]",
1361         M("\\xe2\\x85\\xa0"),
1362         M("\\xe2\\x85\\xb0"),
1363         M("\\xe2\\x98\\x83")
1364     );
1365     test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a"));
1366     test_lit!(
1367         sfx_one_lit_casei2,
1368         suffixes,
1369         "(?i-u)abc",
1370         M("ABC"),
1371         M("ABc"),
1372         M("AbC"),
1373         M("Abc"),
1374         M("aBC"),
1375         M("aBc"),
1376         M("abC"),
1377         M("abc")
1378     );
1379     test_lit!(sfx_group1, suffixes, "(a)", M("a"));
1380     test_lit!(sfx_rep_zero_or_one1, suffixes, "a?");
1381     test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?");
1382     test_lit!(sfx_rep_zero_or_more1, suffixes, "a*");
1383     test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*");
1384     test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a"));
1385     test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc"));
1386     test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a"));
1387     test_lit!(sfx_rep_range1, suffixes, "a{0}");
1388     test_lit!(sfx_rep_range2, suffixes, "a{0,}");
1389     test_lit!(sfx_rep_range3, suffixes, "a{0,1}");
1390     test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a"));
1391     test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa"));
1392     test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a"));
1393     test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa"));
1394
1395     // Test regexes with concatenations.
1396     test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab"));
1397     test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz"));
1398     test_lit!(
1399         sfx_cat3,
1400         suffixes,
1401         "(?i-u)[ab]z",
1402         M("AZ"),
1403         M("Az"),
1404         M("BZ"),
1405         M("Bz"),
1406         M("aZ"),
1407         M("az"),
1408         M("bZ"),
1409         M("bz")
1410     );
1411     test_lit!(
1412         sfx_cat4,
1413         suffixes,
1414         "[ab][yz]",
1415         M("ay"),
1416         M("az"),
1417         M("by"),
1418         M("bz")
1419     );
1420     test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b"));
1421     test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c"));
1422     test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c"));
1423     test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc"));
1424     test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b"));
1425     test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a"));
1426     test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac"));
1427     test_lit!(sfx_cat12, suffixes, "ab+", C("b"));
1428     test_lit!(sfx_cat13, suffixes, "ab+c", C("bc"));
1429     test_lit!(sfx_cat14, suffixes, "a^");
1430     test_lit!(sfx_cat15, suffixes, "$a", C("a"));
1431     test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac"));
1432     test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc"));
1433     test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb"));
1434     test_lit!(sfx_cat19, suffixes, "a.z", C("z"));
1435
1436     // Test regexes with alternations.
1437     test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b"));
1438     test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1439     test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1440     test_lit!(sfx_alt4, suffixes, "a|b*");
1441     test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b"));
1442     test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)");
1443     test_lit!(
1444         sfx_alt7,
1445         suffixes,
1446         "(a|b)*c|(a|ab)*c",
1447         C("ac"),
1448         C("bc"),
1449         M("c"),
1450         C("ac"),
1451         C("abc"),
1452         M("c")
1453     );
1454     test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c"));
1455
1456     // Test regexes with empty assertions.
1457     test_lit!(sfx_empty1, suffixes, "a$", M("a"));
1458     test_lit!(sfx_empty2, suffixes, "${2}a", C("a"));
1459
1460     // Make sure some curious regexes have no suffixes.
1461     test_lit!(sfx_nothing1, suffixes, ".");
1462     test_lit!(sfx_nothing2, suffixes, "(?s).");
1463     test_lit!(sfx_nothing3, suffixes, "^");
1464     test_lit!(sfx_nothing4, suffixes, "$");
1465     test_lit!(sfx_nothing6, suffixes, "(?m)$");
1466     test_lit!(sfx_nothing7, suffixes, r"\b");
1467     test_lit!(sfx_nothing8, suffixes, r"\B");
1468
1469     // Test a few regexes that defeat any suffix literal detection.
1470     test_lit!(sfx_defeated1, suffixes, "a.");
1471     test_lit!(sfx_defeated2, suffixes, "(?s)a.");
1472     test_lit!(sfx_defeated3, suffixes, "a*b*c*");
1473     test_lit!(sfx_defeated4, suffixes, "a|.");
1474     test_lit!(sfx_defeated5, suffixes, ".|a");
1475     test_lit!(sfx_defeated6, suffixes, "a|^");
1476     test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c)).");
1477     test_lit!(sfx_defeated8, suffixes, "a^");
1478     test_lit!(sfx_defeated9, suffixes, "(?m)a$");
1479     test_lit!(sfx_defeated10, suffixes, r"a\b");
1480     test_lit!(sfx_defeated11, suffixes, r"a\B");
1481     test_lit!(sfx_defeated12, suffixes, "a^*");
1482     test_lit!(sfx_defeated13, suffixes, "a^+");
1483
1484     // These test use a much lower limit than the default so that we can
1485     // write test cases of reasonable size.
1486     test_exhausted!(sfx_exhausted1, suffixes, "[a-z]");
1487     test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*");
1488     test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z"));
1489     test_exhausted!(
1490         sfx_exhausted4,
1491         suffixes,
1492         "(?i-u)foobar",
1493         C("AR"),
1494         C("Ar"),
1495         C("aR"),
1496         C("ar")
1497     );
1498     test_exhausted!(
1499         sfx_exhausted5,
1500         suffixes,
1501         "(?:ab){100}",
1502         C("abababababababababab")
1503     );
1504     test_exhausted!(
1505         sfx_exhausted6,
1506         suffixes,
1507         "cd(?:(?:ab){100})*",
1508         C("ababababab"),
1509         M("cd")
1510     );
1511     test_exhausted!(
1512         sfx_exhausted7,
1513         suffixes,
1514         "cd(?:(?:ab){100})*z",
1515         C("abababababz"),
1516         M("cdz")
1517     );
1518     test_exhausted!(
1519         sfx_exhausted8,
1520         suffixes,
1521         "zaaaaaaaaaaaaaaaaaaaa",
1522         C("aaaaaaaaaaaaaaaaaaaa")
1523     );
1524
1525     // ************************************************************************
1526     // Tests for generating unambiguous literal sets.
1527     // ************************************************************************
1528
1529     macro_rules! test_unamb {
1530         ($name:ident, $given:expr, $expected:expr) => {
1531             #[test]
1532             fn $name() {
1533                 let given: Vec<Literal> = $given
1534                     .into_iter()
1535                     .map(|ul| {
1536                         let cut = ul.is_cut();
1537                         Literal { v: ul.v.into_bytes(), cut: cut }
1538                     })
1539                     .collect();
1540                 let lits = create_lits(given);
1541                 let got = lits.unambiguous_prefixes();
1542                 assert_eq!($expected, escape_lits(got.literals()));
1543             }
1544         };
1545     }
1546
1547     test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]);
1548     test_unamb!(
1549         unambiguous2,
1550         vec![M("zaaaaaa"), M("aa")],
1551         vec![C("aa"), C("z")]
1552     );
1553     test_unamb!(
1554         unambiguous3,
1555         vec![M("Sherlock"), M("Watson")],
1556         vec![M("Sherlock"), M("Watson")]
1557     );
1558     test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]);
1559     test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]);
1560     test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]);
1561     test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]);
1562     test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]);
1563     test_unamb!(
1564         unambiguous9,
1565         vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
1566         vec![C("a"), C("b"), C("c")]
1567     );
1568     test_unamb!(
1569         unambiguous10,
1570         vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
1571         vec![C("Mo"), C("Mu")]
1572     );
1573     test_unamb!(
1574         unambiguous11,
1575         vec![M("zazb"), M("azb")],
1576         vec![C("a"), C("z")]
1577     );
1578     test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]);
1579     test_unamb!(
1580         unambiguous13,
1581         vec![M("ABCX"), M("CDAX"), M("BCX")],
1582         vec![C("A"), C("BCX"), C("CD")]
1583     );
1584     test_unamb!(
1585         unambiguous14,
1586         vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
1587         vec![M("DSX"), C("I"), C("MGX"), C("MV")]
1588     );
1589     test_unamb!(
1590         unambiguous15,
1591         vec![M("IMG_"), M("MG_"), M("CIMG")],
1592         vec![C("C"), C("I"), C("MG_")]
1593     );
1594
1595     // ************************************************************************
1596     // Tests for suffix trimming.
1597     // ************************************************************************
1598     macro_rules! test_trim {
1599         ($name:ident, $trim:expr, $given:expr, $expected:expr) => {
1600             #[test]
1601             fn $name() {
1602                 let given: Vec<Literal> = $given
1603                     .into_iter()
1604                     .map(|ul| {
1605                         let cut = ul.is_cut();
1606                         Literal { v: ul.v.into_bytes(), cut: cut }
1607                     })
1608                     .collect();
1609                 let lits = create_lits(given);
1610                 let got = lits.trim_suffix($trim).unwrap();
1611                 assert_eq!($expected, escape_lits(got.literals()));
1612             }
1613         };
1614     }
1615
1616     test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]);
1617     test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]);
1618     test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]);
1619     test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]);
1620
1621     // ************************************************************************
1622     // Tests for longest common prefix.
1623     // ************************************************************************
1624
1625     macro_rules! test_lcp {
1626         ($name:ident, $given:expr, $expected:expr) => {
1627             #[test]
1628             fn $name() {
1629                 let given: Vec<Literal> = $given
1630                     .into_iter()
1631                     .map(|s: &str| Literal {
1632                         v: s.to_owned().into_bytes(),
1633                         cut: false,
1634                     })
1635                     .collect();
1636                 let lits = create_lits(given);
1637                 let got = lits.longest_common_prefix();
1638                 assert_eq!($expected, escape_bytes(got));
1639             }
1640         };
1641     }
1642
1643     test_lcp!(lcp1, vec!["a"], "a");
1644     test_lcp!(lcp2, vec![], "");
1645     test_lcp!(lcp3, vec!["a", "b"], "");
1646     test_lcp!(lcp4, vec!["ab", "ab"], "ab");
1647     test_lcp!(lcp5, vec!["ab", "a"], "a");
1648     test_lcp!(lcp6, vec!["a", "ab"], "a");
1649     test_lcp!(lcp7, vec!["ab", "b"], "");
1650     test_lcp!(lcp8, vec!["b", "ab"], "");
1651     test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba");
1652     test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], "");
1653     test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], "");
1654     test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f");
1655
1656     // ************************************************************************
1657     // Tests for longest common suffix.
1658     // ************************************************************************
1659
1660     macro_rules! test_lcs {
1661         ($name:ident, $given:expr, $expected:expr) => {
1662             #[test]
1663             fn $name() {
1664                 let given: Vec<Literal> = $given
1665                     .into_iter()
1666                     .map(|s: &str| Literal {
1667                         v: s.to_owned().into_bytes(),
1668                         cut: false,
1669                     })
1670                     .collect();
1671                 let lits = create_lits(given);
1672                 let got = lits.longest_common_suffix();
1673                 assert_eq!($expected, escape_bytes(got));
1674             }
1675         };
1676     }
1677
1678     test_lcs!(lcs1, vec!["a"], "a");
1679     test_lcs!(lcs2, vec![], "");
1680     test_lcs!(lcs3, vec!["a", "b"], "");
1681     test_lcs!(lcs4, vec!["ab", "ab"], "ab");
1682     test_lcs!(lcs5, vec!["ab", "a"], "");
1683     test_lcs!(lcs6, vec!["a", "ab"], "");
1684     test_lcs!(lcs7, vec!["ab", "b"], "b");
1685     test_lcs!(lcs8, vec!["b", "ab"], "b");
1686     test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo");
1687     test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], "");
1688     test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], "");
1689     test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b");
1690 }