]> git.proxmox.com Git - rustc.git/blame - src/vendor/regex-syntax/src/literals.rs
New upstream version 1.17.0+dfsg1
[rustc.git] / src / vendor / regex-syntax / src / literals.rs
CommitLineData
8bb4bdeb
XL
1// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use std::cmp;
12use std::fmt;
13use std::iter;
14use std::mem;
15use std::ops;
16
17use {Expr, CharClass, ClassRange, ByteClass, ByteRange, Repeater};
18
19/// A set of literal byte strings extracted from a regular expression.
20///
21/// Every member of the set is a `Lit`, which is represented by a `Vec<u8>`.
22/// (Notably, it may contain invalid UTF-8.) Every member is said to be either
23/// *complete* or *cut*. A complete literal means that it extends until the
24/// beginning (or end) of the regular expression. In some circumstances, this
25/// can be used to indicate a match in the regular expression.
26///
27/// Note that a key aspect of literal extraction is knowing when to stop. It is
28/// not feasible to blindly extract all literals from a regular expression,
29/// even if there are finitely many. For example, the regular expression
30/// `[0-9]{10}` has `10^10` distinct literals. For this reason, literal
31/// extraction is bounded to some low number by default using heuristics, but
32/// the limits can be tweaked.
33#[derive(Clone, Eq, PartialEq)]
34pub struct Literals {
35 lits: Vec<Lit>,
36 limit_size: usize,
37 limit_class: usize,
38}
39
40/// A single member of a set of literals extracted from a regular expression.
41///
42/// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice
43/// and `Vec` operations are available.
44#[derive(Clone, Eq, Ord)]
45pub struct Lit {
46 v: Vec<u8>,
47 cut: bool,
48}
49
50impl Literals {
51 /// Returns a new empty set of literals using default limits.
52 pub fn empty() -> Literals {
53 Literals {
54 lits: vec![],
55 limit_size: 250,
56 limit_class: 10,
57 }
58 }
59
60 /// Get the approximate size limit (in bytes) of this set.
61 pub fn limit_size(&self) -> usize {
62 self.limit_size
63 }
64
65 /// Set the approximate size limit (in bytes) of this set.
66 ///
67 /// If extracting a literal would put the set over this limit, then
68 /// extraction stops.
69 ///
70 /// The new limits will only apply to additions to this set. Existing
71 /// members remain unchanged, even if the set exceeds the new limit.
72 pub fn set_limit_size(&mut self, size: usize) -> &mut Literals {
73 self.limit_size = size;
74 self
75 }
76
77 /// Get the character class size limit for this set.
78 pub fn limit_class(&self) -> usize {
79 self.limit_class
80 }
81
82 /// Limits the size of character(or byte) classes considered.
83 ///
84 /// A value of `0` prevents all character classes from being considered.
85 ///
86 /// This limit also applies to case insensitive literals, since each
87 /// character in the case insensitive literal is converted to a class, and
88 /// then case folded.
89 ///
90 /// The new limits will only apply to additions to this set. Existing
91 /// members remain unchanged, even if the set exceeds the new limit.
92 pub fn set_limit_class(&mut self, size: usize) -> &mut Literals {
93 self.limit_class = size;
94 self
95 }
96
97 /// Returns the set of literals as a slice. Its order is unspecified.
98 pub fn literals(&self) -> &[Lit] {
99 &self.lits
100 }
101
102 /// Returns the length of the smallest literal.
103 ///
104 /// Returns None is there are no literals in the set.
105 pub fn min_len(&self) -> Option<usize> {
106 let mut min = None;
107 for lit in &self.lits {
108 match min {
109 None => min = Some(lit.len()),
110 Some(m) if lit.len() < m => min = Some(lit.len()),
111 _ => {}
112 }
113 }
114 min
115 }
116
117 /// Returns true if all members in this set are complete.
118 pub fn all_complete(&self) -> bool {
119 !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut())
120 }
121
122 /// Returns true if any member in this set is complete.
123 pub fn any_complete(&self) -> bool {
124 self.lits.iter().any(|lit| !lit.is_cut())
125 }
126
127 /// Returns true if this set contains an empty literal.
128 pub fn contains_empty(&self) -> bool {
129 self.lits.iter().any(|lit| lit.is_empty())
130 }
131
132 /// Returns true if this set is empty or if all of its members is empty.
133 pub fn is_empty(&self) -> bool {
134 self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty())
135 }
136
137 /// Returns a new empty set of literals using this set's limits.
138 pub fn to_empty(&self) -> Literals {
139 let mut lits = Literals::empty();
140 lits.set_limit_size(self.limit_size)
141 .set_limit_class(self.limit_class);
142 lits
143 }
144
145 /// Returns the longest common prefix of all members in this set.
146 pub fn longest_common_prefix(&self) -> &[u8] {
147 if self.is_empty() {
148 return &[];
149 }
150 let lit0 = &*self.lits[0];
151 let mut len = lit0.len();
152 for lit in &self.lits[1..] {
153 len = cmp::min(
154 len,
155 lit.iter()
156 .zip(lit0)
157 .take_while(|&(a, b)| a == b)
158 .count());
159 }
160 &self.lits[0][..len]
161 }
162
163 /// Returns the longest common suffix of all members in this set.
164 pub fn longest_common_suffix(&self) -> &[u8] {
165 if self.is_empty() {
166 return &[];
167 }
168 let lit0 = &*self.lits[0];
169 let mut len = lit0.len();
170 for lit in &self.lits[1..] {
171 len = cmp::min(
172 len,
173 lit.iter()
174 .rev()
175 .zip(lit0.iter().rev())
176 .take_while(|&(a, b)| a == b)
177 .count());
178 }
179 &self.lits[0][self.lits[0].len() - len..]
180 }
181
182 /// Returns a new set of literals with the given number of bytes trimmed
183 /// from the suffix of each literal.
184 ///
185 /// If any literal would be cut out completely by trimming, then None is
186 /// returned.
187 ///
188 /// Any duplicates that are created as a result of this transformation are
189 /// removed.
190 pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> {
191 if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) {
192 return None;
193 }
194 let mut new = self.to_empty();
195 for mut lit in self.lits.iter().cloned() {
196 let new_len = lit.len() - num_bytes;
197 lit.truncate(new_len);
198 lit.cut();
199 new.lits.push(lit);
200 }
201 new.lits.sort();
202 new.lits.dedup();
203 Some(new)
204 }
205
206 /// Returns a new set of prefixes of this set of literals that are
207 /// guaranteed to be unambiguous.
208 ///
209 /// Any substring match with a member of the set is returned is guaranteed
210 /// to never overlap with a substring match of another member of the set
211 /// at the same starting position.
212 ///
213 /// Given any two members of the returned set, neither is a substring of
214 /// the other.
215 pub fn unambiguous_prefixes(&self) -> Literals {
216 if self.lits.is_empty() {
217 return self.to_empty();
218 }
219 let mut old: Vec<Lit> = self.lits.iter().cloned().collect();
220 let mut new = self.to_empty();
221 'OUTER:
222 while let Some(mut candidate) = old.pop() {
223 if candidate.is_empty() {
224 continue;
225 }
226 if new.lits.is_empty() {
227 new.lits.push(candidate);
228 continue;
229 }
230 for lit2 in &mut new.lits {
231 if lit2.is_empty() {
232 continue;
233 }
234 if &candidate == lit2 {
235 // If the literal is already in the set, then we can
236 // just drop it. But make sure that cut literals are
237 // infectious!
238 candidate.cut = candidate.cut || lit2.cut;
239 lit2.cut = candidate.cut;
240 continue 'OUTER;
241 }
242 if candidate.len() < lit2.len() {
243 if let Some(i) = position(&candidate, &lit2) {
244 candidate.cut();
245 let mut lit3 = lit2.clone();
246 lit3.truncate(i);
247 lit3.cut();
248 old.push(lit3);
249 lit2.clear();
250 }
251 } else {
252 if let Some(i) = position(&lit2, &candidate) {
253 lit2.cut();
254 let mut new_candidate = candidate.clone();
255 new_candidate.truncate(i);
256 new_candidate.cut();
257 old.push(new_candidate);
258 candidate.clear();
259 }
260 }
261 // Oops, the candidate is already represented in the set.
262 if candidate.is_empty() {
263 continue 'OUTER;
264 }
265 }
266 new.lits.push(candidate);
267 }
268 new.lits.retain(|lit| !lit.is_empty());
269 new.lits.sort();
270 new.lits.dedup();
271 new
272 }
273
274 /// Returns a new set of suffixes of this set of literals that are
275 /// guaranteed to be unambiguous.
276 ///
277 /// Any substring match with a member of the set is returned is guaranteed
278 /// to never overlap with a substring match of another member of the set
279 /// at the same ending position.
280 ///
281 /// Given any two members of the returned set, neither is a substring of
282 /// the other.
283 pub fn unambiguous_suffixes(&self) -> Literals {
284 // This is a touch wasteful...
285 let mut lits = self.clone();
286 lits.reverse();
287 let mut unamb = lits.unambiguous_prefixes();
288 unamb.reverse();
289 unamb
290 }
291
292 /// Unions the prefixes from the given expression to this set.
293 ///
294 /// If prefixes could not be added (for example, this set would exceed its
295 /// size limits or the set of prefixes from `expr` includes the empty
296 /// string), then false is returned.
297 ///
298 /// Note that prefix literals extracted from `expr` are said to be complete
299 /// if and only if the literal extends from the beginning of `expr` to the
300 /// end of `expr`.
301 pub fn union_prefixes(&mut self, expr: &Expr) -> bool {
302 let mut lits = self.to_empty();
303 prefixes(expr, &mut lits);
304 !lits.is_empty() && !lits.contains_empty() && self.union(lits)
305 }
306
307 /// Unions the suffixes from the given expression to this set.
308 ///
309 /// If suffixes could not be added (for example, this set would exceed its
310 /// size limits or the set of suffixes from `expr` includes the empty
311 /// string), then false is returned.
312 ///
313 /// Note that prefix literals extracted from `expr` are said to be complete
314 /// if and only if the literal extends from the end of `expr` to the
315 /// beginning of `expr`.
316 pub fn union_suffixes(&mut self, expr: &Expr) -> bool {
317 let mut lits = self.to_empty();
318 suffixes(expr, &mut lits);
319 lits.reverse();
320 !lits.is_empty() && !lits.contains_empty() && self.union(lits)
321 }
322
323 /// Unions this set with another set.
324 ///
325 /// If the union would cause the set to exceed its limits, then the union
326 /// is skipped and it returns false. Otherwise, if the union succeeds, it
327 /// returns true.
328 pub fn union(&mut self, lits: Literals) -> bool {
329 if self.num_bytes() + lits.num_bytes() > self.limit_size {
330 return false;
331 }
332 if lits.is_empty() {
333 self.lits.push(Lit::empty());
334 } else {
335 self.lits.extend(lits.lits);
336 }
337 true
338 }
339
340 /// Extends this set with another set.
341 ///
342 /// The set of literals is extended via a cross product.
343 ///
344 /// If a cross product would cause this set to exceed its limits, then the
345 /// cross product is skipped and it returns false. Otherwise, if the cross
346 /// product succeeds, it returns true.
347 pub fn cross_product(&mut self, lits: &Literals) -> bool {
348 if lits.is_empty() {
349 return true;
350 }
351 // Check that we make sure we stay in our limits.
352 let mut size_after;
353 if self.is_empty() || !self.any_complete() {
354 size_after = self.num_bytes();
355 for lits_lit in lits.literals() {
356 size_after += lits_lit.len();
357 }
358 } else {
359 size_after = self.lits.iter().fold(0, |accum, lit| {
360 accum + if lit.is_cut() { lit.len() } else { 0 }
361 });
362 for lits_lit in lits.literals() {
363 for self_lit in self.literals() {
364 if !self_lit.is_cut() {
365 size_after += self_lit.len() + lits_lit.len();
366 }
367 }
368 }
369 }
370 if size_after > self.limit_size {
371 return false;
372 }
373
374 let mut base = self.remove_complete();
375 if base.is_empty() {
376 base = vec![Lit::empty()];
377 }
378 for lits_lit in lits.literals() {
379 for mut self_lit in base.clone() {
380 self_lit.extend(&**lits_lit);
381 self_lit.cut = lits_lit.cut;
382 self.lits.push(self_lit);
383 }
384 }
385 true
386 }
387
388 /// Extends each literal in this set with the bytes given.
389 ///
390 /// If the set is empty, then the given literal is added to the set.
391 ///
392 /// If adding any number of bytes to all members of this set causes a limit
393 /// to be exceeded, then no bytes are added and false is returned. If a
394 /// prefix of `bytes` can be fit into this set, then it is used and all
395 /// resulting literals are cut.
396 pub fn cross_add(&mut self, bytes: &[u8]) -> bool {
397 // N.B. This could be implemented by simply calling cross_product with
398 // a literal set containing just `bytes`, but we can be smarter about
399 // taking shorter prefixes of `bytes` if they'll fit.
400 if bytes.is_empty() {
401 return true;
402 }
403 if self.lits.is_empty() {
404 let i = cmp::min(self.limit_size, bytes.len());
405 self.lits.push(Lit::new(bytes[..i].to_owned()));
406 self.lits[0].cut = i < bytes.len();
407 return !self.lits[0].is_cut();
408 }
409 let size = self.num_bytes();
410 if size + self.lits.len() >= self.limit_size {
411 return false;
412 }
413 let mut i = 1;
414 while size + (i * self.lits.len()) <= self.limit_size
415 && i < bytes.len() {
416 i += 1;
417 }
418 for lit in &mut self.lits {
419 if !lit.is_cut() {
420 lit.extend(&bytes[..i]);
421 if i < bytes.len() {
422 lit.cut();
423 }
424 }
425 }
426 true
427 }
428
429 /// Adds the given literal to this set.
430 ///
431 /// Returns false if adding this literal would cause the class to be too
432 /// big.
433 pub fn add(&mut self, lit: Lit) -> bool {
434 if self.num_bytes() + lit.len() > self.limit_size {
435 return false;
436 }
437 self.lits.push(lit);
438 true
439 }
440
441 /// Extends each literal in this set with the character class given.
442 ///
443 /// Returns false if the character class was too big to add.
444 pub fn add_char_class(&mut self, cls: &CharClass) -> bool {
445 self._add_char_class(cls, false)
446 }
447
448 /// Extends each literal in this set with the character class given,
449 /// writing the bytes of each character in reverse.
450 ///
451 /// Returns false if the character class was too big to add.
452 fn add_char_class_reverse(&mut self, cls: &CharClass) -> bool {
453 self._add_char_class(cls, true)
454 }
455
456 fn _add_char_class(&mut self, cls: &CharClass, reverse: bool) -> bool {
457 use std::char;
458
459 if self.class_exceeds_limits(cls.num_chars()) {
460 return false;
461 }
462 let mut base = self.remove_complete();
463 if base.is_empty() {
464 base = vec![Lit::empty()];
465 }
466 for r in cls {
467 let (s, e) = (r.start as u32, r.end as u32 + 1);
468 for c in (s..e).filter_map(char::from_u32) {
469 for mut lit in base.clone() {
470 let mut bytes = c.to_string().into_bytes();
471 if reverse {
472 bytes.reverse();
473 }
474 lit.extend(&bytes);
475 self.lits.push(lit);
476 }
477 }
478 }
479 true
480 }
481
482 /// Extends each literal in this set with the byte class given.
483 ///
484 /// Returns false if the byte class was too big to add.
485 pub fn add_byte_class(&mut self, cls: &ByteClass) -> bool {
486 if self.class_exceeds_limits(cls.num_bytes()) {
487 return false;
488 }
489 let mut base = self.remove_complete();
490 if base.is_empty() {
491 base = vec![Lit::empty()];
492 }
493 for r in cls {
494 let (s, e) = (r.start as u32, r.end as u32 + 1);
495 for b in (s..e).map(|b| b as u8) {
496 for mut lit in base.clone() {
497 lit.push(b);
498 self.lits.push(lit);
499 }
500 }
501 }
502 true
503 }
504
505 /// Cuts every member of this set. When a member is cut, it can never
506 /// be extended.
507 pub fn cut(&mut self) {
508 for lit in &mut self.lits {
509 lit.cut();
510 }
511 }
512
513 /// Reverses all members in place.
514 pub fn reverse(&mut self) {
515 for lit in &mut self.lits {
516 lit.reverse();
517 }
518 }
519
520 /// Clears this set of all members.
521 pub fn clear(&mut self) {
522 self.lits.clear();
523 }
524
525 /// Pops all complete literals out of this set.
526 fn remove_complete(&mut self) -> Vec<Lit> {
527 let mut base = vec![];
528 for lit in mem::replace(&mut self.lits, vec![]) {
529 if lit.is_cut() {
530 self.lits.push(lit);
531 } else {
532 base.push(lit);
533 }
534 }
535 base
536 }
537
538 /// Returns the total number of bytes in this set.
539 fn num_bytes(&self) -> usize {
540 self.lits.iter().fold(0, |accum, lit| accum + lit.len())
541 }
542
543 /// Returns true if a character class with the given size would cause this
544 /// set to exceed its limits.
545 ///
546 /// The size given should correspond to the number of items in the class.
547 fn class_exceeds_limits(&self, size: usize) -> bool {
548 if size > self.limit_class {
549 return true;
550 }
551 // This is an approximation since codepoints in a char class can encode
552 // to 1-4 bytes.
553 let new_byte_count =
554 if self.lits.is_empty() {
555 size
556 } else {
557 self.lits
558 .iter()
559 .fold(0, |accum, lit| {
560 accum + if lit.is_cut() {
561 // If the literal is cut, then we'll never add
562 // anything to it, so don't count it.
563 0
564 } else {
565 (lit.len() + 1) * size
566 }
567 })
568 };
569 new_byte_count > self.limit_size
570 }
571}
572
573fn prefixes(expr: &Expr, lits: &mut Literals) {
574 use Expr::*;
575 match *expr {
576 Literal { ref chars, casei: false } => {
577 let s: String = chars.iter().cloned().collect();
578 lits.cross_add(s.as_bytes());
579 }
580 Literal { ref chars, casei: true } => {
581 for &c in chars {
582 let cls = CharClass::new(vec![
583 ClassRange { start: c, end: c },
584 ]).case_fold();
585 if !lits.add_char_class(&cls) {
586 lits.cut();
587 return;
588 }
589 }
590 }
591 LiteralBytes { ref bytes, casei: false } => {
592 lits.cross_add(bytes);
593 }
594 LiteralBytes { ref bytes, casei: true } => {
595 for &b in bytes {
596 let cls = ByteClass::new(vec![
597 ByteRange { start: b, end: b },
598 ]).case_fold();
599 if !lits.add_byte_class(&cls) {
600 lits.cut();
601 return;
602 }
603 }
604 }
605 Class(ref cls) => {
606 if !lits.add_char_class(cls) {
607 lits.cut();
608 }
609 }
610 ClassBytes(ref cls) => {
611 if !lits.add_byte_class(cls) {
612 lits.cut();
613 }
614 }
615 Group { ref e, .. } => {
616 prefixes(&**e, lits);
617 }
618 Repeat { ref e, r: Repeater::ZeroOrOne, .. } => {
619 repeat_zero_or_one_literals(&**e, lits, prefixes);
620 }
621 Repeat { ref e, r: Repeater::ZeroOrMore, .. } => {
622 repeat_zero_or_more_literals(&**e, lits, prefixes);
623 }
624 Repeat { ref e, r: Repeater::OneOrMore, .. } => {
625 repeat_one_or_more_literals(&**e, lits, prefixes);
626 }
627 Repeat { ref e, r: Repeater::Range { min, max }, greedy } => {
628 repeat_range_literals(&**e, min, max, greedy, lits, prefixes);
629 }
630 Concat(ref es) if es.is_empty() => {}
631 Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
632 Concat(ref es) => {
633 for e in es {
634 if let StartText = *e {
635 if !lits.is_empty() {
636 lits.cut();
637 break;
638 }
639 lits.add(Lit::empty());
640 continue;
641 }
642 let mut lits2 = lits.to_empty();
643 prefixes(e, &mut lits2);
644 if !lits.cross_product(&lits2) || !lits2.any_complete() {
645 // If this expression couldn't yield any literal that
646 // could be extended, then we need to quit. Since we're
647 // short-circuiting, we also need to freeze every member.
648 lits.cut();
649 break;
650 }
651 }
652 }
653 Alternate(ref es) => {
654 alternate_literals(es, lits, prefixes);
655 }
656 _ => lits.cut(),
657 }
658}
659
660fn suffixes(expr: &Expr, lits: &mut Literals) {
661 use Expr::*;
662 match *expr {
663 Literal { ref chars, casei: false } => {
664 let s: String = chars.iter().cloned().collect();
665 let mut bytes = s.into_bytes();
666 bytes.reverse();
667 lits.cross_add(&bytes);
668 }
669 Literal { ref chars, casei: true } => {
670 for &c in chars.iter().rev() {
671 let cls = CharClass::new(vec![
672 ClassRange { start: c, end: c },
673 ]).case_fold();
674 if !lits.add_char_class_reverse(&cls) {
675 lits.cut();
676 return;
677 }
678 }
679 }
680 LiteralBytes { ref bytes, casei: false } => {
681 let b: Vec<u8> = bytes.iter().rev().cloned().collect();
682 lits.cross_add(&b);
683 }
684 LiteralBytes { ref bytes, casei: true } => {
685 for &b in bytes.iter().rev() {
686 let cls = ByteClass::new(vec![
687 ByteRange { start: b, end: b },
688 ]).case_fold();
689 if !lits.add_byte_class(&cls) {
690 lits.cut();
691 return;
692 }
693 }
694 }
695 Class(ref cls) => {
696 if !lits.add_char_class_reverse(cls) {
697 lits.cut();
698 }
699 }
700 ClassBytes(ref cls) => {
701 if !lits.add_byte_class(cls) {
702 lits.cut();
703 }
704 }
705 Group { ref e, .. } => {
706 suffixes(&**e, lits);
707 }
708 Repeat { ref e, r: Repeater::ZeroOrOne, .. } => {
709 repeat_zero_or_one_literals(&**e, lits, suffixes);
710 }
711 Repeat { ref e, r: Repeater::ZeroOrMore, .. } => {
712 repeat_zero_or_more_literals(&**e, lits, suffixes);
713 }
714 Repeat { ref e, r: Repeater::OneOrMore, .. } => {
715 repeat_one_or_more_literals(&**e, lits, suffixes);
716 }
717 Repeat { ref e, r: Repeater::Range { min, max }, greedy } => {
718 repeat_range_literals(&**e, min, max, greedy, lits, suffixes);
719 }
720 Concat(ref es) if es.is_empty() => {}
721 Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
722 Concat(ref es) => {
723 for e in es.iter().rev() {
724 if let EndText = *e {
725 if !lits.is_empty() {
726 lits.cut();
727 break;
728 }
729 lits.add(Lit::empty());
730 continue;
731 }
732 let mut lits2 = lits.to_empty();
733 suffixes(e, &mut lits2);
734 if !lits.cross_product(&lits2) || !lits2.any_complete() {
735 // If this expression couldn't yield any literal that
736 // could be extended, then we need to quit. Since we're
737 // short-circuiting, we also need to freeze every member.
738 lits.cut();
739 break;
740 }
741 }
742 }
743 Alternate(ref es) => {
744 alternate_literals(es, lits, suffixes);
745 }
746 _ => lits.cut(),
747 }
748}
749
750fn repeat_zero_or_one_literals<F: FnMut(&Expr, &mut Literals)>(
751 e: &Expr,
752 lits: &mut Literals,
753 mut f: F,
754) {
755 let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
756 lits3.set_limit_size(lits.limit_size() / 2);
757 f(e, &mut lits3);
758
759 if lits3.is_empty() || !lits2.cross_product(&lits3) {
760 lits.cut();
761 return;
762 }
763 lits2.add(Lit::empty());
764 if !lits.union(lits2) {
765 lits.cut();
766 }
767}
768
769fn repeat_zero_or_more_literals<F: FnMut(&Expr, &mut Literals)>(
770 e: &Expr,
771 lits: &mut Literals,
772 mut f: F,
773) {
774 let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
775 lits3.set_limit_size(lits.limit_size() / 2);
776 f(e, &mut lits3);
777
778 if lits3.is_empty() || !lits2.cross_product(&lits3) {
779 lits.cut();
780 return;
781 }
782 lits2.cut();
783 lits2.add(Lit::empty());
784 if !lits.union(lits2) {
785 lits.cut();
786 }
787}
788
789fn repeat_one_or_more_literals<F: FnMut(&Expr, &mut Literals)>(
790 e: &Expr,
791 lits: &mut Literals,
792 mut f: F,
793) {
794 f(e, lits);
795 lits.cut();
796}
797
798fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>(
799 e: &Expr,
800 min: u32,
801 max: Option<u32>,
802 greedy: bool,
803 lits: &mut Literals,
804 mut f: F,
805) {
806 use Expr::*;
807
808 if min == 0 {
809 // This is a bit conservative. If `max` is set, then we could
810 // treat this as a finite set of alternations. For now, we
811 // just treat it as `e*`.
812 f(&Repeat {
813 e: Box::new(e.clone()),
814 r: Repeater::ZeroOrMore,
815 greedy: greedy,
816 }, lits);
817 } else {
818 if min > 0 {
819 let n = cmp::min(lits.limit_size, min as usize);
820 let es = iter::repeat(e.clone()).take(n).collect();
821 f(&Concat(es), lits);
822 if n < min as usize {
823 lits.cut();
824 }
825 }
826 if max.map_or(true, |max| min < max) {
827 lits.cut();
828 }
829 }
830}
831
832fn alternate_literals<F: FnMut(&Expr, &mut Literals)>(
833 es: &[Expr],
834 lits: &mut Literals,
835 mut f: F,
836) {
837 let mut lits2 = lits.to_empty();
838 for e in es {
839 let mut lits3 = lits.to_empty();
840 lits3.set_limit_size(lits.limit_size() / 5);
841 f(e, &mut lits3);
842 if lits3.is_empty() || !lits2.union(lits3) {
843 // If we couldn't find suffixes for *any* of the
844 // alternates, then the entire alternation has to be thrown
845 // away and any existing members must be frozen. Similarly,
846 // if the union couldn't complete, stop and freeze.
847 lits.cut();
848 return;
849 }
850 }
851 if !lits.cross_product(&lits2) {
852 lits.cut();
853 }
854}
855
856impl fmt::Debug for Literals {
857 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
858 f.debug_struct("Literals")
859 .field("lits", &self.lits)
860 .field("limit_size", &self.limit_size)
861 .field("limit_class", &self.limit_class)
862 .finish()
863 }
864}
865
866impl Lit {
867 /// Returns a new complete literal with the bytes given.
868 pub fn new(bytes: Vec<u8>) -> Lit {
869 Lit { v: bytes, cut: false }
870 }
871
872 /// Returns a new complete empty literal.
873 pub fn empty() -> Lit {
874 Lit { v: vec![], cut: false }
875 }
876
877 /// Returns true if this literal was "cut."
878 pub fn is_cut(&self) -> bool {
879 self.cut
880 }
881
882 /// Cuts this literal.
883 pub fn cut(&mut self) {
884 self.cut = true;
885 }
886}
887
888impl PartialEq for Lit {
889 fn eq(&self, other: &Lit) -> bool {
890 self.v == other.v
891 }
892}
893
894impl PartialOrd for Lit {
895 fn partial_cmp(&self, other: &Lit) -> Option<cmp::Ordering> {
896 self.v.partial_cmp(&other.v)
897 }
898}
899
900impl fmt::Debug for Lit {
901 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
902 if self.is_cut() {
903 write!(f, "Cut({})", escape_unicode(&self.v))
904 } else {
905 write!(f, "Complete({})", escape_unicode(&self.v))
906 }
907 }
908}
909
910impl AsRef<[u8]> for Lit {
911 fn as_ref(&self) -> &[u8] { &self.v }
912}
913
914impl ops::Deref for Lit {
915 type Target = Vec<u8>;
916 fn deref(&self) -> &Vec<u8> { &self.v }
917}
918
919impl ops::DerefMut for Lit {
920 fn deref_mut(&mut self) -> &mut Vec<u8> { &mut self.v }
921}
922
923fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
924 let mut i = 0;
925 while haystack.len() >= needle.len() {
926 if needle == &haystack[..needle.len()] {
927 return Some(i);
928 }
929 i += 1;
930 haystack = &haystack[1..];
931 }
932 None
933}
934
935fn escape_unicode(bytes: &[u8]) -> String {
936 let show = match ::std::str::from_utf8(bytes) {
937 Ok(v) => v.to_string(),
938 Err(_) => escape_bytes(bytes),
939 };
940 let mut space_escaped = String::new();
941 for c in show.chars() {
942 if c.is_whitespace() {
943 let escaped = if c as u32 <= 0x7F {
944 escape_byte(c as u8)
945 } else {
946 if c as u32 <= 0xFFFF {
947 format!(r"\u{{{:04x}}}", c as u32)
948 } else {
949 format!(r"\U{{{:08x}}}", c as u32)
950 }
951 };
952 space_escaped.push_str(&escaped);
953 } else {
954 space_escaped.push(c);
955 }
956 }
957 space_escaped
958}
959
960fn escape_bytes(bytes: &[u8]) -> String {
961 let mut s = String::new();
962 for &b in bytes {
963 s.push_str(&escape_byte(b));
964 }
965 s
966}
967
968fn escape_byte(byte: u8) -> String {
969 use std::ascii::escape_default;
970
971 let escaped: Vec<u8> = escape_default(byte).collect();
972 String::from_utf8_lossy(&escaped).into_owned()
973}
974
975#[cfg(test)]
976mod tests {
977 use std::fmt;
978
979 use {Expr, ExprBuilder};
980 use super::{Literals, Lit, escape_bytes};
981
982 // To make test failures easier to read.
983 #[derive(Debug, Eq, PartialEq)]
984 struct Bytes(Vec<ULit>);
985 #[derive(Debug, Eq, PartialEq)]
986 struct Unicode(Vec<ULit>);
987
988 fn escape_lits(blits: &[Lit]) -> Vec<ULit> {
989 let mut ulits = vec![];
990 for blit in blits {
991 ulits.push(ULit { v: escape_bytes(&blit), cut: blit.is_cut() });
992 }
993 ulits
994 }
995
996 fn create_lits<I: IntoIterator<Item=Lit>>(it: I) -> Literals {
997 Literals {
998 lits: it.into_iter().collect(),
999 limit_size: 0,
1000 limit_class: 0,
1001 }
1002 }
1003
1004 // Needs to be pub for 1.3?
1005 #[derive(Clone, Eq, PartialEq)]
1006 pub struct ULit {
1007 v: String,
1008 cut: bool,
1009 }
1010
1011 impl ULit {
1012 fn is_cut(&self) -> bool { self.cut }
1013 }
1014
1015 impl fmt::Debug for ULit {
1016 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1017 if self.is_cut() {
1018 write!(f, "Cut({})", self.v)
1019 } else {
1020 write!(f, "Complete({})", self.v)
1021 }
1022 }
1023 }
1024
1025 impl PartialEq<Lit> for ULit {
1026 fn eq(&self, other: &Lit) -> bool {
1027 self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut()
1028 }
1029 }
1030
1031 impl PartialEq<ULit> for Lit {
1032 fn eq(&self, other: &ULit) -> bool {
1033 &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut()
1034 }
1035 }
1036
1037 #[allow(non_snake_case)]
1038 fn C(s: &'static str) -> ULit { ULit { v: s.to_owned(), cut: true } }
1039 #[allow(non_snake_case)]
1040 fn M(s: &'static str) -> ULit { ULit { v: s.to_owned(), cut: false } }
1041
1042 fn prefixes(lits: &mut Literals, expr: &Expr) {
1043 lits.union_prefixes(expr);
1044 }
1045
1046 fn suffixes(lits: &mut Literals, expr: &Expr) {
1047 lits.union_suffixes(expr);
1048 }
1049
1050 macro_rules! assert_lit_eq {
1051 ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{
1052 let expected: Vec<ULit> = vec![$($expected_lit),*];
1053 let lits = $got_lits;
1054 assert_eq!(
1055 $which(expected.clone()),
1056 $which(escape_lits(lits.literals())));
1057 assert_eq!(
1058 !expected.is_empty() && expected.iter().all(|l| !l.is_cut()),
1059 lits.all_complete());
1060 assert_eq!(
1061 expected.iter().any(|l| !l.is_cut()),
1062 lits.any_complete());
1063 }};
1064 }
1065
1066 macro_rules! test_lit {
1067 ($name:ident, $which:ident, $re:expr) => {
1068 test_lit!($name, $which, $re,);
1069 };
1070 ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1071 #[test]
1072 fn $name() {
1073 let expr = Expr::parse($re).unwrap();
1074 let lits = expr.$which();
1075 assert_lit_eq!(Unicode, lits, $($lit),*);
1076
1077 let expr = ExprBuilder::new().allow_bytes(true).unicode(false)
1078 .parse($re).unwrap();
1079 let lits = expr.$which();
1080 assert_lit_eq!(Bytes, lits, $($lit),*);
1081 }
1082 };
1083 }
1084
1085 // ************************************************************************
1086 // Tests for prefix literal extraction.
1087 // ************************************************************************
1088
1089 // Elementary tests.
1090 test_lit!(pfx_one_lit1, prefixes, "a", M("a"));
1091 test_lit!(pfx_one_lit2, prefixes, "abc", M("abc"));
1092 test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1093 test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1094 test_lit!(pfx_class1, prefixes, "[1-4]",
1095 M("1"), M("2"), M("3"), M("4"));
1096 test_lit!(pfx_class2, prefixes, "(?u)[☃Ⅰ]",
1097 M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83"));
1098 test_lit!(pfx_class3, prefixes, "(?ui)[☃Ⅰ]",
1099 M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"),
1100 M("\\xe2\\x98\\x83"));
1101 test_lit!(pfx_one_lit_casei1, prefixes, "(?i)a",
1102 M("A"), M("a"));
1103 test_lit!(pfx_one_lit_casei2, prefixes, "(?i)abc",
1104 M("ABC"), M("aBC"), M("AbC"), M("abC"),
1105 M("ABc"), M("aBc"), M("Abc"), M("abc"));
1106 test_lit!(pfx_group1, prefixes, "(a)", M("a"));
1107 test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
1108 test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
1109 test_lit!(pfx_rep_zero_or_more1, prefixes, "a*");
1110 test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*");
1111 test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a"));
1112 test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc"));
1113 test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a"));
1114 test_lit!(pfx_rep_range1, prefixes, "a{0}");
1115 test_lit!(pfx_rep_range2, prefixes, "a{0,}");
1116 test_lit!(pfx_rep_range3, prefixes, "a{0,1}");
1117 test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a"));
1118 test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa"));
1119 test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a"));
1120 test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa"));
1121
1122 // Test regexes with concatenations.
1123 test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab"));
1124 test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz"));
1125 test_lit!(pfx_cat3, prefixes, "(?i)[ab]z",
1126 M("AZ"), M("BZ"), M("aZ"), M("bZ"),
1127 M("Az"), M("Bz"), M("az"), M("bz"));
1128 test_lit!(pfx_cat4, prefixes, "[ab][yz]",
1129 M("ay"), M("by"), M("az"), M("bz"));
1130 test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b"));
1131 test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c"));
1132 test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c"));
1133 test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b"));
1134 test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b"));
1135 test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a"));
1136 test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac"));
1137 test_lit!(pfx_cat12, prefixes, "ab+", C("ab"));
1138 test_lit!(pfx_cat13, prefixes, "ab+c", C("ab"));
1139 test_lit!(pfx_cat14, prefixes, "a^", C("a"));
1140 test_lit!(pfx_cat15, prefixes, "$a");
1141 test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac"));
1142 test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab"));
1143 test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb"));
1144 test_lit!(pfx_cat19, prefixes, "a.z", C("a"));
1145
1146 // Test regexes with alternations.
1147 test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b"));
1148 test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1149 test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1150 test_lit!(pfx_alt4, prefixes, "a|b*");
1151 test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b"));
1152 test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)");
1153 test_lit!(pfx_alt7, prefixes, "(a|b)*c|(a|ab)*c",
1154 C("a"), C("b"), M("c"), C("a"), C("ab"), M("c"));
1155 test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c"));
1156
1157 // Test regexes with empty assertions.
1158 test_lit!(pfx_empty1, prefixes, "^a", M("a"));
1159 test_lit!(pfx_empty2, prefixes, "^abc", M("abc"));
1160 test_lit!(pfx_empty3, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z"));
1161
1162 // Make sure some curious regexes have no prefixes.
1163 test_lit!(pfx_nothing1, prefixes, ".");
1164 test_lit!(pfx_nothing2, prefixes, "(?s).");
1165 test_lit!(pfx_nothing3, prefixes, "^");
1166 test_lit!(pfx_nothing4, prefixes, "$");
1167 test_lit!(pfx_nothing6, prefixes, "(?m)$");
1168 test_lit!(pfx_nothing7, prefixes, r"\b");
1169 test_lit!(pfx_nothing8, prefixes, r"\B");
1170
1171 // Test a few regexes that defeat any prefix literal detection.
1172 test_lit!(pfx_defeated1, prefixes, ".a");
1173 test_lit!(pfx_defeated2, prefixes, "(?s).a");
1174 test_lit!(pfx_defeated3, prefixes, "a*b*c*");
1175 test_lit!(pfx_defeated4, prefixes, "a|.");
1176 test_lit!(pfx_defeated5, prefixes, ".|a");
1177 test_lit!(pfx_defeated6, prefixes, "a|^");
1178 test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))");
1179 test_lit!(pfx_defeated8, prefixes, "$a");
1180 test_lit!(pfx_defeated9, prefixes, "(?m)$a");
1181 test_lit!(pfx_defeated10, prefixes, r"\ba");
1182 test_lit!(pfx_defeated11, prefixes, r"\Ba");
1183 test_lit!(pfx_defeated12, prefixes, "^*a");
1184 test_lit!(pfx_defeated13, prefixes, "^+a");
1185
1186 test_lit!(
1187 pfx_crazy1,
1188 prefixes,
1189 r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
1190 C("Mo\\'am"), C("Mu\\'am"), C("Moam"), C("Muam"));
1191
1192 // ************************************************************************
1193 // Tests for quiting prefix literal search.
1194 // ************************************************************************
1195
1196 macro_rules! test_exhausted {
1197 ($name:ident, $which:ident, $re:expr) => {
1198 test_exhausted!($name, $which, $re,);
1199 };
1200 ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1201 #[test]
1202 fn $name() {
1203 let expr = Expr::parse($re).unwrap();
1204 let mut lits = Literals::empty();
1205 lits.set_limit_size(20).set_limit_class(10);
1206 $which(&mut lits, &expr);
1207 assert_lit_eq!(Unicode, lits, $($lit),*);
1208
1209 let expr = ExprBuilder::new().allow_bytes(true).unicode(false)
1210 .parse($re).unwrap();
1211 let mut lits = Literals::empty();
1212 lits.set_limit_size(20).set_limit_class(10);
1213 $which(&mut lits, &expr);
1214 assert_lit_eq!(Bytes, lits, $($lit),*);
1215 }
1216 };
1217 }
1218
1219 // These test use a much lower limit than the default so that we can
1220 // write test cases of reasonable size.
1221 test_exhausted!(pfx_exhausted1, prefixes, "[a-z]");
1222 test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A");
1223 test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A"));
1224 test_exhausted!(pfx_exhausted4, prefixes, "(?i)foobar",
1225 C("FO"), C("fO"), C("Fo"), C("fo"));
1226 test_exhausted!(pfx_exhausted5, prefixes, "(?:ab){100}",
1227 C("abababababababababab"));
1228 test_exhausted!(pfx_exhausted6, prefixes, "(?:(?:ab){100})*cd",
1229 C("ababababab"), M("cd"));
1230 test_exhausted!(pfx_exhausted7, prefixes, "z(?:(?:ab){100})*cd",
1231 C("zababababab"), M("zcd"));
1232 test_exhausted!(pfx_exhausted8, prefixes, "aaaaaaaaaaaaaaaaaaaaz",
1233 C("aaaaaaaaaaaaaaaaaaaa"));
1234
1235 // ************************************************************************
1236 // Tests for suffix literal extraction.
1237 // ************************************************************************
1238
1239 // Elementary tests.
1240 test_lit!(sfx_one_lit1, suffixes, "a", M("a"));
1241 test_lit!(sfx_one_lit2, suffixes, "abc", M("abc"));
1242 test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1243 test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1244 test_lit!(sfx_class1, suffixes, "[1-4]",
1245 M("1"), M("2"), M("3"), M("4"));
1246 test_lit!(sfx_class2, suffixes, "(?u)[☃Ⅰ]",
1247 M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83"));
1248 test_lit!(sfx_class3, suffixes, "(?ui)[☃Ⅰ]",
1249 M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"),
1250 M("\\xe2\\x98\\x83"));
1251 test_lit!(sfx_one_lit_casei1, suffixes, "(?i)a",
1252 M("A"), M("a"));
1253 test_lit!(sfx_one_lit_casei2, suffixes, "(?i)abc",
1254 M("ABC"), M("ABc"), M("AbC"), M("Abc"),
1255 M("aBC"), M("aBc"), M("abC"), M("abc"));
1256 test_lit!(sfx_group1, suffixes, "(a)", M("a"));
1257 test_lit!(sfx_rep_zero_or_one1, suffixes, "a?");
1258 test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?");
1259 test_lit!(sfx_rep_zero_or_more1, suffixes, "a*");
1260 test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*");
1261 test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a"));
1262 test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc"));
1263 test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a"));
1264 test_lit!(sfx_rep_range1, suffixes, "a{0}");
1265 test_lit!(sfx_rep_range2, suffixes, "a{0,}");
1266 test_lit!(sfx_rep_range3, suffixes, "a{0,1}");
1267 test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a"));
1268 test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa"));
1269 test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a"));
1270 test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa"));
1271
1272 // Test regexes with concatenations.
1273 test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab"));
1274 test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz"));
1275 test_lit!(sfx_cat3, suffixes, "(?i)[ab]z",
1276 M("AZ"), M("Az"), M("BZ"), M("Bz"),
1277 M("aZ"), M("az"), M("bZ"), M("bz"));
1278 test_lit!(sfx_cat4, suffixes, "[ab][yz]",
1279 M("ay"), M("az"), M("by"), M("bz"));
1280 test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b"));
1281 test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c"));
1282 test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c"));
1283 test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc"));
1284 test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b"));
1285 test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a"));
1286 test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac"));
1287 test_lit!(sfx_cat12, suffixes, "ab+", C("b"));
1288 test_lit!(sfx_cat13, suffixes, "ab+c", C("bc"));
1289 test_lit!(sfx_cat14, suffixes, "a^");
1290 test_lit!(sfx_cat15, suffixes, "$a", C("a"));
1291 test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac"));
1292 test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc"));
1293 test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb"));
1294 test_lit!(sfx_cat19, suffixes, "a.z", C("z"));
1295
1296 // Test regexes with alternations.
1297 test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b"));
1298 test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1299 test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1300 test_lit!(sfx_alt4, suffixes, "a|b*");
1301 test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b"));
1302 test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)");
1303 test_lit!(sfx_alt7, suffixes, "(a|b)*c|(a|ab)*c",
1304 C("ac"), C("bc"), M("c"), C("ac"), C("abc"), M("c"));
1305 test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c"));
1306
1307 // Test regexes with empty assertions.
1308 test_lit!(sfx_empty1, suffixes, "a$", M("a"));
1309
1310 // Make sure some curious regexes have no suffixes.
1311 test_lit!(sfx_nothing1, suffixes, ".");
1312 test_lit!(sfx_nothing2, suffixes, "(?s).");
1313 test_lit!(sfx_nothing3, suffixes, "^");
1314 test_lit!(sfx_nothing4, suffixes, "$");
1315 test_lit!(sfx_nothing6, suffixes, "(?m)$");
1316 test_lit!(sfx_nothing7, suffixes, r"\b");
1317 test_lit!(sfx_nothing8, suffixes, r"\B");
1318
1319 // Test a few regexes that defeat any suffix literal detection.
1320 test_lit!(sfx_defeated1, suffixes, "a.");
1321 test_lit!(sfx_defeated2, suffixes, "(?s)a.");
1322 test_lit!(sfx_defeated3, suffixes, "a*b*c*");
1323 test_lit!(sfx_defeated4, suffixes, "a|.");
1324 test_lit!(sfx_defeated5, suffixes, ".|a");
1325 test_lit!(sfx_defeated6, suffixes, "a|^");
1326 test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c)).");
1327 test_lit!(sfx_defeated8, suffixes, "a^");
1328 test_lit!(sfx_defeated9, suffixes, "(?m)a$");
1329 test_lit!(sfx_defeated10, suffixes, r"a\b");
1330 test_lit!(sfx_defeated11, suffixes, r"a\B");
1331 test_lit!(sfx_defeated12, suffixes, "a^*");
1332 test_lit!(sfx_defeated13, suffixes, "a^+");
1333
1334 // These test use a much lower limit than the default so that we can
1335 // write test cases of reasonable size.
1336 test_exhausted!(sfx_exhausted1, suffixes, "[a-z]");
1337 test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*");
1338 test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z"));
1339 test_exhausted!(sfx_exhausted4, suffixes, "(?i)foobar",
1340 C("AR"), C("Ar"), C("aR"), C("ar"));
1341 test_exhausted!(sfx_exhausted5, suffixes, "(?:ab){100}",
1342 C("abababababababababab"));
1343 test_exhausted!(sfx_exhausted6, suffixes, "cd(?:(?:ab){100})*",
1344 C("ababababab"), M("cd"));
1345 test_exhausted!(sfx_exhausted7, suffixes, "cd(?:(?:ab){100})*z",
1346 C("abababababz"), M("cdz"));
1347 test_exhausted!(sfx_exhausted8, suffixes, "zaaaaaaaaaaaaaaaaaaaa",
1348 C("aaaaaaaaaaaaaaaaaaaa"));
1349
1350 // ************************************************************************
1351 // Tests for generating unambiguous literal sets.
1352 // ************************************************************************
1353
1354 macro_rules! test_unamb {
1355 ($name:ident, $given:expr, $expected:expr) => {
1356 #[test]
1357 fn $name() {
1358 let given: Vec<Lit> =
1359 $given
1360 .into_iter()
1361 .map(|ul| {
1362 let cut = ul.is_cut();
1363 Lit { v: ul.v.into_bytes(), cut: cut }
1364 })
1365 .collect();
1366 let lits = create_lits(given);
1367 let got = lits.unambiguous_prefixes();
1368 assert_eq!($expected, escape_lits(got.literals()));
1369 }
1370 };
1371 }
1372
1373 test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]);
1374 test_unamb!(unambiguous2,
1375 vec![M("zaaaaaa"), M("aa")], vec![C("aa"), C("z")]);
1376 test_unamb!(unambiguous3,
1377 vec![M("Sherlock"), M("Watson")],
1378 vec![M("Sherlock"), M("Watson")]);
1379 test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]);
1380 test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]);
1381 test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]);
1382 test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]);
1383 test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]);
1384 test_unamb!(unambiguous9,
1385 vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
1386 vec![C("a"), C("b"), C("c")]);
1387 test_unamb!(unambiguous10,
1388 vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
1389 vec![C("Mo"), C("Mu")]);
1390 test_unamb!(unambiguous11,
1391 vec![M("zazb"), M("azb")], vec![C("a"), C("z")]);
1392 test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]);
1393 test_unamb!(unambiguous13,
1394 vec![M("ABCX"), M("CDAX"), M("BCX")],
1395 vec![C("A"), C("BCX"), C("CD")]);
1396 test_unamb!(unambiguous14,
1397 vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
1398 vec![M("DSX"), C("I"), C("MGX"), C("MV")]);
1399 test_unamb!(unambiguous15,
1400 vec![M("IMG_"), M("MG_"), M("CIMG")],
1401 vec![C("C"), C("I"), C("MG_")]);
1402
1403
1404 // ************************************************************************
1405 // Tests for suffix trimming.
1406 // ************************************************************************
1407 macro_rules! test_trim {
1408 ($name:ident, $trim:expr, $given:expr, $expected:expr) => {
1409 #[test]
1410 fn $name() {
1411 let given: Vec<Lit> =
1412 $given
1413 .into_iter()
1414 .map(|ul| {
1415 let cut = ul.is_cut();
1416 Lit { v: ul.v.into_bytes(), cut: cut }
1417 })
1418 .collect();
1419 let lits = create_lits(given);
1420 let got = lits.trim_suffix($trim).unwrap();
1421 assert_eq!($expected, escape_lits(got.literals()));
1422 }
1423 }
1424 }
1425
1426 test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]);
1427 test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]);
1428 test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]);
1429 test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]);
1430
1431 // ************************************************************************
1432 // Tests for longest common prefix.
1433 // ************************************************************************
1434
1435 macro_rules! test_lcp {
1436 ($name:ident, $given:expr, $expected:expr) => {
1437 #[test]
1438 fn $name() {
1439 let given: Vec<Lit> =
1440 $given
1441 .into_iter()
1442 .map(|s: &str| Lit {
1443 v: s.to_owned().into_bytes(),
1444 cut: false,
1445 })
1446 .collect();
1447 let lits = create_lits(given);
1448 let got = lits.longest_common_prefix();
1449 assert_eq!($expected, escape_bytes(got));
1450 }
1451 };
1452 }
1453
1454 test_lcp!(lcp1, vec!["a"], "a");
1455 test_lcp!(lcp2, vec![], "");
1456 test_lcp!(lcp3, vec!["a", "b"], "");
1457 test_lcp!(lcp4, vec!["ab", "ab"], "ab");
1458 test_lcp!(lcp5, vec!["ab", "a"], "a");
1459 test_lcp!(lcp6, vec!["a", "ab"], "a");
1460 test_lcp!(lcp7, vec!["ab", "b"], "");
1461 test_lcp!(lcp8, vec!["b", "ab"], "");
1462 test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba");
1463 test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], "");
1464 test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], "");
1465 test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f");
1466
1467 // ************************************************************************
1468 // Tests for longest common suffix.
1469 // ************************************************************************
1470
1471 macro_rules! test_lcs {
1472 ($name:ident, $given:expr, $expected:expr) => {
1473 #[test]
1474 fn $name() {
1475 let given: Vec<Lit> =
1476 $given
1477 .into_iter()
1478 .map(|s: &str| Lit {
1479 v: s.to_owned().into_bytes(),
1480 cut: false,
1481 })
1482 .collect();
1483 let lits = create_lits(given);
1484 let got = lits.longest_common_suffix();
1485 assert_eq!($expected, escape_bytes(got));
1486 }
1487 };
1488 }
1489
1490 test_lcs!(lcs1, vec!["a"], "a");
1491 test_lcs!(lcs2, vec![], "");
1492 test_lcs!(lcs3, vec!["a", "b"], "");
1493 test_lcs!(lcs4, vec!["ab", "ab"], "ab");
1494 test_lcs!(lcs5, vec!["ab", "a"], "");
1495 test_lcs!(lcs6, vec!["a", "ab"], "");
1496 test_lcs!(lcs7, vec!["ab", "b"], "b");
1497 test_lcs!(lcs8, vec!["b", "ab"], "b");
1498 test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo");
1499 test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], "");
1500 test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], "");
1501 test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b");
1502}