]> git.proxmox.com Git - rustc.git/blob - vendor/regex-syntax-0.5.6/src/hir/literal/mod.rs
New upstream version 1.32.0~beta.2+dfsg1
[rustc.git] / vendor / regex-syntax-0.5.6 / src / hir / literal / mod.rs
1 // Copyright 2018 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 /*!
12 Provides routines for extracting literal prefixes and suffixes from an `Hir`.
13 */
14
15 use std::cmp;
16 use std::fmt;
17 use std::iter;
18 use std::mem;
19 use std::ops;
20
21 use hir::{self, Hir, HirKind};
22 use unicode;
23
24 /// A set of literal byte strings extracted from a regular expression.
25 ///
26 /// Every member of the set is a `Literal`, which is represented by a
27 /// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is
28 /// said to be either *complete* or *cut*. A complete literal means that
29 /// it extends until the beginning (or end) of the regular expression. In
30 /// some circumstances, this can be used to indicate a match in the regular
31 /// expression.
32 ///
33 /// A key aspect of literal extraction is knowing when to stop. It is not
34 /// feasible to blindly extract all literals from a regular expression, even if
35 /// there are finitely many. For example, the regular expression `[0-9]{10}`
36 /// has `10^10` distinct literals. For this reason, literal extraction is
37 /// bounded to some low number by default using heuristics, but the limits can
38 /// be tweaked.
39 ///
40 /// **WARNING**: Literal extraction uses stack space proportional to the size
41 /// of the `Hir` expression. At some point, this drawback will be eliminated.
42 /// To protect yourself, set a reasonable
43 /// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit).
44 /// This is done for you by default.
45 #[derive(Clone, Eq, PartialEq)]
46 pub struct Literals {
47 lits: Vec<Literal>,
48 limit_size: usize,
49 limit_class: usize,
50 }
51
52 /// A single member of a set of literals extracted from a regular expression.
53 ///
54 /// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice
55 /// and `Vec` operations are available.
56 #[derive(Clone, Eq, Ord)]
57 pub struct Literal {
58 v: Vec<u8>,
59 cut: bool,
60 }
61
62 impl Literals {
63 /// Returns a new empty set of literals using default limits.
64 pub fn empty() -> Literals {
65 Literals {
66 lits: vec![],
67 limit_size: 250,
68 limit_class: 10,
69 }
70 }
71
72 /// Returns a set of literal prefixes extracted from the given `Hir`.
73 pub fn prefixes(expr: &Hir) -> Literals {
74 let mut lits = Literals::empty();
75 lits.union_prefixes(expr);
76 lits
77 }
78
79 /// Returns a set of literal suffixes extracted from the given `Hir`.
80 pub fn suffixes(expr: &Hir) -> Literals {
81 let mut lits = Literals::empty();
82 lits.union_suffixes(expr);
83 lits
84 }
85
86 /// Get the approximate size limit (in bytes) of this set.
87 pub fn limit_size(&self) -> usize {
88 self.limit_size
89 }
90
91 /// Set the approximate size limit (in bytes) of this set.
92 ///
93 /// If extracting a literal would put the set over this limit, then
94 /// extraction stops.
95 ///
96 /// The new limits will only apply to additions to this set. Existing
97 /// members remain unchanged, even if the set exceeds the new limit.
98 pub fn set_limit_size(&mut self, size: usize) -> &mut Literals {
99 self.limit_size = size;
100 self
101 }
102
103 /// Get the character class size limit for this set.
104 pub fn limit_class(&self) -> usize {
105 self.limit_class
106 }
107
108 /// Limits the size of character(or byte) classes considered.
109 ///
110 /// A value of `0` prevents all character classes from being considered.
111 ///
112 /// This limit also applies to case insensitive literals, since each
113 /// character in the case insensitive literal is converted to a class, and
114 /// then case folded.
115 ///
116 /// The new limits will only apply to additions to this set. Existing
117 /// members remain unchanged, even if the set exceeds the new limit.
118 pub fn set_limit_class(&mut self, size: usize) -> &mut Literals {
119 self.limit_class = size;
120 self
121 }
122
123 /// Returns the set of literals as a slice. Its order is unspecified.
124 pub fn literals(&self) -> &[Literal] {
125 &self.lits
126 }
127
128 /// Returns the length of the smallest literal.
129 ///
130 /// Returns None is there are no literals in the set.
131 pub fn min_len(&self) -> Option<usize> {
132 let mut min = None;
133 for lit in &self.lits {
134 match min {
135 None => min = Some(lit.len()),
136 Some(m) if lit.len() < m => min = Some(lit.len()),
137 _ => {}
138 }
139 }
140 min
141 }
142
143 /// Returns true if all members in this set are complete.
144 pub fn all_complete(&self) -> bool {
145 !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut())
146 }
147
148 /// Returns true if any member in this set is complete.
149 pub fn any_complete(&self) -> bool {
150 self.lits.iter().any(|lit| !lit.is_cut())
151 }
152
153 /// Returns true if this set contains an empty literal.
154 pub fn contains_empty(&self) -> bool {
155 self.lits.iter().any(|lit| lit.is_empty())
156 }
157
158 /// Returns true if this set is empty or if all of its members is empty.
159 pub fn is_empty(&self) -> bool {
160 self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty())
161 }
162
163 /// Returns a new empty set of literals using this set's limits.
164 pub fn to_empty(&self) -> Literals {
165 let mut lits = Literals::empty();
166 lits.set_limit_size(self.limit_size)
167 .set_limit_class(self.limit_class);
168 lits
169 }
170
171 /// Returns the longest common prefix of all members in this set.
172 pub fn longest_common_prefix(&self) -> &[u8] {
173 if self.is_empty() {
174 return &[];
175 }
176 let lit0 = &*self.lits[0];
177 let mut len = lit0.len();
178 for lit in &self.lits[1..] {
179 len = cmp::min(
180 len,
181 lit.iter()
182 .zip(lit0)
183 .take_while(|&(a, b)| a == b)
184 .count());
185 }
186 &self.lits[0][..len]
187 }
188
189 /// Returns the longest common suffix of all members in this set.
190 pub fn longest_common_suffix(&self) -> &[u8] {
191 if self.is_empty() {
192 return &[];
193 }
194 let lit0 = &*self.lits[0];
195 let mut len = lit0.len();
196 for lit in &self.lits[1..] {
197 len = cmp::min(
198 len,
199 lit.iter()
200 .rev()
201 .zip(lit0.iter().rev())
202 .take_while(|&(a, b)| a == b)
203 .count());
204 }
205 &self.lits[0][self.lits[0].len() - len..]
206 }
207
208 /// Returns a new set of literals with the given number of bytes trimmed
209 /// from the suffix of each literal.
210 ///
211 /// If any literal would be cut out completely by trimming, then None is
212 /// returned.
213 ///
214 /// Any duplicates that are created as a result of this transformation are
215 /// removed.
216 pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> {
217 if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) {
218 return None;
219 }
220 let mut new = self.to_empty();
221 for mut lit in self.lits.iter().cloned() {
222 let new_len = lit.len() - num_bytes;
223 lit.truncate(new_len);
224 lit.cut();
225 new.lits.push(lit);
226 }
227 new.lits.sort();
228 new.lits.dedup();
229 Some(new)
230 }
231
232 /// Returns a new set of prefixes of this set of literals that are
233 /// guaranteed to be unambiguous.
234 ///
235 /// Any substring match with a member of the set is returned is guaranteed
236 /// to never overlap with a substring match of another member of the set
237 /// at the same starting position.
238 ///
239 /// Given any two members of the returned set, neither is a substring of
240 /// the other.
241 pub fn unambiguous_prefixes(&self) -> Literals {
242 if self.lits.is_empty() {
243 return self.to_empty();
244 }
245 let mut old: Vec<Literal> = self.lits.iter().cloned().collect();
246 let mut new = self.to_empty();
247 'OUTER:
248 while let Some(mut candidate) = old.pop() {
249 if candidate.is_empty() {
250 continue;
251 }
252 if new.lits.is_empty() {
253 new.lits.push(candidate);
254 continue;
255 }
256 for lit2 in &mut new.lits {
257 if lit2.is_empty() {
258 continue;
259 }
260 if &candidate == lit2 {
261 // If the literal is already in the set, then we can
262 // just drop it. But make sure that cut literals are
263 // infectious!
264 candidate.cut = candidate.cut || lit2.cut;
265 lit2.cut = candidate.cut;
266 continue 'OUTER;
267 }
268 if candidate.len() < lit2.len() {
269 if let Some(i) = position(&candidate, &lit2) {
270 candidate.cut();
271 let mut lit3 = lit2.clone();
272 lit3.truncate(i);
273 lit3.cut();
274 old.push(lit3);
275 lit2.clear();
276 }
277 } else {
278 if let Some(i) = position(&lit2, &candidate) {
279 lit2.cut();
280 let mut new_candidate = candidate.clone();
281 new_candidate.truncate(i);
282 new_candidate.cut();
283 old.push(new_candidate);
284 candidate.clear();
285 }
286 }
287 // Oops, the candidate is already represented in the set.
288 if candidate.is_empty() {
289 continue 'OUTER;
290 }
291 }
292 new.lits.push(candidate);
293 }
294 new.lits.retain(|lit| !lit.is_empty());
295 new.lits.sort();
296 new.lits.dedup();
297 new
298 }
299
300 /// Returns a new set of suffixes of this set of literals that are
301 /// guaranteed to be unambiguous.
302 ///
303 /// Any substring match with a member of the set is returned is guaranteed
304 /// to never overlap with a substring match of another member of the set
305 /// at the same ending position.
306 ///
307 /// Given any two members of the returned set, neither is a substring of
308 /// the other.
309 pub fn unambiguous_suffixes(&self) -> Literals {
310 // This is a touch wasteful...
311 let mut lits = self.clone();
312 lits.reverse();
313 let mut unamb = lits.unambiguous_prefixes();
314 unamb.reverse();
315 unamb
316 }
317
318 /// Unions the prefixes from the given expression to this set.
319 ///
320 /// If prefixes could not be added (for example, this set would exceed its
321 /// size limits or the set of prefixes from `expr` includes the empty
322 /// string), then false is returned.
323 ///
324 /// Note that prefix literals extracted from `expr` are said to be complete
325 /// if and only if the literal extends from the beginning of `expr` to the
326 /// end of `expr`.
327 pub fn union_prefixes(&mut self, expr: &Hir) -> bool {
328 let mut lits = self.to_empty();
329 prefixes(expr, &mut lits);
330 !lits.is_empty() && !lits.contains_empty() && self.union(lits)
331 }
332
333 /// Unions the suffixes from the given expression to this set.
334 ///
335 /// If suffixes could not be added (for example, this set would exceed its
336 /// size limits or the set of suffixes from `expr` includes the empty
337 /// string), then false is returned.
338 ///
339 /// Note that prefix literals extracted from `expr` are said to be complete
340 /// if and only if the literal extends from the end of `expr` to the
341 /// beginning of `expr`.
342 pub fn union_suffixes(&mut self, expr: &Hir) -> bool {
343 let mut lits = self.to_empty();
344 suffixes(expr, &mut lits);
345 lits.reverse();
346 !lits.is_empty() && !lits.contains_empty() && self.union(lits)
347 }
348
349 /// Unions this set with another set.
350 ///
351 /// If the union would cause the set to exceed its limits, then the union
352 /// is skipped and it returns false. Otherwise, if the union succeeds, it
353 /// returns true.
354 pub fn union(&mut self, lits: Literals) -> bool {
355 if self.num_bytes() + lits.num_bytes() > self.limit_size {
356 return false;
357 }
358 if lits.is_empty() {
359 self.lits.push(Literal::empty());
360 } else {
361 self.lits.extend(lits.lits);
362 }
363 true
364 }
365
366 /// Extends this set with another set.
367 ///
368 /// The set of literals is extended via a cross product.
369 ///
370 /// If a cross product would cause this set to exceed its limits, then the
371 /// cross product is skipped and it returns false. Otherwise, if the cross
372 /// product succeeds, it returns true.
373 pub fn cross_product(&mut self, lits: &Literals) -> bool {
374 if lits.is_empty() {
375 return true;
376 }
377 // Check that we make sure we stay in our limits.
378 let mut size_after;
379 if self.is_empty() || !self.any_complete() {
380 size_after = self.num_bytes();
381 for lits_lit in lits.literals() {
382 size_after += lits_lit.len();
383 }
384 } else {
385 size_after = self.lits.iter().fold(0, |accum, lit| {
386 accum + if lit.is_cut() { lit.len() } else { 0 }
387 });
388 for lits_lit in lits.literals() {
389 for self_lit in self.literals() {
390 if !self_lit.is_cut() {
391 size_after += self_lit.len() + lits_lit.len();
392 }
393 }
394 }
395 }
396 if size_after > self.limit_size {
397 return false;
398 }
399
400 let mut base = self.remove_complete();
401 if base.is_empty() {
402 base = vec![Literal::empty()];
403 }
404 for lits_lit in lits.literals() {
405 for mut self_lit in base.clone() {
406 self_lit.extend(&**lits_lit);
407 self_lit.cut = lits_lit.cut;
408 self.lits.push(self_lit);
409 }
410 }
411 true
412 }
413
414 /// Extends each literal in this set with the bytes given.
415 ///
416 /// If the set is empty, then the given literal is added to the set.
417 ///
418 /// If adding any number of bytes to all members of this set causes a limit
419 /// to be exceeded, then no bytes are added and false is returned. If a
420 /// prefix of `bytes` can be fit into this set, then it is used and all
421 /// resulting literals are cut.
422 pub fn cross_add(&mut self, bytes: &[u8]) -> bool {
423 // N.B. This could be implemented by simply calling cross_product with
424 // a literal set containing just `bytes`, but we can be smarter about
425 // taking shorter prefixes of `bytes` if they'll fit.
426 if bytes.is_empty() {
427 return true;
428 }
429 if self.lits.is_empty() {
430 let i = cmp::min(self.limit_size, bytes.len());
431 self.lits.push(Literal::new(bytes[..i].to_owned()));
432 self.lits[0].cut = i < bytes.len();
433 return !self.lits[0].is_cut();
434 }
435 let size = self.num_bytes();
436 if size + self.lits.len() >= self.limit_size {
437 return false;
438 }
439 let mut i = 1;
440 while size + (i * self.lits.len()) <= self.limit_size
441 && i < bytes.len() {
442 i += 1;
443 }
444 for lit in &mut self.lits {
445 if !lit.is_cut() {
446 lit.extend(&bytes[..i]);
447 if i < bytes.len() {
448 lit.cut();
449 }
450 }
451 }
452 true
453 }
454
455 /// Adds the given literal to this set.
456 ///
457 /// Returns false if adding this literal would cause the class to be too
458 /// big.
459 pub fn add(&mut self, lit: Literal) -> bool {
460 if self.num_bytes() + lit.len() > self.limit_size {
461 return false;
462 }
463 self.lits.push(lit);
464 true
465 }
466
467 /// Extends each literal in this set with the character class given.
468 ///
469 /// Returns false if the character class was too big to add.
470 pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool {
471 self._add_char_class(cls, false)
472 }
473
474 /// Extends each literal in this set with the character class given,
475 /// writing the bytes of each character in reverse.
476 ///
477 /// Returns false if the character class was too big to add.
478 fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool {
479 self._add_char_class(cls, true)
480 }
481
482 fn _add_char_class(
483 &mut self,
484 cls: &hir::ClassUnicode,
485 reverse: bool,
486 ) -> bool {
487 use std::char;
488
489 if self.class_exceeds_limits(cls_char_count(cls)) {
490 return false;
491 }
492 let mut base = self.remove_complete();
493 if base.is_empty() {
494 base = vec![Literal::empty()];
495 }
496 for r in cls.iter() {
497 let (s, e) = (r.start as u32, r.end as u32 + 1);
498 for c in (s..e).filter_map(char::from_u32) {
499 for mut lit in base.clone() {
500 let mut bytes = c.to_string().into_bytes();
501 if reverse {
502 bytes.reverse();
503 }
504 lit.extend(&bytes);
505 self.lits.push(lit);
506 }
507 }
508 }
509 true
510 }
511
512 /// Extends each literal in this set with the byte class given.
513 ///
514 /// Returns false if the byte class was too big to add.
515 pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool {
516 if self.class_exceeds_limits(cls_byte_count(cls)) {
517 return false;
518 }
519 let mut base = self.remove_complete();
520 if base.is_empty() {
521 base = vec![Literal::empty()];
522 }
523 for r in cls.iter() {
524 let (s, e) = (r.start as u32, r.end as u32 + 1);
525 for b in (s..e).map(|b| b as u8) {
526 for mut lit in base.clone() {
527 lit.push(b);
528 self.lits.push(lit);
529 }
530 }
531 }
532 true
533 }
534
535 /// Cuts every member of this set. When a member is cut, it can never
536 /// be extended.
537 pub fn cut(&mut self) {
538 for lit in &mut self.lits {
539 lit.cut();
540 }
541 }
542
543 /// Reverses all members in place.
544 pub fn reverse(&mut self) {
545 for lit in &mut self.lits {
546 lit.reverse();
547 }
548 }
549
550 /// Clears this set of all members.
551 pub fn clear(&mut self) {
552 self.lits.clear();
553 }
554
555 /// Pops all complete literals out of this set.
556 fn remove_complete(&mut self) -> Vec<Literal> {
557 let mut base = vec![];
558 for lit in mem::replace(&mut self.lits, vec![]) {
559 if lit.is_cut() {
560 self.lits.push(lit);
561 } else {
562 base.push(lit);
563 }
564 }
565 base
566 }
567
568 /// Returns the total number of bytes in this set.
569 fn num_bytes(&self) -> usize {
570 self.lits.iter().fold(0, |accum, lit| accum + lit.len())
571 }
572
573 /// Returns true if a character class with the given size would cause this
574 /// set to exceed its limits.
575 ///
576 /// The size given should correspond to the number of items in the class.
577 fn class_exceeds_limits(&self, size: usize) -> bool {
578 if size > self.limit_class {
579 return true;
580 }
581 // This is an approximation since codepoints in a char class can encode
582 // to 1-4 bytes.
583 let new_byte_count =
584 if self.lits.is_empty() {
585 size
586 } else {
587 self.lits
588 .iter()
589 .fold(0, |accum, lit| {
590 accum + if lit.is_cut() {
591 // If the literal is cut, then we'll never add
592 // anything to it, so don't count it.
593 0
594 } else {
595 (lit.len() + 1) * size
596 }
597 })
598 };
599 new_byte_count > self.limit_size
600 }
601 }
602
603 fn prefixes(expr: &Hir, lits: &mut Literals) {
604 match *expr.kind() {
605 HirKind::Literal(hir::Literal::Unicode(c)) => {
606 let mut buf = [0u8; 4];
607 let i = unicode::encode_utf8(c, &mut buf).unwrap();
608 lits.cross_add(&buf[..i]);
609 }
610 HirKind::Literal(hir::Literal::Byte(b)) => {
611 lits.cross_add(&[b]);
612 }
613 HirKind::Class(hir::Class::Unicode(ref cls)) => {
614 if !lits.add_char_class(cls) {
615 lits.cut();
616 }
617 }
618 HirKind::Class(hir::Class::Bytes(ref cls)) => {
619 if !lits.add_byte_class(cls) {
620 lits.cut();
621 }
622 }
623 HirKind::Group(hir::Group { ref hir, .. }) => {
624 prefixes(&**hir, lits);
625 }
626 HirKind::Repetition(ref x) => {
627 match x.kind {
628 hir::RepetitionKind::ZeroOrOne => {
629 repeat_zero_or_one_literals(&x.hir, lits, prefixes);
630 }
631 hir::RepetitionKind::ZeroOrMore => {
632 repeat_zero_or_more_literals(&x.hir, lits, prefixes);
633 }
634 hir::RepetitionKind::OneOrMore => {
635 repeat_one_or_more_literals(&x.hir, lits, prefixes);
636 }
637 hir::RepetitionKind::Range(ref rng) => {
638 let (min, max) = match *rng {
639 hir::RepetitionRange::Exactly(m) => {
640 (m, Some(m))
641 }
642 hir::RepetitionRange::AtLeast(m) => {
643 (m, None)
644 }
645 hir::RepetitionRange::Bounded(m, n) => {
646 (m, Some(n))
647 }
648 };
649 repeat_range_literals(
650 &x.hir, min, max, x.greedy, lits, prefixes)
651 }
652 }
653 }
654 HirKind::Concat(ref es) if es.is_empty() => {}
655 HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
656 HirKind::Concat(ref es) => {
657 for e in es {
658 if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() {
659 if !lits.is_empty() {
660 lits.cut();
661 break;
662 }
663 lits.add(Literal::empty());
664 continue;
665 }
666 let mut lits2 = lits.to_empty();
667 prefixes(e, &mut lits2);
668 if !lits.cross_product(&lits2) || !lits2.any_complete() {
669 // If this expression couldn't yield any literal that
670 // could be extended, then we need to quit. Since we're
671 // short-circuiting, we also need to freeze every member.
672 lits.cut();
673 break;
674 }
675 }
676 }
677 HirKind::Alternation(ref es) => {
678 alternate_literals(es, lits, prefixes);
679 }
680 _ => lits.cut(),
681 }
682 }
683
684 fn suffixes(expr: &Hir, lits: &mut Literals) {
685 match *expr.kind() {
686 HirKind::Literal(hir::Literal::Unicode(c)) => {
687 let mut buf = [0u8; 4];
688 let i = unicode::encode_utf8(c, &mut buf).unwrap();
689 let mut buf = &mut buf[..i];
690 buf.reverse();
691 lits.cross_add(buf);
692 }
693 HirKind::Literal(hir::Literal::Byte(b)) => {
694 lits.cross_add(&[b]);
695 }
696 HirKind::Class(hir::Class::Unicode(ref cls)) => {
697 if !lits.add_char_class_reverse(cls) {
698 lits.cut();
699 }
700 }
701 HirKind::Class(hir::Class::Bytes(ref cls)) => {
702 if !lits.add_byte_class(cls) {
703 lits.cut();
704 }
705 }
706 HirKind::Group(hir::Group { ref hir, .. }) => {
707 suffixes(&**hir, lits);
708 }
709 HirKind::Repetition(ref x) => {
710 match x.kind {
711 hir::RepetitionKind::ZeroOrOne => {
712 repeat_zero_or_one_literals(&x.hir, lits, suffixes);
713 }
714 hir::RepetitionKind::ZeroOrMore => {
715 repeat_zero_or_more_literals(&x.hir, lits, suffixes);
716 }
717 hir::RepetitionKind::OneOrMore => {
718 repeat_one_or_more_literals(&x.hir, lits, suffixes);
719 }
720 hir::RepetitionKind::Range(ref rng) => {
721 let (min, max) = match *rng {
722 hir::RepetitionRange::Exactly(m) => {
723 (m, Some(m))
724 }
725 hir::RepetitionRange::AtLeast(m) => {
726 (m, None)
727 }
728 hir::RepetitionRange::Bounded(m, n) => {
729 (m, Some(n))
730 }
731 };
732 repeat_range_literals(
733 &x.hir, min, max, x.greedy, lits, suffixes)
734 }
735 }
736 }
737 HirKind::Concat(ref es) if es.is_empty() => {}
738 HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
739 HirKind::Concat(ref es) => {
740 for e in es.iter().rev() {
741 if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() {
742 if !lits.is_empty() {
743 lits.cut();
744 break;
745 }
746 lits.add(Literal::empty());
747 continue;
748 }
749 let mut lits2 = lits.to_empty();
750 suffixes(e, &mut lits2);
751 if !lits.cross_product(&lits2) || !lits2.any_complete() {
752 // If this expression couldn't yield any literal that
753 // could be extended, then we need to quit. Since we're
754 // short-circuiting, we also need to freeze every member.
755 lits.cut();
756 break;
757 }
758 }
759 }
760 HirKind::Alternation(ref es) => {
761 alternate_literals(es, lits, suffixes);
762 }
763 _ => lits.cut(),
764 }
765 }
766
767 fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
768 e: &Hir,
769 lits: &mut Literals,
770 mut f: F,
771 ) {
772 let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
773 lits3.set_limit_size(lits.limit_size() / 2);
774 f(e, &mut lits3);
775
776 if lits3.is_empty() || !lits2.cross_product(&lits3) {
777 lits.cut();
778 return;
779 }
780 lits2.add(Literal::empty());
781 if !lits.union(lits2) {
782 lits.cut();
783 }
784 }
785
786 fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
787 e: &Hir,
788 lits: &mut Literals,
789 mut f: F,
790 ) {
791 let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
792 lits3.set_limit_size(lits.limit_size() / 2);
793 f(e, &mut lits3);
794
795 if lits3.is_empty() || !lits2.cross_product(&lits3) {
796 lits.cut();
797 return;
798 }
799 lits2.cut();
800 lits2.add(Literal::empty());
801 if !lits.union(lits2) {
802 lits.cut();
803 }
804 }
805
806 fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
807 e: &Hir,
808 lits: &mut Literals,
809 mut f: F,
810 ) {
811 f(e, lits);
812 lits.cut();
813 }
814
815 fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
816 e: &Hir,
817 min: u32,
818 max: Option<u32>,
819 greedy: bool,
820 lits: &mut Literals,
821 mut f: F,
822 ) {
823 if min == 0 {
824 // This is a bit conservative. If `max` is set, then we could
825 // treat this as a finite set of alternations. For now, we
826 // just treat it as `e*`.
827 f(&Hir::repetition(hir::Repetition {
828 kind: hir::RepetitionKind::ZeroOrMore,
829 greedy: greedy,
830 hir: Box::new(e.clone()),
831 }), lits);
832 } else {
833 if min > 0 {
834 let n = cmp::min(lits.limit_size, min as usize);
835 let es = iter::repeat(e.clone()).take(n).collect();
836 f(&Hir::concat(es), lits);
837 if n < min as usize || lits.contains_empty() {
838 lits.cut();
839 }
840 }
841 if max.map_or(true, |max| min < max) {
842 lits.cut();
843 }
844 }
845 }
846
847 fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
848 es: &[Hir],
849 lits: &mut Literals,
850 mut f: F,
851 ) {
852 let mut lits2 = lits.to_empty();
853 for e in es {
854 let mut lits3 = lits.to_empty();
855 lits3.set_limit_size(lits.limit_size() / 5);
856 f(e, &mut lits3);
857 if lits3.is_empty() || !lits2.union(lits3) {
858 // If we couldn't find suffixes for *any* of the
859 // alternates, then the entire alternation has to be thrown
860 // away and any existing members must be frozen. Similarly,
861 // if the union couldn't complete, stop and freeze.
862 lits.cut();
863 return;
864 }
865 }
866 if !lits.cross_product(&lits2) {
867 lits.cut();
868 }
869 }
870
871 impl fmt::Debug for Literals {
872 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
873 f.debug_struct("Literals")
874 .field("lits", &self.lits)
875 .field("limit_size", &self.limit_size)
876 .field("limit_class", &self.limit_class)
877 .finish()
878 }
879 }
880
881 impl Literal {
882 /// Returns a new complete literal with the bytes given.
883 pub fn new(bytes: Vec<u8>) -> Literal {
884 Literal { v: bytes, cut: false }
885 }
886
887 /// Returns a new complete empty literal.
888 pub fn empty() -> Literal {
889 Literal { v: vec![], cut: false }
890 }
891
892 /// Returns true if this literal was "cut."
893 pub fn is_cut(&self) -> bool {
894 self.cut
895 }
896
897 /// Cuts this literal.
898 pub fn cut(&mut self) {
899 self.cut = true;
900 }
901 }
902
903 impl PartialEq for Literal {
904 fn eq(&self, other: &Literal) -> bool {
905 self.v == other.v
906 }
907 }
908
909 impl PartialOrd for Literal {
910 fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> {
911 self.v.partial_cmp(&other.v)
912 }
913 }
914
915 impl fmt::Debug for Literal {
916 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
917 if self.is_cut() {
918 write!(f, "Cut({})", escape_unicode(&self.v))
919 } else {
920 write!(f, "Complete({})", escape_unicode(&self.v))
921 }
922 }
923 }
924
925 impl AsRef<[u8]> for Literal {
926 fn as_ref(&self) -> &[u8] { &self.v }
927 }
928
929 impl ops::Deref for Literal {
930 type Target = Vec<u8>;
931 fn deref(&self) -> &Vec<u8> { &self.v }
932 }
933
934 impl ops::DerefMut for Literal {
935 fn deref_mut(&mut self) -> &mut Vec<u8> { &mut self.v }
936 }
937
938 fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
939 let mut i = 0;
940 while haystack.len() >= needle.len() {
941 if needle == &haystack[..needle.len()] {
942 return Some(i);
943 }
944 i += 1;
945 haystack = &haystack[1..];
946 }
947 None
948 }
949
950 fn escape_unicode(bytes: &[u8]) -> String {
951 let show = match ::std::str::from_utf8(bytes) {
952 Ok(v) => v.to_string(),
953 Err(_) => escape_bytes(bytes),
954 };
955 let mut space_escaped = String::new();
956 for c in show.chars() {
957 if c.is_whitespace() {
958 let escaped = if c as u32 <= 0x7F {
959 escape_byte(c as u8)
960 } else {
961 if c as u32 <= 0xFFFF {
962 format!(r"\u{{{:04x}}}", c as u32)
963 } else {
964 format!(r"\U{{{:08x}}}", c as u32)
965 }
966 };
967 space_escaped.push_str(&escaped);
968 } else {
969 space_escaped.push(c);
970 }
971 }
972 space_escaped
973 }
974
975 fn escape_bytes(bytes: &[u8]) -> String {
976 let mut s = String::new();
977 for &b in bytes {
978 s.push_str(&escape_byte(b));
979 }
980 s
981 }
982
983 fn escape_byte(byte: u8) -> String {
984 use std::ascii::escape_default;
985
986 let escaped: Vec<u8> = escape_default(byte).collect();
987 String::from_utf8_lossy(&escaped).into_owned()
988 }
989
990 fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
991 cls.iter()
992 .map(|&r| 1 + (r.end as u32) - (r.start as u32))
993 .sum::<u32>() as usize
994 }
995
996 fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
997 cls.iter()
998 .map(|&r| 1 + (r.end as u32) - (r.start as u32))
999 .sum::<u32>() as usize
1000 }
1001
1002 #[cfg(test)]
1003 mod tests {
1004 use std::fmt;
1005
1006 use ParserBuilder;
1007 use hir::Hir;
1008 use super::{Literals, Literal, escape_bytes};
1009
1010 // To make test failures easier to read.
1011 #[derive(Debug, Eq, PartialEq)]
1012 struct Bytes(Vec<ULiteral>);
1013 #[derive(Debug, Eq, PartialEq)]
1014 struct Unicode(Vec<ULiteral>);
1015
1016 fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> {
1017 let mut ulits = vec![];
1018 for blit in blits {
1019 ulits.push(ULiteral {
1020 v: escape_bytes(&blit),
1021 cut: blit.is_cut(),
1022 });
1023 }
1024 ulits
1025 }
1026
1027 fn create_lits<I: IntoIterator<Item=Literal>>(it: I) -> Literals {
1028 Literals {
1029 lits: it.into_iter().collect(),
1030 limit_size: 0,
1031 limit_class: 0,
1032 }
1033 }
1034
1035 // Needs to be pub for 1.3?
1036 #[derive(Clone, Eq, PartialEq)]
1037 pub struct ULiteral {
1038 v: String,
1039 cut: bool,
1040 }
1041
1042 impl ULiteral {
1043 fn is_cut(&self) -> bool { self.cut }
1044 }
1045
1046 impl fmt::Debug for ULiteral {
1047 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1048 if self.is_cut() {
1049 write!(f, "Cut({})", self.v)
1050 } else {
1051 write!(f, "Complete({})", self.v)
1052 }
1053 }
1054 }
1055
1056 impl PartialEq<Literal> for ULiteral {
1057 fn eq(&self, other: &Literal) -> bool {
1058 self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut()
1059 }
1060 }
1061
1062 impl PartialEq<ULiteral> for Literal {
1063 fn eq(&self, other: &ULiteral) -> bool {
1064 &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut()
1065 }
1066 }
1067
1068 #[allow(non_snake_case)]
1069 fn C(s: &'static str) -> ULiteral {
1070 ULiteral { v: s.to_owned(), cut: true }
1071 }
1072 #[allow(non_snake_case)]
1073 fn M(s: &'static str) -> ULiteral {
1074 ULiteral { v: s.to_owned(), cut: false }
1075 }
1076
1077 fn prefixes(lits: &mut Literals, expr: &Hir) {
1078 lits.union_prefixes(expr);
1079 }
1080
1081 fn suffixes(lits: &mut Literals, expr: &Hir) {
1082 lits.union_suffixes(expr);
1083 }
1084
1085 macro_rules! assert_lit_eq {
1086 ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{
1087 let expected: Vec<ULiteral> = vec![$($expected_lit),*];
1088 let lits = $got_lits;
1089 assert_eq!(
1090 $which(expected.clone()),
1091 $which(escape_lits(lits.literals())));
1092 assert_eq!(
1093 !expected.is_empty() && expected.iter().all(|l| !l.is_cut()),
1094 lits.all_complete());
1095 assert_eq!(
1096 expected.iter().any(|l| !l.is_cut()),
1097 lits.any_complete());
1098 }};
1099 }
1100
1101 macro_rules! test_lit {
1102 ($name:ident, $which:ident, $re:expr) => {
1103 test_lit!($name, $which, $re,);
1104 };
1105 ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1106 #[test]
1107 fn $name() {
1108 let expr = ParserBuilder::new()
1109 .build()
1110 .parse($re)
1111 .unwrap();
1112 let lits = Literals::$which(&expr);
1113 assert_lit_eq!(Unicode, lits, $($lit),*);
1114
1115 let expr = ParserBuilder::new()
1116 .allow_invalid_utf8(true)
1117 .unicode(false)
1118 .build()
1119 .parse($re)
1120 .unwrap();
1121 let lits = Literals::$which(&expr);
1122 assert_lit_eq!(Bytes, lits, $($lit),*);
1123 }
1124 };
1125 }
1126
1127 // ************************************************************************
1128 // Tests for prefix literal extraction.
1129 // ************************************************************************
1130
1131 // Elementary tests.
1132 test_lit!(pfx_one_lit1, prefixes, "a", M("a"));
1133 test_lit!(pfx_one_lit2, prefixes, "abc", M("abc"));
1134 test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1135 test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1136 test_lit!(pfx_class1, prefixes, "[1-4]",
1137 M("1"), M("2"), M("3"), M("4"));
1138 test_lit!(pfx_class2, prefixes, "(?u)[☃Ⅰ]",
1139 M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83"));
1140 test_lit!(pfx_class3, prefixes, "(?ui)[☃Ⅰ]",
1141 M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"),
1142 M("\\xe2\\x98\\x83"));
1143 test_lit!(pfx_one_lit_casei1, prefixes, "(?i)a",
1144 M("A"), M("a"));
1145 test_lit!(pfx_one_lit_casei2, prefixes, "(?i)abc",
1146 M("ABC"), M("aBC"), M("AbC"), M("abC"),
1147 M("ABc"), M("aBc"), M("Abc"), M("abc"));
1148 test_lit!(pfx_group1, prefixes, "(a)", M("a"));
1149 test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
1150 test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
1151 test_lit!(pfx_rep_zero_or_more1, prefixes, "a*");
1152 test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*");
1153 test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a"));
1154 test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc"));
1155 test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a"));
1156 test_lit!(pfx_rep_range1, prefixes, "a{0}");
1157 test_lit!(pfx_rep_range2, prefixes, "a{0,}");
1158 test_lit!(pfx_rep_range3, prefixes, "a{0,1}");
1159 test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a"));
1160 test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa"));
1161 test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a"));
1162 test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa"));
1163
1164 // Test regexes with concatenations.
1165 test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab"));
1166 test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz"));
1167 test_lit!(pfx_cat3, prefixes, "(?i)[ab]z",
1168 M("AZ"), M("BZ"), M("aZ"), M("bZ"),
1169 M("Az"), M("Bz"), M("az"), M("bz"));
1170 test_lit!(pfx_cat4, prefixes, "[ab][yz]",
1171 M("ay"), M("by"), M("az"), M("bz"));
1172 test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b"));
1173 test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c"));
1174 test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c"));
1175 test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b"));
1176 test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b"));
1177 test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a"));
1178 test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac"));
1179 test_lit!(pfx_cat12, prefixes, "ab+", C("ab"));
1180 test_lit!(pfx_cat13, prefixes, "ab+c", C("ab"));
1181 test_lit!(pfx_cat14, prefixes, "a^", C("a"));
1182 test_lit!(pfx_cat15, prefixes, "$a");
1183 test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac"));
1184 test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab"));
1185 test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb"));
1186 test_lit!(pfx_cat19, prefixes, "a.z", C("a"));
1187
1188 // Test regexes with alternations.
1189 test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b"));
1190 test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1191 test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1192 test_lit!(pfx_alt4, prefixes, "a|b*");
1193 test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b"));
1194 test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)");
1195 test_lit!(pfx_alt7, prefixes, "(a|b)*c|(a|ab)*c",
1196 C("a"), C("b"), M("c"), C("a"), C("ab"), M("c"));
1197 test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c"));
1198
1199 // Test regexes with empty assertions.
1200 test_lit!(pfx_empty1, prefixes, "^a", M("a"));
1201 test_lit!(pfx_empty2, prefixes, "a${2}", C("a"));
1202 test_lit!(pfx_empty3, prefixes, "^abc", M("abc"));
1203 test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z"));
1204
1205 // Make sure some curious regexes have no prefixes.
1206 test_lit!(pfx_nothing1, prefixes, ".");
1207 test_lit!(pfx_nothing2, prefixes, "(?s).");
1208 test_lit!(pfx_nothing3, prefixes, "^");
1209 test_lit!(pfx_nothing4, prefixes, "$");
1210 test_lit!(pfx_nothing6, prefixes, "(?m)$");
1211 test_lit!(pfx_nothing7, prefixes, r"\b");
1212 test_lit!(pfx_nothing8, prefixes, r"\B");
1213
1214 // Test a few regexes that defeat any prefix literal detection.
1215 test_lit!(pfx_defeated1, prefixes, ".a");
1216 test_lit!(pfx_defeated2, prefixes, "(?s).a");
1217 test_lit!(pfx_defeated3, prefixes, "a*b*c*");
1218 test_lit!(pfx_defeated4, prefixes, "a|.");
1219 test_lit!(pfx_defeated5, prefixes, ".|a");
1220 test_lit!(pfx_defeated6, prefixes, "a|^");
1221 test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))");
1222 test_lit!(pfx_defeated8, prefixes, "$a");
1223 test_lit!(pfx_defeated9, prefixes, "(?m)$a");
1224 test_lit!(pfx_defeated10, prefixes, r"\ba");
1225 test_lit!(pfx_defeated11, prefixes, r"\Ba");
1226 test_lit!(pfx_defeated12, prefixes, "^*a");
1227 test_lit!(pfx_defeated13, prefixes, "^+a");
1228
1229 test_lit!(
1230 pfx_crazy1,
1231 prefixes,
1232 r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
1233 C("Mo\\'am"), C("Mu\\'am"), C("Moam"), C("Muam"));
1234
1235 // ************************************************************************
1236 // Tests for quiting prefix literal search.
1237 // ************************************************************************
1238
1239 macro_rules! test_exhausted {
1240 ($name:ident, $which:ident, $re:expr) => {
1241 test_exhausted!($name, $which, $re,);
1242 };
1243 ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1244 #[test]
1245 fn $name() {
1246 let expr = ParserBuilder::new()
1247 .build()
1248 .parse($re)
1249 .unwrap();
1250 let mut lits = Literals::empty();
1251 lits.set_limit_size(20).set_limit_class(10);
1252 $which(&mut lits, &expr);
1253 assert_lit_eq!(Unicode, lits, $($lit),*);
1254
1255 let expr = ParserBuilder::new()
1256 .allow_invalid_utf8(true)
1257 .unicode(false)
1258 .build()
1259 .parse($re)
1260 .unwrap();
1261 let mut lits = Literals::empty();
1262 lits.set_limit_size(20).set_limit_class(10);
1263 $which(&mut lits, &expr);
1264 assert_lit_eq!(Bytes, lits, $($lit),*);
1265 }
1266 };
1267 }
1268
1269 // These test use a much lower limit than the default so that we can
1270 // write test cases of reasonable size.
1271 test_exhausted!(pfx_exhausted1, prefixes, "[a-z]");
1272 test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A");
1273 test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A"));
1274 test_exhausted!(pfx_exhausted4, prefixes, "(?i)foobar",
1275 C("FO"), C("fO"), C("Fo"), C("fo"));
1276 test_exhausted!(pfx_exhausted5, prefixes, "(?:ab){100}",
1277 C("abababababababababab"));
1278 test_exhausted!(pfx_exhausted6, prefixes, "(?:(?:ab){100})*cd",
1279 C("ababababab"), M("cd"));
1280 test_exhausted!(pfx_exhausted7, prefixes, "z(?:(?:ab){100})*cd",
1281 C("zababababab"), M("zcd"));
1282 test_exhausted!(pfx_exhausted8, prefixes, "aaaaaaaaaaaaaaaaaaaaz",
1283 C("aaaaaaaaaaaaaaaaaaaa"));
1284
1285 // ************************************************************************
1286 // Tests for suffix literal extraction.
1287 // ************************************************************************
1288
1289 // Elementary tests.
1290 test_lit!(sfx_one_lit1, suffixes, "a", M("a"));
1291 test_lit!(sfx_one_lit2, suffixes, "abc", M("abc"));
1292 test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1293 test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1294 test_lit!(sfx_class1, suffixes, "[1-4]",
1295 M("1"), M("2"), M("3"), M("4"));
1296 test_lit!(sfx_class2, suffixes, "(?u)[☃Ⅰ]",
1297 M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83"));
1298 test_lit!(sfx_class3, suffixes, "(?ui)[☃Ⅰ]",
1299 M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"),
1300 M("\\xe2\\x98\\x83"));
1301 test_lit!(sfx_one_lit_casei1, suffixes, "(?i)a",
1302 M("A"), M("a"));
1303 test_lit!(sfx_one_lit_casei2, suffixes, "(?i)abc",
1304 M("ABC"), M("ABc"), M("AbC"), M("Abc"),
1305 M("aBC"), M("aBc"), M("abC"), M("abc"));
1306 test_lit!(sfx_group1, suffixes, "(a)", M("a"));
1307 test_lit!(sfx_rep_zero_or_one1, suffixes, "a?");
1308 test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?");
1309 test_lit!(sfx_rep_zero_or_more1, suffixes, "a*");
1310 test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*");
1311 test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a"));
1312 test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc"));
1313 test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a"));
1314 test_lit!(sfx_rep_range1, suffixes, "a{0}");
1315 test_lit!(sfx_rep_range2, suffixes, "a{0,}");
1316 test_lit!(sfx_rep_range3, suffixes, "a{0,1}");
1317 test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a"));
1318 test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa"));
1319 test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a"));
1320 test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa"));
1321
1322 // Test regexes with concatenations.
1323 test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab"));
1324 test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz"));
1325 test_lit!(sfx_cat3, suffixes, "(?i)[ab]z",
1326 M("AZ"), M("Az"), M("BZ"), M("Bz"),
1327 M("aZ"), M("az"), M("bZ"), M("bz"));
1328 test_lit!(sfx_cat4, suffixes, "[ab][yz]",
1329 M("ay"), M("az"), M("by"), M("bz"));
1330 test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b"));
1331 test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c"));
1332 test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c"));
1333 test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc"));
1334 test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b"));
1335 test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a"));
1336 test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac"));
1337 test_lit!(sfx_cat12, suffixes, "ab+", C("b"));
1338 test_lit!(sfx_cat13, suffixes, "ab+c", C("bc"));
1339 test_lit!(sfx_cat14, suffixes, "a^");
1340 test_lit!(sfx_cat15, suffixes, "$a", C("a"));
1341 test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac"));
1342 test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc"));
1343 test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb"));
1344 test_lit!(sfx_cat19, suffixes, "a.z", C("z"));
1345
1346 // Test regexes with alternations.
1347 test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b"));
1348 test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1349 test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1350 test_lit!(sfx_alt4, suffixes, "a|b*");
1351 test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b"));
1352 test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)");
1353 test_lit!(sfx_alt7, suffixes, "(a|b)*c|(a|ab)*c",
1354 C("ac"), C("bc"), M("c"), C("ac"), C("abc"), M("c"));
1355 test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c"));
1356
1357 // Test regexes with empty assertions.
1358 test_lit!(sfx_empty1, suffixes, "a$", M("a"));
1359 test_lit!(sfx_empty2, suffixes, "${2}a", C("a"));
1360
1361 // Make sure some curious regexes have no suffixes.
1362 test_lit!(sfx_nothing1, suffixes, ".");
1363 test_lit!(sfx_nothing2, suffixes, "(?s).");
1364 test_lit!(sfx_nothing3, suffixes, "^");
1365 test_lit!(sfx_nothing4, suffixes, "$");
1366 test_lit!(sfx_nothing6, suffixes, "(?m)$");
1367 test_lit!(sfx_nothing7, suffixes, r"\b");
1368 test_lit!(sfx_nothing8, suffixes, r"\B");
1369
1370 // Test a few regexes that defeat any suffix literal detection.
1371 test_lit!(sfx_defeated1, suffixes, "a.");
1372 test_lit!(sfx_defeated2, suffixes, "(?s)a.");
1373 test_lit!(sfx_defeated3, suffixes, "a*b*c*");
1374 test_lit!(sfx_defeated4, suffixes, "a|.");
1375 test_lit!(sfx_defeated5, suffixes, ".|a");
1376 test_lit!(sfx_defeated6, suffixes, "a|^");
1377 test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c)).");
1378 test_lit!(sfx_defeated8, suffixes, "a^");
1379 test_lit!(sfx_defeated9, suffixes, "(?m)a$");
1380 test_lit!(sfx_defeated10, suffixes, r"a\b");
1381 test_lit!(sfx_defeated11, suffixes, r"a\B");
1382 test_lit!(sfx_defeated12, suffixes, "a^*");
1383 test_lit!(sfx_defeated13, suffixes, "a^+");
1384
1385 // These test use a much lower limit than the default so that we can
1386 // write test cases of reasonable size.
1387 test_exhausted!(sfx_exhausted1, suffixes, "[a-z]");
1388 test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*");
1389 test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z"));
1390 test_exhausted!(sfx_exhausted4, suffixes, "(?i)foobar",
1391 C("AR"), C("Ar"), C("aR"), C("ar"));
1392 test_exhausted!(sfx_exhausted5, suffixes, "(?:ab){100}",
1393 C("abababababababababab"));
1394 test_exhausted!(sfx_exhausted6, suffixes, "cd(?:(?:ab){100})*",
1395 C("ababababab"), M("cd"));
1396 test_exhausted!(sfx_exhausted7, suffixes, "cd(?:(?:ab){100})*z",
1397 C("abababababz"), M("cdz"));
1398 test_exhausted!(sfx_exhausted8, suffixes, "zaaaaaaaaaaaaaaaaaaaa",
1399 C("aaaaaaaaaaaaaaaaaaaa"));
1400
1401 // ************************************************************************
1402 // Tests for generating unambiguous literal sets.
1403 // ************************************************************************
1404
1405 macro_rules! test_unamb {
1406 ($name:ident, $given:expr, $expected:expr) => {
1407 #[test]
1408 fn $name() {
1409 let given: Vec<Literal> =
1410 $given
1411 .into_iter()
1412 .map(|ul| {
1413 let cut = ul.is_cut();
1414 Literal { v: ul.v.into_bytes(), cut: cut }
1415 })
1416 .collect();
1417 let lits = create_lits(given);
1418 let got = lits.unambiguous_prefixes();
1419 assert_eq!($expected, escape_lits(got.literals()));
1420 }
1421 };
1422 }
1423
1424 test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]);
1425 test_unamb!(unambiguous2,
1426 vec![M("zaaaaaa"), M("aa")], vec![C("aa"), C("z")]);
1427 test_unamb!(unambiguous3,
1428 vec![M("Sherlock"), M("Watson")],
1429 vec![M("Sherlock"), M("Watson")]);
1430 test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]);
1431 test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]);
1432 test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]);
1433 test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]);
1434 test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]);
1435 test_unamb!(unambiguous9,
1436 vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
1437 vec![C("a"), C("b"), C("c")]);
1438 test_unamb!(unambiguous10,
1439 vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
1440 vec![C("Mo"), C("Mu")]);
1441 test_unamb!(unambiguous11,
1442 vec![M("zazb"), M("azb")], vec![C("a"), C("z")]);
1443 test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]);
1444 test_unamb!(unambiguous13,
1445 vec![M("ABCX"), M("CDAX"), M("BCX")],
1446 vec![C("A"), C("BCX"), C("CD")]);
1447 test_unamb!(unambiguous14,
1448 vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
1449 vec![M("DSX"), C("I"), C("MGX"), C("MV")]);
1450 test_unamb!(unambiguous15,
1451 vec![M("IMG_"), M("MG_"), M("CIMG")],
1452 vec![C("C"), C("I"), C("MG_")]);
1453
1454
1455 // ************************************************************************
1456 // Tests for suffix trimming.
1457 // ************************************************************************
1458 macro_rules! test_trim {
1459 ($name:ident, $trim:expr, $given:expr, $expected:expr) => {
1460 #[test]
1461 fn $name() {
1462 let given: Vec<Literal> =
1463 $given
1464 .into_iter()
1465 .map(|ul| {
1466 let cut = ul.is_cut();
1467 Literal { v: ul.v.into_bytes(), cut: cut }
1468 })
1469 .collect();
1470 let lits = create_lits(given);
1471 let got = lits.trim_suffix($trim).unwrap();
1472 assert_eq!($expected, escape_lits(got.literals()));
1473 }
1474 }
1475 }
1476
1477 test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]);
1478 test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]);
1479 test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]);
1480 test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]);
1481
1482 // ************************************************************************
1483 // Tests for longest common prefix.
1484 // ************************************************************************
1485
1486 macro_rules! test_lcp {
1487 ($name:ident, $given:expr, $expected:expr) => {
1488 #[test]
1489 fn $name() {
1490 let given: Vec<Literal> =
1491 $given
1492 .into_iter()
1493 .map(|s: &str| Literal {
1494 v: s.to_owned().into_bytes(),
1495 cut: false,
1496 })
1497 .collect();
1498 let lits = create_lits(given);
1499 let got = lits.longest_common_prefix();
1500 assert_eq!($expected, escape_bytes(got));
1501 }
1502 };
1503 }
1504
1505 test_lcp!(lcp1, vec!["a"], "a");
1506 test_lcp!(lcp2, vec![], "");
1507 test_lcp!(lcp3, vec!["a", "b"], "");
1508 test_lcp!(lcp4, vec!["ab", "ab"], "ab");
1509 test_lcp!(lcp5, vec!["ab", "a"], "a");
1510 test_lcp!(lcp6, vec!["a", "ab"], "a");
1511 test_lcp!(lcp7, vec!["ab", "b"], "");
1512 test_lcp!(lcp8, vec!["b", "ab"], "");
1513 test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba");
1514 test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], "");
1515 test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], "");
1516 test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f");
1517
1518 // ************************************************************************
1519 // Tests for longest common suffix.
1520 // ************************************************************************
1521
1522 macro_rules! test_lcs {
1523 ($name:ident, $given:expr, $expected:expr) => {
1524 #[test]
1525 fn $name() {
1526 let given: Vec<Literal> =
1527 $given
1528 .into_iter()
1529 .map(|s: &str| Literal {
1530 v: s.to_owned().into_bytes(),
1531 cut: false,
1532 })
1533 .collect();
1534 let lits = create_lits(given);
1535 let got = lits.longest_common_suffix();
1536 assert_eq!($expected, escape_bytes(got));
1537 }
1538 };
1539 }
1540
1541 test_lcs!(lcs1, vec!["a"], "a");
1542 test_lcs!(lcs2, vec![], "");
1543 test_lcs!(lcs3, vec!["a", "b"], "");
1544 test_lcs!(lcs4, vec!["ab", "ab"], "ab");
1545 test_lcs!(lcs5, vec!["ab", "a"], "");
1546 test_lcs!(lcs6, vec!["a", "ab"], "");
1547 test_lcs!(lcs7, vec!["ab", "b"], "b");
1548 test_lcs!(lcs8, vec!["b", "ab"], "b");
1549 test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo");
1550 test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], "");
1551 test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], "");
1552 test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b");
1553 }