1 // Copyright 2018 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
12 Provides routines for extracting literal prefixes and suffixes from an `Hir`.
21 use hir
::{self, Hir, HirKind}
;
24 /// A set of literal byte strings extracted from a regular expression.
26 /// Every member of the set is a `Literal`, which is represented by a
27 /// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is
28 /// said to be either *complete* or *cut*. A complete literal means that
29 /// it extends until the beginning (or end) of the regular expression. In
30 /// some circumstances, this can be used to indicate a match in the regular
33 /// A key aspect of literal extraction is knowing when to stop. It is not
34 /// feasible to blindly extract all literals from a regular expression, even if
35 /// there are finitely many. For example, the regular expression `[0-9]{10}`
36 /// has `10^10` distinct literals. For this reason, literal extraction is
37 /// bounded to some low number by default using heuristics, but the limits can
40 /// **WARNING**: Literal extraction uses stack space proportional to the size
41 /// of the `Hir` expression. At some point, this drawback will be eliminated.
42 /// To protect yourself, set a reasonable
43 /// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit).
44 /// This is done for you by default.
45 #[derive(Clone, Eq, PartialEq)]
52 /// A single member of a set of literals extracted from a regular expression.
54 /// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice
55 /// and `Vec` operations are available.
56 #[derive(Clone, Eq, Ord)]
63 /// Returns a new empty set of literals using default limits.
64 pub fn empty() -> Literals
{
72 /// Returns a set of literal prefixes extracted from the given `Hir`.
73 pub fn prefixes(expr
: &Hir
) -> Literals
{
74 let mut lits
= Literals
::empty();
75 lits
.union_prefixes(expr
);
79 /// Returns a set of literal suffixes extracted from the given `Hir`.
80 pub fn suffixes(expr
: &Hir
) -> Literals
{
81 let mut lits
= Literals
::empty();
82 lits
.union_suffixes(expr
);
86 /// Get the approximate size limit (in bytes) of this set.
87 pub fn limit_size(&self) -> usize {
91 /// Set the approximate size limit (in bytes) of this set.
93 /// If extracting a literal would put the set over this limit, then
96 /// The new limits will only apply to additions to this set. Existing
97 /// members remain unchanged, even if the set exceeds the new limit.
98 pub fn set_limit_size(&mut self, size
: usize) -> &mut Literals
{
99 self.limit_size
= size
;
103 /// Get the character class size limit for this set.
104 pub fn limit_class(&self) -> usize {
108 /// Limits the size of character(or byte) classes considered.
110 /// A value of `0` prevents all character classes from being considered.
112 /// This limit also applies to case insensitive literals, since each
113 /// character in the case insensitive literal is converted to a class, and
114 /// then case folded.
116 /// The new limits will only apply to additions to this set. Existing
117 /// members remain unchanged, even if the set exceeds the new limit.
118 pub fn set_limit_class(&mut self, size
: usize) -> &mut Literals
{
119 self.limit_class
= size
;
123 /// Returns the set of literals as a slice. Its order is unspecified.
124 pub fn literals(&self) -> &[Literal
] {
128 /// Returns the length of the smallest literal.
130 /// Returns None is there are no literals in the set.
131 pub fn min_len(&self) -> Option
<usize> {
133 for lit
in &self.lits
{
135 None
=> min
= Some(lit
.len()),
136 Some(m
) if lit
.len() < m
=> min
= Some(lit
.len()),
143 /// Returns true if all members in this set are complete.
144 pub fn all_complete(&self) -> bool
{
145 !self.lits
.is_empty() && self.lits
.iter().all(|l
| !l
.is_cut())
148 /// Returns true if any member in this set is complete.
149 pub fn any_complete(&self) -> bool
{
150 self.lits
.iter().any(|lit
| !lit
.is_cut())
153 /// Returns true if this set contains an empty literal.
154 pub fn contains_empty(&self) -> bool
{
155 self.lits
.iter().any(|lit
| lit
.is_empty())
158 /// Returns true if this set is empty or if all of its members is empty.
159 pub fn is_empty(&self) -> bool
{
160 self.lits
.is_empty() || self.lits
.iter().all(|lit
| lit
.is_empty())
163 /// Returns a new empty set of literals using this set's limits.
164 pub fn to_empty(&self) -> Literals
{
165 let mut lits
= Literals
::empty();
166 lits
.set_limit_size(self.limit_size
)
167 .set_limit_class(self.limit_class
);
171 /// Returns the longest common prefix of all members in this set.
172 pub fn longest_common_prefix(&self) -> &[u8] {
176 let lit0
= &*self.lits
[0];
177 let mut len
= lit0
.len();
178 for lit
in &self.lits
[1..] {
183 .take_while(|&(a
, b
)| a
== b
)
189 /// Returns the longest common suffix of all members in this set.
190 pub fn longest_common_suffix(&self) -> &[u8] {
194 let lit0
= &*self.lits
[0];
195 let mut len
= lit0
.len();
196 for lit
in &self.lits
[1..] {
201 .zip(lit0
.iter().rev())
202 .take_while(|&(a
, b
)| a
== b
)
205 &self.lits
[0][self.lits
[0].len() - len
..]
208 /// Returns a new set of literals with the given number of bytes trimmed
209 /// from the suffix of each literal.
211 /// If any literal would be cut out completely by trimming, then None is
214 /// Any duplicates that are created as a result of this transformation are
216 pub fn trim_suffix(&self, num_bytes
: usize) -> Option
<Literals
> {
217 if self.min_len().map(|len
| len
<= num_bytes
).unwrap_or(true) {
220 let mut new
= self.to_empty();
221 for mut lit
in self.lits
.iter().cloned() {
222 let new_len
= lit
.len() - num_bytes
;
223 lit
.truncate(new_len
);
232 /// Returns a new set of prefixes of this set of literals that are
233 /// guaranteed to be unambiguous.
235 /// Any substring match with a member of the set is returned is guaranteed
236 /// to never overlap with a substring match of another member of the set
237 /// at the same starting position.
239 /// Given any two members of the returned set, neither is a substring of
241 pub fn unambiguous_prefixes(&self) -> Literals
{
242 if self.lits
.is_empty() {
243 return self.to_empty();
245 let mut old
: Vec
<Literal
> = self.lits
.iter().cloned().collect();
246 let mut new
= self.to_empty();
248 while let Some(mut candidate
) = old
.pop() {
249 if candidate
.is_empty() {
252 if new
.lits
.is_empty() {
253 new
.lits
.push(candidate
);
256 for lit2
in &mut new
.lits
{
260 if &candidate
== lit2
{
261 // If the literal is already in the set, then we can
262 // just drop it. But make sure that cut literals are
264 candidate
.cut
= candidate
.cut
|| lit2
.cut
;
265 lit2
.cut
= candidate
.cut
;
268 if candidate
.len() < lit2
.len() {
269 if let Some(i
) = position(&candidate
, &lit2
) {
271 let mut lit3
= lit2
.clone();
278 if let Some(i
) = position(&lit2
, &candidate
) {
280 let mut new_candidate
= candidate
.clone();
281 new_candidate
.truncate(i
);
283 old
.push(new_candidate
);
287 // Oops, the candidate is already represented in the set.
288 if candidate
.is_empty() {
292 new
.lits
.push(candidate
);
294 new
.lits
.retain(|lit
| !lit
.is_empty());
300 /// Returns a new set of suffixes of this set of literals that are
301 /// guaranteed to be unambiguous.
303 /// Any substring match with a member of the set is returned is guaranteed
304 /// to never overlap with a substring match of another member of the set
305 /// at the same ending position.
307 /// Given any two members of the returned set, neither is a substring of
309 pub fn unambiguous_suffixes(&self) -> Literals
{
310 // This is a touch wasteful...
311 let mut lits
= self.clone();
313 let mut unamb
= lits
.unambiguous_prefixes();
318 /// Unions the prefixes from the given expression to this set.
320 /// If prefixes could not be added (for example, this set would exceed its
321 /// size limits or the set of prefixes from `expr` includes the empty
322 /// string), then false is returned.
324 /// Note that prefix literals extracted from `expr` are said to be complete
325 /// if and only if the literal extends from the beginning of `expr` to the
327 pub fn union_prefixes(&mut self, expr
: &Hir
) -> bool
{
328 let mut lits
= self.to_empty();
329 prefixes(expr
, &mut lits
);
330 !lits
.is_empty() && !lits
.contains_empty() && self.union(lits
)
333 /// Unions the suffixes from the given expression to this set.
335 /// If suffixes could not be added (for example, this set would exceed its
336 /// size limits or the set of suffixes from `expr` includes the empty
337 /// string), then false is returned.
339 /// Note that prefix literals extracted from `expr` are said to be complete
340 /// if and only if the literal extends from the end of `expr` to the
341 /// beginning of `expr`.
342 pub fn union_suffixes(&mut self, expr
: &Hir
) -> bool
{
343 let mut lits
= self.to_empty();
344 suffixes(expr
, &mut lits
);
346 !lits
.is_empty() && !lits
.contains_empty() && self.union(lits
)
349 /// Unions this set with another set.
351 /// If the union would cause the set to exceed its limits, then the union
352 /// is skipped and it returns false. Otherwise, if the union succeeds, it
354 pub fn union(&mut self, lits
: Literals
) -> bool
{
355 if self.num_bytes() + lits
.num_bytes() > self.limit_size
{
359 self.lits
.push(Literal
::empty());
361 self.lits
.extend(lits
.lits
);
366 /// Extends this set with another set.
368 /// The set of literals is extended via a cross product.
370 /// If a cross product would cause this set to exceed its limits, then the
371 /// cross product is skipped and it returns false. Otherwise, if the cross
372 /// product succeeds, it returns true.
373 pub fn cross_product(&mut self, lits
: &Literals
) -> bool
{
377 // Check that we make sure we stay in our limits.
379 if self.is_empty() || !self.any_complete() {
380 size_after
= self.num_bytes();
381 for lits_lit
in lits
.literals() {
382 size_after
+= lits_lit
.len();
385 size_after
= self.lits
.iter().fold(0, |accum
, lit
| {
386 accum
+ if lit
.is_cut() { lit.len() }
else { 0 }
388 for lits_lit
in lits
.literals() {
389 for self_lit
in self.literals() {
390 if !self_lit
.is_cut() {
391 size_after
+= self_lit
.len() + lits_lit
.len();
396 if size_after
> self.limit_size
{
400 let mut base
= self.remove_complete();
402 base
= vec
![Literal
::empty()];
404 for lits_lit
in lits
.literals() {
405 for mut self_lit
in base
.clone() {
406 self_lit
.extend(&**lits_lit
);
407 self_lit
.cut
= lits_lit
.cut
;
408 self.lits
.push(self_lit
);
414 /// Extends each literal in this set with the bytes given.
416 /// If the set is empty, then the given literal is added to the set.
418 /// If adding any number of bytes to all members of this set causes a limit
419 /// to be exceeded, then no bytes are added and false is returned. If a
420 /// prefix of `bytes` can be fit into this set, then it is used and all
421 /// resulting literals are cut.
422 pub fn cross_add(&mut self, bytes
: &[u8]) -> bool
{
423 // N.B. This could be implemented by simply calling cross_product with
424 // a literal set containing just `bytes`, but we can be smarter about
425 // taking shorter prefixes of `bytes` if they'll fit.
426 if bytes
.is_empty() {
429 if self.lits
.is_empty() {
430 let i
= cmp
::min(self.limit_size
, bytes
.len());
431 self.lits
.push(Literal
::new(bytes
[..i
].to_owned()));
432 self.lits
[0].cut
= i
< bytes
.len();
433 return !self.lits
[0].is_cut();
435 let size
= self.num_bytes();
436 if size
+ self.lits
.len() >= self.limit_size
{
440 while size
+ (i
* self.lits
.len()) <= self.limit_size
444 for lit
in &mut self.lits
{
446 lit
.extend(&bytes
[..i
]);
455 /// Adds the given literal to this set.
457 /// Returns false if adding this literal would cause the class to be too
459 pub fn add(&mut self, lit
: Literal
) -> bool
{
460 if self.num_bytes() + lit
.len() > self.limit_size
{
467 /// Extends each literal in this set with the character class given.
469 /// Returns false if the character class was too big to add.
470 pub fn add_char_class(&mut self, cls
: &hir
::ClassUnicode
) -> bool
{
471 self._add_char_class(cls
, false)
474 /// Extends each literal in this set with the character class given,
475 /// writing the bytes of each character in reverse.
477 /// Returns false if the character class was too big to add.
478 fn add_char_class_reverse(&mut self, cls
: &hir
::ClassUnicode
) -> bool
{
479 self._add_char_class(cls
, true)
484 cls
: &hir
::ClassUnicode
,
489 if self.class_exceeds_limits(cls_char_count(cls
)) {
492 let mut base
= self.remove_complete();
494 base
= vec
![Literal
::empty()];
496 for r
in cls
.iter() {
497 let (s
, e
) = (r
.start
as u32, r
.end
as u32 + 1);
498 for c
in (s
..e
).filter_map(char::from_u32
) {
499 for mut lit
in base
.clone() {
500 let mut bytes
= c
.to_string().into_bytes();
512 /// Extends each literal in this set with the byte class given.
514 /// Returns false if the byte class was too big to add.
515 pub fn add_byte_class(&mut self, cls
: &hir
::ClassBytes
) -> bool
{
516 if self.class_exceeds_limits(cls_byte_count(cls
)) {
519 let mut base
= self.remove_complete();
521 base
= vec
![Literal
::empty()];
523 for r
in cls
.iter() {
524 let (s
, e
) = (r
.start
as u32, r
.end
as u32 + 1);
525 for b
in (s
..e
).map(|b
| b
as u8) {
526 for mut lit
in base
.clone() {
535 /// Cuts every member of this set. When a member is cut, it can never
537 pub fn cut(&mut self) {
538 for lit
in &mut self.lits
{
543 /// Reverses all members in place.
544 pub fn reverse(&mut self) {
545 for lit
in &mut self.lits
{
550 /// Clears this set of all members.
551 pub fn clear(&mut self) {
555 /// Pops all complete literals out of this set.
556 fn remove_complete(&mut self) -> Vec
<Literal
> {
557 let mut base
= vec
![];
558 for lit
in mem
::replace(&mut self.lits
, vec
![]) {
568 /// Returns the total number of bytes in this set.
569 fn num_bytes(&self) -> usize {
570 self.lits
.iter().fold(0, |accum
, lit
| accum
+ lit
.len())
573 /// Returns true if a character class with the given size would cause this
574 /// set to exceed its limits.
576 /// The size given should correspond to the number of items in the class.
577 fn class_exceeds_limits(&self, size
: usize) -> bool
{
578 if size
> self.limit_class
{
581 // This is an approximation since codepoints in a char class can encode
584 if self.lits
.is_empty() {
589 .fold(0, |accum
, lit
| {
590 accum
+ if lit
.is_cut() {
591 // If the literal is cut, then we'll never add
592 // anything to it, so don't count it.
595 (lit
.len() + 1) * size
599 new_byte_count
> self.limit_size
603 fn prefixes(expr
: &Hir
, lits
: &mut Literals
) {
605 HirKind
::Literal(hir
::Literal
::Unicode(c
)) => {
606 let mut buf
= [0u8; 4];
607 let i
= unicode
::encode_utf8(c
, &mut buf
).unwrap();
608 lits
.cross_add(&buf
[..i
]);
610 HirKind
::Literal(hir
::Literal
::Byte(b
)) => {
611 lits
.cross_add(&[b
]);
613 HirKind
::Class(hir
::Class
::Unicode(ref cls
)) => {
614 if !lits
.add_char_class(cls
) {
618 HirKind
::Class(hir
::Class
::Bytes(ref cls
)) => {
619 if !lits
.add_byte_class(cls
) {
623 HirKind
::Group(hir
::Group { ref hir, .. }
) => {
624 prefixes(&**hir
, lits
);
626 HirKind
::Repetition(ref x
) => {
628 hir
::RepetitionKind
::ZeroOrOne
=> {
629 repeat_zero_or_one_literals(&x
.hir
, lits
, prefixes
);
631 hir
::RepetitionKind
::ZeroOrMore
=> {
632 repeat_zero_or_more_literals(&x
.hir
, lits
, prefixes
);
634 hir
::RepetitionKind
::OneOrMore
=> {
635 repeat_one_or_more_literals(&x
.hir
, lits
, prefixes
);
637 hir
::RepetitionKind
::Range(ref rng
) => {
638 let (min
, max
) = match *rng
{
639 hir
::RepetitionRange
::Exactly(m
) => {
642 hir
::RepetitionRange
::AtLeast(m
) => {
645 hir
::RepetitionRange
::Bounded(m
, n
) => {
649 repeat_range_literals(
650 &x
.hir
, min
, max
, x
.greedy
, lits
, prefixes
)
654 HirKind
::Concat(ref es
) if es
.is_empty() => {}
655 HirKind
::Concat(ref es
) if es
.len() == 1 => prefixes(&es
[0], lits
),
656 HirKind
::Concat(ref es
) => {
658 if let HirKind
::Anchor(hir
::Anchor
::StartText
) = *e
.kind() {
659 if !lits
.is_empty() {
663 lits
.add(Literal
::empty());
666 let mut lits2
= lits
.to_empty();
667 prefixes(e
, &mut lits2
);
668 if !lits
.cross_product(&lits2
) || !lits2
.any_complete() {
669 // If this expression couldn't yield any literal that
670 // could be extended, then we need to quit. Since we're
671 // short-circuiting, we also need to freeze every member.
677 HirKind
::Alternation(ref es
) => {
678 alternate_literals(es
, lits
, prefixes
);
684 fn suffixes(expr
: &Hir
, lits
: &mut Literals
) {
686 HirKind
::Literal(hir
::Literal
::Unicode(c
)) => {
687 let mut buf
= [0u8; 4];
688 let i
= unicode
::encode_utf8(c
, &mut buf
).unwrap();
689 let mut buf
= &mut buf
[..i
];
693 HirKind
::Literal(hir
::Literal
::Byte(b
)) => {
694 lits
.cross_add(&[b
]);
696 HirKind
::Class(hir
::Class
::Unicode(ref cls
)) => {
697 if !lits
.add_char_class_reverse(cls
) {
701 HirKind
::Class(hir
::Class
::Bytes(ref cls
)) => {
702 if !lits
.add_byte_class(cls
) {
706 HirKind
::Group(hir
::Group { ref hir, .. }
) => {
707 suffixes(&**hir
, lits
);
709 HirKind
::Repetition(ref x
) => {
711 hir
::RepetitionKind
::ZeroOrOne
=> {
712 repeat_zero_or_one_literals(&x
.hir
, lits
, suffixes
);
714 hir
::RepetitionKind
::ZeroOrMore
=> {
715 repeat_zero_or_more_literals(&x
.hir
, lits
, suffixes
);
717 hir
::RepetitionKind
::OneOrMore
=> {
718 repeat_one_or_more_literals(&x
.hir
, lits
, suffixes
);
720 hir
::RepetitionKind
::Range(ref rng
) => {
721 let (min
, max
) = match *rng
{
722 hir
::RepetitionRange
::Exactly(m
) => {
725 hir
::RepetitionRange
::AtLeast(m
) => {
728 hir
::RepetitionRange
::Bounded(m
, n
) => {
732 repeat_range_literals(
733 &x
.hir
, min
, max
, x
.greedy
, lits
, suffixes
)
737 HirKind
::Concat(ref es
) if es
.is_empty() => {}
738 HirKind
::Concat(ref es
) if es
.len() == 1 => suffixes(&es
[0], lits
),
739 HirKind
::Concat(ref es
) => {
740 for e
in es
.iter().rev() {
741 if let HirKind
::Anchor(hir
::Anchor
::EndText
) = *e
.kind() {
742 if !lits
.is_empty() {
746 lits
.add(Literal
::empty());
749 let mut lits2
= lits
.to_empty();
750 suffixes(e
, &mut lits2
);
751 if !lits
.cross_product(&lits2
) || !lits2
.any_complete() {
752 // If this expression couldn't yield any literal that
753 // could be extended, then we need to quit. Since we're
754 // short-circuiting, we also need to freeze every member.
760 HirKind
::Alternation(ref es
) => {
761 alternate_literals(es
, lits
, suffixes
);
767 fn repeat_zero_or_one_literals
<F
: FnMut(&Hir
, &mut Literals
)>(
772 let (mut lits2
, mut lits3
) = (lits
.clone(), lits
.to_empty());
773 lits3
.set_limit_size(lits
.limit_size() / 2);
776 if lits3
.is_empty() || !lits2
.cross_product(&lits3
) {
780 lits2
.add(Literal
::empty());
781 if !lits
.union(lits2
) {
786 fn repeat_zero_or_more_literals
<F
: FnMut(&Hir
, &mut Literals
)>(
791 let (mut lits2
, mut lits3
) = (lits
.clone(), lits
.to_empty());
792 lits3
.set_limit_size(lits
.limit_size() / 2);
795 if lits3
.is_empty() || !lits2
.cross_product(&lits3
) {
800 lits2
.add(Literal
::empty());
801 if !lits
.union(lits2
) {
806 fn repeat_one_or_more_literals
<F
: FnMut(&Hir
, &mut Literals
)>(
815 fn repeat_range_literals
<F
: FnMut(&Hir
, &mut Literals
)>(
824 // This is a bit conservative. If `max` is set, then we could
825 // treat this as a finite set of alternations. For now, we
826 // just treat it as `e*`.
827 f(&Hir
::repetition(hir
::Repetition
{
828 kind
: hir
::RepetitionKind
::ZeroOrMore
,
830 hir
: Box
::new(e
.clone()),
834 let n
= cmp
::min(lits
.limit_size
, min
as usize);
835 let es
= iter
::repeat(e
.clone()).take(n
).collect();
836 f(&Hir
::concat(es
), lits
);
837 if n
< min
as usize || lits
.contains_empty() {
841 if max
.map_or(true, |max
| min
< max
) {
847 fn alternate_literals
<F
: FnMut(&Hir
, &mut Literals
)>(
852 let mut lits2
= lits
.to_empty();
854 let mut lits3
= lits
.to_empty();
855 lits3
.set_limit_size(lits
.limit_size() / 5);
857 if lits3
.is_empty() || !lits2
.union(lits3
) {
858 // If we couldn't find suffixes for *any* of the
859 // alternates, then the entire alternation has to be thrown
860 // away and any existing members must be frozen. Similarly,
861 // if the union couldn't complete, stop and freeze.
866 if !lits
.cross_product(&lits2
) {
871 impl fmt
::Debug
for Literals
{
872 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
873 f
.debug_struct("Literals")
874 .field("lits", &self.lits
)
875 .field("limit_size", &self.limit_size
)
876 .field("limit_class", &self.limit_class
)
882 /// Returns a new complete literal with the bytes given.
883 pub fn new(bytes
: Vec
<u8>) -> Literal
{
884 Literal { v: bytes, cut: false }
887 /// Returns a new complete empty literal.
888 pub fn empty() -> Literal
{
889 Literal { v: vec![], cut: false }
892 /// Returns true if this literal was "cut."
893 pub fn is_cut(&self) -> bool
{
897 /// Cuts this literal.
898 pub fn cut(&mut self) {
903 impl PartialEq
for Literal
{
904 fn eq(&self, other
: &Literal
) -> bool
{
909 impl PartialOrd
for Literal
{
910 fn partial_cmp(&self, other
: &Literal
) -> Option
<cmp
::Ordering
> {
911 self.v
.partial_cmp(&other
.v
)
915 impl fmt
::Debug
for Literal
{
916 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
918 write
!(f
, "Cut({})", escape_unicode(&self.v
))
920 write
!(f
, "Complete({})", escape_unicode(&self.v
))
925 impl AsRef
<[u8]> for Literal
{
926 fn as_ref(&self) -> &[u8] { &self.v }
929 impl ops
::Deref
for Literal
{
930 type Target
= Vec
<u8>;
931 fn deref(&self) -> &Vec
<u8> { &self.v }
934 impl ops
::DerefMut
for Literal
{
935 fn deref_mut(&mut self) -> &mut Vec
<u8> { &mut self.v }
938 fn position(needle
: &[u8], mut haystack
: &[u8]) -> Option
<usize> {
940 while haystack
.len() >= needle
.len() {
941 if needle
== &haystack
[..needle
.len()] {
945 haystack
= &haystack
[1..];
950 fn escape_unicode(bytes
: &[u8]) -> String
{
951 let show
= match ::std
::str::from_utf8(bytes
) {
952 Ok(v
) => v
.to_string(),
953 Err(_
) => escape_bytes(bytes
),
955 let mut space_escaped
= String
::new();
956 for c
in show
.chars() {
957 if c
.is_whitespace() {
958 let escaped
= if c
as u32 <= 0x7F {
961 if c
as u32 <= 0xFFFF {
962 format
!(r
"\u{{{:04x}}}", c
as u32)
964 format
!(r
"\U{{{:08x}}}", c
as u32)
967 space_escaped
.push_str(&escaped
);
969 space_escaped
.push(c
);
975 fn escape_bytes(bytes
: &[u8]) -> String
{
976 let mut s
= String
::new();
978 s
.push_str(&escape_byte(b
));
983 fn escape_byte(byte
: u8) -> String
{
984 use std
::ascii
::escape_default
;
986 let escaped
: Vec
<u8> = escape_default(byte
).collect();
987 String
::from_utf8_lossy(&escaped
).into_owned()
990 fn cls_char_count(cls
: &hir
::ClassUnicode
) -> usize {
992 .map(|&r
| 1 + (r
.end
as u32) - (r
.start
as u32))
993 .sum
::<u32>() as usize
996 fn cls_byte_count(cls
: &hir
::ClassBytes
) -> usize {
998 .map(|&r
| 1 + (r
.end
as u32) - (r
.start
as u32))
999 .sum
::<u32>() as usize
1008 use super::{Literals, Literal, escape_bytes}
;
1010 // To make test failures easier to read.
1011 #[derive(Debug, Eq, PartialEq)]
1012 struct Bytes(Vec
<ULiteral
>);
1013 #[derive(Debug, Eq, PartialEq)]
1014 struct Unicode(Vec
<ULiteral
>);
1016 fn escape_lits(blits
: &[Literal
]) -> Vec
<ULiteral
> {
1017 let mut ulits
= vec
![];
1019 ulits
.push(ULiteral
{
1020 v
: escape_bytes(&blit
),
1027 fn create_lits
<I
: IntoIterator
<Item
=Literal
>>(it
: I
) -> Literals
{
1029 lits
: it
.into_iter().collect(),
1035 // Needs to be pub for 1.3?
1036 #[derive(Clone, Eq, PartialEq)]
1037 pub struct ULiteral
{
1043 fn is_cut(&self) -> bool { self.cut }
1046 impl fmt
::Debug
for ULiteral
{
1047 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
1049 write
!(f
, "Cut({})", self.v
)
1051 write
!(f
, "Complete({})", self.v
)
1056 impl PartialEq
<Literal
> for ULiteral
{
1057 fn eq(&self, other
: &Literal
) -> bool
{
1058 self.v
.as_bytes() == &*other
.v
&& self.is_cut() == other
.is_cut()
1062 impl PartialEq
<ULiteral
> for Literal
{
1063 fn eq(&self, other
: &ULiteral
) -> bool
{
1064 &*self.v
== other
.v
.as_bytes() && self.is_cut() == other
.is_cut()
1068 #[allow(non_snake_case)]
1069 fn C(s
: &'
static str) -> ULiteral
{
1070 ULiteral { v: s.to_owned(), cut: true }
1072 #[allow(non_snake_case)]
1073 fn M(s
: &'
static str) -> ULiteral
{
1074 ULiteral { v: s.to_owned(), cut: false }
1077 fn prefixes(lits
: &mut Literals
, expr
: &Hir
) {
1078 lits
.union_prefixes(expr
);
1081 fn suffixes(lits
: &mut Literals
, expr
: &Hir
) {
1082 lits
.union_suffixes(expr
);
1085 macro_rules
! assert_lit_eq
{
1086 ($which
:ident
, $got_lits
:expr
, $
($expected_lit
:expr
),*) => {{
1087 let expected
: Vec
<ULiteral
> = vec
![$
($expected_lit
),*];
1088 let lits
= $got_lits
;
1090 $
which(expected
.clone()),
1091 $
which(escape_lits(lits
.literals())));
1093 !expected
.is_empty() && expected
.iter().all(|l
| !l
.is_cut()),
1094 lits
.all_complete());
1096 expected
.iter().any(|l
| !l
.is_cut()),
1097 lits
.any_complete());
1101 macro_rules
! test_lit
{
1102 ($name
:ident
, $which
:ident
, $re
:expr
) => {
1103 test_lit
!($name
, $which
, $re
,);
1105 ($name
:ident
, $which
:ident
, $re
:expr
, $
($lit
:expr
),*) => {
1108 let expr
= ParserBuilder
::new()
1112 let lits
= Literals
::$
which(&expr
);
1113 assert_lit_eq
!(Unicode
, lits
, $
($lit
),*);
1115 let expr
= ParserBuilder
::new()
1116 .allow_invalid_utf8(true)
1121 let lits
= Literals
::$
which(&expr
);
1122 assert_lit_eq
!(Bytes
, lits
, $
($lit
),*);
1127 // ************************************************************************
1128 // Tests for prefix literal extraction.
1129 // ************************************************************************
1131 // Elementary tests.
1132 test_lit
!(pfx_one_lit1
, prefixes
, "a", M("a"));
1133 test_lit
!(pfx_one_lit2
, prefixes
, "abc", M("abc"));
1134 test_lit
!(pfx_one_lit3
, prefixes
, "(?u)☃", M("\\xe2\\x98\\x83"));
1135 test_lit
!(pfx_one_lit4
, prefixes
, "(?ui)☃", M("\\xe2\\x98\\x83"));
1136 test_lit
!(pfx_class1
, prefixes
, "[1-4]",
1137 M("1"), M("2"), M("3"), M("4"));
1138 test_lit
!(pfx_class2
, prefixes
, "(?u)[â˜ƒâ… ]",
1139 M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83"));
1140 test_lit
!(pfx_class3
, prefixes
, "(?ui)[â˜ƒâ… ]",
1141 M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"),
1142 M("\\xe2\\x98\\x83"));
1143 test_lit
!(pfx_one_lit_casei1
, prefixes
, "(?i)a",
1145 test_lit
!(pfx_one_lit_casei2
, prefixes
, "(?i)abc",
1146 M("ABC"), M("aBC"), M("AbC"), M("abC"),
1147 M("ABc"), M("aBc"), M("Abc"), M("abc"));
1148 test_lit
!(pfx_group1
, prefixes
, "(a)", M("a"));
1149 test_lit
!(pfx_rep_zero_or_one1
, prefixes
, "a?");
1150 test_lit
!(pfx_rep_zero_or_one2
, prefixes
, "(?:abc)?");
1151 test_lit
!(pfx_rep_zero_or_more1
, prefixes
, "a*");
1152 test_lit
!(pfx_rep_zero_or_more2
, prefixes
, "(?:abc)*");
1153 test_lit
!(pfx_rep_one_or_more1
, prefixes
, "a+", C("a"));
1154 test_lit
!(pfx_rep_one_or_more2
, prefixes
, "(?:abc)+", C("abc"));
1155 test_lit
!(pfx_rep_nested_one_or_more
, prefixes
, "(?:a+)+", C("a"));
1156 test_lit
!(pfx_rep_range1
, prefixes
, "a{0}");
1157 test_lit
!(pfx_rep_range2
, prefixes
, "a{0,}");
1158 test_lit
!(pfx_rep_range3
, prefixes
, "a{0,1}");
1159 test_lit
!(pfx_rep_range4
, prefixes
, "a{1}", M("a"));
1160 test_lit
!(pfx_rep_range5
, prefixes
, "a{2}", M("aa"));
1161 test_lit
!(pfx_rep_range6
, prefixes
, "a{1,2}", C("a"));
1162 test_lit
!(pfx_rep_range7
, prefixes
, "a{2,3}", C("aa"));
1164 // Test regexes with concatenations.
1165 test_lit
!(pfx_cat1
, prefixes
, "(?:a)(?:b)", M("ab"));
1166 test_lit
!(pfx_cat2
, prefixes
, "[ab]z", M("az"), M("bz"));
1167 test_lit
!(pfx_cat3
, prefixes
, "(?i)[ab]z",
1168 M("AZ"), M("BZ"), M("aZ"), M("bZ"),
1169 M("Az"), M("Bz"), M("az"), M("bz"));
1170 test_lit
!(pfx_cat4
, prefixes
, "[ab][yz]",
1171 M("ay"), M("by"), M("az"), M("bz"));
1172 test_lit
!(pfx_cat5
, prefixes
, "a*b", C("a"), M("b"));
1173 test_lit
!(pfx_cat6
, prefixes
, "a*b*c", C("a"), C("b"), M("c"));
1174 test_lit
!(pfx_cat7
, prefixes
, "a*b*c+", C("a"), C("b"), C("c"));
1175 test_lit
!(pfx_cat8
, prefixes
, "a*b+c", C("a"), C("b"));
1176 test_lit
!(pfx_cat9
, prefixes
, "a*b+c*", C("a"), C("b"));
1177 test_lit
!(pfx_cat10
, prefixes
, "ab*", C("ab"), M("a"));
1178 test_lit
!(pfx_cat11
, prefixes
, "ab*c", C("ab"), M("ac"));
1179 test_lit
!(pfx_cat12
, prefixes
, "ab+", C("ab"));
1180 test_lit
!(pfx_cat13
, prefixes
, "ab+c", C("ab"));
1181 test_lit
!(pfx_cat14
, prefixes
, "a^", C("a"));
1182 test_lit
!(pfx_cat15
, prefixes
, "$a");
1183 test_lit
!(pfx_cat16
, prefixes
, r
"ab*c", C("ab"), M("ac"));
1184 test_lit
!(pfx_cat17
, prefixes
, r
"ab+c", C("ab"));
1185 test_lit
!(pfx_cat18
, prefixes
, r
"z*azb", C("z"), M("azb"));
1186 test_lit
!(pfx_cat19
, prefixes
, "a.z", C("a"));
1188 // Test regexes with alternations.
1189 test_lit
!(pfx_alt1
, prefixes
, "a|b", M("a"), M("b"));
1190 test_lit
!(pfx_alt2
, prefixes
, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1191 test_lit
!(pfx_alt3
, prefixes
, "y(?:a|b)z", M("yaz"), M("ybz"));
1192 test_lit
!(pfx_alt4
, prefixes
, "a|b*");
1193 test_lit
!(pfx_alt5
, prefixes
, "a|b+", M("a"), C("b"));
1194 test_lit
!(pfx_alt6
, prefixes
, "a|(?:b|c*)");
1195 test_lit
!(pfx_alt7
, prefixes
, "(a|b)*c|(a|ab)*c",
1196 C("a"), C("b"), M("c"), C("a"), C("ab"), M("c"));
1197 test_lit
!(pfx_alt8
, prefixes
, "a*b|c", C("a"), M("b"), M("c"));
1199 // Test regexes with empty assertions.
1200 test_lit
!(pfx_empty1
, prefixes
, "^a", M("a"));
1201 test_lit
!(pfx_empty2
, prefixes
, "a${2}", C("a"));
1202 test_lit
!(pfx_empty3
, prefixes
, "^abc", M("abc"));
1203 test_lit
!(pfx_empty4
, prefixes
, "(?:^abc)|(?:^z)", M("abc"), M("z"));
1205 // Make sure some curious regexes have no prefixes.
1206 test_lit
!(pfx_nothing1
, prefixes
, ".");
1207 test_lit
!(pfx_nothing2
, prefixes
, "(?s).");
1208 test_lit
!(pfx_nothing3
, prefixes
, "^");
1209 test_lit
!(pfx_nothing4
, prefixes
, "$");
1210 test_lit
!(pfx_nothing6
, prefixes
, "(?m)$");
1211 test_lit
!(pfx_nothing7
, prefixes
, r
"\b");
1212 test_lit
!(pfx_nothing8
, prefixes
, r
"\B");
1214 // Test a few regexes that defeat any prefix literal detection.
1215 test_lit
!(pfx_defeated1
, prefixes
, ".a");
1216 test_lit
!(pfx_defeated2
, prefixes
, "(?s).a");
1217 test_lit
!(pfx_defeated3
, prefixes
, "a*b*c*");
1218 test_lit
!(pfx_defeated4
, prefixes
, "a|.");
1219 test_lit
!(pfx_defeated5
, prefixes
, ".|a");
1220 test_lit
!(pfx_defeated6
, prefixes
, "a|^");
1221 test_lit
!(pfx_defeated7
, prefixes
, ".(?:a(?:b)(?:c))");
1222 test_lit
!(pfx_defeated8
, prefixes
, "$a");
1223 test_lit
!(pfx_defeated9
, prefixes
, "(?m)$a");
1224 test_lit
!(pfx_defeated10
, prefixes
, r
"\ba");
1225 test_lit
!(pfx_defeated11
, prefixes
, r
"\Ba");
1226 test_lit
!(pfx_defeated12
, prefixes
, "^*a");
1227 test_lit
!(pfx_defeated13
, prefixes
, "^+a");
1232 r
"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
1233 C("Mo\\'am"), C("Mu\\'am"), C("Moam"), C("Muam"));
1235 // ************************************************************************
1236 // Tests for quiting prefix literal search.
1237 // ************************************************************************
1239 macro_rules
! test_exhausted
{
1240 ($name
:ident
, $which
:ident
, $re
:expr
) => {
1241 test_exhausted
!($name
, $which
, $re
,);
1243 ($name
:ident
, $which
:ident
, $re
:expr
, $
($lit
:expr
),*) => {
1246 let expr
= ParserBuilder
::new()
1250 let mut lits
= Literals
::empty();
1251 lits
.set_limit_size(20).set_limit_class(10);
1252 $
which(&mut lits
, &expr
);
1253 assert_lit_eq
!(Unicode
, lits
, $
($lit
),*);
1255 let expr
= ParserBuilder
::new()
1256 .allow_invalid_utf8(true)
1261 let mut lits
= Literals
::empty();
1262 lits
.set_limit_size(20).set_limit_class(10);
1263 $
which(&mut lits
, &expr
);
1264 assert_lit_eq
!(Bytes
, lits
, $
($lit
),*);
1269 // These test use a much lower limit than the default so that we can
1270 // write test cases of reasonable size.
1271 test_exhausted
!(pfx_exhausted1
, prefixes
, "[a-z]");
1272 test_exhausted
!(pfx_exhausted2
, prefixes
, "[a-z]*A");
1273 test_exhausted
!(pfx_exhausted3
, prefixes
, "A[a-z]Z", C("A"));
1274 test_exhausted
!(pfx_exhausted4
, prefixes
, "(?i)foobar",
1275 C("FO"), C("fO"), C("Fo"), C("fo"));
1276 test_exhausted
!(pfx_exhausted5
, prefixes
, "(?:ab){100}",
1277 C("abababababababababab"));
1278 test_exhausted
!(pfx_exhausted6
, prefixes
, "(?:(?:ab){100})*cd",
1279 C("ababababab"), M("cd"));
1280 test_exhausted
!(pfx_exhausted7
, prefixes
, "z(?:(?:ab){100})*cd",
1281 C("zababababab"), M("zcd"));
1282 test_exhausted
!(pfx_exhausted8
, prefixes
, "aaaaaaaaaaaaaaaaaaaaz",
1283 C("aaaaaaaaaaaaaaaaaaaa"));
1285 // ************************************************************************
1286 // Tests for suffix literal extraction.
1287 // ************************************************************************
1289 // Elementary tests.
1290 test_lit
!(sfx_one_lit1
, suffixes
, "a", M("a"));
1291 test_lit
!(sfx_one_lit2
, suffixes
, "abc", M("abc"));
1292 test_lit
!(sfx_one_lit3
, suffixes
, "(?u)☃", M("\\xe2\\x98\\x83"));
1293 test_lit
!(sfx_one_lit4
, suffixes
, "(?ui)☃", M("\\xe2\\x98\\x83"));
1294 test_lit
!(sfx_class1
, suffixes
, "[1-4]",
1295 M("1"), M("2"), M("3"), M("4"));
1296 test_lit
!(sfx_class2
, suffixes
, "(?u)[â˜ƒâ… ]",
1297 M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83"));
1298 test_lit
!(sfx_class3
, suffixes
, "(?ui)[â˜ƒâ… ]",
1299 M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"),
1300 M("\\xe2\\x98\\x83"));
1301 test_lit
!(sfx_one_lit_casei1
, suffixes
, "(?i)a",
1303 test_lit
!(sfx_one_lit_casei2
, suffixes
, "(?i)abc",
1304 M("ABC"), M("ABc"), M("AbC"), M("Abc"),
1305 M("aBC"), M("aBc"), M("abC"), M("abc"));
1306 test_lit
!(sfx_group1
, suffixes
, "(a)", M("a"));
1307 test_lit
!(sfx_rep_zero_or_one1
, suffixes
, "a?");
1308 test_lit
!(sfx_rep_zero_or_one2
, suffixes
, "(?:abc)?");
1309 test_lit
!(sfx_rep_zero_or_more1
, suffixes
, "a*");
1310 test_lit
!(sfx_rep_zero_or_more2
, suffixes
, "(?:abc)*");
1311 test_lit
!(sfx_rep_one_or_more1
, suffixes
, "a+", C("a"));
1312 test_lit
!(sfx_rep_one_or_more2
, suffixes
, "(?:abc)+", C("abc"));
1313 test_lit
!(sfx_rep_nested_one_or_more
, suffixes
, "(?:a+)+", C("a"));
1314 test_lit
!(sfx_rep_range1
, suffixes
, "a{0}");
1315 test_lit
!(sfx_rep_range2
, suffixes
, "a{0,}");
1316 test_lit
!(sfx_rep_range3
, suffixes
, "a{0,1}");
1317 test_lit
!(sfx_rep_range4
, suffixes
, "a{1}", M("a"));
1318 test_lit
!(sfx_rep_range5
, suffixes
, "a{2}", M("aa"));
1319 test_lit
!(sfx_rep_range6
, suffixes
, "a{1,2}", C("a"));
1320 test_lit
!(sfx_rep_range7
, suffixes
, "a{2,3}", C("aa"));
1322 // Test regexes with concatenations.
1323 test_lit
!(sfx_cat1
, suffixes
, "(?:a)(?:b)", M("ab"));
1324 test_lit
!(sfx_cat2
, suffixes
, "[ab]z", M("az"), M("bz"));
1325 test_lit
!(sfx_cat3
, suffixes
, "(?i)[ab]z",
1326 M("AZ"), M("Az"), M("BZ"), M("Bz"),
1327 M("aZ"), M("az"), M("bZ"), M("bz"));
1328 test_lit
!(sfx_cat4
, suffixes
, "[ab][yz]",
1329 M("ay"), M("az"), M("by"), M("bz"));
1330 test_lit
!(sfx_cat5
, suffixes
, "a*b", C("ab"), M("b"));
1331 test_lit
!(sfx_cat6
, suffixes
, "a*b*c", C("bc"), C("ac"), M("c"));
1332 test_lit
!(sfx_cat7
, suffixes
, "a*b*c+", C("c"));
1333 test_lit
!(sfx_cat8
, suffixes
, "a*b+c", C("bc"));
1334 test_lit
!(sfx_cat9
, suffixes
, "a*b+c*", C("c"), C("b"));
1335 test_lit
!(sfx_cat10
, suffixes
, "ab*", C("b"), M("a"));
1336 test_lit
!(sfx_cat11
, suffixes
, "ab*c", C("bc"), M("ac"));
1337 test_lit
!(sfx_cat12
, suffixes
, "ab+", C("b"));
1338 test_lit
!(sfx_cat13
, suffixes
, "ab+c", C("bc"));
1339 test_lit
!(sfx_cat14
, suffixes
, "a^");
1340 test_lit
!(sfx_cat15
, suffixes
, "$a", C("a"));
1341 test_lit
!(sfx_cat16
, suffixes
, r
"ab*c", C("bc"), M("ac"));
1342 test_lit
!(sfx_cat17
, suffixes
, r
"ab+c", C("bc"));
1343 test_lit
!(sfx_cat18
, suffixes
, r
"z*azb", C("zazb"), M("azb"));
1344 test_lit
!(sfx_cat19
, suffixes
, "a.z", C("z"));
1346 // Test regexes with alternations.
1347 test_lit
!(sfx_alt1
, suffixes
, "a|b", M("a"), M("b"));
1348 test_lit
!(sfx_alt2
, suffixes
, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1349 test_lit
!(sfx_alt3
, suffixes
, "y(?:a|b)z", M("yaz"), M("ybz"));
1350 test_lit
!(sfx_alt4
, suffixes
, "a|b*");
1351 test_lit
!(sfx_alt5
, suffixes
, "a|b+", M("a"), C("b"));
1352 test_lit
!(sfx_alt6
, suffixes
, "a|(?:b|c*)");
1353 test_lit
!(sfx_alt7
, suffixes
, "(a|b)*c|(a|ab)*c",
1354 C("ac"), C("bc"), M("c"), C("ac"), C("abc"), M("c"));
1355 test_lit
!(sfx_alt8
, suffixes
, "a*b|c", C("ab"), M("b"), M("c"));
1357 // Test regexes with empty assertions.
1358 test_lit
!(sfx_empty1
, suffixes
, "a$", M("a"));
1359 test_lit
!(sfx_empty2
, suffixes
, "${2}a", C("a"));
1361 // Make sure some curious regexes have no suffixes.
1362 test_lit
!(sfx_nothing1
, suffixes
, ".");
1363 test_lit
!(sfx_nothing2
, suffixes
, "(?s).");
1364 test_lit
!(sfx_nothing3
, suffixes
, "^");
1365 test_lit
!(sfx_nothing4
, suffixes
, "$");
1366 test_lit
!(sfx_nothing6
, suffixes
, "(?m)$");
1367 test_lit
!(sfx_nothing7
, suffixes
, r
"\b");
1368 test_lit
!(sfx_nothing8
, suffixes
, r
"\B");
1370 // Test a few regexes that defeat any suffix literal detection.
1371 test_lit
!(sfx_defeated1
, suffixes
, "a.");
1372 test_lit
!(sfx_defeated2
, suffixes
, "(?s)a.");
1373 test_lit
!(sfx_defeated3
, suffixes
, "a*b*c*");
1374 test_lit
!(sfx_defeated4
, suffixes
, "a|.");
1375 test_lit
!(sfx_defeated5
, suffixes
, ".|a");
1376 test_lit
!(sfx_defeated6
, suffixes
, "a|^");
1377 test_lit
!(sfx_defeated7
, suffixes
, "(?:a(?:b)(?:c)).");
1378 test_lit
!(sfx_defeated8
, suffixes
, "a^");
1379 test_lit
!(sfx_defeated9
, suffixes
, "(?m)a$");
1380 test_lit
!(sfx_defeated10
, suffixes
, r
"a\b");
1381 test_lit
!(sfx_defeated11
, suffixes
, r
"a\B");
1382 test_lit
!(sfx_defeated12
, suffixes
, "a^*");
1383 test_lit
!(sfx_defeated13
, suffixes
, "a^+");
1385 // These test use a much lower limit than the default so that we can
1386 // write test cases of reasonable size.
1387 test_exhausted
!(sfx_exhausted1
, suffixes
, "[a-z]");
1388 test_exhausted
!(sfx_exhausted2
, suffixes
, "A[a-z]*");
1389 test_exhausted
!(sfx_exhausted3
, suffixes
, "A[a-z]Z", C("Z"));
1390 test_exhausted
!(sfx_exhausted4
, suffixes
, "(?i)foobar",
1391 C("AR"), C("Ar"), C("aR"), C("ar"));
1392 test_exhausted
!(sfx_exhausted5
, suffixes
, "(?:ab){100}",
1393 C("abababababababababab"));
1394 test_exhausted
!(sfx_exhausted6
, suffixes
, "cd(?:(?:ab){100})*",
1395 C("ababababab"), M("cd"));
1396 test_exhausted
!(sfx_exhausted7
, suffixes
, "cd(?:(?:ab){100})*z",
1397 C("abababababz"), M("cdz"));
1398 test_exhausted
!(sfx_exhausted8
, suffixes
, "zaaaaaaaaaaaaaaaaaaaa",
1399 C("aaaaaaaaaaaaaaaaaaaa"));
1401 // ************************************************************************
1402 // Tests for generating unambiguous literal sets.
1403 // ************************************************************************
1405 macro_rules
! test_unamb
{
1406 ($name
:ident
, $given
:expr
, $expected
:expr
) => {
1409 let given
: Vec
<Literal
> =
1413 let cut
= ul
.is_cut();
1414 Literal { v: ul.v.into_bytes(), cut: cut }
1417 let lits
= create_lits(given
);
1418 let got
= lits
.unambiguous_prefixes();
1419 assert_eq
!($expected
, escape_lits(got
.literals()));
1424 test_unamb
!(unambiguous1
, vec
![M("z"), M("azb")], vec
![C("a"), C("z")]);
1425 test_unamb
!(unambiguous2
,
1426 vec
![M("zaaaaaa"), M("aa")], vec
![C("aa"), C("z")]);
1427 test_unamb
!(unambiguous3
,
1428 vec
![M("Sherlock"), M("Watson")],
1429 vec
![M("Sherlock"), M("Watson")]);
1430 test_unamb
!(unambiguous4
, vec
![M("abc"), M("bc")], vec
![C("a"), C("bc")]);
1431 test_unamb
!(unambiguous5
, vec
![M("bc"), M("abc")], vec
![C("a"), C("bc")]);
1432 test_unamb
!(unambiguous6
, vec
![M("a"), M("aa")], vec
![C("a")]);
1433 test_unamb
!(unambiguous7
, vec
![M("aa"), M("a")], vec
![C("a")]);
1434 test_unamb
!(unambiguous8
, vec
![M("ab"), M("a")], vec
![C("a")]);
1435 test_unamb
!(unambiguous9
,
1436 vec
![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
1437 vec
![C("a"), C("b"), C("c")]);
1438 test_unamb
!(unambiguous10
,
1439 vec
![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
1440 vec
![C("Mo"), C("Mu")]);
1441 test_unamb
!(unambiguous11
,
1442 vec
![M("zazb"), M("azb")], vec
![C("a"), C("z")]);
1443 test_unamb
!(unambiguous12
, vec
![M("foo"), C("foo")], vec
![C("foo")]);
1444 test_unamb
!(unambiguous13
,
1445 vec
![M("ABCX"), M("CDAX"), M("BCX")],
1446 vec
![C("A"), C("BCX"), C("CD")]);
1447 test_unamb
!(unambiguous14
,
1448 vec
![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
1449 vec
![M("DSX"), C("I"), C("MGX"), C("MV")]);
1450 test_unamb
!(unambiguous15
,
1451 vec
![M("IMG_"), M("MG_"), M("CIMG")],
1452 vec
![C("C"), C("I"), C("MG_")]);
1455 // ************************************************************************
1456 // Tests for suffix trimming.
1457 // ************************************************************************
1458 macro_rules
! test_trim
{
1459 ($name
:ident
, $trim
:expr
, $given
:expr
, $expected
:expr
) => {
1462 let given
: Vec
<Literal
> =
1466 let cut
= ul
.is_cut();
1467 Literal { v: ul.v.into_bytes(), cut: cut }
1470 let lits
= create_lits(given
);
1471 let got
= lits
.trim_suffix($trim
).unwrap();
1472 assert_eq
!($expected
, escape_lits(got
.literals()));
1477 test_trim
!(trim1
, 1, vec
![M("ab"), M("yz")], vec
![C("a"), C("y")]);
1478 test_trim
!(trim2
, 1, vec
![M("abc"), M("abd")], vec
![C("ab")]);
1479 test_trim
!(trim3
, 2, vec
![M("abc"), M("abd")], vec
![C("a")]);
1480 test_trim
!(trim4
, 2, vec
![M("abc"), M("ghij")], vec
![C("a"), C("gh")]);
1482 // ************************************************************************
1483 // Tests for longest common prefix.
1484 // ************************************************************************
1486 macro_rules
! test_lcp
{
1487 ($name
:ident
, $given
:expr
, $expected
:expr
) => {
1490 let given
: Vec
<Literal
> =
1493 .map(|s
: &str| Literal
{
1494 v
: s
.to_owned().into_bytes(),
1498 let lits
= create_lits(given
);
1499 let got
= lits
.longest_common_prefix();
1500 assert_eq
!($expected
, escape_bytes(got
));
1505 test_lcp
!(lcp1
, vec
!["a"], "a");
1506 test_lcp
!(lcp2
, vec
![], "");
1507 test_lcp
!(lcp3
, vec
!["a", "b"], "");
1508 test_lcp
!(lcp4
, vec
!["ab", "ab"], "ab");
1509 test_lcp
!(lcp5
, vec
!["ab", "a"], "a");
1510 test_lcp
!(lcp6
, vec
!["a", "ab"], "a");
1511 test_lcp
!(lcp7
, vec
!["ab", "b"], "");
1512 test_lcp
!(lcp8
, vec
!["b", "ab"], "");
1513 test_lcp
!(lcp9
, vec
!["foobar", "foobaz"], "fooba");
1514 test_lcp
!(lcp10
, vec
!["foobar", "foobaz", "a"], "");
1515 test_lcp
!(lcp11
, vec
!["a", "foobar", "foobaz"], "");
1516 test_lcp
!(lcp12
, vec
!["foo", "flub", "flab", "floo"], "f");
1518 // ************************************************************************
1519 // Tests for longest common suffix.
1520 // ************************************************************************
1522 macro_rules
! test_lcs
{
1523 ($name
:ident
, $given
:expr
, $expected
:expr
) => {
1526 let given
: Vec
<Literal
> =
1529 .map(|s
: &str| Literal
{
1530 v
: s
.to_owned().into_bytes(),
1534 let lits
= create_lits(given
);
1535 let got
= lits
.longest_common_suffix();
1536 assert_eq
!($expected
, escape_bytes(got
));
1541 test_lcs
!(lcs1
, vec
!["a"], "a");
1542 test_lcs
!(lcs2
, vec
![], "");
1543 test_lcs
!(lcs3
, vec
!["a", "b"], "");
1544 test_lcs
!(lcs4
, vec
!["ab", "ab"], "ab");
1545 test_lcs
!(lcs5
, vec
!["ab", "a"], "");
1546 test_lcs
!(lcs6
, vec
!["a", "ab"], "");
1547 test_lcs
!(lcs7
, vec
!["ab", "b"], "b");
1548 test_lcs
!(lcs8
, vec
!["b", "ab"], "b");
1549 test_lcs
!(lcs9
, vec
!["barfoo", "bazfoo"], "foo");
1550 test_lcs
!(lcs10
, vec
!["barfoo", "bazfoo", "a"], "");
1551 test_lcs
!(lcs11
, vec
!["a", "barfoo", "bazfoo"], "");
1552 test_lcs
!(lcs12
, vec
!["flub", "bub", "boob", "dub"], "b");