]> git.proxmox.com Git - rustc.git/blame - vendor/regex-syntax/src/hir/literal/mod.rs
New upstream version 1.63.0+dfsg1
[rustc.git] / vendor / regex-syntax / src / hir / literal / mod.rs
CommitLineData
0531ce1d
XL
1/*!
2Provides routines for extracting literal prefixes and suffixes from an `Hir`.
3*/
4
8bb4bdeb
XL
5use std::cmp;
6use std::fmt;
7use std::iter;
8use std::mem;
9use std::ops;
10
17df50a5 11use crate::hir::{self, Hir, HirKind};
8bb4bdeb
XL
12
13/// A set of literal byte strings extracted from a regular expression.
14///
0531ce1d
XL
15/// Every member of the set is a `Literal`, which is represented by a
16/// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is
17/// said to be either *complete* or *cut*. A complete literal means that
18/// it extends until the beginning (or end) of the regular expression. In
19/// some circumstances, this can be used to indicate a match in the regular
20/// expression.
21///
22/// A key aspect of literal extraction is knowing when to stop. It is not
23/// feasible to blindly extract all literals from a regular expression, even if
24/// there are finitely many. For example, the regular expression `[0-9]{10}`
25/// has `10^10` distinct literals. For this reason, literal extraction is
26/// bounded to some low number by default using heuristics, but the limits can
27/// be tweaked.
8bb4bdeb 28///
0531ce1d
XL
29/// **WARNING**: Literal extraction uses stack space proportional to the size
30/// of the `Hir` expression. At some point, this drawback will be eliminated.
31/// To protect yourself, set a reasonable
32/// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit).
33/// This is done for you by default.
8bb4bdeb
XL
34#[derive(Clone, Eq, PartialEq)]
35pub struct Literals {
0531ce1d 36 lits: Vec<Literal>,
8bb4bdeb
XL
37 limit_size: usize,
38 limit_class: usize,
39}
40
41/// A single member of a set of literals extracted from a regular expression.
42///
43/// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice
44/// and `Vec` operations are available.
45#[derive(Clone, Eq, Ord)]
0531ce1d 46pub struct Literal {
8bb4bdeb
XL
47 v: Vec<u8>,
48 cut: bool,
49}
50
51impl Literals {
52 /// Returns a new empty set of literals using default limits.
53 pub fn empty() -> Literals {
f9f354fc 54 Literals { lits: vec![], limit_size: 250, limit_class: 10 }
8bb4bdeb
XL
55 }
56
0531ce1d
XL
57 /// Returns a set of literal prefixes extracted from the given `Hir`.
58 pub fn prefixes(expr: &Hir) -> Literals {
59 let mut lits = Literals::empty();
60 lits.union_prefixes(expr);
61 lits
62 }
63
64 /// Returns a set of literal suffixes extracted from the given `Hir`.
65 pub fn suffixes(expr: &Hir) -> Literals {
66 let mut lits = Literals::empty();
67 lits.union_suffixes(expr);
68 lits
69 }
70
8bb4bdeb
XL
71 /// Get the approximate size limit (in bytes) of this set.
72 pub fn limit_size(&self) -> usize {
73 self.limit_size
74 }
75
76 /// Set the approximate size limit (in bytes) of this set.
77 ///
78 /// If extracting a literal would put the set over this limit, then
79 /// extraction stops.
80 ///
81 /// The new limits will only apply to additions to this set. Existing
82 /// members remain unchanged, even if the set exceeds the new limit.
83 pub fn set_limit_size(&mut self, size: usize) -> &mut Literals {
84 self.limit_size = size;
85 self
86 }
87
88 /// Get the character class size limit for this set.
89 pub fn limit_class(&self) -> usize {
90 self.limit_class
91 }
92
93 /// Limits the size of character(or byte) classes considered.
94 ///
95 /// A value of `0` prevents all character classes from being considered.
96 ///
97 /// This limit also applies to case insensitive literals, since each
98 /// character in the case insensitive literal is converted to a class, and
99 /// then case folded.
100 ///
101 /// The new limits will only apply to additions to this set. Existing
102 /// members remain unchanged, even if the set exceeds the new limit.
103 pub fn set_limit_class(&mut self, size: usize) -> &mut Literals {
104 self.limit_class = size;
105 self
106 }
107
108 /// Returns the set of literals as a slice. Its order is unspecified.
0531ce1d 109 pub fn literals(&self) -> &[Literal] {
8bb4bdeb
XL
110 &self.lits
111 }
112
113 /// Returns the length of the smallest literal.
114 ///
115 /// Returns None is there are no literals in the set.
116 pub fn min_len(&self) -> Option<usize> {
117 let mut min = None;
118 for lit in &self.lits {
119 match min {
120 None => min = Some(lit.len()),
121 Some(m) if lit.len() < m => min = Some(lit.len()),
122 _ => {}
123 }
124 }
125 min
126 }
127
128 /// Returns true if all members in this set are complete.
129 pub fn all_complete(&self) -> bool {
130 !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut())
131 }
132
133 /// Returns true if any member in this set is complete.
134 pub fn any_complete(&self) -> bool {
135 self.lits.iter().any(|lit| !lit.is_cut())
136 }
137
138 /// Returns true if this set contains an empty literal.
139 pub fn contains_empty(&self) -> bool {
140 self.lits.iter().any(|lit| lit.is_empty())
141 }
142
143 /// Returns true if this set is empty or if all of its members is empty.
144 pub fn is_empty(&self) -> bool {
145 self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty())
146 }
147
148 /// Returns a new empty set of literals using this set's limits.
149 pub fn to_empty(&self) -> Literals {
150 let mut lits = Literals::empty();
f9f354fc 151 lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class);
8bb4bdeb
XL
152 lits
153 }
154
155 /// Returns the longest common prefix of all members in this set.
156 pub fn longest_common_prefix(&self) -> &[u8] {
157 if self.is_empty() {
158 return &[];
159 }
160 let lit0 = &*self.lits[0];
161 let mut len = lit0.len();
162 for lit in &self.lits[1..] {
163 len = cmp::min(
164 len,
f9f354fc
XL
165 lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(),
166 );
8bb4bdeb
XL
167 }
168 &self.lits[0][..len]
169 }
170
171 /// Returns the longest common suffix of all members in this set.
172 pub fn longest_common_suffix(&self) -> &[u8] {
173 if self.is_empty() {
174 return &[];
175 }
176 let lit0 = &*self.lits[0];
177 let mut len = lit0.len();
178 for lit in &self.lits[1..] {
179 len = cmp::min(
180 len,
181 lit.iter()
f9f354fc
XL
182 .rev()
183 .zip(lit0.iter().rev())
184 .take_while(|&(a, b)| a == b)
185 .count(),
186 );
8bb4bdeb
XL
187 }
188 &self.lits[0][self.lits[0].len() - len..]
189 }
190
191 /// Returns a new set of literals with the given number of bytes trimmed
192 /// from the suffix of each literal.
193 ///
194 /// If any literal would be cut out completely by trimming, then None is
195 /// returned.
196 ///
197 /// Any duplicates that are created as a result of this transformation are
198 /// removed.
199 pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> {
200 if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) {
201 return None;
202 }
203 let mut new = self.to_empty();
204 for mut lit in self.lits.iter().cloned() {
205 let new_len = lit.len() - num_bytes;
206 lit.truncate(new_len);
207 lit.cut();
208 new.lits.push(lit);
209 }
210 new.lits.sort();
211 new.lits.dedup();
212 Some(new)
213 }
214
215 /// Returns a new set of prefixes of this set of literals that are
216 /// guaranteed to be unambiguous.
217 ///
218 /// Any substring match with a member of the set is returned is guaranteed
219 /// to never overlap with a substring match of another member of the set
220 /// at the same starting position.
221 ///
222 /// Given any two members of the returned set, neither is a substring of
223 /// the other.
224 pub fn unambiguous_prefixes(&self) -> Literals {
225 if self.lits.is_empty() {
226 return self.to_empty();
227 }
0531ce1d 228 let mut old: Vec<Literal> = self.lits.iter().cloned().collect();
8bb4bdeb 229 let mut new = self.to_empty();
f9f354fc 230 'OUTER: while let Some(mut candidate) = old.pop() {
8bb4bdeb
XL
231 if candidate.is_empty() {
232 continue;
233 }
234 if new.lits.is_empty() {
235 new.lits.push(candidate);
236 continue;
237 }
238 for lit2 in &mut new.lits {
239 if lit2.is_empty() {
240 continue;
241 }
242 if &candidate == lit2 {
243 // If the literal is already in the set, then we can
244 // just drop it. But make sure that cut literals are
245 // infectious!
246 candidate.cut = candidate.cut || lit2.cut;
247 lit2.cut = candidate.cut;
248 continue 'OUTER;
249 }
250 if candidate.len() < lit2.len() {
251 if let Some(i) = position(&candidate, &lit2) {
252 candidate.cut();
253 let mut lit3 = lit2.clone();
254 lit3.truncate(i);
255 lit3.cut();
256 old.push(lit3);
257 lit2.clear();
258 }
259 } else {
260 if let Some(i) = position(&lit2, &candidate) {
261 lit2.cut();
262 let mut new_candidate = candidate.clone();
263 new_candidate.truncate(i);
264 new_candidate.cut();
265 old.push(new_candidate);
266 candidate.clear();
267 }
268 }
269 // Oops, the candidate is already represented in the set.
270 if candidate.is_empty() {
271 continue 'OUTER;
272 }
273 }
274 new.lits.push(candidate);
275 }
276 new.lits.retain(|lit| !lit.is_empty());
277 new.lits.sort();
278 new.lits.dedup();
279 new
280 }
281
282 /// Returns a new set of suffixes of this set of literals that are
283 /// guaranteed to be unambiguous.
284 ///
285 /// Any substring match with a member of the set is returned is guaranteed
286 /// to never overlap with a substring match of another member of the set
287 /// at the same ending position.
288 ///
289 /// Given any two members of the returned set, neither is a substring of
290 /// the other.
291 pub fn unambiguous_suffixes(&self) -> Literals {
292 // This is a touch wasteful...
293 let mut lits = self.clone();
294 lits.reverse();
295 let mut unamb = lits.unambiguous_prefixes();
296 unamb.reverse();
297 unamb
298 }
299
300 /// Unions the prefixes from the given expression to this set.
301 ///
302 /// If prefixes could not be added (for example, this set would exceed its
303 /// size limits or the set of prefixes from `expr` includes the empty
304 /// string), then false is returned.
305 ///
306 /// Note that prefix literals extracted from `expr` are said to be complete
307 /// if and only if the literal extends from the beginning of `expr` to the
308 /// end of `expr`.
0531ce1d 309 pub fn union_prefixes(&mut self, expr: &Hir) -> bool {
8bb4bdeb
XL
310 let mut lits = self.to_empty();
311 prefixes(expr, &mut lits);
312 !lits.is_empty() && !lits.contains_empty() && self.union(lits)
313 }
314
315 /// Unions the suffixes from the given expression to this set.
316 ///
317 /// If suffixes could not be added (for example, this set would exceed its
318 /// size limits or the set of suffixes from `expr` includes the empty
319 /// string), then false is returned.
320 ///
321 /// Note that prefix literals extracted from `expr` are said to be complete
322 /// if and only if the literal extends from the end of `expr` to the
323 /// beginning of `expr`.
0531ce1d 324 pub fn union_suffixes(&mut self, expr: &Hir) -> bool {
8bb4bdeb
XL
325 let mut lits = self.to_empty();
326 suffixes(expr, &mut lits);
327 lits.reverse();
328 !lits.is_empty() && !lits.contains_empty() && self.union(lits)
329 }
330
331 /// Unions this set with another set.
332 ///
333 /// If the union would cause the set to exceed its limits, then the union
334 /// is skipped and it returns false. Otherwise, if the union succeeds, it
335 /// returns true.
336 pub fn union(&mut self, lits: Literals) -> bool {
337 if self.num_bytes() + lits.num_bytes() > self.limit_size {
338 return false;
339 }
340 if lits.is_empty() {
0531ce1d 341 self.lits.push(Literal::empty());
8bb4bdeb
XL
342 } else {
343 self.lits.extend(lits.lits);
344 }
345 true
346 }
347
348 /// Extends this set with another set.
349 ///
350 /// The set of literals is extended via a cross product.
351 ///
352 /// If a cross product would cause this set to exceed its limits, then the
353 /// cross product is skipped and it returns false. Otherwise, if the cross
354 /// product succeeds, it returns true.
355 pub fn cross_product(&mut self, lits: &Literals) -> bool {
356 if lits.is_empty() {
357 return true;
358 }
359 // Check that we make sure we stay in our limits.
360 let mut size_after;
361 if self.is_empty() || !self.any_complete() {
362 size_after = self.num_bytes();
363 for lits_lit in lits.literals() {
364 size_after += lits_lit.len();
365 }
366 } else {
367 size_after = self.lits.iter().fold(0, |accum, lit| {
368 accum + if lit.is_cut() { lit.len() } else { 0 }
369 });
370 for lits_lit in lits.literals() {
371 for self_lit in self.literals() {
372 if !self_lit.is_cut() {
373 size_after += self_lit.len() + lits_lit.len();
374 }
375 }
376 }
377 }
378 if size_after > self.limit_size {
379 return false;
380 }
381
382 let mut base = self.remove_complete();
383 if base.is_empty() {
0531ce1d 384 base = vec![Literal::empty()];
8bb4bdeb
XL
385 }
386 for lits_lit in lits.literals() {
387 for mut self_lit in base.clone() {
388 self_lit.extend(&**lits_lit);
389 self_lit.cut = lits_lit.cut;
390 self.lits.push(self_lit);
391 }
392 }
393 true
394 }
395
396 /// Extends each literal in this set with the bytes given.
397 ///
398 /// If the set is empty, then the given literal is added to the set.
399 ///
400 /// If adding any number of bytes to all members of this set causes a limit
401 /// to be exceeded, then no bytes are added and false is returned. If a
402 /// prefix of `bytes` can be fit into this set, then it is used and all
403 /// resulting literals are cut.
404 pub fn cross_add(&mut self, bytes: &[u8]) -> bool {
405 // N.B. This could be implemented by simply calling cross_product with
406 // a literal set containing just `bytes`, but we can be smarter about
407 // taking shorter prefixes of `bytes` if they'll fit.
408 if bytes.is_empty() {
409 return true;
410 }
411 if self.lits.is_empty() {
412 let i = cmp::min(self.limit_size, bytes.len());
0531ce1d 413 self.lits.push(Literal::new(bytes[..i].to_owned()));
8bb4bdeb
XL
414 self.lits[0].cut = i < bytes.len();
415 return !self.lits[0].is_cut();
416 }
417 let size = self.num_bytes();
418 if size + self.lits.len() >= self.limit_size {
419 return false;
420 }
421 let mut i = 1;
422 while size + (i * self.lits.len()) <= self.limit_size
f9f354fc
XL
423 && i < bytes.len()
424 {
8bb4bdeb
XL
425 i += 1;
426 }
427 for lit in &mut self.lits {
428 if !lit.is_cut() {
429 lit.extend(&bytes[..i]);
430 if i < bytes.len() {
431 lit.cut();
432 }
433 }
434 }
435 true
436 }
437
438 /// Adds the given literal to this set.
439 ///
440 /// Returns false if adding this literal would cause the class to be too
441 /// big.
0531ce1d 442 pub fn add(&mut self, lit: Literal) -> bool {
8bb4bdeb
XL
443 if self.num_bytes() + lit.len() > self.limit_size {
444 return false;
445 }
446 self.lits.push(lit);
447 true
448 }
449
450 /// Extends each literal in this set with the character class given.
451 ///
452 /// Returns false if the character class was too big to add.
0531ce1d 453 pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool {
8bb4bdeb
XL
454 self._add_char_class(cls, false)
455 }
456
457 /// Extends each literal in this set with the character class given,
458 /// writing the bytes of each character in reverse.
459 ///
460 /// Returns false if the character class was too big to add.
0531ce1d 461 fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool {
8bb4bdeb
XL
462 self._add_char_class(cls, true)
463 }
464
0531ce1d
XL
465 fn _add_char_class(
466 &mut self,
467 cls: &hir::ClassUnicode,
468 reverse: bool,
469 ) -> bool {
8bb4bdeb
XL
470 use std::char;
471
0531ce1d 472 if self.class_exceeds_limits(cls_char_count(cls)) {
8bb4bdeb
XL
473 return false;
474 }
475 let mut base = self.remove_complete();
476 if base.is_empty() {
0531ce1d 477 base = vec![Literal::empty()];
8bb4bdeb 478 }
0531ce1d 479 for r in cls.iter() {
8bb4bdeb
XL
480 let (s, e) = (r.start as u32, r.end as u32 + 1);
481 for c in (s..e).filter_map(char::from_u32) {
482 for mut lit in base.clone() {
483 let mut bytes = c.to_string().into_bytes();
484 if reverse {
485 bytes.reverse();
486 }
487 lit.extend(&bytes);
488 self.lits.push(lit);
489 }
490 }
491 }
492 true
493 }
494
495 /// Extends each literal in this set with the byte class given.
496 ///
497 /// Returns false if the byte class was too big to add.
0531ce1d
XL
498 pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool {
499 if self.class_exceeds_limits(cls_byte_count(cls)) {
8bb4bdeb
XL
500 return false;
501 }
502 let mut base = self.remove_complete();
503 if base.is_empty() {
0531ce1d 504 base = vec![Literal::empty()];
8bb4bdeb 505 }
0531ce1d 506 for r in cls.iter() {
8bb4bdeb
XL
507 let (s, e) = (r.start as u32, r.end as u32 + 1);
508 for b in (s..e).map(|b| b as u8) {
509 for mut lit in base.clone() {
510 lit.push(b);
511 self.lits.push(lit);
512 }
513 }
514 }
515 true
516 }
517
518 /// Cuts every member of this set. When a member is cut, it can never
519 /// be extended.
520 pub fn cut(&mut self) {
521 for lit in &mut self.lits {
522 lit.cut();
523 }
524 }
525
526 /// Reverses all members in place.
527 pub fn reverse(&mut self) {
528 for lit in &mut self.lits {
529 lit.reverse();
530 }
531 }
532
533 /// Clears this set of all members.
534 pub fn clear(&mut self) {
535 self.lits.clear();
536 }
537
538 /// Pops all complete literals out of this set.
0531ce1d 539 fn remove_complete(&mut self) -> Vec<Literal> {
8bb4bdeb
XL
540 let mut base = vec![];
541 for lit in mem::replace(&mut self.lits, vec![]) {
542 if lit.is_cut() {
543 self.lits.push(lit);
544 } else {
545 base.push(lit);
546 }
547 }
548 base
549 }
550
551 /// Returns the total number of bytes in this set.
552 fn num_bytes(&self) -> usize {
553 self.lits.iter().fold(0, |accum, lit| accum + lit.len())
554 }
555
556 /// Returns true if a character class with the given size would cause this
557 /// set to exceed its limits.
558 ///
559 /// The size given should correspond to the number of items in the class.
560 fn class_exceeds_limits(&self, size: usize) -> bool {
561 if size > self.limit_class {
562 return true;
563 }
564 // This is an approximation since codepoints in a char class can encode
565 // to 1-4 bytes.
f9f354fc
XL
566 let new_byte_count = if self.lits.is_empty() {
567 size
568 } else {
569 self.lits.iter().fold(0, |accum, lit| {
570 accum
571 + if lit.is_cut() {
572 // If the literal is cut, then we'll never add
573 // anything to it, so don't count it.
574 0
575 } else {
576 (lit.len() + 1) * size
577 }
578 })
579 };
8bb4bdeb
XL
580 new_byte_count > self.limit_size
581 }
582}
583
0531ce1d
XL
584fn prefixes(expr: &Hir, lits: &mut Literals) {
585 match *expr.kind() {
586 HirKind::Literal(hir::Literal::Unicode(c)) => {
94b46f34
XL
587 let mut buf = [0; 4];
588 lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
8bb4bdeb 589 }
0531ce1d
XL
590 HirKind::Literal(hir::Literal::Byte(b)) => {
591 lits.cross_add(&[b]);
8bb4bdeb 592 }
0531ce1d 593 HirKind::Class(hir::Class::Unicode(ref cls)) => {
8bb4bdeb
XL
594 if !lits.add_char_class(cls) {
595 lits.cut();
596 }
597 }
0531ce1d 598 HirKind::Class(hir::Class::Bytes(ref cls)) => {
8bb4bdeb
XL
599 if !lits.add_byte_class(cls) {
600 lits.cut();
601 }
602 }
0531ce1d
XL
603 HirKind::Group(hir::Group { ref hir, .. }) => {
604 prefixes(&**hir, lits);
8bb4bdeb 605 }
f9f354fc
XL
606 HirKind::Repetition(ref x) => match x.kind {
607 hir::RepetitionKind::ZeroOrOne => {
608 repeat_zero_or_one_literals(&x.hir, lits, prefixes);
0531ce1d 609 }
f9f354fc
XL
610 hir::RepetitionKind::ZeroOrMore => {
611 repeat_zero_or_more_literals(&x.hir, lits, prefixes);
612 }
613 hir::RepetitionKind::OneOrMore => {
614 repeat_one_or_more_literals(&x.hir, lits, prefixes);
615 }
616 hir::RepetitionKind::Range(ref rng) => {
617 let (min, max) = match *rng {
618 hir::RepetitionRange::Exactly(m) => (m, Some(m)),
619 hir::RepetitionRange::AtLeast(m) => (m, None),
620 hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
621 };
622 repeat_range_literals(
623 &x.hir, min, max, x.greedy, lits, prefixes,
624 )
625 }
626 },
0531ce1d
XL
627 HirKind::Concat(ref es) if es.is_empty() => {}
628 HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
629 HirKind::Concat(ref es) => {
8bb4bdeb 630 for e in es {
0531ce1d 631 if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() {
8bb4bdeb
XL
632 if !lits.is_empty() {
633 lits.cut();
634 break;
635 }
0531ce1d 636 lits.add(Literal::empty());
8bb4bdeb
XL
637 continue;
638 }
639 let mut lits2 = lits.to_empty();
640 prefixes(e, &mut lits2);
641 if !lits.cross_product(&lits2) || !lits2.any_complete() {
642 // If this expression couldn't yield any literal that
643 // could be extended, then we need to quit. Since we're
644 // short-circuiting, we also need to freeze every member.
645 lits.cut();
646 break;
647 }
648 }
649 }
0531ce1d 650 HirKind::Alternation(ref es) => {
8bb4bdeb
XL
651 alternate_literals(es, lits, prefixes);
652 }
653 _ => lits.cut(),
654 }
655}
656
0531ce1d
XL
657fn suffixes(expr: &Hir, lits: &mut Literals) {
658 match *expr.kind() {
659 HirKind::Literal(hir::Literal::Unicode(c)) => {
660 let mut buf = [0u8; 4];
94b46f34 661 let i = c.encode_utf8(&mut buf).len();
f9f354fc 662 let buf = &mut buf[..i];
0531ce1d
XL
663 buf.reverse();
664 lits.cross_add(buf);
8bb4bdeb 665 }
0531ce1d
XL
666 HirKind::Literal(hir::Literal::Byte(b)) => {
667 lits.cross_add(&[b]);
8bb4bdeb 668 }
0531ce1d 669 HirKind::Class(hir::Class::Unicode(ref cls)) => {
8bb4bdeb
XL
670 if !lits.add_char_class_reverse(cls) {
671 lits.cut();
672 }
673 }
0531ce1d 674 HirKind::Class(hir::Class::Bytes(ref cls)) => {
8bb4bdeb
XL
675 if !lits.add_byte_class(cls) {
676 lits.cut();
677 }
678 }
0531ce1d
XL
679 HirKind::Group(hir::Group { ref hir, .. }) => {
680 suffixes(&**hir, lits);
8bb4bdeb 681 }
f9f354fc
XL
682 HirKind::Repetition(ref x) => match x.kind {
683 hir::RepetitionKind::ZeroOrOne => {
684 repeat_zero_or_one_literals(&x.hir, lits, suffixes);
0531ce1d 685 }
f9f354fc
XL
686 hir::RepetitionKind::ZeroOrMore => {
687 repeat_zero_or_more_literals(&x.hir, lits, suffixes);
688 }
689 hir::RepetitionKind::OneOrMore => {
690 repeat_one_or_more_literals(&x.hir, lits, suffixes);
691 }
692 hir::RepetitionKind::Range(ref rng) => {
693 let (min, max) = match *rng {
694 hir::RepetitionRange::Exactly(m) => (m, Some(m)),
695 hir::RepetitionRange::AtLeast(m) => (m, None),
696 hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
697 };
698 repeat_range_literals(
699 &x.hir, min, max, x.greedy, lits, suffixes,
700 )
701 }
702 },
0531ce1d
XL
703 HirKind::Concat(ref es) if es.is_empty() => {}
704 HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
705 HirKind::Concat(ref es) => {
8bb4bdeb 706 for e in es.iter().rev() {
0531ce1d 707 if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() {
8bb4bdeb
XL
708 if !lits.is_empty() {
709 lits.cut();
710 break;
711 }
0531ce1d 712 lits.add(Literal::empty());
8bb4bdeb
XL
713 continue;
714 }
715 let mut lits2 = lits.to_empty();
716 suffixes(e, &mut lits2);
717 if !lits.cross_product(&lits2) || !lits2.any_complete() {
718 // If this expression couldn't yield any literal that
719 // could be extended, then we need to quit. Since we're
720 // short-circuiting, we also need to freeze every member.
721 lits.cut();
722 break;
723 }
724 }
725 }
0531ce1d 726 HirKind::Alternation(ref es) => {
8bb4bdeb
XL
727 alternate_literals(es, lits, suffixes);
728 }
729 _ => lits.cut(),
730 }
731}
732
0531ce1d
XL
733fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
734 e: &Hir,
8bb4bdeb
XL
735 lits: &mut Literals,
736 mut f: F,
737) {
923072b8
FG
738 f(
739 &Hir::repetition(hir::Repetition {
740 kind: hir::RepetitionKind::ZeroOrMore,
741 // FIXME: Our literal extraction doesn't care about greediness.
742 // Which is partially why we're treating 'e?' as 'e*'. Namely,
743 // 'ab??' yields [Complete(ab), Complete(a)], but it should yield
744 // [Complete(a), Complete(ab)] because of the non-greediness.
745 greedy: true,
746 hir: Box::new(e.clone()),
747 }),
748 lits,
749 );
8bb4bdeb
XL
750}
751
0531ce1d
XL
752fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
753 e: &Hir,
8bb4bdeb
XL
754 lits: &mut Literals,
755 mut f: F,
756) {
757 let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
758 lits3.set_limit_size(lits.limit_size() / 2);
759 f(e, &mut lits3);
760
761 if lits3.is_empty() || !lits2.cross_product(&lits3) {
762 lits.cut();
763 return;
764 }
765 lits2.cut();
0531ce1d 766 lits2.add(Literal::empty());
8bb4bdeb
XL
767 if !lits.union(lits2) {
768 lits.cut();
769 }
770}
771
0531ce1d
XL
772fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
773 e: &Hir,
8bb4bdeb
XL
774 lits: &mut Literals,
775 mut f: F,
776) {
777 f(e, lits);
778 lits.cut();
779}
780
0531ce1d
XL
781fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
782 e: &Hir,
8bb4bdeb
XL
783 min: u32,
784 max: Option<u32>,
785 greedy: bool,
786 lits: &mut Literals,
787 mut f: F,
788) {
8bb4bdeb
XL
789 if min == 0 {
790 // This is a bit conservative. If `max` is set, then we could
791 // treat this as a finite set of alternations. For now, we
792 // just treat it as `e*`.
f9f354fc
XL
793 f(
794 &Hir::repetition(hir::Repetition {
795 kind: hir::RepetitionKind::ZeroOrMore,
796 greedy: greedy,
797 hir: Box::new(e.clone()),
798 }),
799 lits,
800 );
8bb4bdeb
XL
801 } else {
802 if min > 0 {
803 let n = cmp::min(lits.limit_size, min as usize);
804 let es = iter::repeat(e.clone()).take(n).collect();
0531ce1d 805 f(&Hir::concat(es), lits);
041b39d2 806 if n < min as usize || lits.contains_empty() {
8bb4bdeb
XL
807 lits.cut();
808 }
809 }
810 if max.map_or(true, |max| min < max) {
811 lits.cut();
812 }
813 }
814}
815
0531ce1d
XL
816fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
817 es: &[Hir],
8bb4bdeb
XL
818 lits: &mut Literals,
819 mut f: F,
820) {
821 let mut lits2 = lits.to_empty();
822 for e in es {
823 let mut lits3 = lits.to_empty();
824 lits3.set_limit_size(lits.limit_size() / 5);
825 f(e, &mut lits3);
826 if lits3.is_empty() || !lits2.union(lits3) {
827 // If we couldn't find suffixes for *any* of the
828 // alternates, then the entire alternation has to be thrown
829 // away and any existing members must be frozen. Similarly,
830 // if the union couldn't complete, stop and freeze.
831 lits.cut();
832 return;
833 }
834 }
835 if !lits.cross_product(&lits2) {
836 lits.cut();
837 }
838}
839
840impl fmt::Debug for Literals {
17df50a5 841 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
8bb4bdeb 842 f.debug_struct("Literals")
f9f354fc
XL
843 .field("lits", &self.lits)
844 .field("limit_size", &self.limit_size)
845 .field("limit_class", &self.limit_class)
846 .finish()
8bb4bdeb
XL
847 }
848}
849
0531ce1d 850impl Literal {
8bb4bdeb 851 /// Returns a new complete literal with the bytes given.
0531ce1d
XL
852 pub fn new(bytes: Vec<u8>) -> Literal {
853 Literal { v: bytes, cut: false }
8bb4bdeb
XL
854 }
855
856 /// Returns a new complete empty literal.
0531ce1d
XL
857 pub fn empty() -> Literal {
858 Literal { v: vec![], cut: false }
8bb4bdeb
XL
859 }
860
861 /// Returns true if this literal was "cut."
862 pub fn is_cut(&self) -> bool {
863 self.cut
864 }
865
866 /// Cuts this literal.
867 pub fn cut(&mut self) {
868 self.cut = true;
869 }
870}
871
0531ce1d
XL
872impl PartialEq for Literal {
873 fn eq(&self, other: &Literal) -> bool {
8bb4bdeb
XL
874 self.v == other.v
875 }
876}
877
0531ce1d
XL
878impl PartialOrd for Literal {
879 fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> {
8bb4bdeb
XL
880 self.v.partial_cmp(&other.v)
881 }
882}
883
0531ce1d 884impl fmt::Debug for Literal {
17df50a5 885 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
8bb4bdeb
XL
886 if self.is_cut() {
887 write!(f, "Cut({})", escape_unicode(&self.v))
888 } else {
889 write!(f, "Complete({})", escape_unicode(&self.v))
890 }
891 }
892}
893
0531ce1d 894impl AsRef<[u8]> for Literal {
f9f354fc
XL
895 fn as_ref(&self) -> &[u8] {
896 &self.v
897 }
8bb4bdeb
XL
898}
899
0531ce1d 900impl ops::Deref for Literal {
8bb4bdeb 901 type Target = Vec<u8>;
f9f354fc
XL
902 fn deref(&self) -> &Vec<u8> {
903 &self.v
904 }
8bb4bdeb
XL
905}
906
0531ce1d 907impl ops::DerefMut for Literal {
f9f354fc
XL
908 fn deref_mut(&mut self) -> &mut Vec<u8> {
909 &mut self.v
910 }
8bb4bdeb
XL
911}
912
913fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
914 let mut i = 0;
915 while haystack.len() >= needle.len() {
916 if needle == &haystack[..needle.len()] {
917 return Some(i);
918 }
919 i += 1;
920 haystack = &haystack[1..];
921 }
922 None
923}
924
925fn escape_unicode(bytes: &[u8]) -> String {
926 let show = match ::std::str::from_utf8(bytes) {
927 Ok(v) => v.to_string(),
928 Err(_) => escape_bytes(bytes),
929 };
930 let mut space_escaped = String::new();
931 for c in show.chars() {
932 if c.is_whitespace() {
933 let escaped = if c as u32 <= 0x7F {
934 escape_byte(c as u8)
935 } else {
936 if c as u32 <= 0xFFFF {
937 format!(r"\u{{{:04x}}}", c as u32)
938 } else {
939 format!(r"\U{{{:08x}}}", c as u32)
940 }
941 };
942 space_escaped.push_str(&escaped);
943 } else {
944 space_escaped.push(c);
945 }
946 }
947 space_escaped
948}
949
950fn escape_bytes(bytes: &[u8]) -> String {
951 let mut s = String::new();
952 for &b in bytes {
953 s.push_str(&escape_byte(b));
954 }
955 s
956}
957
958fn escape_byte(byte: u8) -> String {
959 use std::ascii::escape_default;
960
961 let escaped: Vec<u8> = escape_default(byte).collect();
962 String::from_utf8_lossy(&escaped).into_owned()
963}
964
0531ce1d 965fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
f9f354fc
XL
966 cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
967 as usize
0531ce1d
XL
968}
969
970fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
f9f354fc
XL
971 cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
972 as usize
0531ce1d
XL
973}
974
8bb4bdeb
XL
975#[cfg(test)]
976mod tests {
977 use std::fmt;
978
f9f354fc 979 use super::{escape_bytes, Literal, Literals};
17df50a5
XL
980 use crate::hir::Hir;
981 use crate::ParserBuilder;
8bb4bdeb
XL
982
983 // To make test failures easier to read.
984 #[derive(Debug, Eq, PartialEq)]
0531ce1d 985 struct Bytes(Vec<ULiteral>);
8bb4bdeb 986 #[derive(Debug, Eq, PartialEq)]
0531ce1d 987 struct Unicode(Vec<ULiteral>);
8bb4bdeb 988
0531ce1d 989 fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> {
8bb4bdeb
XL
990 let mut ulits = vec![];
991 for blit in blits {
f9f354fc
XL
992 ulits
993 .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() });
8bb4bdeb
XL
994 }
995 ulits
996 }
997
f9f354fc 998 fn create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals {
8bb4bdeb
XL
999 Literals {
1000 lits: it.into_iter().collect(),
1001 limit_size: 0,
1002 limit_class: 0,
1003 }
1004 }
1005
1006 // Needs to be pub for 1.3?
1007 #[derive(Clone, Eq, PartialEq)]
0531ce1d 1008 pub struct ULiteral {
8bb4bdeb
XL
1009 v: String,
1010 cut: bool,
1011 }
1012
0531ce1d 1013 impl ULiteral {
f9f354fc
XL
1014 fn is_cut(&self) -> bool {
1015 self.cut
1016 }
8bb4bdeb
XL
1017 }
1018
0531ce1d 1019 impl fmt::Debug for ULiteral {
17df50a5 1020 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
8bb4bdeb
XL
1021 if self.is_cut() {
1022 write!(f, "Cut({})", self.v)
1023 } else {
1024 write!(f, "Complete({})", self.v)
1025 }
1026 }
1027 }
1028
0531ce1d
XL
1029 impl PartialEq<Literal> for ULiteral {
1030 fn eq(&self, other: &Literal) -> bool {
8bb4bdeb
XL
1031 self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut()
1032 }
1033 }
1034
0531ce1d
XL
1035 impl PartialEq<ULiteral> for Literal {
1036 fn eq(&self, other: &ULiteral) -> bool {
8bb4bdeb
XL
1037 &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut()
1038 }
1039 }
1040
1041 #[allow(non_snake_case)]
0531ce1d
XL
1042 fn C(s: &'static str) -> ULiteral {
1043 ULiteral { v: s.to_owned(), cut: true }
1044 }
8bb4bdeb 1045 #[allow(non_snake_case)]
0531ce1d
XL
1046 fn M(s: &'static str) -> ULiteral {
1047 ULiteral { v: s.to_owned(), cut: false }
1048 }
8bb4bdeb 1049
0531ce1d 1050 fn prefixes(lits: &mut Literals, expr: &Hir) {
8bb4bdeb
XL
1051 lits.union_prefixes(expr);
1052 }
1053
0531ce1d 1054 fn suffixes(lits: &mut Literals, expr: &Hir) {
8bb4bdeb
XL
1055 lits.union_suffixes(expr);
1056 }
1057
1058 macro_rules! assert_lit_eq {
1059 ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{
0531ce1d 1060 let expected: Vec<ULiteral> = vec![$($expected_lit),*];
8bb4bdeb
XL
1061 let lits = $got_lits;
1062 assert_eq!(
1063 $which(expected.clone()),
1064 $which(escape_lits(lits.literals())));
1065 assert_eq!(
1066 !expected.is_empty() && expected.iter().all(|l| !l.is_cut()),
1067 lits.all_complete());
1068 assert_eq!(
1069 expected.iter().any(|l| !l.is_cut()),
1070 lits.any_complete());
1071 }};
1072 }
1073
1074 macro_rules! test_lit {
1075 ($name:ident, $which:ident, $re:expr) => {
1076 test_lit!($name, $which, $re,);
1077 };
1078 ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1079 #[test]
1080 fn $name() {
0531ce1d
XL
1081 let expr = ParserBuilder::new()
1082 .build()
1083 .parse($re)
1084 .unwrap();
1085 let lits = Literals::$which(&expr);
8bb4bdeb
XL
1086 assert_lit_eq!(Unicode, lits, $($lit),*);
1087
0531ce1d
XL
1088 let expr = ParserBuilder::new()
1089 .allow_invalid_utf8(true)
1090 .unicode(false)
1091 .build()
1092 .parse($re)
1093 .unwrap();
1094 let lits = Literals::$which(&expr);
8bb4bdeb
XL
1095 assert_lit_eq!(Bytes, lits, $($lit),*);
1096 }
1097 };
1098 }
1099
1100 // ************************************************************************
1101 // Tests for prefix literal extraction.
1102 // ************************************************************************
1103
1104 // Elementary tests.
1105 test_lit!(pfx_one_lit1, prefixes, "a", M("a"));
1106 test_lit!(pfx_one_lit2, prefixes, "abc", M("abc"));
1107 test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83"));
f9f354fc 1108 #[cfg(feature = "unicode-case")]
8bb4bdeb 1109 test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
f9f354fc
XL
1110 test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
1111 test_lit!(
1112 pfx_class2,
1113 prefixes,
1114 "(?u)[☃Ⅰ]",
1115 M("\\xe2\\x85\\xa0"),
1116 M("\\xe2\\x98\\x83")
1117 );
1118 #[cfg(feature = "unicode-case")]
1119 test_lit!(
1120 pfx_class3,
1121 prefixes,
1122 "(?ui)[☃Ⅰ]",
1123 M("\\xe2\\x85\\xa0"),
1124 M("\\xe2\\x85\\xb0"),
1125 M("\\xe2\\x98\\x83")
1126 );
1127 test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a"));
1128 test_lit!(
1129 pfx_one_lit_casei2,
1130 prefixes,
1131 "(?i-u)abc",
1132 M("ABC"),
1133 M("aBC"),
1134 M("AbC"),
1135 M("abC"),
1136 M("ABc"),
1137 M("aBc"),
1138 M("Abc"),
1139 M("abc")
1140 );
8bb4bdeb
XL
1141 test_lit!(pfx_group1, prefixes, "(a)", M("a"));
1142 test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
1143 test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
923072b8
FG
1144 test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a"));
1145 // FIXME: This should return [M("a"), M("ab")] because of the non-greedy
1146 // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get
1147 // a cut literal.
1148 test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a"));
8bb4bdeb
XL
1149 test_lit!(pfx_rep_zero_or_more1, prefixes, "a*");
1150 test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*");
1151 test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a"));
1152 test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc"));
1153 test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a"));
1154 test_lit!(pfx_rep_range1, prefixes, "a{0}");
1155 test_lit!(pfx_rep_range2, prefixes, "a{0,}");
1156 test_lit!(pfx_rep_range3, prefixes, "a{0,1}");
1157 test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a"));
1158 test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa"));
1159 test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a"));
1160 test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa"));
1161
1162 // Test regexes with concatenations.
1163 test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab"));
1164 test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz"));
f9f354fc
XL
1165 test_lit!(
1166 pfx_cat3,
1167 prefixes,
1168 "(?i-u)[ab]z",
1169 M("AZ"),
1170 M("BZ"),
1171 M("aZ"),
1172 M("bZ"),
1173 M("Az"),
1174 M("Bz"),
1175 M("az"),
1176 M("bz")
1177 );
1178 test_lit!(
1179 pfx_cat4,
1180 prefixes,
1181 "[ab][yz]",
1182 M("ay"),
1183 M("by"),
1184 M("az"),
1185 M("bz")
1186 );
8bb4bdeb
XL
1187 test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b"));
1188 test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c"));
1189 test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c"));
1190 test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b"));
1191 test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b"));
1192 test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a"));
1193 test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac"));
1194 test_lit!(pfx_cat12, prefixes, "ab+", C("ab"));
1195 test_lit!(pfx_cat13, prefixes, "ab+c", C("ab"));
1196 test_lit!(pfx_cat14, prefixes, "a^", C("a"));
1197 test_lit!(pfx_cat15, prefixes, "$a");
1198 test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac"));
1199 test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab"));
1200 test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb"));
1201 test_lit!(pfx_cat19, prefixes, "a.z", C("a"));
1202
1203 // Test regexes with alternations.
1204 test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b"));
1205 test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1206 test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1207 test_lit!(pfx_alt4, prefixes, "a|b*");
1208 test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b"));
1209 test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)");
f9f354fc
XL
1210 test_lit!(
1211 pfx_alt7,
1212 prefixes,
1213 "(a|b)*c|(a|ab)*c",
1214 C("a"),
1215 C("b"),
1216 M("c"),
1217 C("a"),
1218 C("ab"),
1219 M("c")
1220 );
8bb4bdeb
XL
1221 test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c"));
1222
1223 // Test regexes with empty assertions.
1224 test_lit!(pfx_empty1, prefixes, "^a", M("a"));
041b39d2
XL
1225 test_lit!(pfx_empty2, prefixes, "a${2}", C("a"));
1226 test_lit!(pfx_empty3, prefixes, "^abc", M("abc"));
1227 test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z"));
8bb4bdeb
XL
1228
1229 // Make sure some curious regexes have no prefixes.
1230 test_lit!(pfx_nothing1, prefixes, ".");
1231 test_lit!(pfx_nothing2, prefixes, "(?s).");
1232 test_lit!(pfx_nothing3, prefixes, "^");
1233 test_lit!(pfx_nothing4, prefixes, "$");
1234 test_lit!(pfx_nothing6, prefixes, "(?m)$");
1235 test_lit!(pfx_nothing7, prefixes, r"\b");
1236 test_lit!(pfx_nothing8, prefixes, r"\B");
1237
1238 // Test a few regexes that defeat any prefix literal detection.
1239 test_lit!(pfx_defeated1, prefixes, ".a");
1240 test_lit!(pfx_defeated2, prefixes, "(?s).a");
1241 test_lit!(pfx_defeated3, prefixes, "a*b*c*");
1242 test_lit!(pfx_defeated4, prefixes, "a|.");
1243 test_lit!(pfx_defeated5, prefixes, ".|a");
1244 test_lit!(pfx_defeated6, prefixes, "a|^");
1245 test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))");
1246 test_lit!(pfx_defeated8, prefixes, "$a");
1247 test_lit!(pfx_defeated9, prefixes, "(?m)$a");
1248 test_lit!(pfx_defeated10, prefixes, r"\ba");
1249 test_lit!(pfx_defeated11, prefixes, r"\Ba");
1250 test_lit!(pfx_defeated12, prefixes, "^*a");
1251 test_lit!(pfx_defeated13, prefixes, "^+a");
1252
1253 test_lit!(
1254 pfx_crazy1,
1255 prefixes,
1256 r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
923072b8
FG
1257 C("Mo\\'"),
1258 C("Mu\\'"),
f9f354fc
XL
1259 C("Moam"),
1260 C("Muam")
1261 );
8bb4bdeb
XL
1262
1263 // ************************************************************************
1264 // Tests for quiting prefix literal search.
1265 // ************************************************************************
1266
1267 macro_rules! test_exhausted {
1268 ($name:ident, $which:ident, $re:expr) => {
1269 test_exhausted!($name, $which, $re,);
1270 };
1271 ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1272 #[test]
1273 fn $name() {
0531ce1d
XL
1274 let expr = ParserBuilder::new()
1275 .build()
1276 .parse($re)
1277 .unwrap();
8bb4bdeb
XL
1278 let mut lits = Literals::empty();
1279 lits.set_limit_size(20).set_limit_class(10);
1280 $which(&mut lits, &expr);
1281 assert_lit_eq!(Unicode, lits, $($lit),*);
1282
0531ce1d
XL
1283 let expr = ParserBuilder::new()
1284 .allow_invalid_utf8(true)
1285 .unicode(false)
1286 .build()
1287 .parse($re)
1288 .unwrap();
8bb4bdeb
XL
1289 let mut lits = Literals::empty();
1290 lits.set_limit_size(20).set_limit_class(10);
1291 $which(&mut lits, &expr);
1292 assert_lit_eq!(Bytes, lits, $($lit),*);
1293 }
1294 };
1295 }
1296
1297 // These test use a much lower limit than the default so that we can
1298 // write test cases of reasonable size.
1299 test_exhausted!(pfx_exhausted1, prefixes, "[a-z]");
1300 test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A");
1301 test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A"));
f9f354fc
XL
1302 test_exhausted!(
1303 pfx_exhausted4,
1304 prefixes,
1305 "(?i-u)foobar",
1306 C("FO"),
1307 C("fO"),
1308 C("Fo"),
1309 C("fo")
1310 );
1311 test_exhausted!(
1312 pfx_exhausted5,
1313 prefixes,
1314 "(?:ab){100}",
1315 C("abababababababababab")
1316 );
1317 test_exhausted!(
1318 pfx_exhausted6,
1319 prefixes,
1320 "(?:(?:ab){100})*cd",
1321 C("ababababab"),
1322 M("cd")
1323 );
1324 test_exhausted!(
1325 pfx_exhausted7,
1326 prefixes,
1327 "z(?:(?:ab){100})*cd",
1328 C("zababababab"),
1329 M("zcd")
1330 );
1331 test_exhausted!(
1332 pfx_exhausted8,
1333 prefixes,
1334 "aaaaaaaaaaaaaaaaaaaaz",
1335 C("aaaaaaaaaaaaaaaaaaaa")
1336 );
8bb4bdeb
XL
1337
1338 // ************************************************************************
1339 // Tests for suffix literal extraction.
1340 // ************************************************************************
1341
1342 // Elementary tests.
1343 test_lit!(sfx_one_lit1, suffixes, "a", M("a"));
1344 test_lit!(sfx_one_lit2, suffixes, "abc", M("abc"));
1345 test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83"));
f9f354fc 1346 #[cfg(feature = "unicode-case")]
8bb4bdeb 1347 test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
f9f354fc
XL
1348 test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
1349 test_lit!(
1350 sfx_class2,
1351 suffixes,
1352 "(?u)[☃Ⅰ]",
1353 M("\\xe2\\x85\\xa0"),
1354 M("\\xe2\\x98\\x83")
1355 );
1356 #[cfg(feature = "unicode-case")]
1357 test_lit!(
1358 sfx_class3,
1359 suffixes,
1360 "(?ui)[☃Ⅰ]",
1361 M("\\xe2\\x85\\xa0"),
1362 M("\\xe2\\x85\\xb0"),
1363 M("\\xe2\\x98\\x83")
1364 );
1365 test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a"));
1366 test_lit!(
1367 sfx_one_lit_casei2,
1368 suffixes,
1369 "(?i-u)abc",
1370 M("ABC"),
1371 M("ABc"),
1372 M("AbC"),
1373 M("Abc"),
1374 M("aBC"),
1375 M("aBc"),
1376 M("abC"),
1377 M("abc")
1378 );
8bb4bdeb
XL
1379 test_lit!(sfx_group1, suffixes, "(a)", M("a"));
1380 test_lit!(sfx_rep_zero_or_one1, suffixes, "a?");
1381 test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?");
1382 test_lit!(sfx_rep_zero_or_more1, suffixes, "a*");
1383 test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*");
1384 test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a"));
1385 test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc"));
1386 test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a"));
1387 test_lit!(sfx_rep_range1, suffixes, "a{0}");
1388 test_lit!(sfx_rep_range2, suffixes, "a{0,}");
1389 test_lit!(sfx_rep_range3, suffixes, "a{0,1}");
1390 test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a"));
1391 test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa"));
1392 test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a"));
1393 test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa"));
1394
1395 // Test regexes with concatenations.
1396 test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab"));
1397 test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz"));
f9f354fc
XL
1398 test_lit!(
1399 sfx_cat3,
1400 suffixes,
1401 "(?i-u)[ab]z",
1402 M("AZ"),
1403 M("Az"),
1404 M("BZ"),
1405 M("Bz"),
1406 M("aZ"),
1407 M("az"),
1408 M("bZ"),
1409 M("bz")
1410 );
1411 test_lit!(
1412 sfx_cat4,
1413 suffixes,
1414 "[ab][yz]",
1415 M("ay"),
1416 M("az"),
1417 M("by"),
1418 M("bz")
1419 );
8bb4bdeb
XL
1420 test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b"));
1421 test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c"));
1422 test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c"));
1423 test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc"));
1424 test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b"));
1425 test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a"));
1426 test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac"));
1427 test_lit!(sfx_cat12, suffixes, "ab+", C("b"));
1428 test_lit!(sfx_cat13, suffixes, "ab+c", C("bc"));
1429 test_lit!(sfx_cat14, suffixes, "a^");
1430 test_lit!(sfx_cat15, suffixes, "$a", C("a"));
1431 test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac"));
1432 test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc"));
1433 test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb"));
1434 test_lit!(sfx_cat19, suffixes, "a.z", C("z"));
1435
1436 // Test regexes with alternations.
1437 test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b"));
1438 test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1439 test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1440 test_lit!(sfx_alt4, suffixes, "a|b*");
1441 test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b"));
1442 test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)");
f9f354fc
XL
1443 test_lit!(
1444 sfx_alt7,
1445 suffixes,
1446 "(a|b)*c|(a|ab)*c",
1447 C("ac"),
1448 C("bc"),
1449 M("c"),
1450 C("ac"),
1451 C("abc"),
1452 M("c")
1453 );
8bb4bdeb
XL
1454 test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c"));
1455
1456 // Test regexes with empty assertions.
1457 test_lit!(sfx_empty1, suffixes, "a$", M("a"));
041b39d2 1458 test_lit!(sfx_empty2, suffixes, "${2}a", C("a"));
8bb4bdeb
XL
1459
1460 // Make sure some curious regexes have no suffixes.
1461 test_lit!(sfx_nothing1, suffixes, ".");
1462 test_lit!(sfx_nothing2, suffixes, "(?s).");
1463 test_lit!(sfx_nothing3, suffixes, "^");
1464 test_lit!(sfx_nothing4, suffixes, "$");
1465 test_lit!(sfx_nothing6, suffixes, "(?m)$");
1466 test_lit!(sfx_nothing7, suffixes, r"\b");
1467 test_lit!(sfx_nothing8, suffixes, r"\B");
1468
1469 // Test a few regexes that defeat any suffix literal detection.
1470 test_lit!(sfx_defeated1, suffixes, "a.");
1471 test_lit!(sfx_defeated2, suffixes, "(?s)a.");
1472 test_lit!(sfx_defeated3, suffixes, "a*b*c*");
1473 test_lit!(sfx_defeated4, suffixes, "a|.");
1474 test_lit!(sfx_defeated5, suffixes, ".|a");
1475 test_lit!(sfx_defeated6, suffixes, "a|^");
1476 test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c)).");
1477 test_lit!(sfx_defeated8, suffixes, "a^");
1478 test_lit!(sfx_defeated9, suffixes, "(?m)a$");
1479 test_lit!(sfx_defeated10, suffixes, r"a\b");
1480 test_lit!(sfx_defeated11, suffixes, r"a\B");
1481 test_lit!(sfx_defeated12, suffixes, "a^*");
1482 test_lit!(sfx_defeated13, suffixes, "a^+");
1483
1484 // These test use a much lower limit than the default so that we can
1485 // write test cases of reasonable size.
1486 test_exhausted!(sfx_exhausted1, suffixes, "[a-z]");
1487 test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*");
1488 test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z"));
f9f354fc
XL
1489 test_exhausted!(
1490 sfx_exhausted4,
1491 suffixes,
1492 "(?i-u)foobar",
1493 C("AR"),
1494 C("Ar"),
1495 C("aR"),
1496 C("ar")
1497 );
1498 test_exhausted!(
1499 sfx_exhausted5,
1500 suffixes,
1501 "(?:ab){100}",
1502 C("abababababababababab")
1503 );
1504 test_exhausted!(
1505 sfx_exhausted6,
1506 suffixes,
1507 "cd(?:(?:ab){100})*",
1508 C("ababababab"),
1509 M("cd")
1510 );
1511 test_exhausted!(
1512 sfx_exhausted7,
1513 suffixes,
1514 "cd(?:(?:ab){100})*z",
1515 C("abababababz"),
1516 M("cdz")
1517 );
1518 test_exhausted!(
1519 sfx_exhausted8,
1520 suffixes,
1521 "zaaaaaaaaaaaaaaaaaaaa",
1522 C("aaaaaaaaaaaaaaaaaaaa")
1523 );
8bb4bdeb
XL
1524
1525 // ************************************************************************
1526 // Tests for generating unambiguous literal sets.
1527 // ************************************************************************
1528
1529 macro_rules! test_unamb {
1530 ($name:ident, $given:expr, $expected:expr) => {
1531 #[test]
1532 fn $name() {
f9f354fc 1533 let given: Vec<Literal> = $given
8bb4bdeb
XL
1534 .into_iter()
1535 .map(|ul| {
1536 let cut = ul.is_cut();
0531ce1d 1537 Literal { v: ul.v.into_bytes(), cut: cut }
8bb4bdeb
XL
1538 })
1539 .collect();
1540 let lits = create_lits(given);
1541 let got = lits.unambiguous_prefixes();
1542 assert_eq!($expected, escape_lits(got.literals()));
1543 }
1544 };
1545 }
1546
1547 test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]);
f9f354fc
XL
1548 test_unamb!(
1549 unambiguous2,
1550 vec![M("zaaaaaa"), M("aa")],
1551 vec![C("aa"), C("z")]
1552 );
1553 test_unamb!(
1554 unambiguous3,
1555 vec![M("Sherlock"), M("Watson")],
1556 vec![M("Sherlock"), M("Watson")]
1557 );
8bb4bdeb
XL
1558 test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]);
1559 test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]);
1560 test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]);
1561 test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]);
1562 test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]);
f9f354fc
XL
1563 test_unamb!(
1564 unambiguous9,
1565 vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
1566 vec![C("a"), C("b"), C("c")]
1567 );
1568 test_unamb!(
1569 unambiguous10,
1570 vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
1571 vec![C("Mo"), C("Mu")]
1572 );
1573 test_unamb!(
1574 unambiguous11,
1575 vec![M("zazb"), M("azb")],
1576 vec![C("a"), C("z")]
1577 );
8bb4bdeb 1578 test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]);
f9f354fc
XL
1579 test_unamb!(
1580 unambiguous13,
1581 vec![M("ABCX"), M("CDAX"), M("BCX")],
1582 vec![C("A"), C("BCX"), C("CD")]
1583 );
1584 test_unamb!(
1585 unambiguous14,
1586 vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
1587 vec![M("DSX"), C("I"), C("MGX"), C("MV")]
1588 );
1589 test_unamb!(
1590 unambiguous15,
1591 vec![M("IMG_"), M("MG_"), M("CIMG")],
1592 vec![C("C"), C("I"), C("MG_")]
1593 );
8bb4bdeb
XL
1594
1595 // ************************************************************************
1596 // Tests for suffix trimming.
1597 // ************************************************************************
1598 macro_rules! test_trim {
1599 ($name:ident, $trim:expr, $given:expr, $expected:expr) => {
1600 #[test]
1601 fn $name() {
f9f354fc 1602 let given: Vec<Literal> = $given
8bb4bdeb
XL
1603 .into_iter()
1604 .map(|ul| {
1605 let cut = ul.is_cut();
0531ce1d 1606 Literal { v: ul.v.into_bytes(), cut: cut }
8bb4bdeb
XL
1607 })
1608 .collect();
1609 let lits = create_lits(given);
1610 let got = lits.trim_suffix($trim).unwrap();
1611 assert_eq!($expected, escape_lits(got.literals()));
1612 }
f9f354fc 1613 };
8bb4bdeb
XL
1614 }
1615
1616 test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]);
1617 test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]);
1618 test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]);
1619 test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]);
1620
1621 // ************************************************************************
1622 // Tests for longest common prefix.
1623 // ************************************************************************
1624
1625 macro_rules! test_lcp {
1626 ($name:ident, $given:expr, $expected:expr) => {
1627 #[test]
1628 fn $name() {
f9f354fc 1629 let given: Vec<Literal> = $given
8bb4bdeb 1630 .into_iter()
0531ce1d 1631 .map(|s: &str| Literal {
8bb4bdeb
XL
1632 v: s.to_owned().into_bytes(),
1633 cut: false,
1634 })
1635 .collect();
1636 let lits = create_lits(given);
1637 let got = lits.longest_common_prefix();
1638 assert_eq!($expected, escape_bytes(got));
1639 }
1640 };
1641 }
1642
1643 test_lcp!(lcp1, vec!["a"], "a");
1644 test_lcp!(lcp2, vec![], "");
1645 test_lcp!(lcp3, vec!["a", "b"], "");
1646 test_lcp!(lcp4, vec!["ab", "ab"], "ab");
1647 test_lcp!(lcp5, vec!["ab", "a"], "a");
1648 test_lcp!(lcp6, vec!["a", "ab"], "a");
1649 test_lcp!(lcp7, vec!["ab", "b"], "");
1650 test_lcp!(lcp8, vec!["b", "ab"], "");
1651 test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba");
1652 test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], "");
1653 test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], "");
1654 test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f");
1655
1656 // ************************************************************************
1657 // Tests for longest common suffix.
1658 // ************************************************************************
1659
1660 macro_rules! test_lcs {
1661 ($name:ident, $given:expr, $expected:expr) => {
1662 #[test]
1663 fn $name() {
f9f354fc 1664 let given: Vec<Literal> = $given
8bb4bdeb 1665 .into_iter()
0531ce1d 1666 .map(|s: &str| Literal {
8bb4bdeb
XL
1667 v: s.to_owned().into_bytes(),
1668 cut: false,
1669 })
1670 .collect();
1671 let lits = create_lits(given);
1672 let got = lits.longest_common_suffix();
1673 assert_eq!($expected, escape_bytes(got));
1674 }
1675 };
1676 }
1677
1678 test_lcs!(lcs1, vec!["a"], "a");
1679 test_lcs!(lcs2, vec![], "");
1680 test_lcs!(lcs3, vec!["a", "b"], "");
1681 test_lcs!(lcs4, vec!["ab", "ab"], "ab");
1682 test_lcs!(lcs5, vec!["ab", "a"], "");
1683 test_lcs!(lcs6, vec!["a", "ab"], "");
1684 test_lcs!(lcs7, vec!["ab", "b"], "b");
1685 test_lcs!(lcs8, vec!["b", "ab"], "b");
1686 test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo");
1687 test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], "");
1688 test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], "");
1689 test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b");
1690}