]>
Commit | Line | Data |
---|---|---|
0531ce1d XL |
1 | /*! |
2 | Provides routines for extracting literal prefixes and suffixes from an `Hir`. | |
3 | */ | |
4 | ||
8bb4bdeb XL |
5 | use std::cmp; |
6 | use std::fmt; | |
7 | use std::iter; | |
8 | use std::mem; | |
9 | use std::ops; | |
10 | ||
17df50a5 | 11 | use crate::hir::{self, Hir, HirKind}; |
8bb4bdeb XL |
12 | |
13 | /// A set of literal byte strings extracted from a regular expression. | |
14 | /// | |
0531ce1d XL |
15 | /// Every member of the set is a `Literal`, which is represented by a |
16 | /// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is | |
17 | /// said to be either *complete* or *cut*. A complete literal means that | |
18 | /// it extends until the beginning (or end) of the regular expression. In | |
19 | /// some circumstances, this can be used to indicate a match in the regular | |
20 | /// expression. | |
21 | /// | |
22 | /// A key aspect of literal extraction is knowing when to stop. It is not | |
23 | /// feasible to blindly extract all literals from a regular expression, even if | |
24 | /// there are finitely many. For example, the regular expression `[0-9]{10}` | |
25 | /// has `10^10` distinct literals. For this reason, literal extraction is | |
26 | /// bounded to some low number by default using heuristics, but the limits can | |
27 | /// be tweaked. | |
8bb4bdeb | 28 | /// |
0531ce1d XL |
29 | /// **WARNING**: Literal extraction uses stack space proportional to the size |
30 | /// of the `Hir` expression. At some point, this drawback will be eliminated. | |
31 | /// To protect yourself, set a reasonable | |
32 | /// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit). | |
33 | /// This is done for you by default. | |
8bb4bdeb XL |
34 | #[derive(Clone, Eq, PartialEq)] |
35 | pub struct Literals { | |
0531ce1d | 36 | lits: Vec<Literal>, |
8bb4bdeb XL |
37 | limit_size: usize, |
38 | limit_class: usize, | |
39 | } | |
40 | ||
41 | /// A single member of a set of literals extracted from a regular expression. | |
42 | /// | |
43 | /// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice | |
44 | /// and `Vec` operations are available. | |
45 | #[derive(Clone, Eq, Ord)] | |
0531ce1d | 46 | pub struct Literal { |
8bb4bdeb XL |
47 | v: Vec<u8>, |
48 | cut: bool, | |
49 | } | |
50 | ||
51 | impl Literals { | |
52 | /// Returns a new empty set of literals using default limits. | |
53 | pub fn empty() -> Literals { | |
f9f354fc | 54 | Literals { lits: vec![], limit_size: 250, limit_class: 10 } |
8bb4bdeb XL |
55 | } |
56 | ||
0531ce1d XL |
57 | /// Returns a set of literal prefixes extracted from the given `Hir`. |
58 | pub fn prefixes(expr: &Hir) -> Literals { | |
59 | let mut lits = Literals::empty(); | |
60 | lits.union_prefixes(expr); | |
61 | lits | |
62 | } | |
63 | ||
64 | /// Returns a set of literal suffixes extracted from the given `Hir`. | |
65 | pub fn suffixes(expr: &Hir) -> Literals { | |
66 | let mut lits = Literals::empty(); | |
67 | lits.union_suffixes(expr); | |
68 | lits | |
69 | } | |
70 | ||
8bb4bdeb XL |
71 | /// Get the approximate size limit (in bytes) of this set. |
72 | pub fn limit_size(&self) -> usize { | |
73 | self.limit_size | |
74 | } | |
75 | ||
76 | /// Set the approximate size limit (in bytes) of this set. | |
77 | /// | |
78 | /// If extracting a literal would put the set over this limit, then | |
79 | /// extraction stops. | |
80 | /// | |
81 | /// The new limits will only apply to additions to this set. Existing | |
82 | /// members remain unchanged, even if the set exceeds the new limit. | |
83 | pub fn set_limit_size(&mut self, size: usize) -> &mut Literals { | |
84 | self.limit_size = size; | |
85 | self | |
86 | } | |
87 | ||
88 | /// Get the character class size limit for this set. | |
89 | pub fn limit_class(&self) -> usize { | |
90 | self.limit_class | |
91 | } | |
92 | ||
93 | /// Limits the size of character(or byte) classes considered. | |
94 | /// | |
95 | /// A value of `0` prevents all character classes from being considered. | |
96 | /// | |
97 | /// This limit also applies to case insensitive literals, since each | |
98 | /// character in the case insensitive literal is converted to a class, and | |
99 | /// then case folded. | |
100 | /// | |
101 | /// The new limits will only apply to additions to this set. Existing | |
102 | /// members remain unchanged, even if the set exceeds the new limit. | |
103 | pub fn set_limit_class(&mut self, size: usize) -> &mut Literals { | |
104 | self.limit_class = size; | |
105 | self | |
106 | } | |
107 | ||
108 | /// Returns the set of literals as a slice. Its order is unspecified. | |
0531ce1d | 109 | pub fn literals(&self) -> &[Literal] { |
8bb4bdeb XL |
110 | &self.lits |
111 | } | |
112 | ||
113 | /// Returns the length of the smallest literal. | |
114 | /// | |
115 | /// Returns None is there are no literals in the set. | |
116 | pub fn min_len(&self) -> Option<usize> { | |
117 | let mut min = None; | |
118 | for lit in &self.lits { | |
119 | match min { | |
120 | None => min = Some(lit.len()), | |
121 | Some(m) if lit.len() < m => min = Some(lit.len()), | |
122 | _ => {} | |
123 | } | |
124 | } | |
125 | min | |
126 | } | |
127 | ||
128 | /// Returns true if all members in this set are complete. | |
129 | pub fn all_complete(&self) -> bool { | |
130 | !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut()) | |
131 | } | |
132 | ||
133 | /// Returns true if any member in this set is complete. | |
134 | pub fn any_complete(&self) -> bool { | |
135 | self.lits.iter().any(|lit| !lit.is_cut()) | |
136 | } | |
137 | ||
138 | /// Returns true if this set contains an empty literal. | |
139 | pub fn contains_empty(&self) -> bool { | |
140 | self.lits.iter().any(|lit| lit.is_empty()) | |
141 | } | |
142 | ||
143 | /// Returns true if this set is empty or if all of its members is empty. | |
144 | pub fn is_empty(&self) -> bool { | |
145 | self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty()) | |
146 | } | |
147 | ||
148 | /// Returns a new empty set of literals using this set's limits. | |
149 | pub fn to_empty(&self) -> Literals { | |
150 | let mut lits = Literals::empty(); | |
f9f354fc | 151 | lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class); |
8bb4bdeb XL |
152 | lits |
153 | } | |
154 | ||
155 | /// Returns the longest common prefix of all members in this set. | |
156 | pub fn longest_common_prefix(&self) -> &[u8] { | |
157 | if self.is_empty() { | |
158 | return &[]; | |
159 | } | |
160 | let lit0 = &*self.lits[0]; | |
161 | let mut len = lit0.len(); | |
162 | for lit in &self.lits[1..] { | |
163 | len = cmp::min( | |
164 | len, | |
f9f354fc XL |
165 | lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(), |
166 | ); | |
8bb4bdeb XL |
167 | } |
168 | &self.lits[0][..len] | |
169 | } | |
170 | ||
171 | /// Returns the longest common suffix of all members in this set. | |
172 | pub fn longest_common_suffix(&self) -> &[u8] { | |
173 | if self.is_empty() { | |
174 | return &[]; | |
175 | } | |
176 | let lit0 = &*self.lits[0]; | |
177 | let mut len = lit0.len(); | |
178 | for lit in &self.lits[1..] { | |
179 | len = cmp::min( | |
180 | len, | |
181 | lit.iter() | |
f9f354fc XL |
182 | .rev() |
183 | .zip(lit0.iter().rev()) | |
184 | .take_while(|&(a, b)| a == b) | |
185 | .count(), | |
186 | ); | |
8bb4bdeb XL |
187 | } |
188 | &self.lits[0][self.lits[0].len() - len..] | |
189 | } | |
190 | ||
191 | /// Returns a new set of literals with the given number of bytes trimmed | |
192 | /// from the suffix of each literal. | |
193 | /// | |
194 | /// If any literal would be cut out completely by trimming, then None is | |
195 | /// returned. | |
196 | /// | |
197 | /// Any duplicates that are created as a result of this transformation are | |
198 | /// removed. | |
199 | pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> { | |
200 | if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) { | |
201 | return None; | |
202 | } | |
203 | let mut new = self.to_empty(); | |
204 | for mut lit in self.lits.iter().cloned() { | |
205 | let new_len = lit.len() - num_bytes; | |
206 | lit.truncate(new_len); | |
207 | lit.cut(); | |
208 | new.lits.push(lit); | |
209 | } | |
210 | new.lits.sort(); | |
211 | new.lits.dedup(); | |
212 | Some(new) | |
213 | } | |
214 | ||
215 | /// Returns a new set of prefixes of this set of literals that are | |
216 | /// guaranteed to be unambiguous. | |
217 | /// | |
218 | /// Any substring match with a member of the set is returned is guaranteed | |
219 | /// to never overlap with a substring match of another member of the set | |
220 | /// at the same starting position. | |
221 | /// | |
222 | /// Given any two members of the returned set, neither is a substring of | |
223 | /// the other. | |
224 | pub fn unambiguous_prefixes(&self) -> Literals { | |
225 | if self.lits.is_empty() { | |
226 | return self.to_empty(); | |
227 | } | |
0531ce1d | 228 | let mut old: Vec<Literal> = self.lits.iter().cloned().collect(); |
8bb4bdeb | 229 | let mut new = self.to_empty(); |
f9f354fc | 230 | 'OUTER: while let Some(mut candidate) = old.pop() { |
8bb4bdeb XL |
231 | if candidate.is_empty() { |
232 | continue; | |
233 | } | |
234 | if new.lits.is_empty() { | |
235 | new.lits.push(candidate); | |
236 | continue; | |
237 | } | |
238 | for lit2 in &mut new.lits { | |
239 | if lit2.is_empty() { | |
240 | continue; | |
241 | } | |
242 | if &candidate == lit2 { | |
243 | // If the literal is already in the set, then we can | |
244 | // just drop it. But make sure that cut literals are | |
245 | // infectious! | |
246 | candidate.cut = candidate.cut || lit2.cut; | |
247 | lit2.cut = candidate.cut; | |
248 | continue 'OUTER; | |
249 | } | |
250 | if candidate.len() < lit2.len() { | |
251 | if let Some(i) = position(&candidate, &lit2) { | |
252 | candidate.cut(); | |
253 | let mut lit3 = lit2.clone(); | |
254 | lit3.truncate(i); | |
255 | lit3.cut(); | |
256 | old.push(lit3); | |
257 | lit2.clear(); | |
258 | } | |
259 | } else { | |
260 | if let Some(i) = position(&lit2, &candidate) { | |
261 | lit2.cut(); | |
262 | let mut new_candidate = candidate.clone(); | |
263 | new_candidate.truncate(i); | |
264 | new_candidate.cut(); | |
265 | old.push(new_candidate); | |
266 | candidate.clear(); | |
267 | } | |
268 | } | |
269 | // Oops, the candidate is already represented in the set. | |
270 | if candidate.is_empty() { | |
271 | continue 'OUTER; | |
272 | } | |
273 | } | |
274 | new.lits.push(candidate); | |
275 | } | |
276 | new.lits.retain(|lit| !lit.is_empty()); | |
277 | new.lits.sort(); | |
278 | new.lits.dedup(); | |
279 | new | |
280 | } | |
281 | ||
282 | /// Returns a new set of suffixes of this set of literals that are | |
283 | /// guaranteed to be unambiguous. | |
284 | /// | |
285 | /// Any substring match with a member of the set is returned is guaranteed | |
286 | /// to never overlap with a substring match of another member of the set | |
287 | /// at the same ending position. | |
288 | /// | |
289 | /// Given any two members of the returned set, neither is a substring of | |
290 | /// the other. | |
291 | pub fn unambiguous_suffixes(&self) -> Literals { | |
292 | // This is a touch wasteful... | |
293 | let mut lits = self.clone(); | |
294 | lits.reverse(); | |
295 | let mut unamb = lits.unambiguous_prefixes(); | |
296 | unamb.reverse(); | |
297 | unamb | |
298 | } | |
299 | ||
300 | /// Unions the prefixes from the given expression to this set. | |
301 | /// | |
302 | /// If prefixes could not be added (for example, this set would exceed its | |
303 | /// size limits or the set of prefixes from `expr` includes the empty | |
304 | /// string), then false is returned. | |
305 | /// | |
306 | /// Note that prefix literals extracted from `expr` are said to be complete | |
307 | /// if and only if the literal extends from the beginning of `expr` to the | |
308 | /// end of `expr`. | |
0531ce1d | 309 | pub fn union_prefixes(&mut self, expr: &Hir) -> bool { |
8bb4bdeb XL |
310 | let mut lits = self.to_empty(); |
311 | prefixes(expr, &mut lits); | |
312 | !lits.is_empty() && !lits.contains_empty() && self.union(lits) | |
313 | } | |
314 | ||
315 | /// Unions the suffixes from the given expression to this set. | |
316 | /// | |
317 | /// If suffixes could not be added (for example, this set would exceed its | |
318 | /// size limits or the set of suffixes from `expr` includes the empty | |
319 | /// string), then false is returned. | |
320 | /// | |
321 | /// Note that prefix literals extracted from `expr` are said to be complete | |
322 | /// if and only if the literal extends from the end of `expr` to the | |
323 | /// beginning of `expr`. | |
0531ce1d | 324 | pub fn union_suffixes(&mut self, expr: &Hir) -> bool { |
8bb4bdeb XL |
325 | let mut lits = self.to_empty(); |
326 | suffixes(expr, &mut lits); | |
327 | lits.reverse(); | |
328 | !lits.is_empty() && !lits.contains_empty() && self.union(lits) | |
329 | } | |
330 | ||
331 | /// Unions this set with another set. | |
332 | /// | |
333 | /// If the union would cause the set to exceed its limits, then the union | |
334 | /// is skipped and it returns false. Otherwise, if the union succeeds, it | |
335 | /// returns true. | |
336 | pub fn union(&mut self, lits: Literals) -> bool { | |
337 | if self.num_bytes() + lits.num_bytes() > self.limit_size { | |
338 | return false; | |
339 | } | |
340 | if lits.is_empty() { | |
0531ce1d | 341 | self.lits.push(Literal::empty()); |
8bb4bdeb XL |
342 | } else { |
343 | self.lits.extend(lits.lits); | |
344 | } | |
345 | true | |
346 | } | |
347 | ||
348 | /// Extends this set with another set. | |
349 | /// | |
350 | /// The set of literals is extended via a cross product. | |
351 | /// | |
352 | /// If a cross product would cause this set to exceed its limits, then the | |
353 | /// cross product is skipped and it returns false. Otherwise, if the cross | |
354 | /// product succeeds, it returns true. | |
355 | pub fn cross_product(&mut self, lits: &Literals) -> bool { | |
356 | if lits.is_empty() { | |
357 | return true; | |
358 | } | |
359 | // Check that we make sure we stay in our limits. | |
360 | let mut size_after; | |
361 | if self.is_empty() || !self.any_complete() { | |
362 | size_after = self.num_bytes(); | |
363 | for lits_lit in lits.literals() { | |
364 | size_after += lits_lit.len(); | |
365 | } | |
366 | } else { | |
367 | size_after = self.lits.iter().fold(0, |accum, lit| { | |
368 | accum + if lit.is_cut() { lit.len() } else { 0 } | |
369 | }); | |
370 | for lits_lit in lits.literals() { | |
371 | for self_lit in self.literals() { | |
372 | if !self_lit.is_cut() { | |
373 | size_after += self_lit.len() + lits_lit.len(); | |
374 | } | |
375 | } | |
376 | } | |
377 | } | |
378 | if size_after > self.limit_size { | |
379 | return false; | |
380 | } | |
381 | ||
382 | let mut base = self.remove_complete(); | |
383 | if base.is_empty() { | |
0531ce1d | 384 | base = vec![Literal::empty()]; |
8bb4bdeb XL |
385 | } |
386 | for lits_lit in lits.literals() { | |
387 | for mut self_lit in base.clone() { | |
388 | self_lit.extend(&**lits_lit); | |
389 | self_lit.cut = lits_lit.cut; | |
390 | self.lits.push(self_lit); | |
391 | } | |
392 | } | |
393 | true | |
394 | } | |
395 | ||
396 | /// Extends each literal in this set with the bytes given. | |
397 | /// | |
398 | /// If the set is empty, then the given literal is added to the set. | |
399 | /// | |
400 | /// If adding any number of bytes to all members of this set causes a limit | |
401 | /// to be exceeded, then no bytes are added and false is returned. If a | |
402 | /// prefix of `bytes` can be fit into this set, then it is used and all | |
403 | /// resulting literals are cut. | |
404 | pub fn cross_add(&mut self, bytes: &[u8]) -> bool { | |
405 | // N.B. This could be implemented by simply calling cross_product with | |
406 | // a literal set containing just `bytes`, but we can be smarter about | |
407 | // taking shorter prefixes of `bytes` if they'll fit. | |
408 | if bytes.is_empty() { | |
409 | return true; | |
410 | } | |
411 | if self.lits.is_empty() { | |
412 | let i = cmp::min(self.limit_size, bytes.len()); | |
0531ce1d | 413 | self.lits.push(Literal::new(bytes[..i].to_owned())); |
8bb4bdeb XL |
414 | self.lits[0].cut = i < bytes.len(); |
415 | return !self.lits[0].is_cut(); | |
416 | } | |
417 | let size = self.num_bytes(); | |
418 | if size + self.lits.len() >= self.limit_size { | |
419 | return false; | |
420 | } | |
421 | let mut i = 1; | |
422 | while size + (i * self.lits.len()) <= self.limit_size | |
f9f354fc XL |
423 | && i < bytes.len() |
424 | { | |
8bb4bdeb XL |
425 | i += 1; |
426 | } | |
427 | for lit in &mut self.lits { | |
428 | if !lit.is_cut() { | |
429 | lit.extend(&bytes[..i]); | |
430 | if i < bytes.len() { | |
431 | lit.cut(); | |
432 | } | |
433 | } | |
434 | } | |
435 | true | |
436 | } | |
437 | ||
438 | /// Adds the given literal to this set. | |
439 | /// | |
440 | /// Returns false if adding this literal would cause the class to be too | |
441 | /// big. | |
0531ce1d | 442 | pub fn add(&mut self, lit: Literal) -> bool { |
8bb4bdeb XL |
443 | if self.num_bytes() + lit.len() > self.limit_size { |
444 | return false; | |
445 | } | |
446 | self.lits.push(lit); | |
447 | true | |
448 | } | |
449 | ||
450 | /// Extends each literal in this set with the character class given. | |
451 | /// | |
452 | /// Returns false if the character class was too big to add. | |
0531ce1d | 453 | pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool { |
8bb4bdeb XL |
454 | self._add_char_class(cls, false) |
455 | } | |
456 | ||
457 | /// Extends each literal in this set with the character class given, | |
458 | /// writing the bytes of each character in reverse. | |
459 | /// | |
460 | /// Returns false if the character class was too big to add. | |
0531ce1d | 461 | fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool { |
8bb4bdeb XL |
462 | self._add_char_class(cls, true) |
463 | } | |
464 | ||
0531ce1d XL |
465 | fn _add_char_class( |
466 | &mut self, | |
467 | cls: &hir::ClassUnicode, | |
468 | reverse: bool, | |
469 | ) -> bool { | |
8bb4bdeb XL |
470 | use std::char; |
471 | ||
0531ce1d | 472 | if self.class_exceeds_limits(cls_char_count(cls)) { |
8bb4bdeb XL |
473 | return false; |
474 | } | |
475 | let mut base = self.remove_complete(); | |
476 | if base.is_empty() { | |
0531ce1d | 477 | base = vec![Literal::empty()]; |
8bb4bdeb | 478 | } |
0531ce1d | 479 | for r in cls.iter() { |
8bb4bdeb XL |
480 | let (s, e) = (r.start as u32, r.end as u32 + 1); |
481 | for c in (s..e).filter_map(char::from_u32) { | |
482 | for mut lit in base.clone() { | |
483 | let mut bytes = c.to_string().into_bytes(); | |
484 | if reverse { | |
485 | bytes.reverse(); | |
486 | } | |
487 | lit.extend(&bytes); | |
488 | self.lits.push(lit); | |
489 | } | |
490 | } | |
491 | } | |
492 | true | |
493 | } | |
494 | ||
495 | /// Extends each literal in this set with the byte class given. | |
496 | /// | |
497 | /// Returns false if the byte class was too big to add. | |
0531ce1d XL |
498 | pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool { |
499 | if self.class_exceeds_limits(cls_byte_count(cls)) { | |
8bb4bdeb XL |
500 | return false; |
501 | } | |
502 | let mut base = self.remove_complete(); | |
503 | if base.is_empty() { | |
0531ce1d | 504 | base = vec![Literal::empty()]; |
8bb4bdeb | 505 | } |
0531ce1d | 506 | for r in cls.iter() { |
8bb4bdeb XL |
507 | let (s, e) = (r.start as u32, r.end as u32 + 1); |
508 | for b in (s..e).map(|b| b as u8) { | |
509 | for mut lit in base.clone() { | |
510 | lit.push(b); | |
511 | self.lits.push(lit); | |
512 | } | |
513 | } | |
514 | } | |
515 | true | |
516 | } | |
517 | ||
518 | /// Cuts every member of this set. When a member is cut, it can never | |
519 | /// be extended. | |
520 | pub fn cut(&mut self) { | |
521 | for lit in &mut self.lits { | |
522 | lit.cut(); | |
523 | } | |
524 | } | |
525 | ||
526 | /// Reverses all members in place. | |
527 | pub fn reverse(&mut self) { | |
528 | for lit in &mut self.lits { | |
529 | lit.reverse(); | |
530 | } | |
531 | } | |
532 | ||
533 | /// Clears this set of all members. | |
534 | pub fn clear(&mut self) { | |
535 | self.lits.clear(); | |
536 | } | |
537 | ||
538 | /// Pops all complete literals out of this set. | |
0531ce1d | 539 | fn remove_complete(&mut self) -> Vec<Literal> { |
8bb4bdeb XL |
540 | let mut base = vec![]; |
541 | for lit in mem::replace(&mut self.lits, vec![]) { | |
542 | if lit.is_cut() { | |
543 | self.lits.push(lit); | |
544 | } else { | |
545 | base.push(lit); | |
546 | } | |
547 | } | |
548 | base | |
549 | } | |
550 | ||
551 | /// Returns the total number of bytes in this set. | |
552 | fn num_bytes(&self) -> usize { | |
553 | self.lits.iter().fold(0, |accum, lit| accum + lit.len()) | |
554 | } | |
555 | ||
556 | /// Returns true if a character class with the given size would cause this | |
557 | /// set to exceed its limits. | |
558 | /// | |
559 | /// The size given should correspond to the number of items in the class. | |
560 | fn class_exceeds_limits(&self, size: usize) -> bool { | |
561 | if size > self.limit_class { | |
562 | return true; | |
563 | } | |
564 | // This is an approximation since codepoints in a char class can encode | |
565 | // to 1-4 bytes. | |
f9f354fc XL |
566 | let new_byte_count = if self.lits.is_empty() { |
567 | size | |
568 | } else { | |
569 | self.lits.iter().fold(0, |accum, lit| { | |
570 | accum | |
571 | + if lit.is_cut() { | |
572 | // If the literal is cut, then we'll never add | |
573 | // anything to it, so don't count it. | |
574 | 0 | |
575 | } else { | |
576 | (lit.len() + 1) * size | |
577 | } | |
578 | }) | |
579 | }; | |
8bb4bdeb XL |
580 | new_byte_count > self.limit_size |
581 | } | |
582 | } | |
583 | ||
0531ce1d XL |
584 | fn prefixes(expr: &Hir, lits: &mut Literals) { |
585 | match *expr.kind() { | |
586 | HirKind::Literal(hir::Literal::Unicode(c)) => { | |
94b46f34 XL |
587 | let mut buf = [0; 4]; |
588 | lits.cross_add(c.encode_utf8(&mut buf).as_bytes()); | |
8bb4bdeb | 589 | } |
0531ce1d XL |
590 | HirKind::Literal(hir::Literal::Byte(b)) => { |
591 | lits.cross_add(&[b]); | |
8bb4bdeb | 592 | } |
0531ce1d | 593 | HirKind::Class(hir::Class::Unicode(ref cls)) => { |
8bb4bdeb XL |
594 | if !lits.add_char_class(cls) { |
595 | lits.cut(); | |
596 | } | |
597 | } | |
0531ce1d | 598 | HirKind::Class(hir::Class::Bytes(ref cls)) => { |
8bb4bdeb XL |
599 | if !lits.add_byte_class(cls) { |
600 | lits.cut(); | |
601 | } | |
602 | } | |
0531ce1d XL |
603 | HirKind::Group(hir::Group { ref hir, .. }) => { |
604 | prefixes(&**hir, lits); | |
8bb4bdeb | 605 | } |
f9f354fc XL |
606 | HirKind::Repetition(ref x) => match x.kind { |
607 | hir::RepetitionKind::ZeroOrOne => { | |
608 | repeat_zero_or_one_literals(&x.hir, lits, prefixes); | |
0531ce1d | 609 | } |
f9f354fc XL |
610 | hir::RepetitionKind::ZeroOrMore => { |
611 | repeat_zero_or_more_literals(&x.hir, lits, prefixes); | |
612 | } | |
613 | hir::RepetitionKind::OneOrMore => { | |
614 | repeat_one_or_more_literals(&x.hir, lits, prefixes); | |
615 | } | |
616 | hir::RepetitionKind::Range(ref rng) => { | |
617 | let (min, max) = match *rng { | |
618 | hir::RepetitionRange::Exactly(m) => (m, Some(m)), | |
619 | hir::RepetitionRange::AtLeast(m) => (m, None), | |
620 | hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), | |
621 | }; | |
622 | repeat_range_literals( | |
623 | &x.hir, min, max, x.greedy, lits, prefixes, | |
624 | ) | |
625 | } | |
626 | }, | |
0531ce1d XL |
627 | HirKind::Concat(ref es) if es.is_empty() => {} |
628 | HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits), | |
629 | HirKind::Concat(ref es) => { | |
8bb4bdeb | 630 | for e in es { |
0531ce1d | 631 | if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() { |
8bb4bdeb XL |
632 | if !lits.is_empty() { |
633 | lits.cut(); | |
634 | break; | |
635 | } | |
0531ce1d | 636 | lits.add(Literal::empty()); |
8bb4bdeb XL |
637 | continue; |
638 | } | |
639 | let mut lits2 = lits.to_empty(); | |
640 | prefixes(e, &mut lits2); | |
641 | if !lits.cross_product(&lits2) || !lits2.any_complete() { | |
642 | // If this expression couldn't yield any literal that | |
643 | // could be extended, then we need to quit. Since we're | |
644 | // short-circuiting, we also need to freeze every member. | |
645 | lits.cut(); | |
646 | break; | |
647 | } | |
648 | } | |
649 | } | |
0531ce1d | 650 | HirKind::Alternation(ref es) => { |
8bb4bdeb XL |
651 | alternate_literals(es, lits, prefixes); |
652 | } | |
653 | _ => lits.cut(), | |
654 | } | |
655 | } | |
656 | ||
0531ce1d XL |
657 | fn suffixes(expr: &Hir, lits: &mut Literals) { |
658 | match *expr.kind() { | |
659 | HirKind::Literal(hir::Literal::Unicode(c)) => { | |
660 | let mut buf = [0u8; 4]; | |
94b46f34 | 661 | let i = c.encode_utf8(&mut buf).len(); |
f9f354fc | 662 | let buf = &mut buf[..i]; |
0531ce1d XL |
663 | buf.reverse(); |
664 | lits.cross_add(buf); | |
8bb4bdeb | 665 | } |
0531ce1d XL |
666 | HirKind::Literal(hir::Literal::Byte(b)) => { |
667 | lits.cross_add(&[b]); | |
8bb4bdeb | 668 | } |
0531ce1d | 669 | HirKind::Class(hir::Class::Unicode(ref cls)) => { |
8bb4bdeb XL |
670 | if !lits.add_char_class_reverse(cls) { |
671 | lits.cut(); | |
672 | } | |
673 | } | |
0531ce1d | 674 | HirKind::Class(hir::Class::Bytes(ref cls)) => { |
8bb4bdeb XL |
675 | if !lits.add_byte_class(cls) { |
676 | lits.cut(); | |
677 | } | |
678 | } | |
0531ce1d XL |
679 | HirKind::Group(hir::Group { ref hir, .. }) => { |
680 | suffixes(&**hir, lits); | |
8bb4bdeb | 681 | } |
f9f354fc XL |
682 | HirKind::Repetition(ref x) => match x.kind { |
683 | hir::RepetitionKind::ZeroOrOne => { | |
684 | repeat_zero_or_one_literals(&x.hir, lits, suffixes); | |
0531ce1d | 685 | } |
f9f354fc XL |
686 | hir::RepetitionKind::ZeroOrMore => { |
687 | repeat_zero_or_more_literals(&x.hir, lits, suffixes); | |
688 | } | |
689 | hir::RepetitionKind::OneOrMore => { | |
690 | repeat_one_or_more_literals(&x.hir, lits, suffixes); | |
691 | } | |
692 | hir::RepetitionKind::Range(ref rng) => { | |
693 | let (min, max) = match *rng { | |
694 | hir::RepetitionRange::Exactly(m) => (m, Some(m)), | |
695 | hir::RepetitionRange::AtLeast(m) => (m, None), | |
696 | hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), | |
697 | }; | |
698 | repeat_range_literals( | |
699 | &x.hir, min, max, x.greedy, lits, suffixes, | |
700 | ) | |
701 | } | |
702 | }, | |
0531ce1d XL |
703 | HirKind::Concat(ref es) if es.is_empty() => {} |
704 | HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits), | |
705 | HirKind::Concat(ref es) => { | |
8bb4bdeb | 706 | for e in es.iter().rev() { |
0531ce1d | 707 | if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() { |
8bb4bdeb XL |
708 | if !lits.is_empty() { |
709 | lits.cut(); | |
710 | break; | |
711 | } | |
0531ce1d | 712 | lits.add(Literal::empty()); |
8bb4bdeb XL |
713 | continue; |
714 | } | |
715 | let mut lits2 = lits.to_empty(); | |
716 | suffixes(e, &mut lits2); | |
717 | if !lits.cross_product(&lits2) || !lits2.any_complete() { | |
718 | // If this expression couldn't yield any literal that | |
719 | // could be extended, then we need to quit. Since we're | |
720 | // short-circuiting, we also need to freeze every member. | |
721 | lits.cut(); | |
722 | break; | |
723 | } | |
724 | } | |
725 | } | |
0531ce1d | 726 | HirKind::Alternation(ref es) => { |
8bb4bdeb XL |
727 | alternate_literals(es, lits, suffixes); |
728 | } | |
729 | _ => lits.cut(), | |
730 | } | |
731 | } | |
732 | ||
0531ce1d XL |
733 | fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>( |
734 | e: &Hir, | |
8bb4bdeb XL |
735 | lits: &mut Literals, |
736 | mut f: F, | |
737 | ) { | |
923072b8 FG |
738 | f( |
739 | &Hir::repetition(hir::Repetition { | |
740 | kind: hir::RepetitionKind::ZeroOrMore, | |
741 | // FIXME: Our literal extraction doesn't care about greediness. | |
742 | // Which is partially why we're treating 'e?' as 'e*'. Namely, | |
743 | // 'ab??' yields [Complete(ab), Complete(a)], but it should yield | |
744 | // [Complete(a), Complete(ab)] because of the non-greediness. | |
745 | greedy: true, | |
746 | hir: Box::new(e.clone()), | |
747 | }), | |
748 | lits, | |
749 | ); | |
8bb4bdeb XL |
750 | } |
751 | ||
0531ce1d XL |
752 | fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>( |
753 | e: &Hir, | |
8bb4bdeb XL |
754 | lits: &mut Literals, |
755 | mut f: F, | |
756 | ) { | |
757 | let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); | |
758 | lits3.set_limit_size(lits.limit_size() / 2); | |
759 | f(e, &mut lits3); | |
760 | ||
761 | if lits3.is_empty() || !lits2.cross_product(&lits3) { | |
762 | lits.cut(); | |
763 | return; | |
764 | } | |
765 | lits2.cut(); | |
0531ce1d | 766 | lits2.add(Literal::empty()); |
8bb4bdeb XL |
767 | if !lits.union(lits2) { |
768 | lits.cut(); | |
769 | } | |
770 | } | |
771 | ||
0531ce1d XL |
772 | fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>( |
773 | e: &Hir, | |
8bb4bdeb XL |
774 | lits: &mut Literals, |
775 | mut f: F, | |
776 | ) { | |
777 | f(e, lits); | |
778 | lits.cut(); | |
779 | } | |
780 | ||
0531ce1d XL |
781 | fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>( |
782 | e: &Hir, | |
8bb4bdeb XL |
783 | min: u32, |
784 | max: Option<u32>, | |
785 | greedy: bool, | |
786 | lits: &mut Literals, | |
787 | mut f: F, | |
788 | ) { | |
8bb4bdeb XL |
789 | if min == 0 { |
790 | // This is a bit conservative. If `max` is set, then we could | |
791 | // treat this as a finite set of alternations. For now, we | |
792 | // just treat it as `e*`. | |
f9f354fc XL |
793 | f( |
794 | &Hir::repetition(hir::Repetition { | |
795 | kind: hir::RepetitionKind::ZeroOrMore, | |
796 | greedy: greedy, | |
797 | hir: Box::new(e.clone()), | |
798 | }), | |
799 | lits, | |
800 | ); | |
8bb4bdeb XL |
801 | } else { |
802 | if min > 0 { | |
803 | let n = cmp::min(lits.limit_size, min as usize); | |
804 | let es = iter::repeat(e.clone()).take(n).collect(); | |
0531ce1d | 805 | f(&Hir::concat(es), lits); |
041b39d2 | 806 | if n < min as usize || lits.contains_empty() { |
8bb4bdeb XL |
807 | lits.cut(); |
808 | } | |
809 | } | |
810 | if max.map_or(true, |max| min < max) { | |
811 | lits.cut(); | |
812 | } | |
813 | } | |
814 | } | |
815 | ||
0531ce1d XL |
816 | fn alternate_literals<F: FnMut(&Hir, &mut Literals)>( |
817 | es: &[Hir], | |
8bb4bdeb XL |
818 | lits: &mut Literals, |
819 | mut f: F, | |
820 | ) { | |
821 | let mut lits2 = lits.to_empty(); | |
822 | for e in es { | |
823 | let mut lits3 = lits.to_empty(); | |
824 | lits3.set_limit_size(lits.limit_size() / 5); | |
825 | f(e, &mut lits3); | |
826 | if lits3.is_empty() || !lits2.union(lits3) { | |
827 | // If we couldn't find suffixes for *any* of the | |
828 | // alternates, then the entire alternation has to be thrown | |
829 | // away and any existing members must be frozen. Similarly, | |
830 | // if the union couldn't complete, stop and freeze. | |
831 | lits.cut(); | |
832 | return; | |
833 | } | |
834 | } | |
835 | if !lits.cross_product(&lits2) { | |
836 | lits.cut(); | |
837 | } | |
838 | } | |
839 | ||
840 | impl fmt::Debug for Literals { | |
17df50a5 | 841 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
8bb4bdeb | 842 | f.debug_struct("Literals") |
f9f354fc XL |
843 | .field("lits", &self.lits) |
844 | .field("limit_size", &self.limit_size) | |
845 | .field("limit_class", &self.limit_class) | |
846 | .finish() | |
8bb4bdeb XL |
847 | } |
848 | } | |
849 | ||
0531ce1d | 850 | impl Literal { |
8bb4bdeb | 851 | /// Returns a new complete literal with the bytes given. |
0531ce1d XL |
852 | pub fn new(bytes: Vec<u8>) -> Literal { |
853 | Literal { v: bytes, cut: false } | |
8bb4bdeb XL |
854 | } |
855 | ||
856 | /// Returns a new complete empty literal. | |
0531ce1d XL |
857 | pub fn empty() -> Literal { |
858 | Literal { v: vec![], cut: false } | |
8bb4bdeb XL |
859 | } |
860 | ||
861 | /// Returns true if this literal was "cut." | |
862 | pub fn is_cut(&self) -> bool { | |
863 | self.cut | |
864 | } | |
865 | ||
866 | /// Cuts this literal. | |
867 | pub fn cut(&mut self) { | |
868 | self.cut = true; | |
869 | } | |
870 | } | |
871 | ||
0531ce1d XL |
872 | impl PartialEq for Literal { |
873 | fn eq(&self, other: &Literal) -> bool { | |
8bb4bdeb XL |
874 | self.v == other.v |
875 | } | |
876 | } | |
877 | ||
0531ce1d XL |
878 | impl PartialOrd for Literal { |
879 | fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> { | |
8bb4bdeb XL |
880 | self.v.partial_cmp(&other.v) |
881 | } | |
882 | } | |
883 | ||
0531ce1d | 884 | impl fmt::Debug for Literal { |
17df50a5 | 885 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
8bb4bdeb XL |
886 | if self.is_cut() { |
887 | write!(f, "Cut({})", escape_unicode(&self.v)) | |
888 | } else { | |
889 | write!(f, "Complete({})", escape_unicode(&self.v)) | |
890 | } | |
891 | } | |
892 | } | |
893 | ||
0531ce1d | 894 | impl AsRef<[u8]> for Literal { |
f9f354fc XL |
895 | fn as_ref(&self) -> &[u8] { |
896 | &self.v | |
897 | } | |
8bb4bdeb XL |
898 | } |
899 | ||
0531ce1d | 900 | impl ops::Deref for Literal { |
8bb4bdeb | 901 | type Target = Vec<u8>; |
f9f354fc XL |
902 | fn deref(&self) -> &Vec<u8> { |
903 | &self.v | |
904 | } | |
8bb4bdeb XL |
905 | } |
906 | ||
0531ce1d | 907 | impl ops::DerefMut for Literal { |
f9f354fc XL |
908 | fn deref_mut(&mut self) -> &mut Vec<u8> { |
909 | &mut self.v | |
910 | } | |
8bb4bdeb XL |
911 | } |
912 | ||
913 | fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> { | |
914 | let mut i = 0; | |
915 | while haystack.len() >= needle.len() { | |
916 | if needle == &haystack[..needle.len()] { | |
917 | return Some(i); | |
918 | } | |
919 | i += 1; | |
920 | haystack = &haystack[1..]; | |
921 | } | |
922 | None | |
923 | } | |
924 | ||
925 | fn escape_unicode(bytes: &[u8]) -> String { | |
926 | let show = match ::std::str::from_utf8(bytes) { | |
927 | Ok(v) => v.to_string(), | |
928 | Err(_) => escape_bytes(bytes), | |
929 | }; | |
930 | let mut space_escaped = String::new(); | |
931 | for c in show.chars() { | |
932 | if c.is_whitespace() { | |
933 | let escaped = if c as u32 <= 0x7F { | |
934 | escape_byte(c as u8) | |
935 | } else { | |
936 | if c as u32 <= 0xFFFF { | |
937 | format!(r"\u{{{:04x}}}", c as u32) | |
938 | } else { | |
939 | format!(r"\U{{{:08x}}}", c as u32) | |
940 | } | |
941 | }; | |
942 | space_escaped.push_str(&escaped); | |
943 | } else { | |
944 | space_escaped.push(c); | |
945 | } | |
946 | } | |
947 | space_escaped | |
948 | } | |
949 | ||
950 | fn escape_bytes(bytes: &[u8]) -> String { | |
951 | let mut s = String::new(); | |
952 | for &b in bytes { | |
953 | s.push_str(&escape_byte(b)); | |
954 | } | |
955 | s | |
956 | } | |
957 | ||
958 | fn escape_byte(byte: u8) -> String { | |
959 | use std::ascii::escape_default; | |
960 | ||
961 | let escaped: Vec<u8> = escape_default(byte).collect(); | |
962 | String::from_utf8_lossy(&escaped).into_owned() | |
963 | } | |
964 | ||
0531ce1d | 965 | fn cls_char_count(cls: &hir::ClassUnicode) -> usize { |
f9f354fc XL |
966 | cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>() |
967 | as usize | |
0531ce1d XL |
968 | } |
969 | ||
970 | fn cls_byte_count(cls: &hir::ClassBytes) -> usize { | |
f9f354fc XL |
971 | cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>() |
972 | as usize | |
0531ce1d XL |
973 | } |
974 | ||
8bb4bdeb XL |
975 | #[cfg(test)] |
976 | mod tests { | |
977 | use std::fmt; | |
978 | ||
f9f354fc | 979 | use super::{escape_bytes, Literal, Literals}; |
17df50a5 XL |
980 | use crate::hir::Hir; |
981 | use crate::ParserBuilder; | |
8bb4bdeb XL |
982 | |
983 | // To make test failures easier to read. | |
984 | #[derive(Debug, Eq, PartialEq)] | |
0531ce1d | 985 | struct Bytes(Vec<ULiteral>); |
8bb4bdeb | 986 | #[derive(Debug, Eq, PartialEq)] |
0531ce1d | 987 | struct Unicode(Vec<ULiteral>); |
8bb4bdeb | 988 | |
0531ce1d | 989 | fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> { |
8bb4bdeb XL |
990 | let mut ulits = vec![]; |
991 | for blit in blits { | |
f9f354fc XL |
992 | ulits |
993 | .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() }); | |
8bb4bdeb XL |
994 | } |
995 | ulits | |
996 | } | |
997 | ||
f9f354fc | 998 | fn create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals { |
8bb4bdeb XL |
999 | Literals { |
1000 | lits: it.into_iter().collect(), | |
1001 | limit_size: 0, | |
1002 | limit_class: 0, | |
1003 | } | |
1004 | } | |
1005 | ||
1006 | // Needs to be pub for 1.3? | |
1007 | #[derive(Clone, Eq, PartialEq)] | |
0531ce1d | 1008 | pub struct ULiteral { |
8bb4bdeb XL |
1009 | v: String, |
1010 | cut: bool, | |
1011 | } | |
1012 | ||
0531ce1d | 1013 | impl ULiteral { |
f9f354fc XL |
1014 | fn is_cut(&self) -> bool { |
1015 | self.cut | |
1016 | } | |
8bb4bdeb XL |
1017 | } |
1018 | ||
0531ce1d | 1019 | impl fmt::Debug for ULiteral { |
17df50a5 | 1020 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
8bb4bdeb XL |
1021 | if self.is_cut() { |
1022 | write!(f, "Cut({})", self.v) | |
1023 | } else { | |
1024 | write!(f, "Complete({})", self.v) | |
1025 | } | |
1026 | } | |
1027 | } | |
1028 | ||
0531ce1d XL |
1029 | impl PartialEq<Literal> for ULiteral { |
1030 | fn eq(&self, other: &Literal) -> bool { | |
8bb4bdeb XL |
1031 | self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut() |
1032 | } | |
1033 | } | |
1034 | ||
0531ce1d XL |
1035 | impl PartialEq<ULiteral> for Literal { |
1036 | fn eq(&self, other: &ULiteral) -> bool { | |
8bb4bdeb XL |
1037 | &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut() |
1038 | } | |
1039 | } | |
1040 | ||
1041 | #[allow(non_snake_case)] | |
0531ce1d XL |
1042 | fn C(s: &'static str) -> ULiteral { |
1043 | ULiteral { v: s.to_owned(), cut: true } | |
1044 | } | |
8bb4bdeb | 1045 | #[allow(non_snake_case)] |
0531ce1d XL |
1046 | fn M(s: &'static str) -> ULiteral { |
1047 | ULiteral { v: s.to_owned(), cut: false } | |
1048 | } | |
8bb4bdeb | 1049 | |
0531ce1d | 1050 | fn prefixes(lits: &mut Literals, expr: &Hir) { |
8bb4bdeb XL |
1051 | lits.union_prefixes(expr); |
1052 | } | |
1053 | ||
0531ce1d | 1054 | fn suffixes(lits: &mut Literals, expr: &Hir) { |
8bb4bdeb XL |
1055 | lits.union_suffixes(expr); |
1056 | } | |
1057 | ||
1058 | macro_rules! assert_lit_eq { | |
1059 | ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{ | |
0531ce1d | 1060 | let expected: Vec<ULiteral> = vec![$($expected_lit),*]; |
8bb4bdeb XL |
1061 | let lits = $got_lits; |
1062 | assert_eq!( | |
1063 | $which(expected.clone()), | |
1064 | $which(escape_lits(lits.literals()))); | |
1065 | assert_eq!( | |
1066 | !expected.is_empty() && expected.iter().all(|l| !l.is_cut()), | |
1067 | lits.all_complete()); | |
1068 | assert_eq!( | |
1069 | expected.iter().any(|l| !l.is_cut()), | |
1070 | lits.any_complete()); | |
1071 | }}; | |
1072 | } | |
1073 | ||
1074 | macro_rules! test_lit { | |
1075 | ($name:ident, $which:ident, $re:expr) => { | |
1076 | test_lit!($name, $which, $re,); | |
1077 | }; | |
1078 | ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { | |
1079 | #[test] | |
1080 | fn $name() { | |
0531ce1d XL |
1081 | let expr = ParserBuilder::new() |
1082 | .build() | |
1083 | .parse($re) | |
1084 | .unwrap(); | |
1085 | let lits = Literals::$which(&expr); | |
8bb4bdeb XL |
1086 | assert_lit_eq!(Unicode, lits, $($lit),*); |
1087 | ||
0531ce1d XL |
1088 | let expr = ParserBuilder::new() |
1089 | .allow_invalid_utf8(true) | |
1090 | .unicode(false) | |
1091 | .build() | |
1092 | .parse($re) | |
1093 | .unwrap(); | |
1094 | let lits = Literals::$which(&expr); | |
8bb4bdeb XL |
1095 | assert_lit_eq!(Bytes, lits, $($lit),*); |
1096 | } | |
1097 | }; | |
1098 | } | |
1099 | ||
1100 | // ************************************************************************ | |
1101 | // Tests for prefix literal extraction. | |
1102 | // ************************************************************************ | |
1103 | ||
1104 | // Elementary tests. | |
1105 | test_lit!(pfx_one_lit1, prefixes, "a", M("a")); | |
1106 | test_lit!(pfx_one_lit2, prefixes, "abc", M("abc")); | |
1107 | test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83")); | |
f9f354fc | 1108 | #[cfg(feature = "unicode-case")] |
8bb4bdeb | 1109 | test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83")); |
f9f354fc XL |
1110 | test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); |
1111 | test_lit!( | |
1112 | pfx_class2, | |
1113 | prefixes, | |
1114 | "(?u)[â˜ƒâ… ]", | |
1115 | M("\\xe2\\x85\\xa0"), | |
1116 | M("\\xe2\\x98\\x83") | |
1117 | ); | |
1118 | #[cfg(feature = "unicode-case")] | |
1119 | test_lit!( | |
1120 | pfx_class3, | |
1121 | prefixes, | |
1122 | "(?ui)[â˜ƒâ… ]", | |
1123 | M("\\xe2\\x85\\xa0"), | |
1124 | M("\\xe2\\x85\\xb0"), | |
1125 | M("\\xe2\\x98\\x83") | |
1126 | ); | |
1127 | test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a")); | |
1128 | test_lit!( | |
1129 | pfx_one_lit_casei2, | |
1130 | prefixes, | |
1131 | "(?i-u)abc", | |
1132 | M("ABC"), | |
1133 | M("aBC"), | |
1134 | M("AbC"), | |
1135 | M("abC"), | |
1136 | M("ABc"), | |
1137 | M("aBc"), | |
1138 | M("Abc"), | |
1139 | M("abc") | |
1140 | ); | |
8bb4bdeb XL |
1141 | test_lit!(pfx_group1, prefixes, "(a)", M("a")); |
1142 | test_lit!(pfx_rep_zero_or_one1, prefixes, "a?"); | |
1143 | test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?"); | |
923072b8 FG |
1144 | test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a")); |
1145 | // FIXME: This should return [M("a"), M("ab")] because of the non-greedy | |
1146 | // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get | |
1147 | // a cut literal. | |
1148 | test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a")); | |
8bb4bdeb XL |
1149 | test_lit!(pfx_rep_zero_or_more1, prefixes, "a*"); |
1150 | test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*"); | |
1151 | test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a")); | |
1152 | test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc")); | |
1153 | test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a")); | |
1154 | test_lit!(pfx_rep_range1, prefixes, "a{0}"); | |
1155 | test_lit!(pfx_rep_range2, prefixes, "a{0,}"); | |
1156 | test_lit!(pfx_rep_range3, prefixes, "a{0,1}"); | |
1157 | test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a")); | |
1158 | test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa")); | |
1159 | test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a")); | |
1160 | test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa")); | |
1161 | ||
1162 | // Test regexes with concatenations. | |
1163 | test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab")); | |
1164 | test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz")); | |
f9f354fc XL |
1165 | test_lit!( |
1166 | pfx_cat3, | |
1167 | prefixes, | |
1168 | "(?i-u)[ab]z", | |
1169 | M("AZ"), | |
1170 | M("BZ"), | |
1171 | M("aZ"), | |
1172 | M("bZ"), | |
1173 | M("Az"), | |
1174 | M("Bz"), | |
1175 | M("az"), | |
1176 | M("bz") | |
1177 | ); | |
1178 | test_lit!( | |
1179 | pfx_cat4, | |
1180 | prefixes, | |
1181 | "[ab][yz]", | |
1182 | M("ay"), | |
1183 | M("by"), | |
1184 | M("az"), | |
1185 | M("bz") | |
1186 | ); | |
8bb4bdeb XL |
1187 | test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b")); |
1188 | test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c")); | |
1189 | test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c")); | |
1190 | test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b")); | |
1191 | test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b")); | |
1192 | test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a")); | |
1193 | test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac")); | |
1194 | test_lit!(pfx_cat12, prefixes, "ab+", C("ab")); | |
1195 | test_lit!(pfx_cat13, prefixes, "ab+c", C("ab")); | |
1196 | test_lit!(pfx_cat14, prefixes, "a^", C("a")); | |
1197 | test_lit!(pfx_cat15, prefixes, "$a"); | |
1198 | test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac")); | |
1199 | test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab")); | |
1200 | test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb")); | |
1201 | test_lit!(pfx_cat19, prefixes, "a.z", C("a")); | |
1202 | ||
1203 | // Test regexes with alternations. | |
1204 | test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b")); | |
1205 | test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); | |
1206 | test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz")); | |
1207 | test_lit!(pfx_alt4, prefixes, "a|b*"); | |
1208 | test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b")); | |
1209 | test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)"); | |
f9f354fc XL |
1210 | test_lit!( |
1211 | pfx_alt7, | |
1212 | prefixes, | |
1213 | "(a|b)*c|(a|ab)*c", | |
1214 | C("a"), | |
1215 | C("b"), | |
1216 | M("c"), | |
1217 | C("a"), | |
1218 | C("ab"), | |
1219 | M("c") | |
1220 | ); | |
8bb4bdeb XL |
1221 | test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c")); |
1222 | ||
1223 | // Test regexes with empty assertions. | |
1224 | test_lit!(pfx_empty1, prefixes, "^a", M("a")); | |
041b39d2 XL |
1225 | test_lit!(pfx_empty2, prefixes, "a${2}", C("a")); |
1226 | test_lit!(pfx_empty3, prefixes, "^abc", M("abc")); | |
1227 | test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z")); | |
8bb4bdeb XL |
1228 | |
1229 | // Make sure some curious regexes have no prefixes. | |
1230 | test_lit!(pfx_nothing1, prefixes, "."); | |
1231 | test_lit!(pfx_nothing2, prefixes, "(?s)."); | |
1232 | test_lit!(pfx_nothing3, prefixes, "^"); | |
1233 | test_lit!(pfx_nothing4, prefixes, "$"); | |
1234 | test_lit!(pfx_nothing6, prefixes, "(?m)$"); | |
1235 | test_lit!(pfx_nothing7, prefixes, r"\b"); | |
1236 | test_lit!(pfx_nothing8, prefixes, r"\B"); | |
1237 | ||
1238 | // Test a few regexes that defeat any prefix literal detection. | |
1239 | test_lit!(pfx_defeated1, prefixes, ".a"); | |
1240 | test_lit!(pfx_defeated2, prefixes, "(?s).a"); | |
1241 | test_lit!(pfx_defeated3, prefixes, "a*b*c*"); | |
1242 | test_lit!(pfx_defeated4, prefixes, "a|."); | |
1243 | test_lit!(pfx_defeated5, prefixes, ".|a"); | |
1244 | test_lit!(pfx_defeated6, prefixes, "a|^"); | |
1245 | test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))"); | |
1246 | test_lit!(pfx_defeated8, prefixes, "$a"); | |
1247 | test_lit!(pfx_defeated9, prefixes, "(?m)$a"); | |
1248 | test_lit!(pfx_defeated10, prefixes, r"\ba"); | |
1249 | test_lit!(pfx_defeated11, prefixes, r"\Ba"); | |
1250 | test_lit!(pfx_defeated12, prefixes, "^*a"); | |
1251 | test_lit!(pfx_defeated13, prefixes, "^+a"); | |
1252 | ||
1253 | test_lit!( | |
1254 | pfx_crazy1, | |
1255 | prefixes, | |
1256 | r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", | |
923072b8 FG |
1257 | C("Mo\\'"), |
1258 | C("Mu\\'"), | |
f9f354fc XL |
1259 | C("Moam"), |
1260 | C("Muam") | |
1261 | ); | |
8bb4bdeb XL |
1262 | |
1263 | // ************************************************************************ | |
1264 | // Tests for quiting prefix literal search. | |
1265 | // ************************************************************************ | |
1266 | ||
1267 | macro_rules! test_exhausted { | |
1268 | ($name:ident, $which:ident, $re:expr) => { | |
1269 | test_exhausted!($name, $which, $re,); | |
1270 | }; | |
1271 | ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { | |
1272 | #[test] | |
1273 | fn $name() { | |
0531ce1d XL |
1274 | let expr = ParserBuilder::new() |
1275 | .build() | |
1276 | .parse($re) | |
1277 | .unwrap(); | |
8bb4bdeb XL |
1278 | let mut lits = Literals::empty(); |
1279 | lits.set_limit_size(20).set_limit_class(10); | |
1280 | $which(&mut lits, &expr); | |
1281 | assert_lit_eq!(Unicode, lits, $($lit),*); | |
1282 | ||
0531ce1d XL |
1283 | let expr = ParserBuilder::new() |
1284 | .allow_invalid_utf8(true) | |
1285 | .unicode(false) | |
1286 | .build() | |
1287 | .parse($re) | |
1288 | .unwrap(); | |
8bb4bdeb XL |
1289 | let mut lits = Literals::empty(); |
1290 | lits.set_limit_size(20).set_limit_class(10); | |
1291 | $which(&mut lits, &expr); | |
1292 | assert_lit_eq!(Bytes, lits, $($lit),*); | |
1293 | } | |
1294 | }; | |
1295 | } | |
1296 | ||
1297 | // These test use a much lower limit than the default so that we can | |
1298 | // write test cases of reasonable size. | |
1299 | test_exhausted!(pfx_exhausted1, prefixes, "[a-z]"); | |
1300 | test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A"); | |
1301 | test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A")); | |
f9f354fc XL |
1302 | test_exhausted!( |
1303 | pfx_exhausted4, | |
1304 | prefixes, | |
1305 | "(?i-u)foobar", | |
1306 | C("FO"), | |
1307 | C("fO"), | |
1308 | C("Fo"), | |
1309 | C("fo") | |
1310 | ); | |
1311 | test_exhausted!( | |
1312 | pfx_exhausted5, | |
1313 | prefixes, | |
1314 | "(?:ab){100}", | |
1315 | C("abababababababababab") | |
1316 | ); | |
1317 | test_exhausted!( | |
1318 | pfx_exhausted6, | |
1319 | prefixes, | |
1320 | "(?:(?:ab){100})*cd", | |
1321 | C("ababababab"), | |
1322 | M("cd") | |
1323 | ); | |
1324 | test_exhausted!( | |
1325 | pfx_exhausted7, | |
1326 | prefixes, | |
1327 | "z(?:(?:ab){100})*cd", | |
1328 | C("zababababab"), | |
1329 | M("zcd") | |
1330 | ); | |
1331 | test_exhausted!( | |
1332 | pfx_exhausted8, | |
1333 | prefixes, | |
1334 | "aaaaaaaaaaaaaaaaaaaaz", | |
1335 | C("aaaaaaaaaaaaaaaaaaaa") | |
1336 | ); | |
8bb4bdeb XL |
1337 | |
1338 | // ************************************************************************ | |
1339 | // Tests for suffix literal extraction. | |
1340 | // ************************************************************************ | |
1341 | ||
1342 | // Elementary tests. | |
1343 | test_lit!(sfx_one_lit1, suffixes, "a", M("a")); | |
1344 | test_lit!(sfx_one_lit2, suffixes, "abc", M("abc")); | |
1345 | test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83")); | |
f9f354fc | 1346 | #[cfg(feature = "unicode-case")] |
8bb4bdeb | 1347 | test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83")); |
f9f354fc XL |
1348 | test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); |
1349 | test_lit!( | |
1350 | sfx_class2, | |
1351 | suffixes, | |
1352 | "(?u)[â˜ƒâ… ]", | |
1353 | M("\\xe2\\x85\\xa0"), | |
1354 | M("\\xe2\\x98\\x83") | |
1355 | ); | |
1356 | #[cfg(feature = "unicode-case")] | |
1357 | test_lit!( | |
1358 | sfx_class3, | |
1359 | suffixes, | |
1360 | "(?ui)[â˜ƒâ… ]", | |
1361 | M("\\xe2\\x85\\xa0"), | |
1362 | M("\\xe2\\x85\\xb0"), | |
1363 | M("\\xe2\\x98\\x83") | |
1364 | ); | |
1365 | test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a")); | |
1366 | test_lit!( | |
1367 | sfx_one_lit_casei2, | |
1368 | suffixes, | |
1369 | "(?i-u)abc", | |
1370 | M("ABC"), | |
1371 | M("ABc"), | |
1372 | M("AbC"), | |
1373 | M("Abc"), | |
1374 | M("aBC"), | |
1375 | M("aBc"), | |
1376 | M("abC"), | |
1377 | M("abc") | |
1378 | ); | |
8bb4bdeb XL |
1379 | test_lit!(sfx_group1, suffixes, "(a)", M("a")); |
1380 | test_lit!(sfx_rep_zero_or_one1, suffixes, "a?"); | |
1381 | test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?"); | |
1382 | test_lit!(sfx_rep_zero_or_more1, suffixes, "a*"); | |
1383 | test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*"); | |
1384 | test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a")); | |
1385 | test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc")); | |
1386 | test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a")); | |
1387 | test_lit!(sfx_rep_range1, suffixes, "a{0}"); | |
1388 | test_lit!(sfx_rep_range2, suffixes, "a{0,}"); | |
1389 | test_lit!(sfx_rep_range3, suffixes, "a{0,1}"); | |
1390 | test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a")); | |
1391 | test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa")); | |
1392 | test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a")); | |
1393 | test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa")); | |
1394 | ||
1395 | // Test regexes with concatenations. | |
1396 | test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab")); | |
1397 | test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz")); | |
f9f354fc XL |
1398 | test_lit!( |
1399 | sfx_cat3, | |
1400 | suffixes, | |
1401 | "(?i-u)[ab]z", | |
1402 | M("AZ"), | |
1403 | M("Az"), | |
1404 | M("BZ"), | |
1405 | M("Bz"), | |
1406 | M("aZ"), | |
1407 | M("az"), | |
1408 | M("bZ"), | |
1409 | M("bz") | |
1410 | ); | |
1411 | test_lit!( | |
1412 | sfx_cat4, | |
1413 | suffixes, | |
1414 | "[ab][yz]", | |
1415 | M("ay"), | |
1416 | M("az"), | |
1417 | M("by"), | |
1418 | M("bz") | |
1419 | ); | |
8bb4bdeb XL |
1420 | test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b")); |
1421 | test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c")); | |
1422 | test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c")); | |
1423 | test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc")); | |
1424 | test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b")); | |
1425 | test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a")); | |
1426 | test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac")); | |
1427 | test_lit!(sfx_cat12, suffixes, "ab+", C("b")); | |
1428 | test_lit!(sfx_cat13, suffixes, "ab+c", C("bc")); | |
1429 | test_lit!(sfx_cat14, suffixes, "a^"); | |
1430 | test_lit!(sfx_cat15, suffixes, "$a", C("a")); | |
1431 | test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac")); | |
1432 | test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc")); | |
1433 | test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb")); | |
1434 | test_lit!(sfx_cat19, suffixes, "a.z", C("z")); | |
1435 | ||
1436 | // Test regexes with alternations. | |
1437 | test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b")); | |
1438 | test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); | |
1439 | test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz")); | |
1440 | test_lit!(sfx_alt4, suffixes, "a|b*"); | |
1441 | test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b")); | |
1442 | test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)"); | |
f9f354fc XL |
1443 | test_lit!( |
1444 | sfx_alt7, | |
1445 | suffixes, | |
1446 | "(a|b)*c|(a|ab)*c", | |
1447 | C("ac"), | |
1448 | C("bc"), | |
1449 | M("c"), | |
1450 | C("ac"), | |
1451 | C("abc"), | |
1452 | M("c") | |
1453 | ); | |
8bb4bdeb XL |
1454 | test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c")); |
1455 | ||
1456 | // Test regexes with empty assertions. | |
1457 | test_lit!(sfx_empty1, suffixes, "a$", M("a")); | |
041b39d2 | 1458 | test_lit!(sfx_empty2, suffixes, "${2}a", C("a")); |
8bb4bdeb XL |
1459 | |
1460 | // Make sure some curious regexes have no suffixes. | |
1461 | test_lit!(sfx_nothing1, suffixes, "."); | |
1462 | test_lit!(sfx_nothing2, suffixes, "(?s)."); | |
1463 | test_lit!(sfx_nothing3, suffixes, "^"); | |
1464 | test_lit!(sfx_nothing4, suffixes, "$"); | |
1465 | test_lit!(sfx_nothing6, suffixes, "(?m)$"); | |
1466 | test_lit!(sfx_nothing7, suffixes, r"\b"); | |
1467 | test_lit!(sfx_nothing8, suffixes, r"\B"); | |
1468 | ||
1469 | // Test a few regexes that defeat any suffix literal detection. | |
1470 | test_lit!(sfx_defeated1, suffixes, "a."); | |
1471 | test_lit!(sfx_defeated2, suffixes, "(?s)a."); | |
1472 | test_lit!(sfx_defeated3, suffixes, "a*b*c*"); | |
1473 | test_lit!(sfx_defeated4, suffixes, "a|."); | |
1474 | test_lit!(sfx_defeated5, suffixes, ".|a"); | |
1475 | test_lit!(sfx_defeated6, suffixes, "a|^"); | |
1476 | test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c))."); | |
1477 | test_lit!(sfx_defeated8, suffixes, "a^"); | |
1478 | test_lit!(sfx_defeated9, suffixes, "(?m)a$"); | |
1479 | test_lit!(sfx_defeated10, suffixes, r"a\b"); | |
1480 | test_lit!(sfx_defeated11, suffixes, r"a\B"); | |
1481 | test_lit!(sfx_defeated12, suffixes, "a^*"); | |
1482 | test_lit!(sfx_defeated13, suffixes, "a^+"); | |
1483 | ||
1484 | // These test use a much lower limit than the default so that we can | |
1485 | // write test cases of reasonable size. | |
1486 | test_exhausted!(sfx_exhausted1, suffixes, "[a-z]"); | |
1487 | test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*"); | |
1488 | test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z")); | |
f9f354fc XL |
1489 | test_exhausted!( |
1490 | sfx_exhausted4, | |
1491 | suffixes, | |
1492 | "(?i-u)foobar", | |
1493 | C("AR"), | |
1494 | C("Ar"), | |
1495 | C("aR"), | |
1496 | C("ar") | |
1497 | ); | |
1498 | test_exhausted!( | |
1499 | sfx_exhausted5, | |
1500 | suffixes, | |
1501 | "(?:ab){100}", | |
1502 | C("abababababababababab") | |
1503 | ); | |
1504 | test_exhausted!( | |
1505 | sfx_exhausted6, | |
1506 | suffixes, | |
1507 | "cd(?:(?:ab){100})*", | |
1508 | C("ababababab"), | |
1509 | M("cd") | |
1510 | ); | |
1511 | test_exhausted!( | |
1512 | sfx_exhausted7, | |
1513 | suffixes, | |
1514 | "cd(?:(?:ab){100})*z", | |
1515 | C("abababababz"), | |
1516 | M("cdz") | |
1517 | ); | |
1518 | test_exhausted!( | |
1519 | sfx_exhausted8, | |
1520 | suffixes, | |
1521 | "zaaaaaaaaaaaaaaaaaaaa", | |
1522 | C("aaaaaaaaaaaaaaaaaaaa") | |
1523 | ); | |
8bb4bdeb XL |
1524 | |
1525 | // ************************************************************************ | |
1526 | // Tests for generating unambiguous literal sets. | |
1527 | // ************************************************************************ | |
1528 | ||
1529 | macro_rules! test_unamb { | |
1530 | ($name:ident, $given:expr, $expected:expr) => { | |
1531 | #[test] | |
1532 | fn $name() { | |
f9f354fc | 1533 | let given: Vec<Literal> = $given |
8bb4bdeb XL |
1534 | .into_iter() |
1535 | .map(|ul| { | |
1536 | let cut = ul.is_cut(); | |
0531ce1d | 1537 | Literal { v: ul.v.into_bytes(), cut: cut } |
8bb4bdeb XL |
1538 | }) |
1539 | .collect(); | |
1540 | let lits = create_lits(given); | |
1541 | let got = lits.unambiguous_prefixes(); | |
1542 | assert_eq!($expected, escape_lits(got.literals())); | |
1543 | } | |
1544 | }; | |
1545 | } | |
1546 | ||
1547 | test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]); | |
f9f354fc XL |
1548 | test_unamb!( |
1549 | unambiguous2, | |
1550 | vec![M("zaaaaaa"), M("aa")], | |
1551 | vec![C("aa"), C("z")] | |
1552 | ); | |
1553 | test_unamb!( | |
1554 | unambiguous3, | |
1555 | vec![M("Sherlock"), M("Watson")], | |
1556 | vec![M("Sherlock"), M("Watson")] | |
1557 | ); | |
8bb4bdeb XL |
1558 | test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]); |
1559 | test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]); | |
1560 | test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]); | |
1561 | test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]); | |
1562 | test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]); | |
f9f354fc XL |
1563 | test_unamb!( |
1564 | unambiguous9, | |
1565 | vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")], | |
1566 | vec![C("a"), C("b"), C("c")] | |
1567 | ); | |
1568 | test_unamb!( | |
1569 | unambiguous10, | |
1570 | vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")], | |
1571 | vec![C("Mo"), C("Mu")] | |
1572 | ); | |
1573 | test_unamb!( | |
1574 | unambiguous11, | |
1575 | vec![M("zazb"), M("azb")], | |
1576 | vec![C("a"), C("z")] | |
1577 | ); | |
8bb4bdeb | 1578 | test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]); |
f9f354fc XL |
1579 | test_unamb!( |
1580 | unambiguous13, | |
1581 | vec![M("ABCX"), M("CDAX"), M("BCX")], | |
1582 | vec![C("A"), C("BCX"), C("CD")] | |
1583 | ); | |
1584 | test_unamb!( | |
1585 | unambiguous14, | |
1586 | vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")], | |
1587 | vec![M("DSX"), C("I"), C("MGX"), C("MV")] | |
1588 | ); | |
1589 | test_unamb!( | |
1590 | unambiguous15, | |
1591 | vec![M("IMG_"), M("MG_"), M("CIMG")], | |
1592 | vec![C("C"), C("I"), C("MG_")] | |
1593 | ); | |
8bb4bdeb XL |
1594 | |
1595 | // ************************************************************************ | |
1596 | // Tests for suffix trimming. | |
1597 | // ************************************************************************ | |
1598 | macro_rules! test_trim { | |
1599 | ($name:ident, $trim:expr, $given:expr, $expected:expr) => { | |
1600 | #[test] | |
1601 | fn $name() { | |
f9f354fc | 1602 | let given: Vec<Literal> = $given |
8bb4bdeb XL |
1603 | .into_iter() |
1604 | .map(|ul| { | |
1605 | let cut = ul.is_cut(); | |
0531ce1d | 1606 | Literal { v: ul.v.into_bytes(), cut: cut } |
8bb4bdeb XL |
1607 | }) |
1608 | .collect(); | |
1609 | let lits = create_lits(given); | |
1610 | let got = lits.trim_suffix($trim).unwrap(); | |
1611 | assert_eq!($expected, escape_lits(got.literals())); | |
1612 | } | |
f9f354fc | 1613 | }; |
8bb4bdeb XL |
1614 | } |
1615 | ||
1616 | test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]); | |
1617 | test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]); | |
1618 | test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]); | |
1619 | test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]); | |
1620 | ||
1621 | // ************************************************************************ | |
1622 | // Tests for longest common prefix. | |
1623 | // ************************************************************************ | |
1624 | ||
1625 | macro_rules! test_lcp { | |
1626 | ($name:ident, $given:expr, $expected:expr) => { | |
1627 | #[test] | |
1628 | fn $name() { | |
f9f354fc | 1629 | let given: Vec<Literal> = $given |
8bb4bdeb | 1630 | .into_iter() |
0531ce1d | 1631 | .map(|s: &str| Literal { |
8bb4bdeb XL |
1632 | v: s.to_owned().into_bytes(), |
1633 | cut: false, | |
1634 | }) | |
1635 | .collect(); | |
1636 | let lits = create_lits(given); | |
1637 | let got = lits.longest_common_prefix(); | |
1638 | assert_eq!($expected, escape_bytes(got)); | |
1639 | } | |
1640 | }; | |
1641 | } | |
1642 | ||
1643 | test_lcp!(lcp1, vec!["a"], "a"); | |
1644 | test_lcp!(lcp2, vec![], ""); | |
1645 | test_lcp!(lcp3, vec!["a", "b"], ""); | |
1646 | test_lcp!(lcp4, vec!["ab", "ab"], "ab"); | |
1647 | test_lcp!(lcp5, vec!["ab", "a"], "a"); | |
1648 | test_lcp!(lcp6, vec!["a", "ab"], "a"); | |
1649 | test_lcp!(lcp7, vec!["ab", "b"], ""); | |
1650 | test_lcp!(lcp8, vec!["b", "ab"], ""); | |
1651 | test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba"); | |
1652 | test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], ""); | |
1653 | test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], ""); | |
1654 | test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f"); | |
1655 | ||
1656 | // ************************************************************************ | |
1657 | // Tests for longest common suffix. | |
1658 | // ************************************************************************ | |
1659 | ||
1660 | macro_rules! test_lcs { | |
1661 | ($name:ident, $given:expr, $expected:expr) => { | |
1662 | #[test] | |
1663 | fn $name() { | |
f9f354fc | 1664 | let given: Vec<Literal> = $given |
8bb4bdeb | 1665 | .into_iter() |
0531ce1d | 1666 | .map(|s: &str| Literal { |
8bb4bdeb XL |
1667 | v: s.to_owned().into_bytes(), |
1668 | cut: false, | |
1669 | }) | |
1670 | .collect(); | |
1671 | let lits = create_lits(given); | |
1672 | let got = lits.longest_common_suffix(); | |
1673 | assert_eq!($expected, escape_bytes(got)); | |
1674 | } | |
1675 | }; | |
1676 | } | |
1677 | ||
1678 | test_lcs!(lcs1, vec!["a"], "a"); | |
1679 | test_lcs!(lcs2, vec![], ""); | |
1680 | test_lcs!(lcs3, vec!["a", "b"], ""); | |
1681 | test_lcs!(lcs4, vec!["ab", "ab"], "ab"); | |
1682 | test_lcs!(lcs5, vec!["ab", "a"], ""); | |
1683 | test_lcs!(lcs6, vec!["a", "ab"], ""); | |
1684 | test_lcs!(lcs7, vec!["ab", "b"], "b"); | |
1685 | test_lcs!(lcs8, vec!["b", "ab"], "b"); | |
1686 | test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo"); | |
1687 | test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], ""); | |
1688 | test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], ""); | |
1689 | test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b"); | |
1690 | } |