]>
Commit | Line | Data |
---|---|---|
8bb4bdeb XL |
1 | // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
10 | ||
11 | use std::cmp; | |
12 | use std::fmt; | |
13 | use std::iter; | |
14 | use std::mem; | |
15 | use std::ops; | |
16 | ||
17 | use {Expr, CharClass, ClassRange, ByteClass, ByteRange, Repeater}; | |
18 | ||
19 | /// A set of literal byte strings extracted from a regular expression. | |
20 | /// | |
21 | /// Every member of the set is a `Lit`, which is represented by a `Vec<u8>`. | |
22 | /// (Notably, it may contain invalid UTF-8.) Every member is said to be either | |
23 | /// *complete* or *cut*. A complete literal means that it extends until the | |
24 | /// beginning (or end) of the regular expression. In some circumstances, this | |
25 | /// can be used to indicate a match in the regular expression. | |
26 | /// | |
27 | /// Note that a key aspect of literal extraction is knowing when to stop. It is | |
28 | /// not feasible to blindly extract all literals from a regular expression, | |
29 | /// even if there are finitely many. For example, the regular expression | |
30 | /// `[0-9]{10}` has `10^10` distinct literals. For this reason, literal | |
31 | /// extraction is bounded to some low number by default using heuristics, but | |
32 | /// the limits can be tweaked. | |
33 | #[derive(Clone, Eq, PartialEq)] | |
34 | pub struct Literals { | |
35 | lits: Vec<Lit>, | |
36 | limit_size: usize, | |
37 | limit_class: usize, | |
38 | } | |
39 | ||
40 | /// A single member of a set of literals extracted from a regular expression. | |
41 | /// | |
42 | /// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice | |
43 | /// and `Vec` operations are available. | |
44 | #[derive(Clone, Eq, Ord)] | |
45 | pub struct Lit { | |
46 | v: Vec<u8>, | |
47 | cut: bool, | |
48 | } | |
49 | ||
50 | impl Literals { | |
51 | /// Returns a new empty set of literals using default limits. | |
52 | pub fn empty() -> Literals { | |
53 | Literals { | |
54 | lits: vec![], | |
55 | limit_size: 250, | |
56 | limit_class: 10, | |
57 | } | |
58 | } | |
59 | ||
60 | /// Get the approximate size limit (in bytes) of this set. | |
61 | pub fn limit_size(&self) -> usize { | |
62 | self.limit_size | |
63 | } | |
64 | ||
65 | /// Set the approximate size limit (in bytes) of this set. | |
66 | /// | |
67 | /// If extracting a literal would put the set over this limit, then | |
68 | /// extraction stops. | |
69 | /// | |
70 | /// The new limits will only apply to additions to this set. Existing | |
71 | /// members remain unchanged, even if the set exceeds the new limit. | |
72 | pub fn set_limit_size(&mut self, size: usize) -> &mut Literals { | |
73 | self.limit_size = size; | |
74 | self | |
75 | } | |
76 | ||
77 | /// Get the character class size limit for this set. | |
78 | pub fn limit_class(&self) -> usize { | |
79 | self.limit_class | |
80 | } | |
81 | ||
82 | /// Limits the size of character(or byte) classes considered. | |
83 | /// | |
84 | /// A value of `0` prevents all character classes from being considered. | |
85 | /// | |
86 | /// This limit also applies to case insensitive literals, since each | |
87 | /// character in the case insensitive literal is converted to a class, and | |
88 | /// then case folded. | |
89 | /// | |
90 | /// The new limits will only apply to additions to this set. Existing | |
91 | /// members remain unchanged, even if the set exceeds the new limit. | |
92 | pub fn set_limit_class(&mut self, size: usize) -> &mut Literals { | |
93 | self.limit_class = size; | |
94 | self | |
95 | } | |
96 | ||
97 | /// Returns the set of literals as a slice. Its order is unspecified. | |
98 | pub fn literals(&self) -> &[Lit] { | |
99 | &self.lits | |
100 | } | |
101 | ||
102 | /// Returns the length of the smallest literal. | |
103 | /// | |
104 | /// Returns None is there are no literals in the set. | |
105 | pub fn min_len(&self) -> Option<usize> { | |
106 | let mut min = None; | |
107 | for lit in &self.lits { | |
108 | match min { | |
109 | None => min = Some(lit.len()), | |
110 | Some(m) if lit.len() < m => min = Some(lit.len()), | |
111 | _ => {} | |
112 | } | |
113 | } | |
114 | min | |
115 | } | |
116 | ||
117 | /// Returns true if all members in this set are complete. | |
118 | pub fn all_complete(&self) -> bool { | |
119 | !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut()) | |
120 | } | |
121 | ||
122 | /// Returns true if any member in this set is complete. | |
123 | pub fn any_complete(&self) -> bool { | |
124 | self.lits.iter().any(|lit| !lit.is_cut()) | |
125 | } | |
126 | ||
127 | /// Returns true if this set contains an empty literal. | |
128 | pub fn contains_empty(&self) -> bool { | |
129 | self.lits.iter().any(|lit| lit.is_empty()) | |
130 | } | |
131 | ||
132 | /// Returns true if this set is empty or if all of its members is empty. | |
133 | pub fn is_empty(&self) -> bool { | |
134 | self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty()) | |
135 | } | |
136 | ||
137 | /// Returns a new empty set of literals using this set's limits. | |
138 | pub fn to_empty(&self) -> Literals { | |
139 | let mut lits = Literals::empty(); | |
140 | lits.set_limit_size(self.limit_size) | |
141 | .set_limit_class(self.limit_class); | |
142 | lits | |
143 | } | |
144 | ||
145 | /// Returns the longest common prefix of all members in this set. | |
146 | pub fn longest_common_prefix(&self) -> &[u8] { | |
147 | if self.is_empty() { | |
148 | return &[]; | |
149 | } | |
150 | let lit0 = &*self.lits[0]; | |
151 | let mut len = lit0.len(); | |
152 | for lit in &self.lits[1..] { | |
153 | len = cmp::min( | |
154 | len, | |
155 | lit.iter() | |
156 | .zip(lit0) | |
157 | .take_while(|&(a, b)| a == b) | |
158 | .count()); | |
159 | } | |
160 | &self.lits[0][..len] | |
161 | } | |
162 | ||
163 | /// Returns the longest common suffix of all members in this set. | |
164 | pub fn longest_common_suffix(&self) -> &[u8] { | |
165 | if self.is_empty() { | |
166 | return &[]; | |
167 | } | |
168 | let lit0 = &*self.lits[0]; | |
169 | let mut len = lit0.len(); | |
170 | for lit in &self.lits[1..] { | |
171 | len = cmp::min( | |
172 | len, | |
173 | lit.iter() | |
174 | .rev() | |
175 | .zip(lit0.iter().rev()) | |
176 | .take_while(|&(a, b)| a == b) | |
177 | .count()); | |
178 | } | |
179 | &self.lits[0][self.lits[0].len() - len..] | |
180 | } | |
181 | ||
182 | /// Returns a new set of literals with the given number of bytes trimmed | |
183 | /// from the suffix of each literal. | |
184 | /// | |
185 | /// If any literal would be cut out completely by trimming, then None is | |
186 | /// returned. | |
187 | /// | |
188 | /// Any duplicates that are created as a result of this transformation are | |
189 | /// removed. | |
190 | pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> { | |
191 | if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) { | |
192 | return None; | |
193 | } | |
194 | let mut new = self.to_empty(); | |
195 | for mut lit in self.lits.iter().cloned() { | |
196 | let new_len = lit.len() - num_bytes; | |
197 | lit.truncate(new_len); | |
198 | lit.cut(); | |
199 | new.lits.push(lit); | |
200 | } | |
201 | new.lits.sort(); | |
202 | new.lits.dedup(); | |
203 | Some(new) | |
204 | } | |
205 | ||
206 | /// Returns a new set of prefixes of this set of literals that are | |
207 | /// guaranteed to be unambiguous. | |
208 | /// | |
209 | /// Any substring match with a member of the set is returned is guaranteed | |
210 | /// to never overlap with a substring match of another member of the set | |
211 | /// at the same starting position. | |
212 | /// | |
213 | /// Given any two members of the returned set, neither is a substring of | |
214 | /// the other. | |
215 | pub fn unambiguous_prefixes(&self) -> Literals { | |
216 | if self.lits.is_empty() { | |
217 | return self.to_empty(); | |
218 | } | |
219 | let mut old: Vec<Lit> = self.lits.iter().cloned().collect(); | |
220 | let mut new = self.to_empty(); | |
221 | 'OUTER: | |
222 | while let Some(mut candidate) = old.pop() { | |
223 | if candidate.is_empty() { | |
224 | continue; | |
225 | } | |
226 | if new.lits.is_empty() { | |
227 | new.lits.push(candidate); | |
228 | continue; | |
229 | } | |
230 | for lit2 in &mut new.lits { | |
231 | if lit2.is_empty() { | |
232 | continue; | |
233 | } | |
234 | if &candidate == lit2 { | |
235 | // If the literal is already in the set, then we can | |
236 | // just drop it. But make sure that cut literals are | |
237 | // infectious! | |
238 | candidate.cut = candidate.cut || lit2.cut; | |
239 | lit2.cut = candidate.cut; | |
240 | continue 'OUTER; | |
241 | } | |
242 | if candidate.len() < lit2.len() { | |
243 | if let Some(i) = position(&candidate, &lit2) { | |
244 | candidate.cut(); | |
245 | let mut lit3 = lit2.clone(); | |
246 | lit3.truncate(i); | |
247 | lit3.cut(); | |
248 | old.push(lit3); | |
249 | lit2.clear(); | |
250 | } | |
251 | } else { | |
252 | if let Some(i) = position(&lit2, &candidate) { | |
253 | lit2.cut(); | |
254 | let mut new_candidate = candidate.clone(); | |
255 | new_candidate.truncate(i); | |
256 | new_candidate.cut(); | |
257 | old.push(new_candidate); | |
258 | candidate.clear(); | |
259 | } | |
260 | } | |
261 | // Oops, the candidate is already represented in the set. | |
262 | if candidate.is_empty() { | |
263 | continue 'OUTER; | |
264 | } | |
265 | } | |
266 | new.lits.push(candidate); | |
267 | } | |
268 | new.lits.retain(|lit| !lit.is_empty()); | |
269 | new.lits.sort(); | |
270 | new.lits.dedup(); | |
271 | new | |
272 | } | |
273 | ||
274 | /// Returns a new set of suffixes of this set of literals that are | |
275 | /// guaranteed to be unambiguous. | |
276 | /// | |
277 | /// Any substring match with a member of the set is returned is guaranteed | |
278 | /// to never overlap with a substring match of another member of the set | |
279 | /// at the same ending position. | |
280 | /// | |
281 | /// Given any two members of the returned set, neither is a substring of | |
282 | /// the other. | |
283 | pub fn unambiguous_suffixes(&self) -> Literals { | |
284 | // This is a touch wasteful... | |
285 | let mut lits = self.clone(); | |
286 | lits.reverse(); | |
287 | let mut unamb = lits.unambiguous_prefixes(); | |
288 | unamb.reverse(); | |
289 | unamb | |
290 | } | |
291 | ||
292 | /// Unions the prefixes from the given expression to this set. | |
293 | /// | |
294 | /// If prefixes could not be added (for example, this set would exceed its | |
295 | /// size limits or the set of prefixes from `expr` includes the empty | |
296 | /// string), then false is returned. | |
297 | /// | |
298 | /// Note that prefix literals extracted from `expr` are said to be complete | |
299 | /// if and only if the literal extends from the beginning of `expr` to the | |
300 | /// end of `expr`. | |
301 | pub fn union_prefixes(&mut self, expr: &Expr) -> bool { | |
302 | let mut lits = self.to_empty(); | |
303 | prefixes(expr, &mut lits); | |
304 | !lits.is_empty() && !lits.contains_empty() && self.union(lits) | |
305 | } | |
306 | ||
307 | /// Unions the suffixes from the given expression to this set. | |
308 | /// | |
309 | /// If suffixes could not be added (for example, this set would exceed its | |
310 | /// size limits or the set of suffixes from `expr` includes the empty | |
311 | /// string), then false is returned. | |
312 | /// | |
313 | /// Note that prefix literals extracted from `expr` are said to be complete | |
314 | /// if and only if the literal extends from the end of `expr` to the | |
315 | /// beginning of `expr`. | |
316 | pub fn union_suffixes(&mut self, expr: &Expr) -> bool { | |
317 | let mut lits = self.to_empty(); | |
318 | suffixes(expr, &mut lits); | |
319 | lits.reverse(); | |
320 | !lits.is_empty() && !lits.contains_empty() && self.union(lits) | |
321 | } | |
322 | ||
323 | /// Unions this set with another set. | |
324 | /// | |
325 | /// If the union would cause the set to exceed its limits, then the union | |
326 | /// is skipped and it returns false. Otherwise, if the union succeeds, it | |
327 | /// returns true. | |
328 | pub fn union(&mut self, lits: Literals) -> bool { | |
329 | if self.num_bytes() + lits.num_bytes() > self.limit_size { | |
330 | return false; | |
331 | } | |
332 | if lits.is_empty() { | |
333 | self.lits.push(Lit::empty()); | |
334 | } else { | |
335 | self.lits.extend(lits.lits); | |
336 | } | |
337 | true | |
338 | } | |
339 | ||
340 | /// Extends this set with another set. | |
341 | /// | |
342 | /// The set of literals is extended via a cross product. | |
343 | /// | |
344 | /// If a cross product would cause this set to exceed its limits, then the | |
345 | /// cross product is skipped and it returns false. Otherwise, if the cross | |
346 | /// product succeeds, it returns true. | |
347 | pub fn cross_product(&mut self, lits: &Literals) -> bool { | |
348 | if lits.is_empty() { | |
349 | return true; | |
350 | } | |
351 | // Check that we make sure we stay in our limits. | |
352 | let mut size_after; | |
353 | if self.is_empty() || !self.any_complete() { | |
354 | size_after = self.num_bytes(); | |
355 | for lits_lit in lits.literals() { | |
356 | size_after += lits_lit.len(); | |
357 | } | |
358 | } else { | |
359 | size_after = self.lits.iter().fold(0, |accum, lit| { | |
360 | accum + if lit.is_cut() { lit.len() } else { 0 } | |
361 | }); | |
362 | for lits_lit in lits.literals() { | |
363 | for self_lit in self.literals() { | |
364 | if !self_lit.is_cut() { | |
365 | size_after += self_lit.len() + lits_lit.len(); | |
366 | } | |
367 | } | |
368 | } | |
369 | } | |
370 | if size_after > self.limit_size { | |
371 | return false; | |
372 | } | |
373 | ||
374 | let mut base = self.remove_complete(); | |
375 | if base.is_empty() { | |
376 | base = vec![Lit::empty()]; | |
377 | } | |
378 | for lits_lit in lits.literals() { | |
379 | for mut self_lit in base.clone() { | |
380 | self_lit.extend(&**lits_lit); | |
381 | self_lit.cut = lits_lit.cut; | |
382 | self.lits.push(self_lit); | |
383 | } | |
384 | } | |
385 | true | |
386 | } | |
387 | ||
388 | /// Extends each literal in this set with the bytes given. | |
389 | /// | |
390 | /// If the set is empty, then the given literal is added to the set. | |
391 | /// | |
392 | /// If adding any number of bytes to all members of this set causes a limit | |
393 | /// to be exceeded, then no bytes are added and false is returned. If a | |
394 | /// prefix of `bytes` can be fit into this set, then it is used and all | |
395 | /// resulting literals are cut. | |
396 | pub fn cross_add(&mut self, bytes: &[u8]) -> bool { | |
397 | // N.B. This could be implemented by simply calling cross_product with | |
398 | // a literal set containing just `bytes`, but we can be smarter about | |
399 | // taking shorter prefixes of `bytes` if they'll fit. | |
400 | if bytes.is_empty() { | |
401 | return true; | |
402 | } | |
403 | if self.lits.is_empty() { | |
404 | let i = cmp::min(self.limit_size, bytes.len()); | |
405 | self.lits.push(Lit::new(bytes[..i].to_owned())); | |
406 | self.lits[0].cut = i < bytes.len(); | |
407 | return !self.lits[0].is_cut(); | |
408 | } | |
409 | let size = self.num_bytes(); | |
410 | if size + self.lits.len() >= self.limit_size { | |
411 | return false; | |
412 | } | |
413 | let mut i = 1; | |
414 | while size + (i * self.lits.len()) <= self.limit_size | |
415 | && i < bytes.len() { | |
416 | i += 1; | |
417 | } | |
418 | for lit in &mut self.lits { | |
419 | if !lit.is_cut() { | |
420 | lit.extend(&bytes[..i]); | |
421 | if i < bytes.len() { | |
422 | lit.cut(); | |
423 | } | |
424 | } | |
425 | } | |
426 | true | |
427 | } | |
428 | ||
429 | /// Adds the given literal to this set. | |
430 | /// | |
431 | /// Returns false if adding this literal would cause the class to be too | |
432 | /// big. | |
433 | pub fn add(&mut self, lit: Lit) -> bool { | |
434 | if self.num_bytes() + lit.len() > self.limit_size { | |
435 | return false; | |
436 | } | |
437 | self.lits.push(lit); | |
438 | true | |
439 | } | |
440 | ||
441 | /// Extends each literal in this set with the character class given. | |
442 | /// | |
443 | /// Returns false if the character class was too big to add. | |
444 | pub fn add_char_class(&mut self, cls: &CharClass) -> bool { | |
445 | self._add_char_class(cls, false) | |
446 | } | |
447 | ||
448 | /// Extends each literal in this set with the character class given, | |
449 | /// writing the bytes of each character in reverse. | |
450 | /// | |
451 | /// Returns false if the character class was too big to add. | |
452 | fn add_char_class_reverse(&mut self, cls: &CharClass) -> bool { | |
453 | self._add_char_class(cls, true) | |
454 | } | |
455 | ||
456 | fn _add_char_class(&mut self, cls: &CharClass, reverse: bool) -> bool { | |
457 | use std::char; | |
458 | ||
459 | if self.class_exceeds_limits(cls.num_chars()) { | |
460 | return false; | |
461 | } | |
462 | let mut base = self.remove_complete(); | |
463 | if base.is_empty() { | |
464 | base = vec![Lit::empty()]; | |
465 | } | |
466 | for r in cls { | |
467 | let (s, e) = (r.start as u32, r.end as u32 + 1); | |
468 | for c in (s..e).filter_map(char::from_u32) { | |
469 | for mut lit in base.clone() { | |
470 | let mut bytes = c.to_string().into_bytes(); | |
471 | if reverse { | |
472 | bytes.reverse(); | |
473 | } | |
474 | lit.extend(&bytes); | |
475 | self.lits.push(lit); | |
476 | } | |
477 | } | |
478 | } | |
479 | true | |
480 | } | |
481 | ||
482 | /// Extends each literal in this set with the byte class given. | |
483 | /// | |
484 | /// Returns false if the byte class was too big to add. | |
485 | pub fn add_byte_class(&mut self, cls: &ByteClass) -> bool { | |
486 | if self.class_exceeds_limits(cls.num_bytes()) { | |
487 | return false; | |
488 | } | |
489 | let mut base = self.remove_complete(); | |
490 | if base.is_empty() { | |
491 | base = vec![Lit::empty()]; | |
492 | } | |
493 | for r in cls { | |
494 | let (s, e) = (r.start as u32, r.end as u32 + 1); | |
495 | for b in (s..e).map(|b| b as u8) { | |
496 | for mut lit in base.clone() { | |
497 | lit.push(b); | |
498 | self.lits.push(lit); | |
499 | } | |
500 | } | |
501 | } | |
502 | true | |
503 | } | |
504 | ||
505 | /// Cuts every member of this set. When a member is cut, it can never | |
506 | /// be extended. | |
507 | pub fn cut(&mut self) { | |
508 | for lit in &mut self.lits { | |
509 | lit.cut(); | |
510 | } | |
511 | } | |
512 | ||
513 | /// Reverses all members in place. | |
514 | pub fn reverse(&mut self) { | |
515 | for lit in &mut self.lits { | |
516 | lit.reverse(); | |
517 | } | |
518 | } | |
519 | ||
520 | /// Clears this set of all members. | |
521 | pub fn clear(&mut self) { | |
522 | self.lits.clear(); | |
523 | } | |
524 | ||
525 | /// Pops all complete literals out of this set. | |
526 | fn remove_complete(&mut self) -> Vec<Lit> { | |
527 | let mut base = vec![]; | |
528 | for lit in mem::replace(&mut self.lits, vec![]) { | |
529 | if lit.is_cut() { | |
530 | self.lits.push(lit); | |
531 | } else { | |
532 | base.push(lit); | |
533 | } | |
534 | } | |
535 | base | |
536 | } | |
537 | ||
538 | /// Returns the total number of bytes in this set. | |
539 | fn num_bytes(&self) -> usize { | |
540 | self.lits.iter().fold(0, |accum, lit| accum + lit.len()) | |
541 | } | |
542 | ||
543 | /// Returns true if a character class with the given size would cause this | |
544 | /// set to exceed its limits. | |
545 | /// | |
546 | /// The size given should correspond to the number of items in the class. | |
547 | fn class_exceeds_limits(&self, size: usize) -> bool { | |
548 | if size > self.limit_class { | |
549 | return true; | |
550 | } | |
551 | // This is an approximation since codepoints in a char class can encode | |
552 | // to 1-4 bytes. | |
553 | let new_byte_count = | |
554 | if self.lits.is_empty() { | |
555 | size | |
556 | } else { | |
557 | self.lits | |
558 | .iter() | |
559 | .fold(0, |accum, lit| { | |
560 | accum + if lit.is_cut() { | |
561 | // If the literal is cut, then we'll never add | |
562 | // anything to it, so don't count it. | |
563 | 0 | |
564 | } else { | |
565 | (lit.len() + 1) * size | |
566 | } | |
567 | }) | |
568 | }; | |
569 | new_byte_count > self.limit_size | |
570 | } | |
571 | } | |
572 | ||
573 | fn prefixes(expr: &Expr, lits: &mut Literals) { | |
574 | use Expr::*; | |
575 | match *expr { | |
576 | Literal { ref chars, casei: false } => { | |
577 | let s: String = chars.iter().cloned().collect(); | |
578 | lits.cross_add(s.as_bytes()); | |
579 | } | |
580 | Literal { ref chars, casei: true } => { | |
581 | for &c in chars { | |
582 | let cls = CharClass::new(vec![ | |
583 | ClassRange { start: c, end: c }, | |
584 | ]).case_fold(); | |
585 | if !lits.add_char_class(&cls) { | |
586 | lits.cut(); | |
587 | return; | |
588 | } | |
589 | } | |
590 | } | |
591 | LiteralBytes { ref bytes, casei: false } => { | |
592 | lits.cross_add(bytes); | |
593 | } | |
594 | LiteralBytes { ref bytes, casei: true } => { | |
595 | for &b in bytes { | |
596 | let cls = ByteClass::new(vec![ | |
597 | ByteRange { start: b, end: b }, | |
598 | ]).case_fold(); | |
599 | if !lits.add_byte_class(&cls) { | |
600 | lits.cut(); | |
601 | return; | |
602 | } | |
603 | } | |
604 | } | |
605 | Class(ref cls) => { | |
606 | if !lits.add_char_class(cls) { | |
607 | lits.cut(); | |
608 | } | |
609 | } | |
610 | ClassBytes(ref cls) => { | |
611 | if !lits.add_byte_class(cls) { | |
612 | lits.cut(); | |
613 | } | |
614 | } | |
615 | Group { ref e, .. } => { | |
616 | prefixes(&**e, lits); | |
617 | } | |
618 | Repeat { ref e, r: Repeater::ZeroOrOne, .. } => { | |
619 | repeat_zero_or_one_literals(&**e, lits, prefixes); | |
620 | } | |
621 | Repeat { ref e, r: Repeater::ZeroOrMore, .. } => { | |
622 | repeat_zero_or_more_literals(&**e, lits, prefixes); | |
623 | } | |
624 | Repeat { ref e, r: Repeater::OneOrMore, .. } => { | |
625 | repeat_one_or_more_literals(&**e, lits, prefixes); | |
626 | } | |
627 | Repeat { ref e, r: Repeater::Range { min, max }, greedy } => { | |
628 | repeat_range_literals(&**e, min, max, greedy, lits, prefixes); | |
629 | } | |
630 | Concat(ref es) if es.is_empty() => {} | |
631 | Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits), | |
632 | Concat(ref es) => { | |
633 | for e in es { | |
634 | if let StartText = *e { | |
635 | if !lits.is_empty() { | |
636 | lits.cut(); | |
637 | break; | |
638 | } | |
639 | lits.add(Lit::empty()); | |
640 | continue; | |
641 | } | |
642 | let mut lits2 = lits.to_empty(); | |
643 | prefixes(e, &mut lits2); | |
644 | if !lits.cross_product(&lits2) || !lits2.any_complete() { | |
645 | // If this expression couldn't yield any literal that | |
646 | // could be extended, then we need to quit. Since we're | |
647 | // short-circuiting, we also need to freeze every member. | |
648 | lits.cut(); | |
649 | break; | |
650 | } | |
651 | } | |
652 | } | |
653 | Alternate(ref es) => { | |
654 | alternate_literals(es, lits, prefixes); | |
655 | } | |
656 | _ => lits.cut(), | |
657 | } | |
658 | } | |
659 | ||
660 | fn suffixes(expr: &Expr, lits: &mut Literals) { | |
661 | use Expr::*; | |
662 | match *expr { | |
663 | Literal { ref chars, casei: false } => { | |
664 | let s: String = chars.iter().cloned().collect(); | |
665 | let mut bytes = s.into_bytes(); | |
666 | bytes.reverse(); | |
667 | lits.cross_add(&bytes); | |
668 | } | |
669 | Literal { ref chars, casei: true } => { | |
670 | for &c in chars.iter().rev() { | |
671 | let cls = CharClass::new(vec![ | |
672 | ClassRange { start: c, end: c }, | |
673 | ]).case_fold(); | |
674 | if !lits.add_char_class_reverse(&cls) { | |
675 | lits.cut(); | |
676 | return; | |
677 | } | |
678 | } | |
679 | } | |
680 | LiteralBytes { ref bytes, casei: false } => { | |
681 | let b: Vec<u8> = bytes.iter().rev().cloned().collect(); | |
682 | lits.cross_add(&b); | |
683 | } | |
684 | LiteralBytes { ref bytes, casei: true } => { | |
685 | for &b in bytes.iter().rev() { | |
686 | let cls = ByteClass::new(vec![ | |
687 | ByteRange { start: b, end: b }, | |
688 | ]).case_fold(); | |
689 | if !lits.add_byte_class(&cls) { | |
690 | lits.cut(); | |
691 | return; | |
692 | } | |
693 | } | |
694 | } | |
695 | Class(ref cls) => { | |
696 | if !lits.add_char_class_reverse(cls) { | |
697 | lits.cut(); | |
698 | } | |
699 | } | |
700 | ClassBytes(ref cls) => { | |
701 | if !lits.add_byte_class(cls) { | |
702 | lits.cut(); | |
703 | } | |
704 | } | |
705 | Group { ref e, .. } => { | |
706 | suffixes(&**e, lits); | |
707 | } | |
708 | Repeat { ref e, r: Repeater::ZeroOrOne, .. } => { | |
709 | repeat_zero_or_one_literals(&**e, lits, suffixes); | |
710 | } | |
711 | Repeat { ref e, r: Repeater::ZeroOrMore, .. } => { | |
712 | repeat_zero_or_more_literals(&**e, lits, suffixes); | |
713 | } | |
714 | Repeat { ref e, r: Repeater::OneOrMore, .. } => { | |
715 | repeat_one_or_more_literals(&**e, lits, suffixes); | |
716 | } | |
717 | Repeat { ref e, r: Repeater::Range { min, max }, greedy } => { | |
718 | repeat_range_literals(&**e, min, max, greedy, lits, suffixes); | |
719 | } | |
720 | Concat(ref es) if es.is_empty() => {} | |
721 | Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits), | |
722 | Concat(ref es) => { | |
723 | for e in es.iter().rev() { | |
724 | if let EndText = *e { | |
725 | if !lits.is_empty() { | |
726 | lits.cut(); | |
727 | break; | |
728 | } | |
729 | lits.add(Lit::empty()); | |
730 | continue; | |
731 | } | |
732 | let mut lits2 = lits.to_empty(); | |
733 | suffixes(e, &mut lits2); | |
734 | if !lits.cross_product(&lits2) || !lits2.any_complete() { | |
735 | // If this expression couldn't yield any literal that | |
736 | // could be extended, then we need to quit. Since we're | |
737 | // short-circuiting, we also need to freeze every member. | |
738 | lits.cut(); | |
739 | break; | |
740 | } | |
741 | } | |
742 | } | |
743 | Alternate(ref es) => { | |
744 | alternate_literals(es, lits, suffixes); | |
745 | } | |
746 | _ => lits.cut(), | |
747 | } | |
748 | } | |
749 | ||
750 | fn repeat_zero_or_one_literals<F: FnMut(&Expr, &mut Literals)>( | |
751 | e: &Expr, | |
752 | lits: &mut Literals, | |
753 | mut f: F, | |
754 | ) { | |
755 | let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); | |
756 | lits3.set_limit_size(lits.limit_size() / 2); | |
757 | f(e, &mut lits3); | |
758 | ||
759 | if lits3.is_empty() || !lits2.cross_product(&lits3) { | |
760 | lits.cut(); | |
761 | return; | |
762 | } | |
763 | lits2.add(Lit::empty()); | |
764 | if !lits.union(lits2) { | |
765 | lits.cut(); | |
766 | } | |
767 | } | |
768 | ||
769 | fn repeat_zero_or_more_literals<F: FnMut(&Expr, &mut Literals)>( | |
770 | e: &Expr, | |
771 | lits: &mut Literals, | |
772 | mut f: F, | |
773 | ) { | |
774 | let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); | |
775 | lits3.set_limit_size(lits.limit_size() / 2); | |
776 | f(e, &mut lits3); | |
777 | ||
778 | if lits3.is_empty() || !lits2.cross_product(&lits3) { | |
779 | lits.cut(); | |
780 | return; | |
781 | } | |
782 | lits2.cut(); | |
783 | lits2.add(Lit::empty()); | |
784 | if !lits.union(lits2) { | |
785 | lits.cut(); | |
786 | } | |
787 | } | |
788 | ||
789 | fn repeat_one_or_more_literals<F: FnMut(&Expr, &mut Literals)>( | |
790 | e: &Expr, | |
791 | lits: &mut Literals, | |
792 | mut f: F, | |
793 | ) { | |
794 | f(e, lits); | |
795 | lits.cut(); | |
796 | } | |
797 | ||
798 | fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>( | |
799 | e: &Expr, | |
800 | min: u32, | |
801 | max: Option<u32>, | |
802 | greedy: bool, | |
803 | lits: &mut Literals, | |
804 | mut f: F, | |
805 | ) { | |
806 | use Expr::*; | |
807 | ||
808 | if min == 0 { | |
809 | // This is a bit conservative. If `max` is set, then we could | |
810 | // treat this as a finite set of alternations. For now, we | |
811 | // just treat it as `e*`. | |
812 | f(&Repeat { | |
813 | e: Box::new(e.clone()), | |
814 | r: Repeater::ZeroOrMore, | |
815 | greedy: greedy, | |
816 | }, lits); | |
817 | } else { | |
818 | if min > 0 { | |
819 | let n = cmp::min(lits.limit_size, min as usize); | |
820 | let es = iter::repeat(e.clone()).take(n).collect(); | |
821 | f(&Concat(es), lits); | |
822 | if n < min as usize { | |
823 | lits.cut(); | |
824 | } | |
825 | } | |
826 | if max.map_or(true, |max| min < max) { | |
827 | lits.cut(); | |
828 | } | |
829 | } | |
830 | } | |
831 | ||
832 | fn alternate_literals<F: FnMut(&Expr, &mut Literals)>( | |
833 | es: &[Expr], | |
834 | lits: &mut Literals, | |
835 | mut f: F, | |
836 | ) { | |
837 | let mut lits2 = lits.to_empty(); | |
838 | for e in es { | |
839 | let mut lits3 = lits.to_empty(); | |
840 | lits3.set_limit_size(lits.limit_size() / 5); | |
841 | f(e, &mut lits3); | |
842 | if lits3.is_empty() || !lits2.union(lits3) { | |
843 | // If we couldn't find suffixes for *any* of the | |
844 | // alternates, then the entire alternation has to be thrown | |
845 | // away and any existing members must be frozen. Similarly, | |
846 | // if the union couldn't complete, stop and freeze. | |
847 | lits.cut(); | |
848 | return; | |
849 | } | |
850 | } | |
851 | if !lits.cross_product(&lits2) { | |
852 | lits.cut(); | |
853 | } | |
854 | } | |
855 | ||
856 | impl fmt::Debug for Literals { | |
857 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
858 | f.debug_struct("Literals") | |
859 | .field("lits", &self.lits) | |
860 | .field("limit_size", &self.limit_size) | |
861 | .field("limit_class", &self.limit_class) | |
862 | .finish() | |
863 | } | |
864 | } | |
865 | ||
866 | impl Lit { | |
867 | /// Returns a new complete literal with the bytes given. | |
868 | pub fn new(bytes: Vec<u8>) -> Lit { | |
869 | Lit { v: bytes, cut: false } | |
870 | } | |
871 | ||
872 | /// Returns a new complete empty literal. | |
873 | pub fn empty() -> Lit { | |
874 | Lit { v: vec![], cut: false } | |
875 | } | |
876 | ||
877 | /// Returns true if this literal was "cut." | |
878 | pub fn is_cut(&self) -> bool { | |
879 | self.cut | |
880 | } | |
881 | ||
882 | /// Cuts this literal. | |
883 | pub fn cut(&mut self) { | |
884 | self.cut = true; | |
885 | } | |
886 | } | |
887 | ||
888 | impl PartialEq for Lit { | |
889 | fn eq(&self, other: &Lit) -> bool { | |
890 | self.v == other.v | |
891 | } | |
892 | } | |
893 | ||
894 | impl PartialOrd for Lit { | |
895 | fn partial_cmp(&self, other: &Lit) -> Option<cmp::Ordering> { | |
896 | self.v.partial_cmp(&other.v) | |
897 | } | |
898 | } | |
899 | ||
900 | impl fmt::Debug for Lit { | |
901 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
902 | if self.is_cut() { | |
903 | write!(f, "Cut({})", escape_unicode(&self.v)) | |
904 | } else { | |
905 | write!(f, "Complete({})", escape_unicode(&self.v)) | |
906 | } | |
907 | } | |
908 | } | |
909 | ||
910 | impl AsRef<[u8]> for Lit { | |
911 | fn as_ref(&self) -> &[u8] { &self.v } | |
912 | } | |
913 | ||
914 | impl ops::Deref for Lit { | |
915 | type Target = Vec<u8>; | |
916 | fn deref(&self) -> &Vec<u8> { &self.v } | |
917 | } | |
918 | ||
919 | impl ops::DerefMut for Lit { | |
920 | fn deref_mut(&mut self) -> &mut Vec<u8> { &mut self.v } | |
921 | } | |
922 | ||
923 | fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> { | |
924 | let mut i = 0; | |
925 | while haystack.len() >= needle.len() { | |
926 | if needle == &haystack[..needle.len()] { | |
927 | return Some(i); | |
928 | } | |
929 | i += 1; | |
930 | haystack = &haystack[1..]; | |
931 | } | |
932 | None | |
933 | } | |
934 | ||
935 | fn escape_unicode(bytes: &[u8]) -> String { | |
936 | let show = match ::std::str::from_utf8(bytes) { | |
937 | Ok(v) => v.to_string(), | |
938 | Err(_) => escape_bytes(bytes), | |
939 | }; | |
940 | let mut space_escaped = String::new(); | |
941 | for c in show.chars() { | |
942 | if c.is_whitespace() { | |
943 | let escaped = if c as u32 <= 0x7F { | |
944 | escape_byte(c as u8) | |
945 | } else { | |
946 | if c as u32 <= 0xFFFF { | |
947 | format!(r"\u{{{:04x}}}", c as u32) | |
948 | } else { | |
949 | format!(r"\U{{{:08x}}}", c as u32) | |
950 | } | |
951 | }; | |
952 | space_escaped.push_str(&escaped); | |
953 | } else { | |
954 | space_escaped.push(c); | |
955 | } | |
956 | } | |
957 | space_escaped | |
958 | } | |
959 | ||
960 | fn escape_bytes(bytes: &[u8]) -> String { | |
961 | let mut s = String::new(); | |
962 | for &b in bytes { | |
963 | s.push_str(&escape_byte(b)); | |
964 | } | |
965 | s | |
966 | } | |
967 | ||
968 | fn escape_byte(byte: u8) -> String { | |
969 | use std::ascii::escape_default; | |
970 | ||
971 | let escaped: Vec<u8> = escape_default(byte).collect(); | |
972 | String::from_utf8_lossy(&escaped).into_owned() | |
973 | } | |
974 | ||
975 | #[cfg(test)] | |
976 | mod tests { | |
977 | use std::fmt; | |
978 | ||
979 | use {Expr, ExprBuilder}; | |
980 | use super::{Literals, Lit, escape_bytes}; | |
981 | ||
982 | // To make test failures easier to read. | |
983 | #[derive(Debug, Eq, PartialEq)] | |
984 | struct Bytes(Vec<ULit>); | |
985 | #[derive(Debug, Eq, PartialEq)] | |
986 | struct Unicode(Vec<ULit>); | |
987 | ||
988 | fn escape_lits(blits: &[Lit]) -> Vec<ULit> { | |
989 | let mut ulits = vec![]; | |
990 | for blit in blits { | |
991 | ulits.push(ULit { v: escape_bytes(&blit), cut: blit.is_cut() }); | |
992 | } | |
993 | ulits | |
994 | } | |
995 | ||
996 | fn create_lits<I: IntoIterator<Item=Lit>>(it: I) -> Literals { | |
997 | Literals { | |
998 | lits: it.into_iter().collect(), | |
999 | limit_size: 0, | |
1000 | limit_class: 0, | |
1001 | } | |
1002 | } | |
1003 | ||
1004 | // Needs to be pub for 1.3? | |
1005 | #[derive(Clone, Eq, PartialEq)] | |
1006 | pub struct ULit { | |
1007 | v: String, | |
1008 | cut: bool, | |
1009 | } | |
1010 | ||
1011 | impl ULit { | |
1012 | fn is_cut(&self) -> bool { self.cut } | |
1013 | } | |
1014 | ||
1015 | impl fmt::Debug for ULit { | |
1016 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
1017 | if self.is_cut() { | |
1018 | write!(f, "Cut({})", self.v) | |
1019 | } else { | |
1020 | write!(f, "Complete({})", self.v) | |
1021 | } | |
1022 | } | |
1023 | } | |
1024 | ||
1025 | impl PartialEq<Lit> for ULit { | |
1026 | fn eq(&self, other: &Lit) -> bool { | |
1027 | self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut() | |
1028 | } | |
1029 | } | |
1030 | ||
1031 | impl PartialEq<ULit> for Lit { | |
1032 | fn eq(&self, other: &ULit) -> bool { | |
1033 | &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut() | |
1034 | } | |
1035 | } | |
1036 | ||
1037 | #[allow(non_snake_case)] | |
1038 | fn C(s: &'static str) -> ULit { ULit { v: s.to_owned(), cut: true } } | |
1039 | #[allow(non_snake_case)] | |
1040 | fn M(s: &'static str) -> ULit { ULit { v: s.to_owned(), cut: false } } | |
1041 | ||
1042 | fn prefixes(lits: &mut Literals, expr: &Expr) { | |
1043 | lits.union_prefixes(expr); | |
1044 | } | |
1045 | ||
1046 | fn suffixes(lits: &mut Literals, expr: &Expr) { | |
1047 | lits.union_suffixes(expr); | |
1048 | } | |
1049 | ||
1050 | macro_rules! assert_lit_eq { | |
1051 | ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{ | |
1052 | let expected: Vec<ULit> = vec![$($expected_lit),*]; | |
1053 | let lits = $got_lits; | |
1054 | assert_eq!( | |
1055 | $which(expected.clone()), | |
1056 | $which(escape_lits(lits.literals()))); | |
1057 | assert_eq!( | |
1058 | !expected.is_empty() && expected.iter().all(|l| !l.is_cut()), | |
1059 | lits.all_complete()); | |
1060 | assert_eq!( | |
1061 | expected.iter().any(|l| !l.is_cut()), | |
1062 | lits.any_complete()); | |
1063 | }}; | |
1064 | } | |
1065 | ||
1066 | macro_rules! test_lit { | |
1067 | ($name:ident, $which:ident, $re:expr) => { | |
1068 | test_lit!($name, $which, $re,); | |
1069 | }; | |
1070 | ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { | |
1071 | #[test] | |
1072 | fn $name() { | |
1073 | let expr = Expr::parse($re).unwrap(); | |
1074 | let lits = expr.$which(); | |
1075 | assert_lit_eq!(Unicode, lits, $($lit),*); | |
1076 | ||
1077 | let expr = ExprBuilder::new().allow_bytes(true).unicode(false) | |
1078 | .parse($re).unwrap(); | |
1079 | let lits = expr.$which(); | |
1080 | assert_lit_eq!(Bytes, lits, $($lit),*); | |
1081 | } | |
1082 | }; | |
1083 | } | |
1084 | ||
1085 | // ************************************************************************ | |
1086 | // Tests for prefix literal extraction. | |
1087 | // ************************************************************************ | |
1088 | ||
1089 | // Elementary tests. | |
1090 | test_lit!(pfx_one_lit1, prefixes, "a", M("a")); | |
1091 | test_lit!(pfx_one_lit2, prefixes, "abc", M("abc")); | |
1092 | test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83")); | |
1093 | test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83")); | |
1094 | test_lit!(pfx_class1, prefixes, "[1-4]", | |
1095 | M("1"), M("2"), M("3"), M("4")); | |
1096 | test_lit!(pfx_class2, prefixes, "(?u)[â˜ƒâ… ]", | |
1097 | M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83")); | |
1098 | test_lit!(pfx_class3, prefixes, "(?ui)[â˜ƒâ… ]", | |
1099 | M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"), | |
1100 | M("\\xe2\\x98\\x83")); | |
1101 | test_lit!(pfx_one_lit_casei1, prefixes, "(?i)a", | |
1102 | M("A"), M("a")); | |
1103 | test_lit!(pfx_one_lit_casei2, prefixes, "(?i)abc", | |
1104 | M("ABC"), M("aBC"), M("AbC"), M("abC"), | |
1105 | M("ABc"), M("aBc"), M("Abc"), M("abc")); | |
1106 | test_lit!(pfx_group1, prefixes, "(a)", M("a")); | |
1107 | test_lit!(pfx_rep_zero_or_one1, prefixes, "a?"); | |
1108 | test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?"); | |
1109 | test_lit!(pfx_rep_zero_or_more1, prefixes, "a*"); | |
1110 | test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*"); | |
1111 | test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a")); | |
1112 | test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc")); | |
1113 | test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a")); | |
1114 | test_lit!(pfx_rep_range1, prefixes, "a{0}"); | |
1115 | test_lit!(pfx_rep_range2, prefixes, "a{0,}"); | |
1116 | test_lit!(pfx_rep_range3, prefixes, "a{0,1}"); | |
1117 | test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a")); | |
1118 | test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa")); | |
1119 | test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a")); | |
1120 | test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa")); | |
1121 | ||
1122 | // Test regexes with concatenations. | |
1123 | test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab")); | |
1124 | test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz")); | |
1125 | test_lit!(pfx_cat3, prefixes, "(?i)[ab]z", | |
1126 | M("AZ"), M("BZ"), M("aZ"), M("bZ"), | |
1127 | M("Az"), M("Bz"), M("az"), M("bz")); | |
1128 | test_lit!(pfx_cat4, prefixes, "[ab][yz]", | |
1129 | M("ay"), M("by"), M("az"), M("bz")); | |
1130 | test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b")); | |
1131 | test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c")); | |
1132 | test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c")); | |
1133 | test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b")); | |
1134 | test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b")); | |
1135 | test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a")); | |
1136 | test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac")); | |
1137 | test_lit!(pfx_cat12, prefixes, "ab+", C("ab")); | |
1138 | test_lit!(pfx_cat13, prefixes, "ab+c", C("ab")); | |
1139 | test_lit!(pfx_cat14, prefixes, "a^", C("a")); | |
1140 | test_lit!(pfx_cat15, prefixes, "$a"); | |
1141 | test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac")); | |
1142 | test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab")); | |
1143 | test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb")); | |
1144 | test_lit!(pfx_cat19, prefixes, "a.z", C("a")); | |
1145 | ||
1146 | // Test regexes with alternations. | |
1147 | test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b")); | |
1148 | test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); | |
1149 | test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz")); | |
1150 | test_lit!(pfx_alt4, prefixes, "a|b*"); | |
1151 | test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b")); | |
1152 | test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)"); | |
1153 | test_lit!(pfx_alt7, prefixes, "(a|b)*c|(a|ab)*c", | |
1154 | C("a"), C("b"), M("c"), C("a"), C("ab"), M("c")); | |
1155 | test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c")); | |
1156 | ||
1157 | // Test regexes with empty assertions. | |
1158 | test_lit!(pfx_empty1, prefixes, "^a", M("a")); | |
1159 | test_lit!(pfx_empty2, prefixes, "^abc", M("abc")); | |
1160 | test_lit!(pfx_empty3, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z")); | |
1161 | ||
1162 | // Make sure some curious regexes have no prefixes. | |
1163 | test_lit!(pfx_nothing1, prefixes, "."); | |
1164 | test_lit!(pfx_nothing2, prefixes, "(?s)."); | |
1165 | test_lit!(pfx_nothing3, prefixes, "^"); | |
1166 | test_lit!(pfx_nothing4, prefixes, "$"); | |
1167 | test_lit!(pfx_nothing6, prefixes, "(?m)$"); | |
1168 | test_lit!(pfx_nothing7, prefixes, r"\b"); | |
1169 | test_lit!(pfx_nothing8, prefixes, r"\B"); | |
1170 | ||
1171 | // Test a few regexes that defeat any prefix literal detection. | |
1172 | test_lit!(pfx_defeated1, prefixes, ".a"); | |
1173 | test_lit!(pfx_defeated2, prefixes, "(?s).a"); | |
1174 | test_lit!(pfx_defeated3, prefixes, "a*b*c*"); | |
1175 | test_lit!(pfx_defeated4, prefixes, "a|."); | |
1176 | test_lit!(pfx_defeated5, prefixes, ".|a"); | |
1177 | test_lit!(pfx_defeated6, prefixes, "a|^"); | |
1178 | test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))"); | |
1179 | test_lit!(pfx_defeated8, prefixes, "$a"); | |
1180 | test_lit!(pfx_defeated9, prefixes, "(?m)$a"); | |
1181 | test_lit!(pfx_defeated10, prefixes, r"\ba"); | |
1182 | test_lit!(pfx_defeated11, prefixes, r"\Ba"); | |
1183 | test_lit!(pfx_defeated12, prefixes, "^*a"); | |
1184 | test_lit!(pfx_defeated13, prefixes, "^+a"); | |
1185 | ||
1186 | test_lit!( | |
1187 | pfx_crazy1, | |
1188 | prefixes, | |
1189 | r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", | |
1190 | C("Mo\\'am"), C("Mu\\'am"), C("Moam"), C("Muam")); | |
1191 | ||
1192 | // ************************************************************************ | |
1193 | // Tests for quiting prefix literal search. | |
1194 | // ************************************************************************ | |
1195 | ||
1196 | macro_rules! test_exhausted { | |
1197 | ($name:ident, $which:ident, $re:expr) => { | |
1198 | test_exhausted!($name, $which, $re,); | |
1199 | }; | |
1200 | ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { | |
1201 | #[test] | |
1202 | fn $name() { | |
1203 | let expr = Expr::parse($re).unwrap(); | |
1204 | let mut lits = Literals::empty(); | |
1205 | lits.set_limit_size(20).set_limit_class(10); | |
1206 | $which(&mut lits, &expr); | |
1207 | assert_lit_eq!(Unicode, lits, $($lit),*); | |
1208 | ||
1209 | let expr = ExprBuilder::new().allow_bytes(true).unicode(false) | |
1210 | .parse($re).unwrap(); | |
1211 | let mut lits = Literals::empty(); | |
1212 | lits.set_limit_size(20).set_limit_class(10); | |
1213 | $which(&mut lits, &expr); | |
1214 | assert_lit_eq!(Bytes, lits, $($lit),*); | |
1215 | } | |
1216 | }; | |
1217 | } | |
1218 | ||
1219 | // These test use a much lower limit than the default so that we can | |
1220 | // write test cases of reasonable size. | |
1221 | test_exhausted!(pfx_exhausted1, prefixes, "[a-z]"); | |
1222 | test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A"); | |
1223 | test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A")); | |
1224 | test_exhausted!(pfx_exhausted4, prefixes, "(?i)foobar", | |
1225 | C("FO"), C("fO"), C("Fo"), C("fo")); | |
1226 | test_exhausted!(pfx_exhausted5, prefixes, "(?:ab){100}", | |
1227 | C("abababababababababab")); | |
1228 | test_exhausted!(pfx_exhausted6, prefixes, "(?:(?:ab){100})*cd", | |
1229 | C("ababababab"), M("cd")); | |
1230 | test_exhausted!(pfx_exhausted7, prefixes, "z(?:(?:ab){100})*cd", | |
1231 | C("zababababab"), M("zcd")); | |
1232 | test_exhausted!(pfx_exhausted8, prefixes, "aaaaaaaaaaaaaaaaaaaaz", | |
1233 | C("aaaaaaaaaaaaaaaaaaaa")); | |
1234 | ||
1235 | // ************************************************************************ | |
1236 | // Tests for suffix literal extraction. | |
1237 | // ************************************************************************ | |
1238 | ||
1239 | // Elementary tests. | |
1240 | test_lit!(sfx_one_lit1, suffixes, "a", M("a")); | |
1241 | test_lit!(sfx_one_lit2, suffixes, "abc", M("abc")); | |
1242 | test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83")); | |
1243 | test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83")); | |
1244 | test_lit!(sfx_class1, suffixes, "[1-4]", | |
1245 | M("1"), M("2"), M("3"), M("4")); | |
1246 | test_lit!(sfx_class2, suffixes, "(?u)[â˜ƒâ… ]", | |
1247 | M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83")); | |
1248 | test_lit!(sfx_class3, suffixes, "(?ui)[â˜ƒâ… ]", | |
1249 | M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"), | |
1250 | M("\\xe2\\x98\\x83")); | |
1251 | test_lit!(sfx_one_lit_casei1, suffixes, "(?i)a", | |
1252 | M("A"), M("a")); | |
1253 | test_lit!(sfx_one_lit_casei2, suffixes, "(?i)abc", | |
1254 | M("ABC"), M("ABc"), M("AbC"), M("Abc"), | |
1255 | M("aBC"), M("aBc"), M("abC"), M("abc")); | |
1256 | test_lit!(sfx_group1, suffixes, "(a)", M("a")); | |
1257 | test_lit!(sfx_rep_zero_or_one1, suffixes, "a?"); | |
1258 | test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?"); | |
1259 | test_lit!(sfx_rep_zero_or_more1, suffixes, "a*"); | |
1260 | test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*"); | |
1261 | test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a")); | |
1262 | test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc")); | |
1263 | test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a")); | |
1264 | test_lit!(sfx_rep_range1, suffixes, "a{0}"); | |
1265 | test_lit!(sfx_rep_range2, suffixes, "a{0,}"); | |
1266 | test_lit!(sfx_rep_range3, suffixes, "a{0,1}"); | |
1267 | test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a")); | |
1268 | test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa")); | |
1269 | test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a")); | |
1270 | test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa")); | |
1271 | ||
1272 | // Test regexes with concatenations. | |
1273 | test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab")); | |
1274 | test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz")); | |
1275 | test_lit!(sfx_cat3, suffixes, "(?i)[ab]z", | |
1276 | M("AZ"), M("Az"), M("BZ"), M("Bz"), | |
1277 | M("aZ"), M("az"), M("bZ"), M("bz")); | |
1278 | test_lit!(sfx_cat4, suffixes, "[ab][yz]", | |
1279 | M("ay"), M("az"), M("by"), M("bz")); | |
1280 | test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b")); | |
1281 | test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c")); | |
1282 | test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c")); | |
1283 | test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc")); | |
1284 | test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b")); | |
1285 | test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a")); | |
1286 | test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac")); | |
1287 | test_lit!(sfx_cat12, suffixes, "ab+", C("b")); | |
1288 | test_lit!(sfx_cat13, suffixes, "ab+c", C("bc")); | |
1289 | test_lit!(sfx_cat14, suffixes, "a^"); | |
1290 | test_lit!(sfx_cat15, suffixes, "$a", C("a")); | |
1291 | test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac")); | |
1292 | test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc")); | |
1293 | test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb")); | |
1294 | test_lit!(sfx_cat19, suffixes, "a.z", C("z")); | |
1295 | ||
1296 | // Test regexes with alternations. | |
1297 | test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b")); | |
1298 | test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); | |
1299 | test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz")); | |
1300 | test_lit!(sfx_alt4, suffixes, "a|b*"); | |
1301 | test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b")); | |
1302 | test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)"); | |
1303 | test_lit!(sfx_alt7, suffixes, "(a|b)*c|(a|ab)*c", | |
1304 | C("ac"), C("bc"), M("c"), C("ac"), C("abc"), M("c")); | |
1305 | test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c")); | |
1306 | ||
1307 | // Test regexes with empty assertions. | |
1308 | test_lit!(sfx_empty1, suffixes, "a$", M("a")); | |
1309 | ||
1310 | // Make sure some curious regexes have no suffixes. | |
1311 | test_lit!(sfx_nothing1, suffixes, "."); | |
1312 | test_lit!(sfx_nothing2, suffixes, "(?s)."); | |
1313 | test_lit!(sfx_nothing3, suffixes, "^"); | |
1314 | test_lit!(sfx_nothing4, suffixes, "$"); | |
1315 | test_lit!(sfx_nothing6, suffixes, "(?m)$"); | |
1316 | test_lit!(sfx_nothing7, suffixes, r"\b"); | |
1317 | test_lit!(sfx_nothing8, suffixes, r"\B"); | |
1318 | ||
1319 | // Test a few regexes that defeat any suffix literal detection. | |
1320 | test_lit!(sfx_defeated1, suffixes, "a."); | |
1321 | test_lit!(sfx_defeated2, suffixes, "(?s)a."); | |
1322 | test_lit!(sfx_defeated3, suffixes, "a*b*c*"); | |
1323 | test_lit!(sfx_defeated4, suffixes, "a|."); | |
1324 | test_lit!(sfx_defeated5, suffixes, ".|a"); | |
1325 | test_lit!(sfx_defeated6, suffixes, "a|^"); | |
1326 | test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c))."); | |
1327 | test_lit!(sfx_defeated8, suffixes, "a^"); | |
1328 | test_lit!(sfx_defeated9, suffixes, "(?m)a$"); | |
1329 | test_lit!(sfx_defeated10, suffixes, r"a\b"); | |
1330 | test_lit!(sfx_defeated11, suffixes, r"a\B"); | |
1331 | test_lit!(sfx_defeated12, suffixes, "a^*"); | |
1332 | test_lit!(sfx_defeated13, suffixes, "a^+"); | |
1333 | ||
1334 | // These test use a much lower limit than the default so that we can | |
1335 | // write test cases of reasonable size. | |
1336 | test_exhausted!(sfx_exhausted1, suffixes, "[a-z]"); | |
1337 | test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*"); | |
1338 | test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z")); | |
1339 | test_exhausted!(sfx_exhausted4, suffixes, "(?i)foobar", | |
1340 | C("AR"), C("Ar"), C("aR"), C("ar")); | |
1341 | test_exhausted!(sfx_exhausted5, suffixes, "(?:ab){100}", | |
1342 | C("abababababababababab")); | |
1343 | test_exhausted!(sfx_exhausted6, suffixes, "cd(?:(?:ab){100})*", | |
1344 | C("ababababab"), M("cd")); | |
1345 | test_exhausted!(sfx_exhausted7, suffixes, "cd(?:(?:ab){100})*z", | |
1346 | C("abababababz"), M("cdz")); | |
1347 | test_exhausted!(sfx_exhausted8, suffixes, "zaaaaaaaaaaaaaaaaaaaa", | |
1348 | C("aaaaaaaaaaaaaaaaaaaa")); | |
1349 | ||
1350 | // ************************************************************************ | |
1351 | // Tests for generating unambiguous literal sets. | |
1352 | // ************************************************************************ | |
1353 | ||
1354 | macro_rules! test_unamb { | |
1355 | ($name:ident, $given:expr, $expected:expr) => { | |
1356 | #[test] | |
1357 | fn $name() { | |
1358 | let given: Vec<Lit> = | |
1359 | $given | |
1360 | .into_iter() | |
1361 | .map(|ul| { | |
1362 | let cut = ul.is_cut(); | |
1363 | Lit { v: ul.v.into_bytes(), cut: cut } | |
1364 | }) | |
1365 | .collect(); | |
1366 | let lits = create_lits(given); | |
1367 | let got = lits.unambiguous_prefixes(); | |
1368 | assert_eq!($expected, escape_lits(got.literals())); | |
1369 | } | |
1370 | }; | |
1371 | } | |
1372 | ||
1373 | test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]); | |
1374 | test_unamb!(unambiguous2, | |
1375 | vec![M("zaaaaaa"), M("aa")], vec![C("aa"), C("z")]); | |
1376 | test_unamb!(unambiguous3, | |
1377 | vec![M("Sherlock"), M("Watson")], | |
1378 | vec![M("Sherlock"), M("Watson")]); | |
1379 | test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]); | |
1380 | test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]); | |
1381 | test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]); | |
1382 | test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]); | |
1383 | test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]); | |
1384 | test_unamb!(unambiguous9, | |
1385 | vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")], | |
1386 | vec![C("a"), C("b"), C("c")]); | |
1387 | test_unamb!(unambiguous10, | |
1388 | vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")], | |
1389 | vec![C("Mo"), C("Mu")]); | |
1390 | test_unamb!(unambiguous11, | |
1391 | vec![M("zazb"), M("azb")], vec![C("a"), C("z")]); | |
1392 | test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]); | |
1393 | test_unamb!(unambiguous13, | |
1394 | vec![M("ABCX"), M("CDAX"), M("BCX")], | |
1395 | vec![C("A"), C("BCX"), C("CD")]); | |
1396 | test_unamb!(unambiguous14, | |
1397 | vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")], | |
1398 | vec![M("DSX"), C("I"), C("MGX"), C("MV")]); | |
1399 | test_unamb!(unambiguous15, | |
1400 | vec![M("IMG_"), M("MG_"), M("CIMG")], | |
1401 | vec![C("C"), C("I"), C("MG_")]); | |
1402 | ||
1403 | ||
1404 | // ************************************************************************ | |
1405 | // Tests for suffix trimming. | |
1406 | // ************************************************************************ | |
1407 | macro_rules! test_trim { | |
1408 | ($name:ident, $trim:expr, $given:expr, $expected:expr) => { | |
1409 | #[test] | |
1410 | fn $name() { | |
1411 | let given: Vec<Lit> = | |
1412 | $given | |
1413 | .into_iter() | |
1414 | .map(|ul| { | |
1415 | let cut = ul.is_cut(); | |
1416 | Lit { v: ul.v.into_bytes(), cut: cut } | |
1417 | }) | |
1418 | .collect(); | |
1419 | let lits = create_lits(given); | |
1420 | let got = lits.trim_suffix($trim).unwrap(); | |
1421 | assert_eq!($expected, escape_lits(got.literals())); | |
1422 | } | |
1423 | } | |
1424 | } | |
1425 | ||
1426 | test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]); | |
1427 | test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]); | |
1428 | test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]); | |
1429 | test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]); | |
1430 | ||
1431 | // ************************************************************************ | |
1432 | // Tests for longest common prefix. | |
1433 | // ************************************************************************ | |
1434 | ||
1435 | macro_rules! test_lcp { | |
1436 | ($name:ident, $given:expr, $expected:expr) => { | |
1437 | #[test] | |
1438 | fn $name() { | |
1439 | let given: Vec<Lit> = | |
1440 | $given | |
1441 | .into_iter() | |
1442 | .map(|s: &str| Lit { | |
1443 | v: s.to_owned().into_bytes(), | |
1444 | cut: false, | |
1445 | }) | |
1446 | .collect(); | |
1447 | let lits = create_lits(given); | |
1448 | let got = lits.longest_common_prefix(); | |
1449 | assert_eq!($expected, escape_bytes(got)); | |
1450 | } | |
1451 | }; | |
1452 | } | |
1453 | ||
1454 | test_lcp!(lcp1, vec!["a"], "a"); | |
1455 | test_lcp!(lcp2, vec![], ""); | |
1456 | test_lcp!(lcp3, vec!["a", "b"], ""); | |
1457 | test_lcp!(lcp4, vec!["ab", "ab"], "ab"); | |
1458 | test_lcp!(lcp5, vec!["ab", "a"], "a"); | |
1459 | test_lcp!(lcp6, vec!["a", "ab"], "a"); | |
1460 | test_lcp!(lcp7, vec!["ab", "b"], ""); | |
1461 | test_lcp!(lcp8, vec!["b", "ab"], ""); | |
1462 | test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba"); | |
1463 | test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], ""); | |
1464 | test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], ""); | |
1465 | test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f"); | |
1466 | ||
1467 | // ************************************************************************ | |
1468 | // Tests for longest common suffix. | |
1469 | // ************************************************************************ | |
1470 | ||
1471 | macro_rules! test_lcs { | |
1472 | ($name:ident, $given:expr, $expected:expr) => { | |
1473 | #[test] | |
1474 | fn $name() { | |
1475 | let given: Vec<Lit> = | |
1476 | $given | |
1477 | .into_iter() | |
1478 | .map(|s: &str| Lit { | |
1479 | v: s.to_owned().into_bytes(), | |
1480 | cut: false, | |
1481 | }) | |
1482 | .collect(); | |
1483 | let lits = create_lits(given); | |
1484 | let got = lits.longest_common_suffix(); | |
1485 | assert_eq!($expected, escape_bytes(got)); | |
1486 | } | |
1487 | }; | |
1488 | } | |
1489 | ||
1490 | test_lcs!(lcs1, vec!["a"], "a"); | |
1491 | test_lcs!(lcs2, vec![], ""); | |
1492 | test_lcs!(lcs3, vec!["a", "b"], ""); | |
1493 | test_lcs!(lcs4, vec!["ab", "ab"], "ab"); | |
1494 | test_lcs!(lcs5, vec!["ab", "a"], ""); | |
1495 | test_lcs!(lcs6, vec!["a", "ab"], ""); | |
1496 | test_lcs!(lcs7, vec!["ab", "b"], "b"); | |
1497 | test_lcs!(lcs8, vec!["b", "ab"], "b"); | |
1498 | test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo"); | |
1499 | test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], ""); | |
1500 | test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], ""); | |
1501 | test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b"); | |
1502 | } |