]> git.proxmox.com Git - rustc.git/blob - src/vendor/regex-syntax-0.3.9/src/lib.rs
New upstream version 1.19.0+dfsg1
[rustc.git] / src / vendor / regex-syntax-0.3.9 / src / lib.rs
1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 /*!
12 This crate provides a regular expression parser and an abstract syntax for
13 regular expressions. The abstract syntax is defined by the `Expr` type. The
14 concrete syntax is enumerated in the
15 [`regex`](../regex/index.html#syntax)
16 crate documentation.
17
18 Note that since this crate is first and foremost an implementation detail for
19 the `regex` crate, it may experience more frequent breaking changes. It is
20 exposed as a separate crate so that others may use it to do analysis on regular
21 expressions or even build their own matching engine.
22
23 # Example: parsing an expression
24
25 Parsing a regular expression can be done with the `Expr::parse` function.
26
27 ```rust
28 use regex_syntax::Expr;
29
30 assert_eq!(Expr::parse(r"ab|yz").unwrap(), Expr::Alternate(vec![
31 Expr::Literal { chars: vec!['a', 'b'], casei: false },
32 Expr::Literal { chars: vec!['y', 'z'], casei: false },
33 ]));
34 ```
35
36 # Example: inspecting an error
37
38 The parser in this crate provides very detailed error values. For example,
39 if an invalid character class range is given:
40
41 ```rust
42 use regex_syntax::{Expr, ErrorKind};
43
44 let err = Expr::parse(r"[z-a]").unwrap_err();
45 assert_eq!(err.position(), 4);
46 assert_eq!(err.kind(), &ErrorKind::InvalidClassRange {
47 start: 'z',
48 end: 'a',
49 });
50 ```
51
52 Or unbalanced parentheses:
53
54 ```rust
55 use regex_syntax::{Expr, ErrorKind};
56
57 let err = Expr::parse(r"ab(cd").unwrap_err();
58 assert_eq!(err.position(), 2);
59 assert_eq!(err.kind(), &ErrorKind::UnclosedParen);
60 ```
61 */
62
63 #![deny(missing_docs)]
64 #![cfg_attr(test, deny(warnings))]
65
66 #[cfg(test)] extern crate quickcheck;
67 #[cfg(test)] extern crate rand;
68
69 mod literals;
70 mod parser;
71 mod unicode;
72
73 use std::ascii;
74 use std::char;
75 use std::cmp::{Ordering, max, min};
76 use std::fmt;
77 use std::iter::IntoIterator;
78 use std::ops::Deref;
79 use std::result;
80 use std::slice;
81 use std::u8;
82 use std::vec;
83
84 use unicode::case_folding;
85
86 use self::Expr::*;
87 use self::Repeater::*;
88
89 use parser::{Flags, Parser};
90
91 pub use literals::{Literals, Lit};
92
93 /// A regular expression abstract syntax tree.
94 ///
95 /// An `Expr` represents the abstract syntax of a regular expression.
96 #[derive(Clone, Debug, PartialEq, Eq)]
97 pub enum Expr {
98 /// An empty regex (which never matches any text).
99 Empty,
100 /// A sequence of one or more literal characters to be matched.
101 Literal {
102 /// The characters.
103 chars: Vec<char>,
104 /// Whether to match case insensitively.
105 casei: bool,
106 },
107 /// A sequence of one or more literal bytes to be matched.
108 LiteralBytes {
109 /// The bytes.
110 bytes: Vec<u8>,
111 /// Whether to match case insensitively.
112 ///
113 /// The interpretation of "case insensitive" in this context is
114 /// ambiguous since `bytes` can be arbitrary. However, a good heuristic
115 /// is to assume that the bytes are ASCII-compatible and do simple
116 /// ASCII case folding.
117 casei: bool,
118 },
119 /// Match any character.
120 AnyChar,
121 /// Match any character, excluding new line (`0xA`).
122 AnyCharNoNL,
123 /// Match any byte.
124 AnyByte,
125 /// Match any byte, excluding new line (`0xA`).
126 AnyByteNoNL,
127 /// A character class.
128 Class(CharClass),
129 /// A character class with byte ranges only.
130 ClassBytes(ByteClass),
131 /// Match the start of a line or beginning of input.
132 StartLine,
133 /// Match the end of a line or end of input.
134 EndLine,
135 /// Match the beginning of input.
136 StartText,
137 /// Match the end of input.
138 EndText,
139 /// Match a word boundary (word character on one side and a non-word
140 /// character on the other).
141 WordBoundary,
142 /// Match a position that is not a word boundary (word or non-word
143 /// characters on both sides).
144 NotWordBoundary,
145 /// Match an ASCII word boundary.
146 WordBoundaryAscii,
147 /// Match a position that is not an ASCII word boundary.
148 NotWordBoundaryAscii,
149 /// A group, possibly non-capturing.
150 Group {
151 /// The expression inside the group.
152 e: Box<Expr>,
153 /// The capture index (starting at `1`) only for capturing groups.
154 i: Option<usize>,
155 /// The capture name, only for capturing named groups.
156 name: Option<String>,
157 },
158 /// A repeat operator (`?`, `*`, `+` or `{m,n}`).
159 Repeat {
160 /// The expression to be repeated. Limited to literals, `.`, classes
161 /// or grouped expressions.
162 e: Box<Expr>,
163 /// The type of repeat operator used.
164 r: Repeater,
165 /// Whether the repeat is greedy (match the most) or not (match the
166 /// least).
167 greedy: bool,
168 },
169 /// A concatenation of expressions. Must be matched one after the other.
170 ///
171 /// N.B. A concat expression can only appear at the top-level or
172 /// immediately inside a group expression.
173 Concat(Vec<Expr>),
174 /// An alternation of expressions. Only one must match.
175 ///
176 /// N.B. An alternate expression can only appear at the top-level or
177 /// immediately inside a group expression.
178 Alternate(Vec<Expr>),
179 }
180
181 type CaptureIndex = Option<usize>;
182
183 type CaptureName = Option<String>;
184
185 /// The type of a repeat operator expression.
186 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
187 pub enum Repeater {
188 /// Match zero or one (`?`).
189 ZeroOrOne,
190 /// Match zero or more (`*`).
191 ZeroOrMore,
192 /// Match one or more (`+`).
193 OneOrMore,
194 /// Match for at least `min` and at most `max` (`{m,n}`).
195 ///
196 /// When `max` is `None`, there is no upper bound on the number of matches.
197 Range {
198 /// Lower bound on the number of matches.
199 min: u32,
200 /// Optional upper bound on the number of matches.
201 max: Option<u32>,
202 },
203 }
204
205 impl Repeater {
206 /// Returns true if and only if this repetition can match the empty string.
207 fn matches_empty(&self) -> bool {
208 use self::Repeater::*;
209 match *self {
210 ZeroOrOne => true,
211 ZeroOrMore => true,
212 OneOrMore => false,
213 Range { min, .. } => min == 0,
214 }
215 }
216 }
217
218 /// A character class.
219 ///
220 /// A character class has a canonical format that the parser guarantees. Its
221 /// canonical format is defined by the following invariants:
222 ///
223 /// 1. Given any Unicode scalar value, it is matched by *at most* one character
224 /// range in a canonical character class.
225 /// 2. Every adjacent character range is separated by at least one Unicode
226 /// scalar value.
227 /// 3. Given any pair of character ranges `r1` and `r2`, if
228 /// `r1.end < r2.start`, then `r1` comes before `r2` in a canonical
229 /// character class.
230 ///
231 /// In sum, any `CharClass` produced by this crate's parser is a sorted
232 /// sequence of non-overlapping ranges. This makes it possible to test whether
233 /// a character is matched by a class with a binary search.
234 ///
235 /// If the case insensitive flag was set when parsing a character class, then
236 /// simple case folding is done automatically. For example, `(?i)[a-c]` is
237 /// automatically translated to `[a-cA-C]`.
238 #[derive(Clone, Debug, PartialEq, Eq)]
239 pub struct CharClass {
240 ranges: Vec<ClassRange>,
241 }
242
243 /// A single inclusive range in a character class.
244 ///
245 /// Since range boundaries are defined by Unicode scalar values, the boundaries
246 /// can never be in the open interval `(0xD7FF, 0xE000)`. However, a range may
247 /// *cover* codepoints that are not scalar values.
248 ///
249 /// Note that this has a few convenient impls on `PartialEq` and `PartialOrd`
250 /// for testing whether a character is contained inside a given range.
251 #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)]
252 pub struct ClassRange {
253 /// The start character of the range.
254 ///
255 /// This must be less than or equal to `end`.
256 pub start: char,
257
258 /// The end character of the range.
259 ///
260 /// This must be greater than or equal to `start`.
261 pub end: char,
262 }
263
264 /// A byte class for byte ranges only.
265 ///
266 /// A byte class has a canonical format that the parser guarantees. Its
267 /// canonical format is defined by the following invariants:
268 ///
269 /// 1. Given any byte, it is matched by *at most* one byte range in a canonical
270 /// character class.
271 /// 2. Every adjacent byte range is separated by at least one byte.
272 /// 3. Given any pair of byte ranges `r1` and `r2`, if
273 /// `r1.end < r2.start`, then `r1` comes before `r2` in a canonical
274 /// character class.
275 ///
276 /// In sum, any `ByteClass` produced by this crate's parser is a sorted
277 /// sequence of non-overlapping ranges. This makes it possible to test whether
278 /// a byte is matched by a class with a binary search.
279 ///
280 /// If the case insensitive flag was set when parsing a character class,
281 /// then simple ASCII-only case folding is done automatically. For example,
282 /// `(?i)[a-c]` is automatically translated to `[a-cA-C]`.
283 #[derive(Clone, Debug, PartialEq, Eq)]
284 pub struct ByteClass {
285 ranges: Vec<ByteRange>,
286 }
287
288 /// A single inclusive range in a byte class.
289 ///
290 /// Note that this has a few convenient impls on `PartialEq` and `PartialOrd`
291 /// for testing whether a byte is contained inside a given range.
292 #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)]
293 pub struct ByteRange {
294 /// The start byte of the range.
295 ///
296 /// This must be less than or equal to `end`.
297 pub start: u8,
298
299 /// The end byte of the range.
300 ///
301 /// This must be greater than or equal to `end`.
302 pub end: u8,
303 }
304
305 /// A builder for configuring regular expression parsing.
306 ///
307 /// This allows setting the default values of flags and other options, such
308 /// as the maximum nesting depth.
309 #[derive(Clone, Debug)]
310 pub struct ExprBuilder {
311 flags: Flags,
312 nest_limit: usize,
313 }
314
315 impl ExprBuilder {
316 /// Create a new builder for configuring expression parsing.
317 ///
318 /// Note that all flags are disabled by default.
319 pub fn new() -> ExprBuilder {
320 ExprBuilder {
321 flags: Flags::default(),
322 nest_limit: 200,
323 }
324 }
325
326 /// Set the default value for the case insensitive (`i`) flag.
327 pub fn case_insensitive(mut self, yes: bool) -> ExprBuilder {
328 self.flags.casei = yes;
329 self
330 }
331
332 /// Set the default value for the multi-line matching (`m`) flag.
333 pub fn multi_line(mut self, yes: bool) -> ExprBuilder {
334 self.flags.multi = yes;
335 self
336 }
337
338 /// Set the default value for the any character (`s`) flag.
339 pub fn dot_matches_new_line(mut self, yes: bool) -> ExprBuilder {
340 self.flags.dotnl = yes;
341 self
342 }
343
344 /// Set the default value for the greedy swap (`U`) flag.
345 pub fn swap_greed(mut self, yes: bool) -> ExprBuilder {
346 self.flags.swap_greed = yes;
347 self
348 }
349
350 /// Set the default value for the ignore whitespace (`x`) flag.
351 pub fn ignore_whitespace(mut self, yes: bool) -> ExprBuilder {
352 self.flags.ignore_space = yes;
353 self
354 }
355
356 /// Set the default value for the Unicode (`u`) flag.
357 ///
358 /// If `yes` is false, then `allow_bytes` is set to true.
359 pub fn unicode(mut self, yes: bool) -> ExprBuilder {
360 self.flags.unicode = yes;
361 if !yes {
362 self.allow_bytes(true)
363 } else {
364 self
365 }
366 }
367
368 /// Whether the parser allows matching arbitrary bytes or not.
369 ///
370 /// When the `u` flag is disabled (either with this builder or in the
371 /// expression itself), the parser switches to interpreting the expression
372 /// as matching arbitrary bytes instead of Unicode codepoints. For example,
373 /// the expression `(?u:\xFF)` matches the *codepoint* `\xFF`, which
374 /// corresponds to the UTF-8 byte sequence `\xCE\xBF`. Conversely,
375 /// `(?-u:\xFF)` matches the *byte* `\xFF`, which is not valid UTF-8.
376 ///
377 /// When `allow_bytes` is disabled (the default), an expression like
378 /// `(?-u:\xFF)` will cause the parser to return an error, since it would
379 /// otherwise match invalid UTF-8. When enabled, it will be allowed.
380 pub fn allow_bytes(mut self, yes: bool) -> ExprBuilder {
381 self.flags.allow_bytes = yes;
382 self
383 }
384
385 /// Set the nesting limit for regular expression parsing.
386 ///
387 /// Regular expressions that nest more than this limit will result in a
388 /// `StackExhausted` error.
389 pub fn nest_limit(mut self, limit: usize) -> ExprBuilder {
390 self.nest_limit = limit;
391 self
392 }
393
394 /// Parse a string as a regular expression using the current configuraiton.
395 pub fn parse(self, s: &str) -> Result<Expr> {
396 Parser::parse(s, self.flags).and_then(|e| e.simplify(self.nest_limit))
397 }
398 }
399
400 impl Expr {
401 /// Parses a string in a regular expression syntax tree.
402 ///
403 /// This is a convenience method for parsing an expression using the
404 /// default configuration. To tweak parsing options (such as which flags
405 /// are enabled by default), use the `ExprBuilder` type.
406 pub fn parse(s: &str) -> Result<Expr> {
407 ExprBuilder::new().parse(s)
408 }
409
410 /// Returns true iff the expression can be repeated by a quantifier.
411 fn can_repeat(&self) -> bool {
412 match *self {
413 Literal{..} | LiteralBytes{..}
414 | AnyChar | AnyCharNoNL | AnyByte | AnyByteNoNL
415 | Class(_) | ClassBytes(_)
416 | StartLine | EndLine | StartText | EndText
417 | WordBoundary | NotWordBoundary
418 | WordBoundaryAscii | NotWordBoundaryAscii
419 | Group{..}
420 => true,
421 _ => false,
422 }
423 }
424
425 fn simplify(self, nest_limit: usize) -> Result<Expr> {
426 fn combine_literals(es: &mut Vec<Expr>, e: Expr) {
427 match (es.pop(), e) {
428 (None, e) => es.push(e),
429 (Some(Literal { chars: mut chars1, casei: casei1 }),
430 Literal { chars: chars2, casei: casei2 }) => {
431 if casei1 == casei2 {
432 chars1.extend(chars2);
433 es.push(Literal { chars: chars1, casei: casei1 });
434 } else {
435 es.push(Literal { chars: chars1, casei: casei1 });
436 es.push(Literal { chars: chars2, casei: casei2 });
437 }
438 }
439 (Some(LiteralBytes { bytes: mut bytes1, casei: casei1 }),
440 LiteralBytes { bytes: bytes2, casei: casei2 }) => {
441 if casei1 == casei2 {
442 bytes1.extend(bytes2);
443 es.push(LiteralBytes { bytes: bytes1, casei: casei1 });
444 } else {
445 es.push(LiteralBytes { bytes: bytes1, casei: casei1 });
446 es.push(LiteralBytes { bytes: bytes2, casei: casei2 });
447 }
448 }
449 (Some(e1), e2) => {
450 es.push(e1);
451 es.push(e2);
452 }
453 }
454 }
455 fn simp(expr: Expr, recurse: usize, limit: usize) -> Result<Expr> {
456 if recurse > limit {
457 return Err(Error {
458 pos: 0,
459 surround: "".to_owned(),
460 kind: ErrorKind::StackExhausted,
461 });
462 }
463 let simplify = |e| simp(e, recurse + 1, limit);
464 Ok(match expr {
465 Repeat { e, r, greedy } => Repeat {
466 e: Box::new(try!(simplify(*e))),
467 r: r,
468 greedy: greedy,
469 },
470 Group { e, i, name } => {
471 let e = try!(simplify(*e));
472 if i.is_none() && name.is_none() && e.can_repeat() {
473 e
474 } else {
475 Group { e: Box::new(e), i: i, name: name }
476 }
477 }
478 Concat(es) => {
479 let mut new_es = Vec::with_capacity(es.len());
480 for e in es {
481 combine_literals(&mut new_es, try!(simplify(e)));
482 }
483 if new_es.len() == 1 {
484 new_es.pop().unwrap()
485 } else {
486 Concat(new_es)
487 }
488 }
489 Alternate(es) => {
490 let mut new_es = Vec::with_capacity(es.len());
491 for e in es {
492 new_es.push(try!(simplify(e)));
493 }
494 Alternate(new_es)
495 }
496 e => e,
497 })
498 }
499 simp(self, 0, nest_limit)
500 }
501
502 /// Returns a set of literal prefixes extracted from this expression.
503 pub fn prefixes(&self) -> Literals {
504 let mut lits = Literals::empty();
505 lits.union_prefixes(self);
506 lits
507 }
508
509 /// Returns a set of literal suffixes extracted from this expression.
510 pub fn suffixes(&self) -> Literals {
511 let mut lits = Literals::empty();
512 lits.union_suffixes(self);
513 lits
514 }
515
516 /// Returns true if and only if the expression is required to match from
517 /// the beginning of text.
518 pub fn is_anchored_start(&self) -> bool {
519 match *self {
520 Repeat { ref e, r, .. } => {
521 !r.matches_empty() && e.is_anchored_start()
522 }
523 Group { ref e, .. } => e.is_anchored_start(),
524 Concat(ref es) => es[0].is_anchored_start(),
525 Alternate(ref es) => es.iter().all(|e| e.is_anchored_start()),
526 StartText => true,
527 _ => false,
528 }
529 }
530
531 /// Returns true if and only if the expression has at least one matchable
532 /// sub-expression that must match the beginning of text.
533 pub fn has_anchored_start(&self) -> bool {
534 match *self {
535 Repeat { ref e, r, .. } => {
536 !r.matches_empty() && e.has_anchored_start()
537 }
538 Group { ref e, .. } => e.has_anchored_start(),
539 Concat(ref es) => es[0].has_anchored_start(),
540 Alternate(ref es) => es.iter().any(|e| e.has_anchored_start()),
541 StartText => true,
542 _ => false,
543 }
544 }
545
546 /// Returns true if and only if the expression is required to match at the
547 /// end of the text.
548 pub fn is_anchored_end(&self) -> bool {
549 match *self {
550 Repeat { ref e, r, .. } => {
551 !r.matches_empty() && e.is_anchored_end()
552 }
553 Group { ref e, .. } => e.is_anchored_end(),
554 Concat(ref es) => es[es.len() - 1].is_anchored_end(),
555 Alternate(ref es) => es.iter().all(|e| e.is_anchored_end()),
556 EndText => true,
557 _ => false,
558 }
559 }
560
561 /// Returns true if and only if the expression has at least one matchable
562 /// sub-expression that must match the beginning of text.
563 pub fn has_anchored_end(&self) -> bool {
564 match *self {
565 Repeat { ref e, r, .. } => {
566 !r.matches_empty() && e.has_anchored_end()
567 }
568 Group { ref e, .. } => e.has_anchored_end(),
569 Concat(ref es) => es[es.len() - 1].has_anchored_end(),
570 Alternate(ref es) => es.iter().any(|e| e.has_anchored_end()),
571 EndText => true,
572 _ => false,
573 }
574 }
575
576 /// Returns true if and only if the expression contains sub-expressions
577 /// that can match arbitrary bytes.
578 pub fn has_bytes(&self) -> bool {
579 match *self {
580 Repeat { ref e, .. } => e.has_bytes(),
581 Group { ref e, .. } => e.has_bytes(),
582 Concat(ref es) => es.iter().any(|e| e.has_bytes()),
583 Alternate(ref es) => es.iter().any(|e| e.has_bytes()),
584 LiteralBytes{..} => true,
585 AnyByte | AnyByteNoNL => true,
586 ClassBytes(_) => true,
587 WordBoundaryAscii | NotWordBoundaryAscii => true,
588 _ => false,
589 }
590 }
591 }
592
593 impl Deref for CharClass {
594 type Target = Vec<ClassRange>;
595 fn deref(&self) -> &Vec<ClassRange> { &self.ranges }
596 }
597
598 impl IntoIterator for CharClass {
599 type Item = ClassRange;
600 type IntoIter = vec::IntoIter<ClassRange>;
601 fn into_iter(self) -> vec::IntoIter<ClassRange> { self.ranges.into_iter() }
602 }
603
604 impl<'a> IntoIterator for &'a CharClass {
605 type Item = &'a ClassRange;
606 type IntoIter = slice::Iter<'a, ClassRange>;
607 fn into_iter(self) -> slice::Iter<'a, ClassRange> { self.iter() }
608 }
609
610 impl CharClass {
611 /// Create a new class from an existing set of ranges.
612 pub fn new(ranges: Vec<ClassRange>) -> CharClass {
613 CharClass { ranges: ranges }
614 }
615
616 /// Create an empty class.
617 fn empty() -> CharClass {
618 CharClass::new(Vec::new())
619 }
620
621 /// Returns true if `c` is matched by this character class.
622 pub fn matches(&self, c: char) -> bool {
623 self.binary_search_by(|range| c.partial_cmp(range).unwrap()).is_ok()
624 }
625
626 /// Removes the given character from the class if it exists.
627 ///
628 /// Note that this takes `O(n)` time in the number of ranges.
629 pub fn remove(&mut self, c: char) {
630 let mut i = match self.binary_search_by(|r| c.partial_cmp(r).unwrap()) {
631 Ok(i) => i,
632 Err(_) => return,
633 };
634 let mut r = self.ranges.remove(i);
635 if r.start == c {
636 r.start = inc_char(c);
637 if r.start > r.end || c == char::MAX {
638 return;
639 }
640 self.ranges.insert(i, r);
641 } else if r.end == c {
642 r.end = dec_char(c);
643 if r.end < r.start || c == '\x00' {
644 return;
645 }
646 self.ranges.insert(0, r);
647 } else {
648 let (mut r1, mut r2) = (r.clone(), r.clone());
649 r1.end = dec_char(c);
650 if r1.start <= r1.end {
651 self.ranges.insert(i, r1);
652 i += 1;
653 }
654 r2.start = inc_char(c);
655 if r2.start <= r2.end {
656 self.ranges.insert(i, r2);
657 }
658 }
659 }
660
661 /// Create a new empty class from this one.
662 fn to_empty(&self) -> CharClass {
663 CharClass { ranges: Vec::with_capacity(self.len()) }
664 }
665
666 /// Create a byte class from this character class.
667 ///
668 /// Codepoints above 0xFF are removed.
669 fn to_byte_class(self) -> ByteClass {
670 ByteClass::new(
671 self.ranges.into_iter()
672 .filter_map(|r| r.to_byte_range())
673 .collect()).canonicalize()
674 }
675
676 /// Merge two classes and canonicalize them.
677 #[cfg(test)]
678 fn merge(mut self, other: CharClass) -> CharClass {
679 self.ranges.extend(other);
680 self.canonicalize()
681 }
682
683 /// Canonicalze any sequence of ranges.
684 ///
685 /// This is responsible for enforcing the canonical format invariants
686 /// as described on the docs for the `CharClass` type.
687 fn canonicalize(mut self) -> CharClass {
688 // TODO: Save some cycles here by checking if already canonicalized.
689 self.ranges.sort();
690 let mut ordered = self.to_empty(); // TODO: Do this in place?
691 for candidate in self {
692 // If the candidate overlaps with an existing range, then it must
693 // be the most recent range added because we process the candidates
694 // in order.
695 if let Some(or) = ordered.ranges.last_mut() {
696 if or.overlapping(candidate) {
697 *or = or.merge(candidate);
698 continue;
699 }
700 }
701 ordered.ranges.push(candidate);
702 }
703 ordered
704 }
705
706 /// Negates the character class.
707 ///
708 /// For all `c` where `c` is a Unicode scalar value, `c` matches `self`
709 /// if and only if `c` does not match `self.negate()`.
710 pub fn negate(mut self) -> CharClass {
711 fn range(s: char, e: char) -> ClassRange { ClassRange::new(s, e) }
712
713 if self.is_empty() {
714 // Inverting an empty range yields all of Unicode.
715 return CharClass {
716 ranges: vec![ClassRange { start: '\x00', end: '\u{10ffff}' }],
717 };
718 }
719 self = self.canonicalize();
720 let mut inv = self.to_empty();
721 if self[0].start > '\x00' {
722 inv.ranges.push(range('\x00', dec_char(self[0].start)));
723 }
724 for win in self.windows(2) {
725 inv.ranges.push(range(inc_char(win[0].end),
726 dec_char(win[1].start)));
727 }
728 if self[self.len() - 1].end < char::MAX {
729 inv.ranges.push(range(inc_char(self[self.len() - 1].end),
730 char::MAX));
731 }
732 inv
733 }
734
735 /// Apply case folding to this character class.
736 ///
737 /// N.B. Applying case folding to a negated character class probably
738 /// won't produce the expected result. e.g., `(?i)[^x]` really should
739 /// match any character sans `x` and `X`, but if `[^x]` is negated
740 /// before being case folded, you'll end up matching any character.
741 pub fn case_fold(self) -> CharClass {
742 let mut folded = self.to_empty();
743 for r in self {
744 // Applying case folding to a range is expensive because *every*
745 // character needs to be examined. Thus, we avoid that drudgery
746 // if no character in the current range is in our case folding
747 // table.
748 if r.needs_case_folding() {
749 folded.ranges.extend(r.case_fold());
750 }
751 folded.ranges.push(r);
752 }
753 folded.canonicalize()
754 }
755
756 /// Returns the number of characters that match this class.
757 fn num_chars(&self) -> usize {
758 self.ranges.iter()
759 .map(|&r| 1 + (r.end as u32) - (r.start as u32))
760 .fold(0, |acc, len| acc + len)
761 as usize
762 }
763 }
764
765 impl ClassRange {
766 /// Create a new class range.
767 ///
768 /// If `end < start`, then the two values are swapped so that
769 /// the invariant `start <= end` is preserved.
770 fn new(start: char, end: char) -> ClassRange {
771 if start <= end {
772 ClassRange { start: start, end: end }
773 } else {
774 ClassRange { start: end, end: start }
775 }
776 }
777
778 /// Translate this to a byte class.
779 ///
780 /// If the start codepoint exceeds 0xFF, then this returns `None`.
781 ///
782 /// If the end codepoint exceeds 0xFF, then it is set to 0xFF.
783 fn to_byte_range(self) -> Option<ByteRange> {
784 if self.start > '\u{FF}' {
785 None
786 } else {
787 let s = self.start as u8;
788 let e = min('\u{FF}', self.end) as u8;
789 Some(ByteRange::new(s, e))
790 }
791 }
792
793 /// Create a range of one character.
794 fn one(c: char) -> ClassRange {
795 ClassRange { start: c, end: c }
796 }
797
798 /// Returns true if and only if the two ranges are overlapping. Note that
799 /// since ranges are inclusive, `a-c` and `d-f` are overlapping!
800 fn overlapping(self, other: ClassRange) -> bool {
801 max(self.start, other.start) <= inc_char(min(self.end, other.end))
802 }
803
804 /// Creates a new range representing the union of `self` and `other.
805 fn merge(self, other: ClassRange) -> ClassRange {
806 ClassRange {
807 start: min(self.start, other.start),
808 end: max(self.end, other.end),
809 }
810 }
811
812 /// Returns true if and only if this range contains a character that is
813 /// in the case folding table.
814 fn needs_case_folding(self) -> bool {
815 case_folding::C_plus_S_both_table
816 .binary_search_by(|&(c, _)| self.partial_cmp(&c).unwrap()).is_ok()
817 }
818
819 /// Apply case folding to this range.
820 ///
821 /// Since case folding might add characters such that the range is no
822 /// longer contiguous, this returns multiple class ranges. They are in
823 /// canonical order.
824 fn case_fold(self) -> Vec<ClassRange> {
825 let table = &case_folding::C_plus_S_both_table;
826 let (s, e) = (self.start as u32, self.end as u32 + 1);
827 let mut start = self.start;
828 let mut end = start;
829 let mut next_case_fold = '\x00';
830 let mut ranges = Vec::with_capacity(10);
831 for mut c in (s..e).filter_map(char::from_u32) {
832 if c >= next_case_fold {
833 c = match simple_case_fold_both_result(c) {
834 Ok(i) => {
835 for &(c1, c2) in &table[i..] {
836 if c1 != c {
837 break;
838 }
839 if c2 != inc_char(end) {
840 ranges.push(ClassRange::new(start, end));
841 start = c2;
842 }
843 end = c2;
844 }
845 continue;
846 }
847 Err(i) => {
848 if i < table.len() {
849 next_case_fold = table[i].0;
850 } else {
851 next_case_fold = '\u{10FFFF}';
852 }
853 c
854 }
855 };
856 }
857 // The fast path. We know this character doesn't have an entry
858 // in the case folding table.
859 if c != inc_char(end) {
860 ranges.push(ClassRange::new(start, end));
861 start = c;
862 }
863 end = c;
864 }
865 ranges.push(ClassRange::new(start, end));
866 ranges
867 }
868 }
869
870 impl PartialEq<char> for ClassRange {
871 #[inline]
872 fn eq(&self, other: &char) -> bool {
873 self.start <= *other && *other <= self.end
874 }
875 }
876
877 impl PartialEq<ClassRange> for char {
878 #[inline]
879 fn eq(&self, other: &ClassRange) -> bool {
880 other.eq(self)
881 }
882 }
883
884 impl PartialOrd<char> for ClassRange {
885 #[inline]
886 fn partial_cmp(&self, other: &char) -> Option<Ordering> {
887 Some(if self == other {
888 Ordering::Equal
889 } else if *other > self.end {
890 Ordering::Greater
891 } else {
892 Ordering::Less
893 })
894 }
895 }
896
897 impl PartialOrd<ClassRange> for char {
898 #[inline]
899 fn partial_cmp(&self, other: &ClassRange) -> Option<Ordering> {
900 other.partial_cmp(self).map(|o| o.reverse())
901 }
902 }
903
904 impl ByteClass {
905 /// Create a new class from an existing set of ranges.
906 pub fn new(ranges: Vec<ByteRange>) -> ByteClass {
907 ByteClass { ranges: ranges }
908 }
909
910 /// Returns true if `b` is matched by this byte class.
911 pub fn matches(&self, b: u8) -> bool {
912 self.binary_search_by(|range| b.partial_cmp(range).unwrap()).is_ok()
913 }
914
915 /// Removes the given byte from the class if it exists.
916 ///
917 /// Note that this takes `O(n)` time in the number of ranges.
918 pub fn remove(&mut self, b: u8) {
919 let mut i = match self.binary_search_by(|r| b.partial_cmp(r).unwrap()) {
920 Ok(i) => i,
921 Err(_) => return,
922 };
923 let mut r = self.ranges.remove(i);
924 if r.start == b {
925 r.start = b.saturating_add(1);
926 if r.start > r.end || b == u8::MAX {
927 return;
928 }
929 self.ranges.insert(i, r);
930 } else if r.end == b {
931 r.end = b.saturating_sub(1);
932 if r.end < r.start || b == b'\x00' {
933 return;
934 }
935 self.ranges.insert(0, r);
936 } else {
937 let (mut r1, mut r2) = (r.clone(), r.clone());
938 r1.end = b.saturating_sub(1);
939 if r1.start <= r1.end {
940 self.ranges.insert(i, r1);
941 i += 1;
942 }
943 r2.start = b.saturating_add(1);
944 if r2.start <= r2.end {
945 self.ranges.insert(i, r2);
946 }
947 }
948 }
949
950 /// Create a new empty class from this one.
951 fn to_empty(&self) -> ByteClass {
952 ByteClass { ranges: Vec::with_capacity(self.len()) }
953 }
954
955 /// Canonicalze any sequence of ranges.
956 ///
957 /// This is responsible for enforcing the canonical format invariants
958 /// as described on the docs for the `ByteClass` type.
959 fn canonicalize(mut self) -> ByteClass {
960 // TODO: Save some cycles here by checking if already canonicalized.
961 self.ranges.sort();
962 let mut ordered = self.to_empty(); // TODO: Do this in place?
963 for candidate in self {
964 // If the candidate overlaps with an existing range, then it must
965 // be the most recent range added because we process the candidates
966 // in order.
967 if let Some(or) = ordered.ranges.last_mut() {
968 if or.overlapping(candidate) {
969 *or = or.merge(candidate);
970 continue;
971 }
972 }
973 ordered.ranges.push(candidate);
974 }
975 ordered
976 }
977
978 /// Negates the byte class.
979 ///
980 /// For all `b` where `b` is a byte, `b` matches `self` if and only if `b`
981 /// does not match `self.negate()`.
982 pub fn negate(mut self) -> ByteClass {
983 fn range(s: u8, e: u8) -> ByteRange { ByteRange::new(s, e) }
984
985 if self.is_empty() {
986 // Inverting an empty range yields all bytes.
987 return ByteClass {
988 ranges: vec![ByteRange { start: b'\x00', end: b'\xff' }],
989 };
990 }
991 self = self.canonicalize();
992 let mut inv = self.to_empty();
993 if self[0].start > b'\x00' {
994 inv.ranges.push(range(b'\x00', self[0].start.saturating_sub(1)));
995 }
996 for win in self.windows(2) {
997 inv.ranges.push(range(win[0].end.saturating_add(1),
998 win[1].start.saturating_sub(1)));
999 }
1000 if self[self.len() - 1].end < u8::MAX {
1001 inv.ranges.push(range(self[self.len() - 1].end.saturating_add(1),
1002 u8::MAX));
1003 }
1004 inv
1005 }
1006
1007 /// Apply case folding to this byte class.
1008 ///
1009 /// This assumes that the bytes in the ranges are ASCII compatible.
1010 ///
1011 /// N.B. Applying case folding to a negated character class probably
1012 /// won't produce the expected result. e.g., `(?i)[^x]` really should
1013 /// match any character sans `x` and `X`, but if `[^x]` is negated
1014 /// before being case folded, you'll end up matching any character.
1015 pub fn case_fold(self) -> ByteClass {
1016 let mut folded = self.to_empty();
1017 for r in self {
1018 folded.ranges.extend(r.case_fold());
1019 }
1020 folded.canonicalize()
1021 }
1022
1023 /// Returns the number of bytes that match this class.
1024 fn num_bytes(&self) -> usize {
1025 self.ranges.iter()
1026 .map(|&r| 1 + (r.end as u32) - (r.start as u32))
1027 .fold(0, |acc, len| acc + len)
1028 as usize
1029 }
1030 }
1031
1032 impl ByteRange {
1033 /// Create a new class range.
1034 ///
1035 /// If `end < start`, then the two values are swapped so that
1036 /// the invariant `start <= end` is preserved.
1037 fn new(start: u8, end: u8) -> ByteRange {
1038 if start <= end {
1039 ByteRange { start: start, end: end }
1040 } else {
1041 ByteRange { start: end, end: start }
1042 }
1043 }
1044
1045 /// Returns true if and only if the two ranges are overlapping. Note that
1046 /// since ranges are inclusive, `a-c` and `d-f` are overlapping!
1047 fn overlapping(self, other: ByteRange) -> bool {
1048 max(self.start, other.start)
1049 <= min(self.end, other.end).saturating_add(1)
1050 }
1051
1052 /// Returns true if and only if the intersection of self and other is non
1053 /// empty.
1054 fn is_intersect_empty(self, other: ByteRange) -> bool {
1055 max(self.start, other.start) > min(self.end, other.end)
1056 }
1057
1058 /// Creates a new range representing the union of `self` and `other.
1059 fn merge(self, other: ByteRange) -> ByteRange {
1060 ByteRange {
1061 start: min(self.start, other.start),
1062 end: max(self.end, other.end),
1063 }
1064 }
1065
1066 /// Apply case folding to this range.
1067 ///
1068 /// Since case folding might add bytes such that the range is no
1069 /// longer contiguous, this returns multiple byte ranges.
1070 ///
1071 /// This assumes that the bytes in this range are ASCII compatible.
1072 fn case_fold(self) -> Vec<ByteRange> {
1073 // So much easier than Unicode case folding!
1074 let mut ranges = vec![self];
1075 if !ByteRange::new(b'a', b'z').is_intersect_empty(self) {
1076 let lower = max(self.start, b'a');
1077 let upper = min(self.end, b'z');
1078 ranges.push(ByteRange::new(lower - 32, upper - 32));
1079 }
1080 if !ByteRange::new(b'A', b'Z').is_intersect_empty(self) {
1081 let lower = max(self.start, b'A');
1082 let upper = min(self.end, b'Z');
1083 ranges.push(ByteRange::new(lower + 32, upper + 32));
1084 }
1085 ranges
1086 }
1087 }
1088
1089 impl Deref for ByteClass {
1090 type Target = Vec<ByteRange>;
1091 fn deref(&self) -> &Vec<ByteRange> { &self.ranges }
1092 }
1093
1094 impl IntoIterator for ByteClass {
1095 type Item = ByteRange;
1096 type IntoIter = vec::IntoIter<ByteRange>;
1097 fn into_iter(self) -> vec::IntoIter<ByteRange> { self.ranges.into_iter() }
1098 }
1099
1100 impl<'a> IntoIterator for &'a ByteClass {
1101 type Item = &'a ByteRange;
1102 type IntoIter = slice::Iter<'a, ByteRange>;
1103 fn into_iter(self) -> slice::Iter<'a, ByteRange> { self.iter() }
1104 }
1105
1106 impl PartialEq<u8> for ByteRange {
1107 #[inline]
1108 fn eq(&self, other: &u8) -> bool {
1109 self.start <= *other && *other <= self.end
1110 }
1111 }
1112
1113 impl PartialEq<ByteRange> for u8 {
1114 #[inline]
1115 fn eq(&self, other: &ByteRange) -> bool {
1116 other.eq(self)
1117 }
1118 }
1119
1120 impl PartialOrd<u8> for ByteRange {
1121 #[inline]
1122 fn partial_cmp(&self, other: &u8) -> Option<Ordering> {
1123 Some(if self == other {
1124 Ordering::Equal
1125 } else if *other > self.end {
1126 Ordering::Greater
1127 } else {
1128 Ordering::Less
1129 })
1130 }
1131 }
1132
1133 impl PartialOrd<ByteRange> for u8 {
1134 #[inline]
1135 fn partial_cmp(&self, other: &ByteRange) -> Option<Ordering> {
1136 other.partial_cmp(self).map(|o| o.reverse())
1137 }
1138 }
1139
1140 /// This implementation of `Display` will write a regular expression from the
1141 /// syntax tree. It does not write the original string parsed.
1142 impl fmt::Display for Expr {
1143 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1144 match *self {
1145 Empty => write!(f, ""),
1146 Literal { ref chars, casei } => {
1147 if casei {
1148 try!(write!(f, "(?iu:"));
1149 } else {
1150 try!(write!(f, "(?u:"));
1151 }
1152 for &c in chars {
1153 try!(write!(f, "{}", quote_char(c)));
1154 }
1155 try!(write!(f, ")"));
1156 Ok(())
1157 }
1158 LiteralBytes { ref bytes, casei } => {
1159 if casei {
1160 try!(write!(f, "(?i-u:"));
1161 } else {
1162 try!(write!(f, "(?-u:"));
1163 }
1164 for &b in bytes {
1165 try!(write!(f, "{}", quote_byte(b)));
1166 }
1167 try!(write!(f, ")"));
1168 Ok(())
1169 }
1170 AnyChar => write!(f, "(?su:.)"),
1171 AnyCharNoNL => write!(f, "(?u:.)"),
1172 AnyByte => write!(f, "(?s-u:.)"),
1173 AnyByteNoNL => write!(f, "(?-u:.)"),
1174 Class(ref cls) => write!(f, "{}", cls),
1175 ClassBytes(ref cls) => write!(f, "{}", cls),
1176 StartLine => write!(f, "(?m:^)"),
1177 EndLine => write!(f, "(?m:$)"),
1178 StartText => write!(f, r"^"),
1179 EndText => write!(f, r"$"),
1180 WordBoundary => write!(f, r"(?u:\b)"),
1181 NotWordBoundary => write!(f, r"(?u:\B)"),
1182 WordBoundaryAscii => write!(f, r"(?-u:\b)"),
1183 NotWordBoundaryAscii => write!(f, r"(?-u:\B)"),
1184 Group { ref e, i: None, name: None } => write!(f, "(?:{})", e),
1185 Group { ref e, name: None, .. } => write!(f, "({})", e),
1186 Group { ref e, name: Some(ref n), .. } => {
1187 write!(f, "(?P<{}>{})", n, e)
1188 }
1189 Repeat { ref e, r, greedy } => {
1190 match &**e {
1191 &Literal { ref chars, .. } if chars.len() > 1 => {
1192 try!(write!(f, "(?:{}){}", e, r))
1193 }
1194 _ => try!(write!(f, "{}{}", e, r)),
1195 }
1196 if !greedy { try!(write!(f, "?")); }
1197 Ok(())
1198 }
1199 Concat(ref es) => {
1200 for e in es {
1201 try!(write!(f, "{}", e));
1202 }
1203 Ok(())
1204 }
1205 Alternate(ref es) => {
1206 for (i, e) in es.iter().enumerate() {
1207 if i > 0 { try!(write!(f, "|")); }
1208 try!(write!(f, "{}", e));
1209 }
1210 Ok(())
1211 }
1212 }
1213 }
1214 }
1215
1216 impl fmt::Display for Repeater {
1217 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1218 match *self {
1219 ZeroOrOne => write!(f, "?"),
1220 ZeroOrMore => write!(f, "*"),
1221 OneOrMore => write!(f, "+"),
1222 Range { min: s, max: None } => write!(f, "{{{},}}", s),
1223 Range { min: s, max: Some(e) } if s == e => write!(f, "{{{}}}", s),
1224 Range { min: s, max: Some(e) } => write!(f, "{{{}, {}}}", s, e),
1225 }
1226 }
1227 }
1228
1229 impl fmt::Display for CharClass {
1230 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1231 try!(write!(f, "(?u:["));
1232 for range in self.iter() {
1233 if range.start == '-' || range.end == '-' {
1234 try!(write!(f, "-"));
1235 break;
1236 }
1237 }
1238 for range in self.iter() {
1239 let mut range = *range;
1240 if range.start == '-' {
1241 range.start = ((range.start as u8) + 1) as char;
1242 }
1243 if range.end == '-' {
1244 range.end = ((range.end as u8) - 1) as char;
1245 }
1246 if range.start > range.end {
1247 continue;
1248 }
1249 try!(write!(f, "{}", range));
1250 }
1251 try!(write!(f, "])"));
1252 Ok(())
1253 }
1254 }
1255
1256 impl fmt::Display for ClassRange {
1257 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1258 write!(f, "{}-{}", quote_char(self.start), quote_char(self.end))
1259 }
1260 }
1261
1262 impl fmt::Display for ByteClass {
1263 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1264 try!(write!(f, "(?-u:["));
1265 for range in self.iter() {
1266 if range.start == b'-' || range.end == b'-' {
1267 try!(write!(f, "-"));
1268 break;
1269 }
1270 }
1271 for range in self.iter() {
1272 let mut range = *range;
1273 if range.start == b'-' {
1274 range.start += 1;
1275 }
1276 if range.end == b'-' {
1277 range.start -= 1;
1278 }
1279 if range.start > range.end {
1280 continue;
1281 }
1282 try!(write!(f, "{}", range));
1283 }
1284 try!(write!(f, "])"));
1285 Ok(())
1286 }
1287 }
1288
1289 impl fmt::Display for ByteRange {
1290 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1291 write!(f, "{}-{}", quote_byte(self.start), quote_byte(self.end))
1292 }
1293 }
1294
1295 /// An alias for computations that can return a `Error`.
1296 pub type Result<T> = ::std::result::Result<T, Error>;
1297
1298 /// A parse error.
1299 ///
1300 /// This includes details about the specific type of error and a rough
1301 /// approximation of where it occurred.
1302 #[derive(Clone, Debug, PartialEq)]
1303 pub struct Error {
1304 pos: usize,
1305 surround: String,
1306 kind: ErrorKind,
1307 }
1308
1309 /// The specific type of parse error that can occur.
1310 #[derive(Clone, Debug, PartialEq)]
1311 pub enum ErrorKind {
1312 /// A negation symbol is used twice in flag settings.
1313 /// e.g., `(?-i-s)`.
1314 DoubleFlagNegation,
1315 /// The same capture name was used more than once.
1316 /// e.g., `(?P<a>.)(?P<a>.)`.
1317 DuplicateCaptureName(String),
1318 /// An alternate is empty. e.g., `(|a)`.
1319 EmptyAlternate,
1320 /// A capture group name is empty. e.g., `(?P<>a)`.
1321 EmptyCaptureName,
1322 /// A negation symbol was not proceded by any flags. e.g., `(?i-)`.
1323 EmptyFlagNegation,
1324 /// A group is empty. e.g., `()`.
1325 EmptyGroup,
1326 /// An invalid number was used in a counted repetition. e.g., `a{b}`.
1327 InvalidBase10(String),
1328 /// An invalid hexadecimal number was used in an escape sequence.
1329 /// e.g., `\xAG`.
1330 InvalidBase16(String),
1331 /// An invalid capture name was used. e.g., `(?P<0a>b)`.
1332 InvalidCaptureName(String),
1333 /// An invalid class range was givien. Specifically, when the start of the
1334 /// range is greater than the end. e.g., `[z-a]`.
1335 InvalidClassRange {
1336 /// The first character specified in the range.
1337 start: char,
1338 /// The second character specified in the range.
1339 end: char,
1340 },
1341 /// An escape sequence was used in a character class where it is not
1342 /// allowed. e.g., `[a-\pN]` or `[\A]`.
1343 InvalidClassEscape(Expr),
1344 /// An invalid counted repetition min/max was given. e.g., `a{2,1}`.
1345 InvalidRepeatRange {
1346 /// The first number specified in the repetition.
1347 min: u32,
1348 /// The second number specified in the repetition.
1349 max: u32,
1350 },
1351 /// An invalid Unicode scalar value was used in a long hexadecimal
1352 /// sequence. e.g., `\x{D800}`.
1353 InvalidScalarValue(u32),
1354 /// An empty counted repetition operator. e.g., `a{}`.
1355 MissingBase10,
1356 /// A repetition operator was not applied to an expression. e.g., `*`.
1357 RepeaterExpectsExpr,
1358 /// A repetition operator was applied to an expression that cannot be
1359 /// repeated. e.g., `a+*` or `a|*`.
1360 RepeaterUnexpectedExpr(Expr),
1361 /// A capture group name that is never closed. e.g., `(?P<a`.
1362 UnclosedCaptureName(String),
1363 /// An unclosed hexadecimal literal. e.g., `\x{a`.
1364 UnclosedHex,
1365 /// An unclosed parenthesis. e.g., `(a`.
1366 UnclosedParen,
1367 /// An unclosed counted repetition operator. e.g., `a{2`.
1368 UnclosedRepeat,
1369 /// An unclosed named Unicode class. e.g., `\p{Yi`.
1370 UnclosedUnicodeName,
1371 /// Saw end of regex before class was closed. e.g., `[a`.
1372 UnexpectedClassEof,
1373 /// Saw end of regex before escape sequence was closed. e.g., `\`.
1374 UnexpectedEscapeEof,
1375 /// Saw end of regex before flags were closed. e.g., `(?i`.
1376 UnexpectedFlagEof,
1377 /// Saw end of regex before two hexadecimal digits were seen. e.g., `\xA`.
1378 UnexpectedTwoDigitHexEof,
1379 /// Unopened parenthesis. e.g., `)`.
1380 UnopenedParen,
1381 /// Unrecognized escape sequence. e.g., `\q`.
1382 UnrecognizedEscape(char),
1383 /// Unrecognized flag. e.g., `(?a)`.
1384 UnrecognizedFlag(char),
1385 /// Unrecognized named Unicode class. e.g., `\p{Foo}`.
1386 UnrecognizedUnicodeClass(String),
1387 /// Indicates that the regex uses too much nesting.
1388 ///
1389 /// (N.B. This error exists because traversing the Expr is recursive and
1390 /// an explicit heap allocated stack is not (yet?) used. Regardless, some
1391 /// sort of limit must be applied to avoid unbounded memory growth.
1392 StackExhausted,
1393 /// A disallowed flag was found (e.g., `u`).
1394 FlagNotAllowed(char),
1395 /// A Unicode class was used when the Unicode (`u`) flag was disabled.
1396 UnicodeNotAllowed,
1397 /// InvalidUtf8 indicates that the expression may match non-UTF-8 bytes.
1398 /// This never returned if the parser is permitted to allow expressions
1399 /// that match arbitrary bytes.
1400 InvalidUtf8,
1401 /// A character class was constructed such that it is empty.
1402 /// e.g., `[^\d\D]`.
1403 EmptyClass,
1404 /// Hints that destructuring should not be exhaustive.
1405 ///
1406 /// This enum may grow additional variants, so this makes sure clients
1407 /// don't count on exhaustive matching. (Otherwise, adding a new variant
1408 /// could break existing code.)
1409 #[doc(hidden)]
1410 __Nonexhaustive,
1411 }
1412
1413 impl Error {
1414 /// Returns an approximate *character* offset at which the error occurred.
1415 ///
1416 /// The character offset may be equal to the number of characters in the
1417 /// string, in which case it should be interpreted as pointing to the end
1418 /// of the regex.
1419 pub fn position(&self) -> usize {
1420 self.pos
1421 }
1422
1423 /// Returns the type of the regex parse error.
1424 pub fn kind(&self) -> &ErrorKind {
1425 &self.kind
1426 }
1427 }
1428
1429 impl ErrorKind {
1430 fn description(&self) -> &str {
1431 use ErrorKind::*;
1432 match *self {
1433 DoubleFlagNegation => "double flag negation",
1434 DuplicateCaptureName(_) => "duplicate capture name",
1435 EmptyAlternate => "empty alternate",
1436 EmptyCaptureName => "empty capture name",
1437 EmptyFlagNegation => "flag negation without any flags",
1438 EmptyGroup => "empty group (e.g., '()')",
1439 InvalidBase10(_) => "invalid base 10 number",
1440 InvalidBase16(_) => "invalid base 16 number",
1441 InvalidCaptureName(_) => "invalid capture name",
1442 InvalidClassRange{..} => "invalid character class range",
1443 InvalidClassEscape(_) => "invalid escape sequence in class",
1444 InvalidRepeatRange{..} => "invalid counted repetition range",
1445 InvalidScalarValue(_) => "invalid Unicode scalar value",
1446 MissingBase10 => "missing count in repetition operator",
1447 RepeaterExpectsExpr => "repetition operator missing expression",
1448 RepeaterUnexpectedExpr(_) => "expression cannot be repeated",
1449 UnclosedCaptureName(_) => "unclosed capture group name",
1450 UnclosedHex => "unclosed hexadecimal literal",
1451 UnclosedParen => "unclosed parenthesis",
1452 UnclosedRepeat => "unclosed counted repetition operator",
1453 UnclosedUnicodeName => "unclosed Unicode class literal",
1454 UnexpectedClassEof => "unexpected EOF in character class",
1455 UnexpectedEscapeEof => "unexpected EOF in escape sequence",
1456 UnexpectedFlagEof => "unexpected EOF in flags",
1457 UnexpectedTwoDigitHexEof => "unexpected EOF in hex literal",
1458 UnopenedParen => "unopened parenthesis",
1459 UnrecognizedEscape(_) => "unrecognized escape sequence",
1460 UnrecognizedFlag(_) => "unrecognized flag",
1461 UnrecognizedUnicodeClass(_) => "unrecognized Unicode class name",
1462 StackExhausted => "stack exhausted, too much nesting",
1463 FlagNotAllowed(_) => "flag not allowed",
1464 UnicodeNotAllowed => "Unicode features not allowed",
1465 InvalidUtf8 => "matching arbitrary bytes is not allowed",
1466 EmptyClass => "empty character class",
1467 __Nonexhaustive => unreachable!(),
1468 }
1469 }
1470 }
1471
1472 impl ::std::error::Error for Error {
1473 fn description(&self) -> &str {
1474 self.kind.description()
1475 }
1476 }
1477
1478 impl fmt::Display for Error {
1479 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1480 if let ErrorKind::StackExhausted = self.kind {
1481 write!(f, "Error parsing regex: {}", self.kind)
1482 } else {
1483 write!(
1484 f, "Error parsing regex near '{}' at character offset {}: {}",
1485 self.surround, self.pos, self.kind)
1486 }
1487 }
1488 }
1489
1490 impl fmt::Display for ErrorKind {
1491 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1492 use ErrorKind::*;
1493 match *self {
1494 DoubleFlagNegation =>
1495 write!(f, "Only one negation symbol is allowed in flags."),
1496 DuplicateCaptureName(ref s) =>
1497 write!(f, "Capture name '{}' is used more than once.", s),
1498 EmptyAlternate =>
1499 write!(f, "Alternations cannot be empty."),
1500 EmptyCaptureName =>
1501 write!(f, "Capture names cannot be empty."),
1502 EmptyFlagNegation =>
1503 write!(f, "Flag negation requires setting at least one flag."),
1504 EmptyGroup =>
1505 write!(f, "Empty regex groups (e.g., '()') are not allowed."),
1506 InvalidBase10(ref s) =>
1507 write!(f, "Not a valid base 10 number: '{}'", s),
1508 InvalidBase16(ref s) =>
1509 write!(f, "Not a valid base 16 number: '{}'", s),
1510 InvalidCaptureName(ref s) =>
1511 write!(f, "Invalid capture name: '{}'. Capture names must \
1512 consist of [_a-zA-Z0-9] and are not allowed to \
1513 start with with a number.", s),
1514 InvalidClassRange { start, end } =>
1515 write!(f, "Invalid character class range '{}-{}'. \
1516 Character class ranges must start with the smaller \
1517 character, but {} > {}", start, end, start, end),
1518 InvalidClassEscape(ref e) =>
1519 write!(f, "Invalid escape sequence in character \
1520 class: '{}'.", e),
1521 InvalidRepeatRange { min, max } =>
1522 write!(f, "Invalid counted repetition range: {{{}, {}}}. \
1523 Counted repetition ranges must start with the \
1524 minimum, but {} > {}", min, max, min, max),
1525 InvalidScalarValue(c) =>
1526 write!(f, "Number does not correspond to a Unicode scalar \
1527 value: '{}'.", c),
1528 MissingBase10 =>
1529 write!(f, "Missing maximum in counted reptition operator."),
1530 RepeaterExpectsExpr =>
1531 write!(f, "Missing expression for reptition operator."),
1532 RepeaterUnexpectedExpr(ref e) =>
1533 write!(f, "Invalid application of reptition operator to: \
1534 '{}'.", e),
1535 UnclosedCaptureName(ref s) =>
1536 write!(f, "Capture name group for '{}' is not closed. \
1537 (Missing a '>'.)", s),
1538 UnclosedHex =>
1539 write!(f, "Unclosed hexadecimal literal (missing a '}}')."),
1540 UnclosedParen =>
1541 write!(f, "Unclosed parenthesis."),
1542 UnclosedRepeat =>
1543 write!(f, "Unclosed counted repetition (missing a '}}')."),
1544 UnclosedUnicodeName =>
1545 write!(f, "Unclosed Unicode literal (missing a '}}')."),
1546 UnexpectedClassEof =>
1547 write!(f, "Character class was not closed before the end of \
1548 the regex (missing a ']')."),
1549 UnexpectedEscapeEof =>
1550 write!(f, "Started an escape sequence that didn't finish \
1551 before the end of the regex."),
1552 UnexpectedFlagEof =>
1553 write!(f, "Inline flag settings was not closed before the end \
1554 of the regex (missing a ')' or ':')."),
1555 UnexpectedTwoDigitHexEof =>
1556 write!(f, "Unexpected end of two digit hexadecimal literal."),
1557 UnopenedParen =>
1558 write!(f, "Unopened parenthesis."),
1559 UnrecognizedEscape(c) =>
1560 write!(f, "Unrecognized escape sequence: '\\{}'.", c),
1561 UnrecognizedFlag(c) =>
1562 write!(f, "Unrecognized flag: '{}'. \
1563 (Allowed flags: i, m, s, U, u, x.)", c),
1564 UnrecognizedUnicodeClass(ref s) =>
1565 write!(f, "Unrecognized Unicode class name: '{}'.", s),
1566 StackExhausted =>
1567 write!(f, "Exhausted space required to parse regex with too \
1568 much nesting."),
1569 FlagNotAllowed(flag) =>
1570 write!(f, "Use of the flag '{}' is not allowed.", flag),
1571 UnicodeNotAllowed =>
1572 write!(f, "Unicode features are not allowed when the Unicode \
1573 (u) flag is not set."),
1574 InvalidUtf8 =>
1575 write!(f, "Matching arbitrary bytes is not allowed."),
1576 EmptyClass =>
1577 write!(f, "Empty character classes are not allowed."),
1578 __Nonexhaustive => unreachable!(),
1579 }
1580 }
1581 }
1582
1583 /// The result of binary search on the simple case folding table.
1584 ///
1585 /// Note that this binary search is done on the "both" table, such that
1586 /// the index returned corresponds to the *first* location of `c1` in the
1587 /// table. The table can then be scanned linearly starting from the position
1588 /// returned to find other case mappings for `c1`.
1589 fn simple_case_fold_both_result(c1: char) -> result::Result<usize, usize> {
1590 let table = &case_folding::C_plus_S_both_table;
1591 let i = binary_search(table, |&(c2, _)| c1 <= c2);
1592 if i >= table.len() || table[i].0 != c1 {
1593 Err(i)
1594 } else {
1595 Ok(i)
1596 }
1597 }
1598
1599 /// Binary search to find first element such that `pred(T) == true`.
1600 ///
1601 /// Assumes that if `pred(xs[i]) == true` then `pred(xs[i+1]) == true`.
1602 ///
1603 /// If all elements yield `pred(T) == false`, then `xs.len()` is returned.
1604 fn binary_search<T, F>(xs: &[T], mut pred: F) -> usize
1605 where F: FnMut(&T) -> bool {
1606 let (mut left, mut right) = (0, xs.len());
1607 while left < right {
1608 let mid = (left + right) / 2;
1609 if pred(&xs[mid]) {
1610 right = mid;
1611 } else {
1612 left = mid + 1;
1613 }
1614 }
1615 left
1616 }
1617
1618 /// Escapes all regular expression meta characters in `text`.
1619 ///
1620 /// The string returned may be safely used as a literal in a regular
1621 /// expression.
1622 pub fn quote(text: &str) -> String {
1623 let mut quoted = String::with_capacity(text.len());
1624 for c in text.chars() {
1625 if parser::is_punct(c) {
1626 quoted.push('\\');
1627 }
1628 quoted.push(c);
1629 }
1630 quoted
1631 }
1632
1633 fn quote_char(c: char) -> String {
1634 let mut s = String::new();
1635 if parser::is_punct(c) {
1636 s.push('\\');
1637 }
1638 s.push(c);
1639 s
1640 }
1641
1642 fn quote_byte(b: u8) -> String {
1643 if parser::is_punct(b as char) || b == b'\'' || b == b'"' {
1644 quote_char(b as char)
1645 } else {
1646 let escaped: Vec<u8> = ascii::escape_default(b).collect();
1647 String::from_utf8(escaped).unwrap()
1648 }
1649 }
1650
1651 fn inc_char(c: char) -> char {
1652 match c {
1653 char::MAX => char::MAX,
1654 '\u{D7FF}' => '\u{E000}',
1655 c => char::from_u32(c as u32 + 1).unwrap(),
1656 }
1657 }
1658
1659 fn dec_char(c: char) -> char {
1660 match c {
1661 '\x00' => '\x00',
1662 '\u{E000}' => '\u{D7FF}',
1663 c => char::from_u32(c as u32 - 1).unwrap(),
1664 }
1665 }
1666
1667 /// Returns true if and only if `c` is a word character.
1668 #[doc(hidden)]
1669 pub fn is_word_char(c: char) -> bool {
1670 match c {
1671 '_' | '0' ... '9' | 'a' ... 'z' | 'A' ... 'Z' => true,
1672 _ => ::unicode::regex::PERLW.binary_search_by(|&(start, end)| {
1673 if c >= start && c <= end {
1674 Ordering::Equal
1675 } else if start > c {
1676 Ordering::Greater
1677 } else {
1678 Ordering::Less
1679 }
1680 }).is_ok(),
1681 }
1682 }
1683
1684 /// Returns true if and only if `c` is an ASCII word byte.
1685 #[doc(hidden)]
1686 pub fn is_word_byte(b: u8) -> bool {
1687 match b {
1688 b'_' | b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' => true,
1689 _ => false,
1690 }
1691 }
1692
1693 #[cfg(test)]
1694 mod properties;
1695
1696 #[cfg(test)]
1697 mod tests {
1698 use {CharClass, ClassRange, ByteClass, ByteRange, Expr};
1699
1700 fn class(ranges: &[(char, char)]) -> CharClass {
1701 let ranges = ranges.iter().cloned()
1702 .map(|(c1, c2)| ClassRange::new(c1, c2)).collect();
1703 CharClass::new(ranges)
1704 }
1705
1706 fn bclass(ranges: &[(u8, u8)]) -> ByteClass {
1707 let ranges = ranges.iter().cloned()
1708 .map(|(c1, c2)| ByteRange::new(c1, c2)).collect();
1709 ByteClass::new(ranges)
1710 }
1711
1712 fn e(re: &str) -> Expr { Expr::parse(re).unwrap() }
1713
1714 #[test]
1715 fn stack_exhaustion() {
1716 use std::iter::repeat;
1717
1718 let open: String = repeat('(').take(200).collect();
1719 let close: String = repeat(')').take(200).collect();
1720 assert!(Expr::parse(&format!("{}a{}", open, close)).is_ok());
1721
1722 let open: String = repeat('(').take(200 + 1).collect();
1723 let close: String = repeat(')').take(200 + 1).collect();
1724 assert!(Expr::parse(&format!("{}a{}", open, close)).is_err());
1725 }
1726
1727 #[test]
1728 fn anchored_start() {
1729 assert!(e("^a").is_anchored_start());
1730 assert!(e("(^a)").is_anchored_start());
1731 assert!(e("^a|^b").is_anchored_start());
1732 assert!(e("(^a)|(^b)").is_anchored_start());
1733 assert!(e("(^(a|b))").is_anchored_start());
1734
1735 assert!(!e("^a|b").is_anchored_start());
1736 assert!(!e("a|^b").is_anchored_start());
1737 }
1738
1739 #[test]
1740 fn anchored_end() {
1741 assert!(e("a$").is_anchored_end());
1742 assert!(e("(a$)").is_anchored_end());
1743 assert!(e("a$|b$").is_anchored_end());
1744 assert!(e("(a$)|(b$)").is_anchored_end());
1745 assert!(e("((a|b)$)").is_anchored_end());
1746
1747 assert!(!e("a$|b").is_anchored_end());
1748 assert!(!e("a|b$").is_anchored_end());
1749 }
1750
1751 #[test]
1752 fn class_canon_no_change() {
1753 let cls = class(&[('a', 'c'), ('x', 'z')]);
1754 assert_eq!(cls.clone().canonicalize(), cls);
1755 }
1756
1757 #[test]
1758 fn class_canon_unordered() {
1759 let cls = class(&[('x', 'z'), ('a', 'c')]);
1760 assert_eq!(cls.canonicalize(), class(&[
1761 ('a', 'c'), ('x', 'z'),
1762 ]));
1763 }
1764
1765 #[test]
1766 fn class_canon_overlap() {
1767 let cls = class(&[('x', 'z'), ('w', 'y')]);
1768 assert_eq!(cls.canonicalize(), class(&[
1769 ('w', 'z'),
1770 ]));
1771 }
1772
1773 #[test]
1774 fn class_canon_overlap_many() {
1775 let cls = class(&[
1776 ('c', 'f'), ('a', 'g'), ('d', 'j'), ('a', 'c'),
1777 ('m', 'p'), ('l', 's'),
1778 ]);
1779 assert_eq!(cls.clone().canonicalize(), class(&[
1780 ('a', 'j'), ('l', 's'),
1781 ]));
1782 }
1783
1784 #[test]
1785 fn class_canon_overlap_boundary() {
1786 let cls = class(&[('x', 'z'), ('u', 'w')]);
1787 assert_eq!(cls.canonicalize(), class(&[
1788 ('u', 'z'),
1789 ]));
1790 }
1791
1792 #[test]
1793 fn class_canon_extreme_edge_case() {
1794 let cls = class(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
1795 assert_eq!(cls.canonicalize(), class(&[
1796 ('\x00', '\u{10FFFF}'),
1797 ]));
1798 }
1799
1800 #[test]
1801 fn class_canon_singles() {
1802 let cls = class(&[('a', 'a'), ('b', 'b')]);
1803 assert_eq!(cls.canonicalize(), class(&[('a', 'b')]));
1804 }
1805
1806 #[test]
1807 fn class_negate_single() {
1808 let cls = class(&[('a', 'a')]);
1809 assert_eq!(cls.negate(), class(&[
1810 ('\x00', '\x60'), ('\x62', '\u{10FFFF}'),
1811 ]));
1812 }
1813
1814 #[test]
1815 fn class_negate_singles() {
1816 let cls = class(&[('a', 'a'), ('b', 'b')]);
1817 assert_eq!(cls.negate(), class(&[
1818 ('\x00', '\x60'), ('\x63', '\u{10FFFF}'),
1819 ]));
1820 }
1821
1822 #[test]
1823 fn class_negate_multiples() {
1824 let cls = class(&[('a', 'c'), ('x', 'z')]);
1825 assert_eq!(cls.negate(), class(&[
1826 ('\x00', '\x60'), ('\x64', '\x77'), ('\x7b', '\u{10FFFF}'),
1827 ]));
1828 }
1829
1830 #[test]
1831 fn class_negate_min_scalar() {
1832 let cls = class(&[('\x00', 'a')]);
1833 assert_eq!(cls.negate(), class(&[
1834 ('\x62', '\u{10FFFF}'),
1835 ]));
1836 }
1837
1838 #[test]
1839 fn class_negate_max_scalar() {
1840 let cls = class(&[('a', '\u{10FFFF}')]);
1841 assert_eq!(cls.negate(), class(&[
1842 ('\x00', '\x60'),
1843 ]));
1844 }
1845
1846 #[test]
1847 fn class_negate_everything() {
1848 let cls = class(&[('\x00', '\u{10FFFF}')]);
1849 assert_eq!(cls.negate(), class(&[]));
1850 }
1851
1852 #[test]
1853 fn class_negate_everything_sans_one() {
1854 let cls = class(&[
1855 ('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')
1856 ]);
1857 assert_eq!(cls.negate(), class(&[
1858 ('\u{10FFFE}', '\u{10FFFE}'),
1859 ]));
1860 }
1861
1862 #[test]
1863 fn class_negate_surrogates_min() {
1864 let cls = class(&[('\x00', '\u{D7FF}')]);
1865 assert_eq!(cls.negate(), class(&[
1866 ('\u{E000}', '\u{10FFFF}'),
1867 ]));
1868 }
1869
1870 #[test]
1871 fn class_negate_surrogates_min_edge() {
1872 let cls = class(&[('\x00', '\u{D7FE}')]);
1873 assert_eq!(cls.negate(), class(&[
1874 ('\u{D7FF}', '\u{10FFFF}'),
1875 ]));
1876 }
1877
1878 #[test]
1879 fn class_negate_surrogates_max() {
1880 let cls = class(&[('\u{E000}', '\u{10FFFF}')]);
1881 assert_eq!(cls.negate(), class(&[
1882 ('\x00', '\u{D7FF}'),
1883 ]));
1884 }
1885
1886 #[test]
1887 fn class_negate_surrogates_max_edge() {
1888 let cls = class(&[('\u{E001}', '\u{10FFFF}')]);
1889 assert_eq!(cls.negate(), class(&[
1890 ('\x00', '\u{E000}'),
1891 ]));
1892 }
1893
1894 #[test]
1895 fn class_canon_overlap_many_case_fold() {
1896 let cls = class(&[
1897 ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'),
1898 ('M', 'P'), ('L', 'S'), ('c', 'f'),
1899 ]);
1900 assert_eq!(cls.case_fold(), class(&[
1901 ('A', 'J'), ('L', 'S'),
1902 ('a', 'j'), ('l', 's'),
1903 ('\u{17F}', '\u{17F}'),
1904 ]));
1905
1906 let cls = bclass(&[
1907 (b'C', b'F'), (b'A', b'G'), (b'D', b'J'), (b'A', b'C'),
1908 (b'M', b'P'), (b'L', b'S'), (b'c', b'f'),
1909 ]);
1910 assert_eq!(cls.case_fold(), bclass(&[
1911 (b'A', b'J'), (b'L', b'S'),
1912 (b'a', b'j'), (b'l', b's'),
1913 ]));
1914 }
1915
1916 #[test]
1917 fn class_fold_az() {
1918 let cls = class(&[('A', 'Z')]);
1919 assert_eq!(cls.case_fold(), class(&[
1920 ('A', 'Z'), ('a', 'z'),
1921 ('\u{17F}', '\u{17F}'),
1922 ('\u{212A}', '\u{212A}'),
1923 ]));
1924 let cls = class(&[('a', 'z')]);
1925 assert_eq!(cls.case_fold(), class(&[
1926 ('A', 'Z'), ('a', 'z'),
1927 ('\u{17F}', '\u{17F}'),
1928 ('\u{212A}', '\u{212A}'),
1929 ]));
1930
1931 let cls = bclass(&[(b'A', b'Z')]);
1932 assert_eq!(cls.case_fold(), bclass(&[
1933 (b'A', b'Z'), (b'a', b'z'),
1934 ]));
1935 let cls = bclass(&[(b'a', b'z')]);
1936 assert_eq!(cls.case_fold(), bclass(&[
1937 (b'A', b'Z'), (b'a', b'z'),
1938 ]));
1939 }
1940
1941 #[test]
1942 fn class_fold_a_underscore() {
1943 let cls = class(&[('A', 'A'), ('_', '_')]);
1944 assert_eq!(cls.clone().canonicalize(), class(&[
1945 ('A', 'A'), ('_', '_'),
1946 ]));
1947 assert_eq!(cls.case_fold(), class(&[
1948 ('A', 'A'), ('_', '_'), ('a', 'a'),
1949 ]));
1950
1951 let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
1952 assert_eq!(cls.clone().canonicalize(), bclass(&[
1953 (b'A', b'A'), (b'_', b'_'),
1954 ]));
1955 assert_eq!(cls.case_fold(), bclass(&[
1956 (b'A', b'A'), (b'_', b'_'), (b'a', b'a'),
1957 ]));
1958 }
1959
1960 #[test]
1961 fn class_fold_a_equals() {
1962 let cls = class(&[('A', 'A'), ('=', '=')]);
1963 assert_eq!(cls.clone().canonicalize(), class(&[
1964 ('=', '='), ('A', 'A'),
1965 ]));
1966 assert_eq!(cls.case_fold(), class(&[
1967 ('=', '='), ('A', 'A'), ('a', 'a'),
1968 ]));
1969
1970 let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
1971 assert_eq!(cls.clone().canonicalize(), bclass(&[
1972 (b'=', b'='), (b'A', b'A'),
1973 ]));
1974 assert_eq!(cls.case_fold(), bclass(&[
1975 (b'=', b'='), (b'A', b'A'), (b'a', b'a'),
1976 ]));
1977 }
1978
1979 #[test]
1980 fn class_fold_no_folding_needed() {
1981 let cls = class(&[('\x00', '\x10')]);
1982 assert_eq!(cls.case_fold(), class(&[
1983 ('\x00', '\x10'),
1984 ]));
1985
1986 let cls = bclass(&[(b'\x00', b'\x10')]);
1987 assert_eq!(cls.case_fold(), bclass(&[
1988 (b'\x00', b'\x10'),
1989 ]));
1990 }
1991
1992 #[test]
1993 fn class_fold_negated() {
1994 let cls = class(&[('x', 'x')]);
1995 assert_eq!(cls.clone().case_fold(), class(&[
1996 ('X', 'X'), ('x', 'x'),
1997 ]));
1998 assert_eq!(cls.case_fold().negate(), class(&[
1999 ('\x00', 'W'), ('Y', 'w'), ('y', '\u{10FFFF}'),
2000 ]));
2001
2002 let cls = bclass(&[(b'x', b'x')]);
2003 assert_eq!(cls.clone().case_fold(), bclass(&[
2004 (b'X', b'X'), (b'x', b'x'),
2005 ]));
2006 assert_eq!(cls.case_fold().negate(), bclass(&[
2007 (b'\x00', b'W'), (b'Y', b'w'), (b'y', b'\xff'),
2008 ]));
2009 }
2010
2011 #[test]
2012 fn class_fold_single_to_multiple() {
2013 let cls = class(&[('k', 'k')]);
2014 assert_eq!(cls.case_fold(), class(&[
2015 ('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),
2016 ]));
2017
2018 let cls = bclass(&[(b'k', b'k')]);
2019 assert_eq!(cls.case_fold(), bclass(&[
2020 (b'K', b'K'), (b'k', b'k'),
2021 ]));
2022 }
2023
2024 #[test]
2025 fn class_fold_at() {
2026 let cls = class(&[('@', '@')]);
2027 assert_eq!(cls.clone().canonicalize(), class(&[('@', '@')]));
2028 assert_eq!(cls.case_fold(), class(&[('@', '@')]));
2029
2030 let cls = bclass(&[(b'@', b'@')]);
2031 assert_eq!(cls.clone().canonicalize(), bclass(&[(b'@', b'@')]));
2032 assert_eq!(cls.case_fold(), bclass(&[(b'@', b'@')]));
2033 }
2034
2035 #[test]
2036 fn roundtrip_class_hypen() {
2037 let expr = e("[-./]");
2038 assert_eq!("(?u:[-\\.-/])", expr.to_string());
2039
2040 let expr = e("(?-u)[-./]");
2041 assert_eq!("(?-u:[-\\.-/])", expr.to_string());
2042 }
2043 }