1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
12 This crate provides a regular expression parser and an abstract syntax for
13 regular expressions. The abstract syntax is defined by the `Expr` type. The
14 concrete syntax is enumerated in the
15 [`regex`](../regex/index.html#syntax)
18 Note that since this crate is first and foremost an implementation detail for
19 the `regex` crate, it may experience more frequent breaking changes. It is
20 exposed as a separate crate so that others may use it to do analysis on regular
21 expressions or even build their own matching engine.
23 # Example: parsing an expression
25 Parsing a regular expression can be done with the `Expr::parse` function.
28 use regex_syntax::Expr;
30 assert_eq!(Expr::parse(r"ab|yz").unwrap(), Expr::Alternate(vec![
31 Expr::Literal { chars: vec!['a', 'b'], casei: false },
32 Expr::Literal { chars: vec!['y', 'z'], casei: false },
36 # Example: inspecting an error
38 The parser in this crate provides very detailed error values. For example,
39 if an invalid character class range is given:
42 use regex_syntax::{Expr, ErrorKind};
44 let err = Expr::parse(r"[z-a]").unwrap_err();
45 assert_eq!(err.position(), 4);
46 assert_eq!(err.kind(), &ErrorKind::InvalidClassRange {
52 Or unbalanced parentheses:
55 use regex_syntax::{Expr, ErrorKind};
57 let err = Expr::parse(r"ab(cd").unwrap_err();
58 assert_eq!(err.position(), 2);
59 assert_eq!(err.kind(), &ErrorKind::UnclosedParen);
63 #![deny(missing_docs)]
64 #![cfg_attr(test, deny(warnings))]
66 #[cfg(test)] extern crate quickcheck;
67 #[cfg(test)] extern crate rand;
75 use std
::cmp
::{Ordering, max, min}
;
77 use std
::iter
::IntoIterator
;
84 use unicode
::case_folding
;
87 use self::Repeater
::*;
89 use parser
::{Flags, Parser}
;
91 pub use literals
::{Literals, Lit}
;
93 /// A regular expression abstract syntax tree.
95 /// An `Expr` represents the abstract syntax of a regular expression.
96 #[derive(Clone, Debug, PartialEq, Eq)]
98 /// An empty regex (which never matches any text).
100 /// A sequence of one or more literal characters to be matched.
104 /// Whether to match case insensitively.
107 /// A sequence of one or more literal bytes to be matched.
111 /// Whether to match case insensitively.
113 /// The interpretation of "case insensitive" in this context is
114 /// ambiguous since `bytes` can be arbitrary. However, a good heuristic
115 /// is to assume that the bytes are ASCII-compatible and do simple
116 /// ASCII case folding.
119 /// Match any character.
121 /// Match any character, excluding new line (`0xA`).
125 /// Match any byte, excluding new line (`0xA`).
127 /// A character class.
129 /// A character class with byte ranges only.
130 ClassBytes(ByteClass
),
131 /// Match the start of a line or beginning of input.
133 /// Match the end of a line or end of input.
135 /// Match the beginning of input.
137 /// Match the end of input.
139 /// Match a word boundary (word character on one side and a non-word
140 /// character on the other).
142 /// Match a position that is not a word boundary (word or non-word
143 /// characters on both sides).
145 /// Match an ASCII word boundary.
147 /// Match a position that is not an ASCII word boundary.
148 NotWordBoundaryAscii
,
149 /// A group, possibly non-capturing.
151 /// The expression inside the group.
153 /// The capture index (starting at `1`) only for capturing groups.
155 /// The capture name, only for capturing named groups.
156 name
: Option
<String
>,
158 /// A repeat operator (`?`, `*`, `+` or `{m,n}`).
160 /// The expression to be repeated. Limited to literals, `.`, classes
161 /// or grouped expressions.
163 /// The type of repeat operator used.
165 /// Whether the repeat is greedy (match the most) or not (match the
169 /// A concatenation of expressions. Must be matched one after the other.
171 /// N.B. A concat expression can only appear at the top-level or
172 /// immediately inside a group expression.
174 /// An alternation of expressions. Only one must match.
176 /// N.B. An alternate expression can only appear at the top-level or
177 /// immediately inside a group expression.
178 Alternate(Vec
<Expr
>),
181 type CaptureIndex
= Option
<usize>;
183 type CaptureName
= Option
<String
>;
185 /// The type of a repeat operator expression.
186 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
188 /// Match zero or one (`?`).
190 /// Match zero or more (`*`).
192 /// Match one or more (`+`).
194 /// Match for at least `min` and at most `max` (`{m,n}`).
196 /// When `max` is `None`, there is no upper bound on the number of matches.
198 /// Lower bound on the number of matches.
200 /// Optional upper bound on the number of matches.
206 /// Returns true if and only if this repetition can match the empty string.
207 fn matches_empty(&self) -> bool
{
208 use self::Repeater
::*;
213 Range { min, .. }
=> min
== 0,
218 /// A character class.
220 /// A character class has a canonical format that the parser guarantees. Its
221 /// canonical format is defined by the following invariants:
223 /// 1. Given any Unicode scalar value, it is matched by *at most* one character
224 /// range in a canonical character class.
225 /// 2. Every adjacent character range is separated by at least one Unicode
227 /// 3. Given any pair of character ranges `r1` and `r2`, if
228 /// `r1.end < r2.start`, then `r1` comes before `r2` in a canonical
231 /// In sum, any `CharClass` produced by this crate's parser is a sorted
232 /// sequence of non-overlapping ranges. This makes it possible to test whether
233 /// a character is matched by a class with a binary search.
235 /// If the case insensitive flag was set when parsing a character class, then
236 /// simple case folding is done automatically. For example, `(?i)[a-c]` is
237 /// automatically translated to `[a-cA-C]`.
238 #[derive(Clone, Debug, PartialEq, Eq)]
239 pub struct CharClass
{
240 ranges
: Vec
<ClassRange
>,
243 /// A single inclusive range in a character class.
245 /// Since range boundaries are defined by Unicode scalar values, the boundaries
246 /// can never be in the open interval `(0xD7FF, 0xE000)`. However, a range may
247 /// *cover* codepoints that are not scalar values.
249 /// Note that this has a few convenient impls on `PartialEq` and `PartialOrd`
250 /// for testing whether a character is contained inside a given range.
251 #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)]
252 pub struct ClassRange
{
253 /// The start character of the range.
255 /// This must be less than or equal to `end`.
258 /// The end character of the range.
260 /// This must be greater than or equal to `start`.
264 /// A byte class for byte ranges only.
266 /// A byte class has a canonical format that the parser guarantees. Its
267 /// canonical format is defined by the following invariants:
269 /// 1. Given any byte, it is matched by *at most* one byte range in a canonical
271 /// 2. Every adjacent byte range is separated by at least one byte.
272 /// 3. Given any pair of byte ranges `r1` and `r2`, if
273 /// `r1.end < r2.start`, then `r1` comes before `r2` in a canonical
276 /// In sum, any `ByteClass` produced by this crate's parser is a sorted
277 /// sequence of non-overlapping ranges. This makes it possible to test whether
278 /// a byte is matched by a class with a binary search.
280 /// If the case insensitive flag was set when parsing a character class,
281 /// then simple ASCII-only case folding is done automatically. For example,
282 /// `(?i)[a-c]` is automatically translated to `[a-cA-C]`.
283 #[derive(Clone, Debug, PartialEq, Eq)]
284 pub struct ByteClass
{
285 ranges
: Vec
<ByteRange
>,
288 /// A single inclusive range in a byte class.
290 /// Note that this has a few convenient impls on `PartialEq` and `PartialOrd`
291 /// for testing whether a byte is contained inside a given range.
292 #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)]
293 pub struct ByteRange
{
294 /// The start byte of the range.
296 /// This must be less than or equal to `end`.
299 /// The end byte of the range.
301 /// This must be greater than or equal to `end`.
305 /// A builder for configuring regular expression parsing.
307 /// This allows setting the default values of flags and other options, such
308 /// as the maximum nesting depth.
309 #[derive(Clone, Debug)]
310 pub struct ExprBuilder
{
316 /// Create a new builder for configuring expression parsing.
318 /// Note that all flags are disabled by default.
319 pub fn new() -> ExprBuilder
{
321 flags
: Flags
::default(),
326 /// Set the default value for the case insensitive (`i`) flag.
327 pub fn case_insensitive(mut self, yes
: bool
) -> ExprBuilder
{
328 self.flags
.casei
= yes
;
332 /// Set the default value for the multi-line matching (`m`) flag.
333 pub fn multi_line(mut self, yes
: bool
) -> ExprBuilder
{
334 self.flags
.multi
= yes
;
338 /// Set the default value for the any character (`s`) flag.
339 pub fn dot_matches_new_line(mut self, yes
: bool
) -> ExprBuilder
{
340 self.flags
.dotnl
= yes
;
344 /// Set the default value for the greedy swap (`U`) flag.
345 pub fn swap_greed(mut self, yes
: bool
) -> ExprBuilder
{
346 self.flags
.swap_greed
= yes
;
350 /// Set the default value for the ignore whitespace (`x`) flag.
351 pub fn ignore_whitespace(mut self, yes
: bool
) -> ExprBuilder
{
352 self.flags
.ignore_space
= yes
;
356 /// Set the default value for the Unicode (`u`) flag.
358 /// If `yes` is false, then `allow_bytes` is set to true.
359 pub fn unicode(mut self, yes
: bool
) -> ExprBuilder
{
360 self.flags
.unicode
= yes
;
362 self.allow_bytes(true)
368 /// Whether the parser allows matching arbitrary bytes or not.
370 /// When the `u` flag is disabled (either with this builder or in the
371 /// expression itself), the parser switches to interpreting the expression
372 /// as matching arbitrary bytes instead of Unicode codepoints. For example,
373 /// the expression `(?u:\xFF)` matches the *codepoint* `\xFF`, which
374 /// corresponds to the UTF-8 byte sequence `\xCE\xBF`. Conversely,
375 /// `(?-u:\xFF)` matches the *byte* `\xFF`, which is not valid UTF-8.
377 /// When `allow_bytes` is disabled (the default), an expression like
378 /// `(?-u:\xFF)` will cause the parser to return an error, since it would
379 /// otherwise match invalid UTF-8. When enabled, it will be allowed.
380 pub fn allow_bytes(mut self, yes
: bool
) -> ExprBuilder
{
381 self.flags
.allow_bytes
= yes
;
385 /// Set the nesting limit for regular expression parsing.
387 /// Regular expressions that nest more than this limit will result in a
388 /// `StackExhausted` error.
389 pub fn nest_limit(mut self, limit
: usize) -> ExprBuilder
{
390 self.nest_limit
= limit
;
394 /// Parse a string as a regular expression using the current configuraiton.
395 pub fn parse(self, s
: &str) -> Result
<Expr
> {
396 Parser
::parse(s
, self.flags
).and_then(|e
| e
.simplify(self.nest_limit
))
401 /// Parses a string in a regular expression syntax tree.
403 /// This is a convenience method for parsing an expression using the
404 /// default configuration. To tweak parsing options (such as which flags
405 /// are enabled by default), use the `ExprBuilder` type.
406 pub fn parse(s
: &str) -> Result
<Expr
> {
407 ExprBuilder
::new().parse(s
)
410 /// Returns true iff the expression can be repeated by a quantifier.
411 fn can_repeat(&self) -> bool
{
413 Literal{..}
| LiteralBytes{..}
414 | AnyChar
| AnyCharNoNL
| AnyByte
| AnyByteNoNL
415 | Class(_
) | ClassBytes(_
)
416 | StartLine
| EndLine
| StartText
| EndText
417 | WordBoundary
| NotWordBoundary
418 | WordBoundaryAscii
| NotWordBoundaryAscii
425 fn simplify(self, nest_limit
: usize) -> Result
<Expr
> {
426 fn combine_literals(es
: &mut Vec
<Expr
>, e
: Expr
) {
427 match (es
.pop(), e
) {
428 (None
, e
) => es
.push(e
),
429 (Some(Literal { chars: mut chars1, casei: casei1 }
),
430 Literal { chars: chars2, casei: casei2 }
) => {
431 if casei1
== casei2
{
432 chars1
.extend(chars2
);
433 es
.push(Literal { chars: chars1, casei: casei1 }
);
435 es
.push(Literal { chars: chars1, casei: casei1 }
);
436 es
.push(Literal { chars: chars2, casei: casei2 }
);
439 (Some(LiteralBytes { bytes: mut bytes1, casei: casei1 }
),
440 LiteralBytes { bytes: bytes2, casei: casei2 }
) => {
441 if casei1
== casei2
{
442 bytes1
.extend(bytes2
);
443 es
.push(LiteralBytes { bytes: bytes1, casei: casei1 }
);
445 es
.push(LiteralBytes { bytes: bytes1, casei: casei1 }
);
446 es
.push(LiteralBytes { bytes: bytes2, casei: casei2 }
);
455 fn simp(expr
: Expr
, recurse
: usize, limit
: usize) -> Result
<Expr
> {
459 surround
: "".to_owned(),
460 kind
: ErrorKind
::StackExhausted
,
463 let simplify
= |e
| simp(e
, recurse
+ 1, limit
);
465 Repeat { e, r, greedy }
=> Repeat
{
466 e
: Box
::new(try
!(simplify(*e
))),
470 Group { e, i, name }
=> {
471 let e
= try
!(simplify(*e
));
472 if i
.is_none() && name
.is_none() && e
.can_repeat() {
475 Group { e: Box::new(e), i: i, name: name }
479 let mut new_es
= Vec
::with_capacity(es
.len());
481 combine_literals(&mut new_es
, try
!(simplify(e
)));
483 if new_es
.len() == 1 {
484 new_es
.pop().unwrap()
490 let mut new_es
= Vec
::with_capacity(es
.len());
492 new_es
.push(try
!(simplify(e
)));
499 simp(self, 0, nest_limit
)
502 /// Returns a set of literal prefixes extracted from this expression.
503 pub fn prefixes(&self) -> Literals
{
504 let mut lits
= Literals
::empty();
505 lits
.union_prefixes(self);
509 /// Returns a set of literal suffixes extracted from this expression.
510 pub fn suffixes(&self) -> Literals
{
511 let mut lits
= Literals
::empty();
512 lits
.union_suffixes(self);
516 /// Returns true if and only if the expression is required to match from
517 /// the beginning of text.
518 pub fn is_anchored_start(&self) -> bool
{
520 Repeat { ref e, r, .. }
=> {
521 !r
.matches_empty() && e
.is_anchored_start()
523 Group { ref e, .. }
=> e
.is_anchored_start(),
524 Concat(ref es
) => es
[0].is_anchored_start(),
525 Alternate(ref es
) => es
.iter().all(|e
| e
.is_anchored_start()),
531 /// Returns true if and only if the expression has at least one matchable
532 /// sub-expression that must match the beginning of text.
533 pub fn has_anchored_start(&self) -> bool
{
535 Repeat { ref e, r, .. }
=> {
536 !r
.matches_empty() && e
.has_anchored_start()
538 Group { ref e, .. }
=> e
.has_anchored_start(),
539 Concat(ref es
) => es
[0].has_anchored_start(),
540 Alternate(ref es
) => es
.iter().any(|e
| e
.has_anchored_start()),
546 /// Returns true if and only if the expression is required to match at the
548 pub fn is_anchored_end(&self) -> bool
{
550 Repeat { ref e, r, .. }
=> {
551 !r
.matches_empty() && e
.is_anchored_end()
553 Group { ref e, .. }
=> e
.is_anchored_end(),
554 Concat(ref es
) => es
[es
.len() - 1].is_anchored_end(),
555 Alternate(ref es
) => es
.iter().all(|e
| e
.is_anchored_end()),
561 /// Returns true if and only if the expression has at least one matchable
562 /// sub-expression that must match the beginning of text.
563 pub fn has_anchored_end(&self) -> bool
{
565 Repeat { ref e, r, .. }
=> {
566 !r
.matches_empty() && e
.has_anchored_end()
568 Group { ref e, .. }
=> e
.has_anchored_end(),
569 Concat(ref es
) => es
[es
.len() - 1].has_anchored_end(),
570 Alternate(ref es
) => es
.iter().any(|e
| e
.has_anchored_end()),
576 /// Returns true if and only if the expression contains sub-expressions
577 /// that can match arbitrary bytes.
578 pub fn has_bytes(&self) -> bool
{
580 Repeat { ref e, .. }
=> e
.has_bytes(),
581 Group { ref e, .. }
=> e
.has_bytes(),
582 Concat(ref es
) => es
.iter().any(|e
| e
.has_bytes()),
583 Alternate(ref es
) => es
.iter().any(|e
| e
.has_bytes()),
584 LiteralBytes{..}
=> true,
585 AnyByte
| AnyByteNoNL
=> true,
586 ClassBytes(_
) => true,
587 WordBoundaryAscii
| NotWordBoundaryAscii
=> true,
593 impl Deref
for CharClass
{
594 type Target
= Vec
<ClassRange
>;
595 fn deref(&self) -> &Vec
<ClassRange
> { &self.ranges }
598 impl IntoIterator
for CharClass
{
599 type Item
= ClassRange
;
600 type IntoIter
= vec
::IntoIter
<ClassRange
>;
601 fn into_iter(self) -> vec
::IntoIter
<ClassRange
> { self.ranges.into_iter() }
604 impl<'a
> IntoIterator
for &'a CharClass
{
605 type Item
= &'a ClassRange
;
606 type IntoIter
= slice
::Iter
<'a
, ClassRange
>;
607 fn into_iter(self) -> slice
::Iter
<'a
, ClassRange
> { self.iter() }
611 /// Create a new class from an existing set of ranges.
612 pub fn new(ranges
: Vec
<ClassRange
>) -> CharClass
{
613 CharClass { ranges: ranges }
616 /// Create an empty class.
617 fn empty() -> CharClass
{
618 CharClass
::new(Vec
::new())
621 /// Returns true if `c` is matched by this character class.
622 pub fn matches(&self, c
: char) -> bool
{
623 self.binary_search_by(|range
| c
.partial_cmp(range
).unwrap()).is_ok()
626 /// Removes the given character from the class if it exists.
628 /// Note that this takes `O(n)` time in the number of ranges.
629 pub fn remove(&mut self, c
: char) {
630 let mut i
= match self.binary_search_by(|r
| c
.partial_cmp(r
).unwrap()) {
634 let mut r
= self.ranges
.remove(i
);
636 r
.start
= inc_char(c
);
637 if r
.start
> r
.end
|| c
== char::MAX
{
640 self.ranges
.insert(i
, r
);
641 } else if r
.end
== c
{
643 if r
.end
< r
.start
|| c
== '
\x00'
{
646 self.ranges
.insert(0, r
);
648 let (mut r1
, mut r2
) = (r
.clone(), r
.clone());
649 r1
.end
= dec_char(c
);
650 if r1
.start
<= r1
.end
{
651 self.ranges
.insert(i
, r1
);
654 r2
.start
= inc_char(c
);
655 if r2
.start
<= r2
.end
{
656 self.ranges
.insert(i
, r2
);
661 /// Create a new empty class from this one.
662 fn to_empty(&self) -> CharClass
{
663 CharClass { ranges: Vec::with_capacity(self.len()) }
666 /// Create a byte class from this character class.
668 /// Codepoints above 0xFF are removed.
669 fn to_byte_class(self) -> ByteClass
{
671 self.ranges
.into_iter()
672 .filter_map(|r
| r
.to_byte_range())
673 .collect()).canonicalize()
676 /// Merge two classes and canonicalize them.
678 fn merge(mut self, other
: CharClass
) -> CharClass
{
679 self.ranges
.extend(other
);
683 /// Canonicalze any sequence of ranges.
685 /// This is responsible for enforcing the canonical format invariants
686 /// as described on the docs for the `CharClass` type.
687 fn canonicalize(mut self) -> CharClass
{
688 // TODO: Save some cycles here by checking if already canonicalized.
690 let mut ordered
= self.to_empty(); // TODO: Do this in place?
691 for candidate
in self {
692 // If the candidate overlaps with an existing range, then it must
693 // be the most recent range added because we process the candidates
695 if let Some(or
) = ordered
.ranges
.last_mut() {
696 if or
.overlapping(candidate
) {
697 *or
= or
.merge(candidate
);
701 ordered
.ranges
.push(candidate
);
706 /// Negates the character class.
708 /// For all `c` where `c` is a Unicode scalar value, `c` matches `self`
709 /// if and only if `c` does not match `self.negate()`.
710 pub fn negate(mut self) -> CharClass
{
711 fn range(s
: char, e
: char) -> ClassRange { ClassRange::new(s, e) }
714 // Inverting an empty range yields all of Unicode.
716 ranges
: vec
![ClassRange { start: '\x00', end: '\u{10ffff}'
}],
719 self = self.canonicalize();
720 let mut inv
= self.to_empty();
721 if self[0].start
> '
\x00'
{
722 inv
.ranges
.push(range('
\x00'
, dec_char(self[0].start
)));
724 for win
in self.windows(2) {
725 inv
.ranges
.push(range(inc_char(win
[0].end
),
726 dec_char(win
[1].start
)));
728 if self[self.len() - 1].end
< char::MAX
{
729 inv
.ranges
.push(range(inc_char(self[self.len() - 1].end
),
735 /// Apply case folding to this character class.
737 /// N.B. Applying case folding to a negated character class probably
738 /// won't produce the expected result. e.g., `(?i)[^x]` really should
739 /// match any character sans `x` and `X`, but if `[^x]` is negated
740 /// before being case folded, you'll end up matching any character.
741 pub fn case_fold(self) -> CharClass
{
742 let mut folded
= self.to_empty();
744 // Applying case folding to a range is expensive because *every*
745 // character needs to be examined. Thus, we avoid that drudgery
746 // if no character in the current range is in our case folding
748 if r
.needs_case_folding() {
749 folded
.ranges
.extend(r
.case_fold());
751 folded
.ranges
.push(r
);
753 folded
.canonicalize()
756 /// Returns the number of characters that match this class.
757 fn num_chars(&self) -> usize {
759 .map(|&r
| 1 + (r
.end
as u32) - (r
.start
as u32))
760 .fold(0, |acc
, len
| acc
+ len
)
766 /// Create a new class range.
768 /// If `end < start`, then the two values are swapped so that
769 /// the invariant `start <= end` is preserved.
770 fn new(start
: char, end
: char) -> ClassRange
{
772 ClassRange { start: start, end: end }
774 ClassRange { start: end, end: start }
778 /// Translate this to a byte class.
780 /// If the start codepoint exceeds 0xFF, then this returns `None`.
782 /// If the end codepoint exceeds 0xFF, then it is set to 0xFF.
783 fn to_byte_range(self) -> Option
<ByteRange
> {
784 if self.start
> '
\u{FF}'
{
787 let s
= self.start
as u8;
788 let e
= min('
\u{FF}'
, self.end
) as u8;
789 Some(ByteRange
::new(s
, e
))
793 /// Create a range of one character.
794 fn one(c
: char) -> ClassRange
{
795 ClassRange { start: c, end: c }
798 /// Returns true if and only if the two ranges are overlapping. Note that
799 /// since ranges are inclusive, `a-c` and `d-f` are overlapping!
800 fn overlapping(self, other
: ClassRange
) -> bool
{
801 max(self.start
, other
.start
) <= inc_char(min(self.end
, other
.end
))
804 /// Creates a new range representing the union of `self` and `other.
805 fn merge(self, other
: ClassRange
) -> ClassRange
{
807 start
: min(self.start
, other
.start
),
808 end
: max(self.end
, other
.end
),
812 /// Returns true if and only if this range contains a character that is
813 /// in the case folding table.
814 fn needs_case_folding(self) -> bool
{
815 case_folding
::C_plus_S_both_table
816 .binary_search_by(|&(c
, _
)| self.partial_cmp(&c
).unwrap()).is_ok()
819 /// Apply case folding to this range.
821 /// Since case folding might add characters such that the range is no
822 /// longer contiguous, this returns multiple class ranges. They are in
824 fn case_fold(self) -> Vec
<ClassRange
> {
825 let table
= &case_folding
::C_plus_S_both_table
;
826 let (s
, e
) = (self.start
as u32, self.end
as u32 + 1);
827 let mut start
= self.start
;
829 let mut next_case_fold
= '
\x00'
;
830 let mut ranges
= Vec
::with_capacity(10);
831 for mut c
in (s
..e
).filter_map(char::from_u32
) {
832 if c
>= next_case_fold
{
833 c
= match simple_case_fold_both_result(c
) {
835 for &(c1
, c2
) in &table
[i
..] {
839 if c2
!= inc_char(end
) {
840 ranges
.push(ClassRange
::new(start
, end
));
849 next_case_fold
= table
[i
].0;
851 next_case_fold
= '
\u{10FFFF}'
;
857 // The fast path. We know this character doesn't have an entry
858 // in the case folding table.
859 if c
!= inc_char(end
) {
860 ranges
.push(ClassRange
::new(start
, end
));
865 ranges
.push(ClassRange
::new(start
, end
));
870 impl PartialEq
<char> for ClassRange
{
872 fn eq(&self, other
: &char) -> bool
{
873 self.start
<= *other
&& *other
<= self.end
877 impl PartialEq
<ClassRange
> for char {
879 fn eq(&self, other
: &ClassRange
) -> bool
{
884 impl PartialOrd
<char> for ClassRange
{
886 fn partial_cmp(&self, other
: &char) -> Option
<Ordering
> {
887 Some(if self == other
{
889 } else if *other
> self.end
{
897 impl PartialOrd
<ClassRange
> for char {
899 fn partial_cmp(&self, other
: &ClassRange
) -> Option
<Ordering
> {
900 other
.partial_cmp(self).map(|o
| o
.reverse())
905 /// Create a new class from an existing set of ranges.
906 pub fn new(ranges
: Vec
<ByteRange
>) -> ByteClass
{
907 ByteClass { ranges: ranges }
910 /// Returns true if `b` is matched by this byte class.
911 pub fn matches(&self, b
: u8) -> bool
{
912 self.binary_search_by(|range
| b
.partial_cmp(range
).unwrap()).is_ok()
915 /// Removes the given byte from the class if it exists.
917 /// Note that this takes `O(n)` time in the number of ranges.
918 pub fn remove(&mut self, b
: u8) {
919 let mut i
= match self.binary_search_by(|r
| b
.partial_cmp(r
).unwrap()) {
923 let mut r
= self.ranges
.remove(i
);
925 r
.start
= b
.saturating_add(1);
926 if r
.start
> r
.end
|| b
== u8::MAX
{
929 self.ranges
.insert(i
, r
);
930 } else if r
.end
== b
{
931 r
.end
= b
.saturating_sub(1);
932 if r
.end
< r
.start
|| b
== b'
\x00'
{
935 self.ranges
.insert(0, r
);
937 let (mut r1
, mut r2
) = (r
.clone(), r
.clone());
938 r1
.end
= b
.saturating_sub(1);
939 if r1
.start
<= r1
.end
{
940 self.ranges
.insert(i
, r1
);
943 r2
.start
= b
.saturating_add(1);
944 if r2
.start
<= r2
.end
{
945 self.ranges
.insert(i
, r2
);
950 /// Create a new empty class from this one.
951 fn to_empty(&self) -> ByteClass
{
952 ByteClass { ranges: Vec::with_capacity(self.len()) }
955 /// Canonicalze any sequence of ranges.
957 /// This is responsible for enforcing the canonical format invariants
958 /// as described on the docs for the `ByteClass` type.
959 fn canonicalize(mut self) -> ByteClass
{
960 // TODO: Save some cycles here by checking if already canonicalized.
962 let mut ordered
= self.to_empty(); // TODO: Do this in place?
963 for candidate
in self {
964 // If the candidate overlaps with an existing range, then it must
965 // be the most recent range added because we process the candidates
967 if let Some(or
) = ordered
.ranges
.last_mut() {
968 if or
.overlapping(candidate
) {
969 *or
= or
.merge(candidate
);
973 ordered
.ranges
.push(candidate
);
978 /// Negates the byte class.
980 /// For all `b` where `b` is a byte, `b` matches `self` if and only if `b`
981 /// does not match `self.negate()`.
982 pub fn negate(mut self) -> ByteClass
{
983 fn range(s
: u8, e
: u8) -> ByteRange { ByteRange::new(s, e) }
986 // Inverting an empty range yields all bytes.
988 ranges
: vec
![ByteRange { start: b'\x00', end: b'\xff' }
],
991 self = self.canonicalize();
992 let mut inv
= self.to_empty();
993 if self[0].start
> b'
\x00'
{
994 inv
.ranges
.push(range(b'
\x00'
, self[0].start
.saturating_sub(1)));
996 for win
in self.windows(2) {
997 inv
.ranges
.push(range(win
[0].end
.saturating_add(1),
998 win
[1].start
.saturating_sub(1)));
1000 if self[self.len() - 1].end
< u8::MAX
{
1001 inv
.ranges
.push(range(self[self.len() - 1].end
.saturating_add(1),
1007 /// Apply case folding to this byte class.
1009 /// This assumes that the bytes in the ranges are ASCII compatible.
1011 /// N.B. Applying case folding to a negated character class probably
1012 /// won't produce the expected result. e.g., `(?i)[^x]` really should
1013 /// match any character sans `x` and `X`, but if `[^x]` is negated
1014 /// before being case folded, you'll end up matching any character.
1015 pub fn case_fold(self) -> ByteClass
{
1016 let mut folded
= self.to_empty();
1018 folded
.ranges
.extend(r
.case_fold());
1020 folded
.canonicalize()
1023 /// Returns the number of bytes that match this class.
1024 fn num_bytes(&self) -> usize {
1026 .map(|&r
| 1 + (r
.end
as u32) - (r
.start
as u32))
1027 .fold(0, |acc
, len
| acc
+ len
)
1033 /// Create a new class range.
1035 /// If `end < start`, then the two values are swapped so that
1036 /// the invariant `start <= end` is preserved.
1037 fn new(start
: u8, end
: u8) -> ByteRange
{
1039 ByteRange { start: start, end: end }
1041 ByteRange { start: end, end: start }
1045 /// Returns true if and only if the two ranges are overlapping. Note that
1046 /// since ranges are inclusive, `a-c` and `d-f` are overlapping!
1047 fn overlapping(self, other
: ByteRange
) -> bool
{
1048 max(self.start
, other
.start
)
1049 <= min(self.end
, other
.end
).saturating_add(1)
1052 /// Returns true if and only if the intersection of self and other is non
1054 fn is_intersect_empty(self, other
: ByteRange
) -> bool
{
1055 max(self.start
, other
.start
) > min(self.end
, other
.end
)
1058 /// Creates a new range representing the union of `self` and `other.
1059 fn merge(self, other
: ByteRange
) -> ByteRange
{
1061 start
: min(self.start
, other
.start
),
1062 end
: max(self.end
, other
.end
),
1066 /// Apply case folding to this range.
1068 /// Since case folding might add bytes such that the range is no
1069 /// longer contiguous, this returns multiple byte ranges.
1071 /// This assumes that the bytes in this range are ASCII compatible.
1072 fn case_fold(self) -> Vec
<ByteRange
> {
1073 // So much easier than Unicode case folding!
1074 let mut ranges
= vec
![self];
1075 if !ByteRange
::new(b'a'
, b'z'
).is_intersect_empty(self) {
1076 let lower
= max(self.start
, b'a'
);
1077 let upper
= min(self.end
, b'z'
);
1078 ranges
.push(ByteRange
::new(lower
- 32, upper
- 32));
1080 if !ByteRange
::new(b'A'
, b'Z'
).is_intersect_empty(self) {
1081 let lower
= max(self.start
, b'A'
);
1082 let upper
= min(self.end
, b'Z'
);
1083 ranges
.push(ByteRange
::new(lower
+ 32, upper
+ 32));
1089 impl Deref
for ByteClass
{
1090 type Target
= Vec
<ByteRange
>;
1091 fn deref(&self) -> &Vec
<ByteRange
> { &self.ranges }
1094 impl IntoIterator
for ByteClass
{
1095 type Item
= ByteRange
;
1096 type IntoIter
= vec
::IntoIter
<ByteRange
>;
1097 fn into_iter(self) -> vec
::IntoIter
<ByteRange
> { self.ranges.into_iter() }
1100 impl<'a
> IntoIterator
for &'a ByteClass
{
1101 type Item
= &'a ByteRange
;
1102 type IntoIter
= slice
::Iter
<'a
, ByteRange
>;
1103 fn into_iter(self) -> slice
::Iter
<'a
, ByteRange
> { self.iter() }
1106 impl PartialEq
<u8> for ByteRange
{
1108 fn eq(&self, other
: &u8) -> bool
{
1109 self.start
<= *other
&& *other
<= self.end
1113 impl PartialEq
<ByteRange
> for u8 {
1115 fn eq(&self, other
: &ByteRange
) -> bool
{
1120 impl PartialOrd
<u8> for ByteRange
{
1122 fn partial_cmp(&self, other
: &u8) -> Option
<Ordering
> {
1123 Some(if self == other
{
1125 } else if *other
> self.end
{
1133 impl PartialOrd
<ByteRange
> for u8 {
1135 fn partial_cmp(&self, other
: &ByteRange
) -> Option
<Ordering
> {
1136 other
.partial_cmp(self).map(|o
| o
.reverse())
1140 /// This implementation of `Display` will write a regular expression from the
1141 /// syntax tree. It does not write the original string parsed.
1142 impl fmt
::Display
for Expr
{
1143 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
1145 Empty
=> write
!(f
, ""),
1146 Literal { ref chars, casei }
=> {
1148 try
!(write
!(f
, "(?iu:"));
1150 try
!(write
!(f
, "(?u:"));
1153 try
!(write
!(f
, "{}", quote_char(c
)));
1155 try
!(write
!(f
, ")"));
1158 LiteralBytes { ref bytes, casei }
=> {
1160 try
!(write
!(f
, "(?i-u:"));
1162 try
!(write
!(f
, "(?-u:"));
1165 try
!(write
!(f
, "{}", quote_byte(b
)));
1167 try
!(write
!(f
, ")"));
1170 AnyChar
=> write
!(f
, "(?su:.)"),
1171 AnyCharNoNL
=> write
!(f
, "(?u:.)"),
1172 AnyByte
=> write
!(f
, "(?s-u:.)"),
1173 AnyByteNoNL
=> write
!(f
, "(?-u:.)"),
1174 Class(ref cls
) => write
!(f
, "{}", cls
),
1175 ClassBytes(ref cls
) => write
!(f
, "{}", cls
),
1176 StartLine
=> write
!(f
, "(?m:^)"),
1177 EndLine
=> write
!(f
, "(?m:$)"),
1178 StartText
=> write
!(f
, r
"^"),
1179 EndText
=> write
!(f
, r
"$"),
1180 WordBoundary
=> write
!(f
, r
"(?u:\b)"),
1181 NotWordBoundary
=> write
!(f
, r
"(?u:\B)"),
1182 WordBoundaryAscii
=> write
!(f
, r
"(?-u:\b)"),
1183 NotWordBoundaryAscii
=> write
!(f
, r
"(?-u:\B)"),
1184 Group { ref e, i: None, name: None }
=> write
!(f
, "(?:{})", e
),
1185 Group { ref e, name: None, .. }
=> write
!(f
, "({})", e
),
1186 Group { ref e, name: Some(ref n), .. }
=> {
1187 write
!(f
, "(?P<{}>{})", n
, e
)
1189 Repeat { ref e, r, greedy }
=> {
1191 &Literal { ref chars, .. }
if chars
.len() > 1 => {
1192 try
!(write
!(f
, "(?:{}){}", e
, r
))
1194 _
=> try
!(write
!(f
, "{}{}", e
, r
)),
1196 if !greedy { try!(write!(f, "?")); }
1201 try
!(write
!(f
, "{}", e
));
1205 Alternate(ref es
) => {
1206 for (i
, e
) in es
.iter().enumerate() {
1207 if i
> 0 { try!(write!(f, "|")); }
1208 try
!(write
!(f
, "{}", e
));
1216 impl fmt
::Display
for Repeater
{
1217 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
1219 ZeroOrOne
=> write
!(f
, "?"),
1220 ZeroOrMore
=> write
!(f
, "*"),
1221 OneOrMore
=> write
!(f
, "+"),
1222 Range { min: s, max: None }
=> write
!(f
, "{{{},}}", s
),
1223 Range { min: s, max: Some(e) }
if s
== e
=> write
!(f
, "{{{}}}", s
),
1224 Range { min: s, max: Some(e) }
=> write
!(f
, "{{{}, {}}}", s
, e
),
1229 impl fmt
::Display
for CharClass
{
1230 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
1231 try
!(write
!(f
, "(?u:["));
1232 for range
in self.iter() {
1233 if range
.start
== '
-'
|| range
.end
== '
-'
{
1234 try
!(write
!(f
, "-"));
1238 for range
in self.iter() {
1239 let mut range
= *range
;
1240 if range
.start
== '
-'
{
1241 range
.start
= ((range
.start
as u8) + 1) as char;
1243 if range
.end
== '
-'
{
1244 range
.end
= ((range
.end
as u8) - 1) as char;
1246 if range
.start
> range
.end
{
1249 try
!(write
!(f
, "{}", range
));
1251 try
!(write
!(f
, "])"));
1256 impl fmt
::Display
for ClassRange
{
1257 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
1258 write
!(f
, "{}-{}", quote_char(self.start
), quote_char(self.end
))
1262 impl fmt
::Display
for ByteClass
{
1263 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
1264 try
!(write
!(f
, "(?-u:["));
1265 for range
in self.iter() {
1266 if range
.start
== b'
-'
|| range
.end
== b'
-'
{
1267 try
!(write
!(f
, "-"));
1271 for range
in self.iter() {
1272 let mut range
= *range
;
1273 if range
.start
== b'
-'
{
1276 if range
.end
== b'
-'
{
1279 if range
.start
> range
.end
{
1282 try
!(write
!(f
, "{}", range
));
1284 try
!(write
!(f
, "])"));
1289 impl fmt
::Display
for ByteRange
{
1290 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
1291 write
!(f
, "{}-{}", quote_byte(self.start
), quote_byte(self.end
))
1295 /// An alias for computations that can return a `Error`.
1296 pub type Result
<T
> = ::std
::result
::Result
<T
, Error
>;
1300 /// This includes details about the specific type of error and a rough
1301 /// approximation of where it occurred.
1302 #[derive(Clone, Debug, PartialEq)]
1309 /// The specific type of parse error that can occur.
1310 #[derive(Clone, Debug, PartialEq)]
1311 pub enum ErrorKind
{
1312 /// A negation symbol is used twice in flag settings.
1313 /// e.g., `(?-i-s)`.
1315 /// The same capture name was used more than once.
1316 /// e.g., `(?P<a>.)(?P<a>.)`.
1317 DuplicateCaptureName(String
),
1318 /// An alternate is empty. e.g., `(|a)`.
1320 /// A capture group name is empty. e.g., `(?P<>a)`.
1322 /// A negation symbol was not proceded by any flags. e.g., `(?i-)`.
1324 /// A group is empty. e.g., `()`.
1326 /// An invalid number was used in a counted repetition. e.g., `a{b}`.
1327 InvalidBase10(String
),
1328 /// An invalid hexadecimal number was used in an escape sequence.
1330 InvalidBase16(String
),
1331 /// An invalid capture name was used. e.g., `(?P<0a>b)`.
1332 InvalidCaptureName(String
),
1333 /// An invalid class range was givien. Specifically, when the start of the
1334 /// range is greater than the end. e.g., `[z-a]`.
1336 /// The first character specified in the range.
1338 /// The second character specified in the range.
1341 /// An escape sequence was used in a character class where it is not
1342 /// allowed. e.g., `[a-\pN]` or `[\A]`.
1343 InvalidClassEscape(Expr
),
1344 /// An invalid counted repetition min/max was given. e.g., `a{2,1}`.
1345 InvalidRepeatRange
{
1346 /// The first number specified in the repetition.
1348 /// The second number specified in the repetition.
1351 /// An invalid Unicode scalar value was used in a long hexadecimal
1352 /// sequence. e.g., `\x{D800}`.
1353 InvalidScalarValue(u32),
1354 /// An empty counted repetition operator. e.g., `a{}`.
1356 /// A repetition operator was not applied to an expression. e.g., `*`.
1357 RepeaterExpectsExpr
,
1358 /// A repetition operator was applied to an expression that cannot be
1359 /// repeated. e.g., `a+*` or `a|*`.
1360 RepeaterUnexpectedExpr(Expr
),
1361 /// A capture group name that is never closed. e.g., `(?P<a`.
1362 UnclosedCaptureName(String
),
1363 /// An unclosed hexadecimal literal. e.g., `\x{a`.
1365 /// An unclosed parenthesis. e.g., `(a`.
1367 /// An unclosed counted repetition operator. e.g., `a{2`.
1369 /// An unclosed named Unicode class. e.g., `\p{Yi`.
1370 UnclosedUnicodeName
,
1371 /// Saw end of regex before class was closed. e.g., `[a`.
1373 /// Saw end of regex before escape sequence was closed. e.g., `\`.
1374 UnexpectedEscapeEof
,
1375 /// Saw end of regex before flags were closed. e.g., `(?i`.
1377 /// Saw end of regex before two hexadecimal digits were seen. e.g., `\xA`.
1378 UnexpectedTwoDigitHexEof
,
1379 /// Unopened parenthesis. e.g., `)`.
1381 /// Unrecognized escape sequence. e.g., `\q`.
1382 UnrecognizedEscape(char),
1383 /// Unrecognized flag. e.g., `(?a)`.
1384 UnrecognizedFlag(char),
1385 /// Unrecognized named Unicode class. e.g., `\p{Foo}`.
1386 UnrecognizedUnicodeClass(String
),
1387 /// Indicates that the regex uses too much nesting.
1389 /// (N.B. This error exists because traversing the Expr is recursive and
1390 /// an explicit heap allocated stack is not (yet?) used. Regardless, some
1391 /// sort of limit must be applied to avoid unbounded memory growth.
1393 /// A disallowed flag was found (e.g., `u`).
1394 FlagNotAllowed(char),
1395 /// A Unicode class was used when the Unicode (`u`) flag was disabled.
1397 /// InvalidUtf8 indicates that the expression may match non-UTF-8 bytes.
1398 /// This never returned if the parser is permitted to allow expressions
1399 /// that match arbitrary bytes.
1401 /// A character class was constructed such that it is empty.
1402 /// e.g., `[^\d\D]`.
1404 /// Hints that destructuring should not be exhaustive.
1406 /// This enum may grow additional variants, so this makes sure clients
1407 /// don't count on exhaustive matching. (Otherwise, adding a new variant
1408 /// could break existing code.)
1414 /// Returns an approximate *character* offset at which the error occurred.
1416 /// The character offset may be equal to the number of characters in the
1417 /// string, in which case it should be interpreted as pointing to the end
1419 pub fn position(&self) -> usize {
1423 /// Returns the type of the regex parse error.
1424 pub fn kind(&self) -> &ErrorKind
{
1430 fn description(&self) -> &str {
1433 DoubleFlagNegation
=> "double flag negation",
1434 DuplicateCaptureName(_
) => "duplicate capture name",
1435 EmptyAlternate
=> "empty alternate",
1436 EmptyCaptureName
=> "empty capture name",
1437 EmptyFlagNegation
=> "flag negation without any flags",
1438 EmptyGroup
=> "empty group (e.g., '()')",
1439 InvalidBase10(_
) => "invalid base 10 number",
1440 InvalidBase16(_
) => "invalid base 16 number",
1441 InvalidCaptureName(_
) => "invalid capture name",
1442 InvalidClassRange{..}
=> "invalid character class range",
1443 InvalidClassEscape(_
) => "invalid escape sequence in class",
1444 InvalidRepeatRange{..}
=> "invalid counted repetition range",
1445 InvalidScalarValue(_
) => "invalid Unicode scalar value",
1446 MissingBase10
=> "missing count in repetition operator",
1447 RepeaterExpectsExpr
=> "repetition operator missing expression",
1448 RepeaterUnexpectedExpr(_
) => "expression cannot be repeated",
1449 UnclosedCaptureName(_
) => "unclosed capture group name",
1450 UnclosedHex
=> "unclosed hexadecimal literal",
1451 UnclosedParen
=> "unclosed parenthesis",
1452 UnclosedRepeat
=> "unclosed counted repetition operator",
1453 UnclosedUnicodeName
=> "unclosed Unicode class literal",
1454 UnexpectedClassEof
=> "unexpected EOF in character class",
1455 UnexpectedEscapeEof
=> "unexpected EOF in escape sequence",
1456 UnexpectedFlagEof
=> "unexpected EOF in flags",
1457 UnexpectedTwoDigitHexEof
=> "unexpected EOF in hex literal",
1458 UnopenedParen
=> "unopened parenthesis",
1459 UnrecognizedEscape(_
) => "unrecognized escape sequence",
1460 UnrecognizedFlag(_
) => "unrecognized flag",
1461 UnrecognizedUnicodeClass(_
) => "unrecognized Unicode class name",
1462 StackExhausted
=> "stack exhausted, too much nesting",
1463 FlagNotAllowed(_
) => "flag not allowed",
1464 UnicodeNotAllowed
=> "Unicode features not allowed",
1465 InvalidUtf8
=> "matching arbitrary bytes is not allowed",
1466 EmptyClass
=> "empty character class",
1467 __Nonexhaustive
=> unreachable
!(),
1472 impl ::std
::error
::Error
for Error
{
1473 fn description(&self) -> &str {
1474 self.kind
.description()
1478 impl fmt
::Display
for Error
{
1479 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
1480 if let ErrorKind
::StackExhausted
= self.kind
{
1481 write
!(f
, "Error parsing regex: {}", self.kind
)
1484 f
, "Error parsing regex near '{}' at character offset {}: {}",
1485 self.surround
, self.pos
, self.kind
)
1490 impl fmt
::Display
for ErrorKind
{
1491 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
1494 DoubleFlagNegation
=>
1495 write
!(f
, "Only one negation symbol is allowed in flags."),
1496 DuplicateCaptureName(ref s
) =>
1497 write
!(f
, "Capture name '{}' is used more than once.", s
),
1499 write
!(f
, "Alternations cannot be empty."),
1501 write
!(f
, "Capture names cannot be empty."),
1502 EmptyFlagNegation
=>
1503 write
!(f
, "Flag negation requires setting at least one flag."),
1505 write
!(f
, "Empty regex groups (e.g., '()') are not allowed."),
1506 InvalidBase10(ref s
) =>
1507 write
!(f
, "Not a valid base 10 number: '{}'", s
),
1508 InvalidBase16(ref s
) =>
1509 write
!(f
, "Not a valid base 16 number: '{}'", s
),
1510 InvalidCaptureName(ref s
) =>
1511 write
!(f
, "Invalid capture name: '{}'. Capture names must \
1512 consist of [_a-zA-Z0-9] and are not allowed to \
1513 start with with a number.", s
),
1514 InvalidClassRange { start, end }
=>
1515 write
!(f
, "Invalid character class range '{}-{}'. \
1516 Character class ranges must start with the smaller \
1517 character, but {} > {}", start
, end
, start
, end
),
1518 InvalidClassEscape(ref e
) =>
1519 write
!(f
, "Invalid escape sequence in character \
1521 InvalidRepeatRange { min, max }
=>
1522 write
!(f
, "Invalid counted repetition range: {{{}, {}}}. \
1523 Counted repetition ranges must start with the \
1524 minimum, but {} > {}", min
, max
, min
, max
),
1525 InvalidScalarValue(c
) =>
1526 write
!(f
, "Number does not correspond to a Unicode scalar \
1529 write
!(f
, "Missing maximum in counted reptition operator."),
1530 RepeaterExpectsExpr
=>
1531 write
!(f
, "Missing expression for reptition operator."),
1532 RepeaterUnexpectedExpr(ref e
) =>
1533 write
!(f
, "Invalid application of reptition operator to: \
1535 UnclosedCaptureName(ref s
) =>
1536 write
!(f
, "Capture name group for '{}' is not closed. \
1537 (Missing a '>'.)", s
),
1539 write
!(f
, "Unclosed hexadecimal literal (missing a '}}')."),
1541 write
!(f
, "Unclosed parenthesis."),
1543 write
!(f
, "Unclosed counted repetition (missing a '}}')."),
1544 UnclosedUnicodeName
=>
1545 write
!(f
, "Unclosed Unicode literal (missing a '}}')."),
1546 UnexpectedClassEof
=>
1547 write
!(f
, "Character class was not closed before the end of \
1548 the regex (missing a ']')."),
1549 UnexpectedEscapeEof
=>
1550 write
!(f
, "Started an escape sequence that didn't finish \
1551 before the end of the regex."),
1552 UnexpectedFlagEof
=>
1553 write
!(f
, "Inline flag settings was not closed before the end \
1554 of the regex (missing a ')' or ':')."),
1555 UnexpectedTwoDigitHexEof
=>
1556 write
!(f
, "Unexpected end of two digit hexadecimal literal."),
1558 write
!(f
, "Unopened parenthesis."),
1559 UnrecognizedEscape(c
) =>
1560 write
!(f
, "Unrecognized escape sequence: '\\{}'.", c
),
1561 UnrecognizedFlag(c
) =>
1562 write
!(f
, "Unrecognized flag: '{}'. \
1563 (Allowed flags: i, m, s, U, u, x.)", c
),
1564 UnrecognizedUnicodeClass(ref s
) =>
1565 write
!(f
, "Unrecognized Unicode class name: '{}'.", s
),
1567 write
!(f
, "Exhausted space required to parse regex with too \
1569 FlagNotAllowed(flag
) =>
1570 write
!(f
, "Use of the flag '{}' is not allowed.", flag
),
1571 UnicodeNotAllowed
=>
1572 write
!(f
, "Unicode features are not allowed when the Unicode \
1573 (u) flag is not set."),
1575 write
!(f
, "Matching arbitrary bytes is not allowed."),
1577 write
!(f
, "Empty character classes are not allowed."),
1578 __Nonexhaustive
=> unreachable
!(),
1583 /// The result of binary search on the simple case folding table.
1585 /// Note that this binary search is done on the "both" table, such that
1586 /// the index returned corresponds to the *first* location of `c1` in the
1587 /// table. The table can then be scanned linearly starting from the position
1588 /// returned to find other case mappings for `c1`.
1589 fn simple_case_fold_both_result(c1
: char) -> result
::Result
<usize, usize> {
1590 let table
= &case_folding
::C_plus_S_both_table
;
1591 let i
= binary_search(table
, |&(c2
, _
)| c1
<= c2
);
1592 if i
>= table
.len() || table
[i
].0 != c1
{
1599 /// Binary search to find first element such that `pred(T) == true`.
1601 /// Assumes that if `pred(xs[i]) == true` then `pred(xs[i+1]) == true`.
1603 /// If all elements yield `pred(T) == false`, then `xs.len()` is returned.
1604 fn binary_search
<T
, F
>(xs
: &[T
], mut pred
: F
) -> usize
1605 where F
: FnMut(&T
) -> bool
{
1606 let (mut left
, mut right
) = (0, xs
.len());
1607 while left
< right
{
1608 let mid
= (left
+ right
) / 2;
1618 /// Escapes all regular expression meta characters in `text`.
1620 /// The string returned may be safely used as a literal in a regular
1622 pub fn quote(text
: &str) -> String
{
1623 let mut quoted
= String
::with_capacity(text
.len());
1624 for c
in text
.chars() {
1625 if parser
::is_punct(c
) {
1633 fn quote_char(c
: char) -> String
{
1634 let mut s
= String
::new();
1635 if parser
::is_punct(c
) {
1642 fn quote_byte(b
: u8) -> String
{
1643 if parser
::is_punct(b
as char) || b
== b'
\''
|| b
== b'
"' {
1644 quote_char(b as char)
1646 let escaped: Vec<u8> = ascii::escape_default(b).collect();
1647 String::from_utf8(escaped).unwrap()
1651 fn inc_char(c: char) -> char {
1653 char::MAX => char::MAX,
1654 '\u{D7FF}' => '\u{E000}',
1655 c => char::from_u32(c as u32 + 1).unwrap(),
1659 fn dec_char(c: char) -> char {
1662 '\u{E000}' => '\u{D7FF}',
1663 c => char::from_u32(c as u32 - 1).unwrap(),
1667 /// Returns true if and only if `c` is a word character.
1669 pub fn is_word_char(c: char) -> bool {
1671 '_' | '0' ... '9' | 'a' ... 'z' | 'A' ... 'Z' => true,
1672 _ => ::unicode::regex::PERLW.binary_search_by(|&(start, end)| {
1673 if c >= start && c <= end {
1675 } else if start > c {
1684 /// Returns true if and only if `c` is an ASCII word byte.
1686 pub fn is_word_byte(b: u8) -> bool {
1688 b'_' | b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' => true,
1698 use {CharClass, ClassRange, ByteClass, ByteRange, Expr};
1700 fn class(ranges: &[(char, char)]) -> CharClass {
1701 let ranges = ranges.iter().cloned()
1702 .map(|(c1, c2)| ClassRange::new(c1, c2)).collect();
1703 CharClass::new(ranges)
1706 fn bclass(ranges: &[(u8, u8)]) -> ByteClass {
1707 let ranges = ranges.iter().cloned()
1708 .map(|(c1, c2)| ByteRange::new(c1, c2)).collect();
1709 ByteClass::new(ranges)
1712 fn e(re: &str) -> Expr { Expr::parse(re).unwrap() }
1715 fn stack_exhaustion() {
1716 use std::iter::repeat;
1718 let open: String = repeat('(').take(200).collect();
1719 let close: String = repeat(')').take(200).collect();
1720 assert!(Expr::parse(&format!("{}a{}
", open, close)).is_ok());
1722 let open: String = repeat('(').take(200 + 1).collect();
1723 let close: String = repeat(')').take(200 + 1).collect();
1724 assert!(Expr::parse(&format!("{}a{}
", open, close)).is_err());
1728 fn anchored_start() {
1729 assert!(e("^a
").is_anchored_start());
1730 assert!(e("(^a
)").is_anchored_start());
1731 assert!(e("^a
|^b
").is_anchored_start());
1732 assert!(e("(^a
)|(^b
)").is_anchored_start());
1733 assert!(e("(^
(a
|b
))").is_anchored_start());
1735 assert!(!e("^a
|b
").is_anchored_start());
1736 assert!(!e("a
|^b
").is_anchored_start());
1741 assert!(e("a$
").is_anchored_end());
1742 assert!(e("(a$
)").is_anchored_end());
1743 assert!(e("a$
|b$
").is_anchored_end());
1744 assert!(e("(a$
)|(b$
)").is_anchored_end());
1745 assert!(e("((a
|b
)$
)").is_anchored_end());
1747 assert!(!e("a$
|b
").is_anchored_end());
1748 assert!(!e("a
|b$
").is_anchored_end());
1752 fn class_canon_no_change() {
1753 let cls = class(&[('a', 'c'), ('x', 'z')]);
1754 assert_eq!(cls.clone().canonicalize(), cls);
1758 fn class_canon_unordered() {
1759 let cls = class(&[('x', 'z'), ('a', 'c')]);
1760 assert_eq!(cls.canonicalize(), class(&[
1761 ('a', 'c'), ('x', 'z'),
1766 fn class_canon_overlap() {
1767 let cls = class(&[('x', 'z'), ('w', 'y')]);
1768 assert_eq!(cls.canonicalize(), class(&[
1774 fn class_canon_overlap_many() {
1776 ('c', 'f'), ('a', 'g'), ('d', 'j'), ('a', 'c'),
1777 ('m', 'p'), ('l', 's'),
1779 assert_eq!(cls.clone().canonicalize(), class(&[
1780 ('a', 'j'), ('l', 's'),
1785 fn class_canon_overlap_boundary() {
1786 let cls = class(&[('x', 'z'), ('u', 'w')]);
1787 assert_eq!(cls.canonicalize(), class(&[
1793 fn class_canon_extreme_edge_case() {
1794 let cls = class(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
1795 assert_eq!(cls.canonicalize(), class(&[
1796 ('\x00', '\u{10FFFF}'),
1801 fn class_canon_singles() {
1802 let cls = class(&[('a', 'a'), ('b', 'b')]);
1803 assert_eq!(cls.canonicalize(), class(&[('a', 'b')]));
1807 fn class_negate_single() {
1808 let cls = class(&[('a', 'a')]);
1809 assert_eq!(cls.negate(), class(&[
1810 ('\x00', '\x60'), ('\x62', '\u{10FFFF}'),
1815 fn class_negate_singles() {
1816 let cls = class(&[('a', 'a'), ('b', 'b')]);
1817 assert_eq!(cls.negate(), class(&[
1818 ('\x00', '\x60'), ('\x63', '\u{10FFFF}'),
1823 fn class_negate_multiples() {
1824 let cls = class(&[('a', 'c'), ('x', 'z')]);
1825 assert_eq!(cls.negate(), class(&[
1826 ('\x00', '\x60'), ('\x64', '\x77'), ('\x7b', '\u{10FFFF}'),
1831 fn class_negate_min_scalar() {
1832 let cls = class(&[('\x00', 'a')]);
1833 assert_eq!(cls.negate(), class(&[
1834 ('\x62', '\u{10FFFF}'),
1839 fn class_negate_max_scalar() {
1840 let cls = class(&[('a', '\u{10FFFF}')]);
1841 assert_eq!(cls.negate(), class(&[
1847 fn class_negate_everything() {
1848 let cls = class(&[('\x00', '\u{10FFFF}')]);
1849 assert_eq!(cls.negate(), class(&[]));
1853 fn class_negate_everything_sans_one() {
1855 ('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')
1857 assert_eq!(cls.negate(), class(&[
1858 ('\u{10FFFE}', '\u{10FFFE}'),
1863 fn class_negate_surrogates_min() {
1864 let cls = class(&[('\x00', '\u{D7FF}')]);
1865 assert_eq!(cls.negate(), class(&[
1866 ('\u{E000}', '\u{10FFFF}'),
1871 fn class_negate_surrogates_min_edge() {
1872 let cls = class(&[('\x00', '\u{D7FE}')]);
1873 assert_eq!(cls.negate(), class(&[
1874 ('\u{D7FF}', '\u{10FFFF}'),
1879 fn class_negate_surrogates_max() {
1880 let cls = class(&[('\u{E000}', '\u{10FFFF}')]);
1881 assert_eq!(cls.negate(), class(&[
1882 ('\x00', '\u{D7FF}'),
1887 fn class_negate_surrogates_max_edge() {
1888 let cls = class(&[('\u{E001}', '\u{10FFFF}')]);
1889 assert_eq!(cls.negate(), class(&[
1890 ('\x00', '\u{E000}'),
1895 fn class_canon_overlap_many_case_fold() {
1897 ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'),
1898 ('M', 'P'), ('L', 'S'), ('c', 'f'),
1900 assert_eq!(cls.case_fold(), class(&[
1901 ('A', 'J'), ('L', 'S'),
1902 ('a', 'j'), ('l', 's'),
1903 ('\u{17F}', '\u{17F}'),
1907 (b'C', b'F'), (b'A', b'G'), (b'D', b'J'), (b'A', b'C'),
1908 (b'M', b'P'), (b'L', b'S'), (b'c', b'f'),
1910 assert_eq!(cls.case_fold(), bclass(&[
1911 (b'A', b'J'), (b'L', b'S'),
1912 (b'a', b'j'), (b'l', b's'),
1917 fn class_fold_az() {
1918 let cls = class(&[('A', 'Z')]);
1919 assert_eq!(cls.case_fold(), class(&[
1920 ('A', 'Z'), ('a', 'z'),
1921 ('\u{17F}', '\u{17F}'),
1922 ('\u{212A}', '\u{212A}'),
1924 let cls = class(&[('a', 'z')]);
1925 assert_eq!(cls.case_fold(), class(&[
1926 ('A', 'Z'), ('a', 'z'),
1927 ('\u{17F}', '\u{17F}'),
1928 ('\u{212A}', '\u{212A}'),
1931 let cls = bclass(&[(b'A', b'Z')]);
1932 assert_eq!(cls.case_fold(), bclass(&[
1933 (b'A', b'Z'), (b'a', b'z'),
1935 let cls = bclass(&[(b'a', b'z')]);
1936 assert_eq!(cls.case_fold(), bclass(&[
1937 (b'A', b'Z'), (b'a', b'z'),
1942 fn class_fold_a_underscore() {
1943 let cls = class(&[('A', 'A'), ('_', '_')]);
1944 assert_eq!(cls.clone().canonicalize(), class(&[
1945 ('A', 'A'), ('_', '_'),
1947 assert_eq!(cls.case_fold(), class(&[
1948 ('A', 'A'), ('_', '_'), ('a', 'a'),
1951 let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
1952 assert_eq!(cls.clone().canonicalize(), bclass(&[
1953 (b'A', b'A'), (b'_', b'_'),
1955 assert_eq!(cls.case_fold(), bclass(&[
1956 (b'A', b'A'), (b'_', b'_'), (b'a', b'a'),
1961 fn class_fold_a_equals() {
1962 let cls = class(&[('A', 'A'), ('=', '=')]);
1963 assert_eq!(cls.clone().canonicalize(), class(&[
1964 ('=', '='), ('A', 'A'),
1966 assert_eq!(cls.case_fold(), class(&[
1967 ('=', '='), ('A', 'A'), ('a', 'a'),
1970 let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
1971 assert_eq!(cls.clone().canonicalize(), bclass(&[
1972 (b'=', b'='), (b'A', b'A'),
1974 assert_eq!(cls.case_fold(), bclass(&[
1975 (b'=', b'='), (b'A', b'A'), (b'a', b'a'),
1980 fn class_fold_no_folding_needed() {
1981 let cls = class(&[('\x00', '\x10')]);
1982 assert_eq!(cls.case_fold(), class(&[
1986 let cls = bclass(&[(b'\x00', b'\x10')]);
1987 assert_eq!(cls.case_fold(), bclass(&[
1993 fn class_fold_negated() {
1994 let cls = class(&[('x', 'x')]);
1995 assert_eq!(cls.clone().case_fold(), class(&[
1996 ('X', 'X'), ('x', 'x'),
1998 assert_eq!(cls.case_fold().negate(), class(&[
1999 ('\x00', 'W'), ('Y', 'w'), ('y', '\u{10FFFF}'),
2002 let cls = bclass(&[(b'x', b'x')]);
2003 assert_eq!(cls.clone().case_fold(), bclass(&[
2004 (b'X', b'X'), (b'x', b'x'),
2006 assert_eq!(cls.case_fold().negate(), bclass(&[
2007 (b'\x00', b'W'), (b'Y', b'w'), (b'y', b'\xff'),
2012 fn class_fold_single_to_multiple() {
2013 let cls = class(&[('k', 'k')]);
2014 assert_eq!(cls.case_fold(), class(&[
2015 ('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),
2018 let cls = bclass(&[(b'k', b'k')]);
2019 assert_eq!(cls.case_fold(), bclass(&[
2020 (b'K', b'K'), (b'k', b'k'),
2025 fn class_fold_at() {
2026 let cls = class(&[('@', '@')]);
2027 assert_eq!(cls.clone().canonicalize(), class(&[('@', '@')]));
2028 assert_eq!(cls.case_fold(), class(&[('@', '@')]));
2030 let cls = bclass(&[(b'@', b'@')]);
2031 assert_eq!(cls.clone().canonicalize(), bclass(&[(b'@', b'@')]));
2032 assert_eq!(cls.case_fold(), bclass(&[(b'@', b'@')]));
2036 fn roundtrip_class_hypen() {
2037 let expr = e("[-./]");
2038 assert_eq!("(?u
:[-\\.-/])", expr.to_string());
2040 let expr = e("(?
-u
)[-./]");
2041 assert_eq!("(?
-u
:[-\\.-/])", expr.to_string());