1 // Copyright 2017 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
12 Defines an abstract syntax for regular expressions.
15 use std
::cmp
::Ordering
;
19 pub use ast
::visitor
::{Visitor, visit}
;
25 /// An error that occurred while parsing a regular expression into an abstract
28 /// Note that note all ASTs represents a valid regular expression. For example,
29 /// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a
30 /// valid Unicode property name. That particular error is reported when
31 /// translating an AST to the high-level intermediate representation (`HIR`).
32 #[derive(Clone, Debug, Eq, PartialEq)]
34 /// The kind of error.
36 /// The original pattern that the parser generated the error from. Every
37 /// span in an error is a valid range into this string.
39 /// The span of this error.
44 /// Return the type of this error.
45 pub fn kind(&self) -> &ErrorKind
{
49 /// The original pattern string in which this error occurred.
51 /// Every span reported by this error is reported in terms of this string.
52 pub fn pattern(&self) -> &str {
56 /// Return the span at which this error occurred.
57 pub fn span(&self) -> &Span
{
61 /// Return an auxiliary span. This span exists only for some errors that
62 /// benefit from being able to point to two locations in the original
63 /// regular expression. For example, "duplicate" errors will have the
64 /// main error position set to the duplicate occurrence while its
65 /// auxiliary span will be set to the initial occurrence.
66 pub fn auxiliary_span(&self) -> Option
<&Span
> {
67 use self::ErrorKind
::*;
69 FlagDuplicate { ref original }
=> Some(original
),
70 FlagRepeatedNegation { ref original, .. }
=> Some(original
),
71 GroupNameDuplicate { ref original, .. }
=> Some(original
),
77 /// The type of an error that occurred while building an AST.
78 #[derive(Clone, Debug, Eq, PartialEq)]
80 /// The capturing group limit was exceeded.
82 /// Note that this represents a limit on the total number of capturing
83 /// groups in a regex and not necessarily the number of nested capturing
84 /// groups. That is, the nest limit can be low and it is still possible for
85 /// this error to occur.
87 /// An invalid escape sequence was found in a character class set.
89 /// An invalid character class range was found. An invalid range is any
90 /// range where the start is greater than the end.
92 /// An opening `[` was found with no corresponding closing `]`.
94 /// An empty decimal number was given where one was expected.
96 /// An invalid decimal number was given where one was expected.
98 /// A bracketed hex literal was empty.
100 /// A bracketed hex literal did not correspond to a Unicode scalar value.
102 /// An invalid hexadecimal digit was found.
103 EscapeHexInvalidDigit
,
104 /// EOF was found before an escape sequence was completed.
106 /// An unrecognized escape sequence.
108 /// A dangling negation was used when setting flags, e.g., `i-`.
109 FlagDanglingNegation
,
110 /// A flag was used twice, e.g., `i-i`.
112 /// The position of the original flag. The error position
113 /// points to the duplicate flag.
116 /// The negation operator was used twice, e.g., `-i-s`.
117 FlagRepeatedNegation
{
118 /// The position of the original negation operator. The error position
119 /// points to the duplicate negation operator.
122 /// Expected a flag but got EOF, e.g., `(?`.
124 /// Unrecognized flag, e.g., `a`.
126 /// A duplicate capture name was found.
128 /// The position of the initial occurrence of the capture name. The
129 /// error position itself points to the duplicate occurrence.
132 /// A capture group name is empty, e.g., `(?P<>abc)`.
134 /// An invalid character was seen for a capture group name. This includes
135 /// errors where the first character is a digit (even though subsequent
136 /// characters are allowed to be digits).
138 /// A closing `>` could not be found for a capture group name.
139 GroupNameUnexpectedEof
,
140 /// An unclosed group, e.g., `(ab`.
142 /// The span of this error corresponds to the unclosed parenthesis.
144 /// An unopened group, e.g., `ab)`.
146 /// The nest limit was exceeded. The limit stored here is the limit
147 /// configured in the parser.
148 NestLimitExceeded(u32),
149 /// The range provided in a counted repetition operator is invalid. The
150 /// range is invalid if the start is greater than the end.
151 RepetitionCountInvalid
,
152 /// An opening `{` was found with no corresponding closing `}`.
153 RepetitionCountUnclosed
,
154 /// A repetition operator was applied to a missing sub-expression. This
155 /// occurs, for example, in the regex consisting of just a `*`. It is,
156 /// however, possible to create a repetition operating on an empty
157 /// sub-expression. For example, `()*` is still considered valid.
159 /// When octal support is disabled, this error is produced when an octal
160 /// escape is used. The octal escape is assumed to be an invocation of
161 /// a backreference, which is the common case.
162 UnsupportedBackreference
,
163 /// When syntax similar to PCRE's look-around is used, this error is
164 /// returned. Some example syntaxes that are rejected include, but are
165 /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and
166 /// `(?<!re)`. Note that all of these syntaxes are otherwise invalid; this
167 /// error is used to improve the user experience.
168 UnsupportedLookAround
,
169 /// Hints that destructuring should not be exhaustive.
171 /// This enum may grow additional variants, so this makes sure clients
172 /// don't count on exhaustive matching. (Otherwise, adding a new variant
173 /// could break existing code.)
178 impl error
::Error
for Error
{
179 fn description(&self) -> &str {
180 use self::ErrorKind
::*;
182 CaptureLimitExceeded
=> "capture group limit exceeded",
183 ClassEscapeInvalid
=> "invalid escape sequence in character class",
184 ClassRangeInvalid
=> "invalid character class range",
185 ClassUnclosed
=> "unclosed character class",
186 DecimalEmpty
=> "empty decimal literal",
187 DecimalInvalid
=> "invalid decimal literal",
188 EscapeHexEmpty
=> "empty hexadecimal literal",
189 EscapeHexInvalid
=> "invalid hexadecimal literal",
190 EscapeHexInvalidDigit
=> "invalid hexadecimal digit",
191 EscapeUnexpectedEof
=> "unexpected eof (escape sequence)",
192 EscapeUnrecognized
=> "unrecognized escape sequence",
193 FlagDanglingNegation
=> "dangling flag negation operator",
194 FlagDuplicate{..}
=> "duplicate flag",
195 FlagRepeatedNegation{..}
=> "repeated negation",
196 FlagUnexpectedEof
=> "unexpected eof (flag)",
197 FlagUnrecognized
=> "unrecognized flag",
198 GroupNameDuplicate{..}
=> "duplicate capture group name",
199 GroupNameEmpty
=> "empty capture group name",
200 GroupNameInvalid
=> "invalid capture group name",
201 GroupNameUnexpectedEof
=> "unclosed capture group name",
202 GroupUnclosed
=> "unclosed group",
203 GroupUnopened
=> "unopened group",
204 NestLimitExceeded(_
) => "nest limit exceeded",
205 RepetitionCountInvalid
=> "invalid repetition count range",
206 RepetitionCountUnclosed
=> "unclosed counted repetition",
207 RepetitionMissing
=> "repetition operator missing expression",
208 UnsupportedBackreference
=> "backreferences are not supported",
209 UnsupportedLookAround
=> "look-around is not supported",
215 impl fmt
::Display
for Error
{
216 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
217 ::error
::Formatter
::from(self).fmt(f
)
221 impl fmt
::Display
for ErrorKind
{
222 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
223 use self::ErrorKind
::*;
225 CaptureLimitExceeded
=> {
226 write
!(f
, "exceeded the maximum number of \
227 capturing groups ({})", ::std
::u32::MAX
)
229 ClassEscapeInvalid
=> {
230 write
!(f
, "invalid escape sequence found in character class")
232 ClassRangeInvalid
=> {
233 write
!(f
, "invalid character class range, \
234 the start must be <= the end")
237 write
!(f
, "unclosed character class")
240 write
!(f
, "decimal literal empty")
243 write
!(f
, "decimal literal invalid")
246 write
!(f
, "hexadecimal literal empty")
248 EscapeHexInvalid
=> {
249 write
!(f
, "hexadecimal literal is not a Unicode scalar value")
251 EscapeHexInvalidDigit
=> {
252 write
!(f
, "invalid hexadecimal digit")
254 EscapeUnexpectedEof
=> {
255 write
!(f
, "incomplete escape sequence, \
256 reached end of pattern prematurely")
258 EscapeUnrecognized
=> {
259 write
!(f
, "unrecognized escape sequence")
261 FlagDanglingNegation
=> {
262 write
!(f
, "dangling flag negation operator")
264 FlagDuplicate{..}
=> {
265 write
!(f
, "duplicate flag")
267 FlagRepeatedNegation{..}
=> {
268 write
!(f
, "flag negation operator repeated")
270 FlagUnexpectedEof
=> {
271 write
!(f
, "expected flag but got end of regex")
273 FlagUnrecognized
=> {
274 write
!(f
, "unrecognized flag")
276 GroupNameDuplicate{..}
=> {
277 write
!(f
, "duplicate capture group name")
280 write
!(f
, "empty capture group name")
282 GroupNameInvalid
=> {
283 write
!(f
, "invalid capture group character")
285 GroupNameUnexpectedEof
=> {
286 write
!(f
, "unclosed capture group name")
289 write
!(f
, "unclosed group")
292 write
!(f
, "unopened group")
294 NestLimitExceeded(limit
) => {
295 write
!(f
, "exceed the maximum number of \
296 nested parentheses/brackets ({})", limit
)
298 RepetitionCountInvalid
=> {
299 write
!(f
, "invalid repetition count range, \
300 the start must be <= the end")
302 RepetitionCountUnclosed
=> {
303 write
!(f
, "unclosed counted repetition")
305 RepetitionMissing
=> {
306 write
!(f
, "repetition operator missing expression")
308 UnsupportedBackreference
=> {
309 write
!(f
, "backreferences are not supported")
311 UnsupportedLookAround
=> {
312 write
!(f
, "look-around, including look-ahead and look-behind, \
320 /// Span represents the position information of a single AST item.
322 /// All span positions are absolute byte offsets that can be used on the
323 /// original regular expression that was parsed.
324 #[derive(Clone, Copy, Eq, PartialEq)]
326 /// The start byte offset.
328 /// The end byte offset.
332 impl fmt
::Debug
for Span
{
333 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
334 write
!(f
, "Span({:?}, {:?})", self.start
, self.end
)
339 fn cmp(&self, other
: &Span
) -> Ordering
{
340 (&self.start
, &self.end
).cmp(&(&other
.start
, &other
.end
))
344 impl PartialOrd
for Span
{
345 fn partial_cmp(&self, other
: &Span
) -> Option
<Ordering
> {
346 Some(self.cmp(other
))
350 /// A single position in a regular expression.
352 /// A position encodes one half of a span, and include the byte offset, line
353 /// number and column number.
354 #[derive(Clone, Copy, Eq, PartialEq)]
355 pub struct Position
{
356 /// The absolute offset of this position, starting at `0` from the
357 /// beginning of the regular expression pattern string.
359 /// The line number, starting at `1`.
361 /// The approximate column number, starting at `1`.
365 impl fmt
::Debug
for Position
{
366 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
369 "Position(o: {:?}, l: {:?}, c: {:?})",
370 self.offset
, self.line
, self.column
)
374 impl Ord
for Position
{
375 fn cmp(&self, other
: &Position
) -> Ordering
{
376 self.offset
.cmp(&other
.offset
)
380 impl PartialOrd
for Position
{
381 fn partial_cmp(&self, other
: &Position
) -> Option
<Ordering
> {
382 Some(self.cmp(other
))
387 /// Create a new span with the given positions.
388 pub fn new(start
: Position
, end
: Position
) -> Span
{
389 Span { start: start, end: end }
392 /// Create a new span using the given position as the start and end.
393 pub fn splat(pos
: Position
) -> Span
{
397 /// Create a new span by replacing the starting the position with the one
399 pub fn with_start(self, pos
: Position
) -> Span
{
400 Span { start: pos, ..self }
403 /// Create a new span by replacing the ending the position with the one
405 pub fn with_end(self, pos
: Position
) -> Span
{
406 Span { end: pos, ..self }
409 /// Returns true if and only if this span occurs on a single line.
410 pub fn is_one_line(&self) -> bool
{
411 self.start
.line
== self.end
.line
414 /// Returns true if and only if this span is empty. That is, it points to
415 /// a single position in the concrete syntax of a regular expression.
416 pub fn is_empty(&self) -> bool
{
417 self.start
.offset
== self.end
.offset
422 /// Create a new position with the given information.
424 /// `offset` is the absolute offset of the position, starting at `0` from
425 /// the beginning of the regular expression pattern string.
427 /// `line` is the line number, starting at `1`.
429 /// `column` is the approximate column number, starting at `1`.
430 pub fn new(offset
: usize, line
: usize, column
: usize) -> Position
{
431 Position { offset: offset, line: line, column: column }
435 /// An abstract syntax tree for a singular expression along with comments
438 /// Comments are not stored in the tree itself to avoid complexity. Each
439 /// comment contains a span of precisely where it occurred in the original
440 /// regular expression.
441 #[derive(Clone, Debug, Eq, PartialEq)]
442 pub struct WithComments
{
445 /// All comments found in the original regular expression.
446 pub comments
: Vec
<Comment
>,
449 /// A comment from a regular expression with an associated span.
451 /// A regular expression can only contain comments when the `x` flag is
453 #[derive(Clone, Debug, Eq, PartialEq)]
455 /// The span of this comment, including the beginning `#` and ending `\n`.
457 /// The comment text, starting with the first character following the `#`
458 /// and ending with the last character preceding the `\n`.
462 /// An abstract syntax tree for a single regular expression.
464 /// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap
465 /// space proportional to the size of the `Ast`.
467 /// This type defines its own destructor that uses constant stack space and
468 /// heap space proportional to the size of the `Ast`.
469 #[derive(Clone, Debug, Eq, PartialEq)]
471 /// An empty regex that matches everything.
473 /// A set of flags, e.g., `(?is)`.
475 /// A single character literal, which includes escape sequences.
477 /// The "any character" class.
479 /// A single zero-width assertion.
480 Assertion(Assertion
),
481 /// A single character class. This includes all forms of character classes
482 /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`.
484 /// A repetition operator applied to an arbitrary regular expression.
485 Repetition(Repetition
),
486 /// A grouped regular expression.
488 /// An alternation of regular expressions.
489 Alternation(Alternation
),
490 /// A concatenation of regular expressions.
495 /// Return the span of this abstract syntax tree.
496 pub fn span(&self) -> &Span
{
498 Ast
::Empty(ref span
) => span
,
499 Ast
::Flags(ref x
) => &x
.span
,
500 Ast
::Literal(ref x
) => &x
.span
,
501 Ast
::Dot(ref span
) => span
,
502 Ast
::Assertion(ref x
) => &x
.span
,
503 Ast
::Class(ref x
) => x
.span(),
504 Ast
::Repetition(ref x
) => &x
.span
,
505 Ast
::Group(ref x
) => &x
.span
,
506 Ast
::Alternation(ref x
) => &x
.span
,
507 Ast
::Concat(ref x
) => &x
.span
,
511 /// Return true if and only if this Ast is empty.
512 pub fn is_empty(&self) -> bool
{
514 Ast
::Empty(_
) => true,
519 /// Returns true if and only if this AST has any (including possibly empty)
521 fn has_subexprs(&self) -> bool
{
527 | Ast
::Assertion(_
) => false,
531 | Ast
::Alternation(_
)
532 | Ast
::Concat(_
) => true,
537 /// Print a display representation of this Ast.
539 /// This does not preserve any of the original whitespace formatting that may
540 /// have originally been present in the concrete syntax from which this Ast
543 /// This implementation uses constant stack space and heap space proportional
544 /// to the size of the `Ast`.
545 impl fmt
::Display
for Ast
{
546 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
547 use ast
::print
::Printer
;
548 Printer
::new().print(self, f
)
552 /// An alternation of regular expressions.
553 #[derive(Clone, Debug, Eq, PartialEq)]
554 pub struct Alternation
{
555 /// The span of this alternation.
557 /// The alternate regular expressions.
562 /// Return this alternation as an AST.
564 /// If this alternation contains zero ASTs, then Ast::Empty is
565 /// returned. If this alternation contains exactly 1 AST, then the
566 /// corresponding AST is returned. Otherwise, Ast::Alternation is returned.
567 pub fn into_ast(mut self) -> Ast
{
568 match self.asts
.len() {
569 0 => Ast
::Empty(self.span
),
570 1 => self.asts
.pop().unwrap(),
571 _
=> Ast
::Alternation(self),
576 /// A concatenation of regular expressions.
577 #[derive(Clone, Debug, Eq, PartialEq)]
579 /// The span of this concatenation.
581 /// The concatenation regular expressions.
586 /// Return this concatenation as an AST.
588 /// If this concatenation contains zero ASTs, then Ast::Empty is
589 /// returned. If this concatenation contains exactly 1 AST, then the
590 /// corresponding AST is returned. Otherwise, Ast::Concat is returned.
591 pub fn into_ast(mut self) -> Ast
{
592 match self.asts
.len() {
593 0 => Ast
::Empty(self.span
),
594 1 => self.asts
.pop().unwrap(),
595 _
=> Ast
::Concat(self),
600 /// A single literal expression.
602 /// A literal corresponds to a single Unicode scalar value. Literals may be
603 /// represented in their literal form, e.g., `a` or in their escaped form,
605 #[derive(Clone, Debug, Eq, PartialEq)]
607 /// The span of this literal.
609 /// The kind of this literal.
610 pub kind
: LiteralKind
,
611 /// The Unicode scalar value corresponding to this literal.
616 /// If this literal was written as a `\x` hex escape, then this returns
617 /// the corresponding byte value. Otherwise, this returns `None`.
618 pub fn byte(&self) -> Option
<u8> {
619 let short_hex
= LiteralKind
::HexFixed(HexLiteralKind
::X
);
620 if self.c
as u32 <= 255 && self.kind
== short_hex
{
628 /// The kind of a single literal expression.
629 #[derive(Clone, Debug, Eq, PartialEq)]
630 pub enum LiteralKind
{
631 /// The literal is written verbatim, e.g., `a` or `☃`.
633 /// The literal is written as an escape because it is punctuation, e.g.,
636 /// The literal is written as an octal escape, e.g., `\141`.
638 /// The literal is written as a hex code with a fixed number of digits
639 /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or
641 HexFixed(HexLiteralKind
),
642 /// The literal is written as a hex code with a bracketed number of
643 /// digits. The only restriction is that the bracketed hex code must refer
644 /// to a valid Unicode scalar value.
645 HexBrace(HexLiteralKind
),
646 /// The literal is written as a specially recognized escape, e.g., `\f`
648 Special(SpecialLiteralKind
),
651 /// The type of a special literal.
653 /// A special literal is a special escape sequence recognized by the regex
654 /// parser, e.g., `\f` or `\n`.
655 #[derive(Clone, Debug, Eq, PartialEq)]
656 pub enum SpecialLiteralKind
{
657 /// Bell, spelled `\a` (`\x07`).
659 /// Form feed, spelled `\f` (`\x0C`).
661 /// Tab, spelled `\t` (`\x09`).
663 /// Line feed, spelled `\n` (`\x0A`).
665 /// Carriage return, spelled `\r` (`\x0D`).
667 /// Vertical tab, spelled `\v` (`\x0B`).
669 /// Space, spelled `\ ` (`\x20`). Note that this can only appear when
670 /// parsing in verbose mode.
674 /// The type of a Unicode hex literal.
676 /// Note that all variants behave the same when used with brackets. They only
677 /// differ when used without brackets in the number of hex digits that must
679 #[derive(Clone, Debug, Eq, PartialEq)]
680 pub enum HexLiteralKind
{
681 /// A `\x` prefix. When used without brackets, this form is limited to
684 /// A `\u` prefix. When used without brackets, this form is limited to
687 /// A `\U` prefix. When used without brackets, this form is limited to
692 impl HexLiteralKind
{
693 /// The number of digits that must be used with this literal form when
694 /// used without brackets. When used with brackets, there is no
695 /// restriction on the number of digits.
696 pub fn digits(&self) -> u32 {
698 HexLiteralKind
::X
=> 2,
699 HexLiteralKind
::UnicodeShort
=> 4,
700 HexLiteralKind
::UnicodeLong
=> 8,
705 /// A single character class expression.
706 #[derive(Clone, Debug, Eq, PartialEq)]
708 /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
709 Unicode(ClassUnicode
),
710 /// A perl character class, e.g., `\d` or `\W`.
712 /// A bracketed character class set, which may contain zero or more
713 /// character ranges and/or zero or more nested classes. e.g.,
715 Bracketed(ClassBracketed
),
719 /// Return the span of this character class.
720 pub fn span(&self) -> &Span
{
722 Class
::Perl(ref x
) => &x
.span
,
723 Class
::Unicode(ref x
) => &x
.span
,
724 Class
::Bracketed(ref x
) => &x
.span
,
729 /// A Perl character class.
730 #[derive(Clone, Debug, Eq, PartialEq)]
731 pub struct ClassPerl
{
732 /// The span of this class.
734 /// The kind of Perl class.
735 pub kind
: ClassPerlKind
,
736 /// Whether the class is negated or not. e.g., `\d` is not negated but
741 /// The available Perl character classes.
742 #[derive(Clone, Debug, Eq, PartialEq)]
743 pub enum ClassPerlKind
{
752 /// An ASCII character class.
753 #[derive(Clone, Debug, Eq, PartialEq)]
754 pub struct ClassAscii
{
755 /// The span of this class.
757 /// The kind of ASCII class.
758 pub kind
: ClassAsciiKind
,
759 /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated
760 /// but `[[:^alpha:]]` is.
764 /// The available ASCII character classes.
765 #[derive(Clone, Debug, Eq, PartialEq)]
766 pub enum ClassAsciiKind
{
775 /// `[\x00-\x1F\x7F]`
785 /// `[!-/:-@\[-`{-~]`
797 impl ClassAsciiKind
{
798 /// Return the corresponding ClassAsciiKind variant for the given name.
800 /// The name given should correspond to the lowercase version of the
801 /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`.
803 /// If no variant with the corresponding name exists, then `None` is
805 pub fn from_name(name
: &str) -> Option
<ClassAsciiKind
> {
806 use self::ClassAsciiKind
::*;
808 "alnum" => Some(Alnum
),
809 "alpha" => Some(Alpha
),
810 "ascii" => Some(Ascii
),
811 "blank" => Some(Blank
),
812 "cntrl" => Some(Cntrl
),
813 "digit" => Some(Digit
),
814 "graph" => Some(Graph
),
815 "lower" => Some(Lower
),
816 "print" => Some(Print
),
817 "punct" => Some(Punct
),
818 "space" => Some(Space
),
819 "upper" => Some(Upper
),
820 "word" => Some(Word
),
821 "xdigit" => Some(Xdigit
),
827 /// A Unicode character class.
828 #[derive(Clone, Debug, Eq, PartialEq)]
829 pub struct ClassUnicode
{
830 /// The span of this class.
832 /// Whether this class is negated or not.
834 /// Note: be careful when using this attribute. This specifically refers
835 /// to whether the class is written as `\p` or `\P`, where the latter
836 /// is `negated = true`. However, it also possible to write something like
837 /// `\P{scx!=Katakana}` which is actually equivalent to
838 /// `\p{scx=Katakana}` and is therefore not actually negated even though
839 /// `negated = true` here. To test whether this class is truly negated
840 /// or not, use the `is_negated` method.
842 /// The kind of Unicode class.
843 pub kind
: ClassUnicodeKind
,
847 /// Returns true if this class has been negated.
849 /// Note that this takes the Unicode op into account, if it's present.
850 /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`.
851 pub fn is_negated(&self) -> bool
{
853 ClassUnicodeKind
::NamedValue
{
854 op
: ClassUnicodeOpKind
::NotEqual
, ..
861 /// The available forms of Unicode character classes.
862 #[derive(Clone, Debug, Eq, PartialEq)]
863 pub enum ClassUnicodeKind
{
864 /// A one letter abbreviated class, e.g., `\pN`.
866 /// A binary property, general category or script. The string may be
869 /// A property name and an associated value.
871 /// The type of Unicode op used to associate `name` with `value`.
872 op
: ClassUnicodeOpKind
,
873 /// The property name (which may be empty).
875 /// The property value (which may be empty).
880 /// The type of op used in a Unicode character class.
881 #[derive(Clone, Debug, Eq, PartialEq)]
882 pub enum ClassUnicodeOpKind
{
883 /// A property set to a specific value, e.g., `\p{scx=Katakana}`.
885 /// A property set to a specific value using a colon, e.g.,
886 /// `\p{scx:Katakana}`.
888 /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`.
892 impl ClassUnicodeOpKind
{
893 /// Whether the op is an equality op or not.
894 pub fn is_equal(&self) -> bool
{
896 ClassUnicodeOpKind
::Equal
|ClassUnicodeOpKind
::Colon
=> true,
902 /// A bracketed character class, e.g., `[a-z0-9]`.
903 #[derive(Clone, Debug, Eq, PartialEq)]
904 pub struct ClassBracketed
{
905 /// The span of this class.
907 /// Whether this class is negated or not. e.g., `[a]` is not negated but
910 /// The type of this set. A set is either a normal union of things, e.g.,
911 /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`.
915 /// A character class set.
917 /// This type corresponds to the internal structure of a bracketed character
918 /// class. That is, every bracketed character is one of two types: a union of
919 /// items (literals, ranges, other bracketed classes) or a tree of binary set
921 #[derive(Clone, Debug, Eq, PartialEq)]
923 /// An item, which can be a single literal, range, nested character class
924 /// or a union of items.
926 /// A single binary operation (i.e., &&, -- or ~~).
927 BinaryOp(ClassSetBinaryOp
),
931 /// Build a set from a union.
932 pub fn union(ast
: ClassSetUnion
) -> ClassSet
{
933 ClassSet
::Item(ClassSetItem
::Union(ast
))
936 /// Return the span of this character class set.
937 pub fn span(&self) -> &Span
{
939 ClassSet
::Item(ref x
) => x
.span(),
940 ClassSet
::BinaryOp(ref x
) => &x
.span
,
944 /// Return true if and only if this class set is empty.
945 fn is_empty(&self) -> bool
{
947 ClassSet
::Item(ClassSetItem
::Empty(_
)) => true,
953 /// A single component of a character class set.
954 #[derive(Clone, Debug, Eq, PartialEq)]
955 pub enum ClassSetItem
{
958 /// Note that a bracketed character class cannot contain a single empty
959 /// item. Empty items can appear when using one of the binary operators.
960 /// For example, `[&&]` is the intersection of two empty classes.
962 /// A single literal.
964 /// A range between two literals.
965 Range(ClassSetRange
),
966 /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`.
968 /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
969 Unicode(ClassUnicode
),
970 /// A perl character class, e.g., `\d` or `\W`.
972 /// A bracketed character class set, which may contain zero or more
973 /// character ranges and/or zero or more nested classes. e.g.,
975 Bracketed(Box
<ClassBracketed
>),
976 /// A union of items.
977 Union(ClassSetUnion
),
981 /// Return the span of this character class set item.
982 pub fn span(&self) -> &Span
{
984 ClassSetItem
::Empty(ref span
) => span
,
985 ClassSetItem
::Literal(ref x
) => &x
.span
,
986 ClassSetItem
::Range(ref x
) => &x
.span
,
987 ClassSetItem
::Ascii(ref x
) => &x
.span
,
988 ClassSetItem
::Perl(ref x
) => &x
.span
,
989 ClassSetItem
::Unicode(ref x
) => &x
.span
,
990 ClassSetItem
::Bracketed(ref x
) => &x
.span
,
991 ClassSetItem
::Union(ref x
) => &x
.span
,
996 /// A single character class range in a set.
997 #[derive(Clone, Debug, Eq, PartialEq)]
998 pub struct ClassSetRange
{
999 /// The span of this range.
1001 /// The start of this range.
1003 /// The end of this range.
1007 impl ClassSetRange
{
1008 /// Returns true if and only if this character class range is valid.
1010 /// The only case where a range is invalid is if its start is greater than
1012 pub fn is_valid(&self) -> bool
{
1013 self.start
.c
<= self.end
.c
1017 /// A union of items inside a character class set.
1018 #[derive(Clone, Debug, Eq, PartialEq)]
1019 pub struct ClassSetUnion
{
1020 /// The span of the items in this operation. e.g., the `a-z0-9` in
1023 /// The sequence of items that make up this union.
1024 pub items
: Vec
<ClassSetItem
>,
1027 impl ClassSetUnion
{
1028 /// Push a new item in this union.
1030 /// The ending position of this union's span is updated to the ending
1031 /// position of the span of the item given. If the union is empty, then
1032 /// the starting position of this union is set to the starting position
1035 /// In other words, if you only use this method to add items to a union
1036 /// and you set the spans on each item correctly, then you should never
1037 /// need to adjust the span of the union directly.
1038 pub fn push(&mut self, item
: ClassSetItem
) {
1039 if self.items
.is_empty() {
1040 self.span
.start
= item
.span().start
;
1042 self.span
.end
= item
.span().end
;
1043 self.items
.push(item
);
1046 /// Return this union as a character class set item.
1048 /// If this union contains zero items, then an empty union is
1049 /// returned. If this concatenation contains exactly 1 item, then the
1050 /// corresponding item is returned. Otherwise, ClassSetItem::Union is
1052 pub fn into_item(mut self) -> ClassSetItem
{
1053 match self.items
.len() {
1054 0 => ClassSetItem
::Empty(self.span
),
1055 1 => self.items
.pop().unwrap(),
1056 _
=> ClassSetItem
::Union(self),
1061 /// A Unicode character class set operation.
1062 #[derive(Clone, Debug, Eq, PartialEq)]
1063 pub struct ClassSetBinaryOp
{
1064 /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`.
1066 /// The type of this set operation.
1067 pub kind
: ClassSetBinaryOpKind
,
1068 /// The left hand side of the operation.
1069 pub lhs
: Box
<ClassSet
>,
1070 /// The right hand side of the operation.
1071 pub rhs
: Box
<ClassSet
>,
1074 /// The type of a Unicode character class set operation.
1076 /// Note that this doesn't explicitly represent union since there is no
1077 /// explicit union operator. Concatenation inside a character class corresponds
1078 /// to the union operation.
1079 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
1080 pub enum ClassSetBinaryOpKind
{
1081 /// The intersection of two sets, e.g., `\pN&&[a-z]`.
1083 /// The difference of two sets, e.g., `\pN--[0-9]`.
1085 /// The symmetric difference of two sets. The symmetric difference is the
1086 /// set of elements belonging to one but not both sets.
1087 /// e.g., `[\pL~~[:ascii:]]`.
1088 SymmetricDifference
,
1091 /// A single zero-width assertion.
1092 #[derive(Clone, Debug, Eq, PartialEq)]
1093 pub struct Assertion
{
1094 /// The span of this assertion.
1096 /// The assertion kind, e.g., `\b` or `^`.
1097 pub kind
: AssertionKind
,
1100 /// An assertion kind.
1101 #[derive(Clone, Debug, Eq, PartialEq)]
1102 pub enum AssertionKind
{
1117 /// A repetition operation applied to a regular expression.
1118 #[derive(Clone, Debug, Eq, PartialEq)]
1119 pub struct Repetition
{
1120 /// The span of this operation.
1122 /// The actual operation.
1123 pub op
: RepetitionOp
,
1124 /// Whether this operation was applied greedily or not.
1126 /// The regular expression under repetition.
1130 /// The repetition operator itself.
1131 #[derive(Clone, Debug, Eq, PartialEq)]
1132 pub struct RepetitionOp
{
1133 /// The span of this operator. This includes things like `+`, `*?` and
1136 /// The type of operation.
1137 pub kind
: RepetitionKind
,
1140 /// The kind of a repetition operator.
1141 #[derive(Clone, Debug, Eq, PartialEq)]
1142 pub enum RepetitionKind
{
1150 Range(RepetitionRange
),
1153 /// A range repetition operator.
1154 #[derive(Clone, Debug, Eq, PartialEq)]
1155 pub enum RepetitionRange
{
1164 impl RepetitionRange
{
1165 /// Returns true if and only if this repetition range is valid.
1167 /// The only case where a repetition range is invalid is if it is bounded
1168 /// and its start is greater than its end.
1169 pub fn is_valid(&self) -> bool
{
1171 RepetitionRange
::Bounded(s
, e
) if s
> e
=> false,
1177 /// A grouped regular expression.
1179 /// This includes both capturing and non-capturing groups. This does **not**
1180 /// include flag-only groups like `(?is)`, but does contain any group that
1181 /// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and
1183 #[derive(Clone, Debug, Eq, PartialEq)]
1185 /// The span of this group.
1187 /// The kind of this group.
1188 pub kind
: GroupKind
,
1189 /// The regular expression in this group.
1194 /// If this group is non-capturing, then this returns the (possibly empty)
1195 /// set of flags. Otherwise, `None` is returned.
1196 pub fn flags(&self) -> Option
<&Flags
> {
1198 GroupKind
::NonCapturing(ref flags
) => Some(flags
),
1203 /// Returns true if and only if this group is capturing.
1204 pub fn is_capturing(&self) -> bool
{
1206 GroupKind
::CaptureIndex(_
) | GroupKind
::CaptureName(_
) => true,
1207 GroupKind
::NonCapturing(_
) => false,
1211 /// Returns the capture index of this group, if this is a capturing group.
1213 /// This returns a capture index precisely when `is_capturing` is `true`.
1214 pub fn capture_index(&self) -> Option
<u32> {
1216 GroupKind
::CaptureIndex(i
) => Some(i
),
1217 GroupKind
::CaptureName(ref x
) => Some(x
.index
),
1218 GroupKind
::NonCapturing(_
) => None
,
1223 /// The kind of a group.
1224 #[derive(Clone, Debug, Eq, PartialEq)]
1225 pub enum GroupKind
{
1229 CaptureName(CaptureName
),
1230 /// `(?:a)` and `(?i:a)`
1231 NonCapturing(Flags
),
1236 /// This corresponds to the name itself between the angle brackets in, e.g.,
1237 /// `(?P<foo>expr)`.
1238 #[derive(Clone, Debug, Eq, PartialEq)]
1239 pub struct CaptureName
{
1240 /// The span of this capture name.
1242 /// The capture name.
1244 /// The capture index.
1248 /// A group of flags that is not applied to a particular regular expression.
1249 #[derive(Clone, Debug, Eq, PartialEq)]
1250 pub struct SetFlags
{
1251 /// The span of these flags, including the grouping parentheses.
1253 /// The actual sequence of flags.
1257 /// A group of flags.
1259 /// This corresponds only to the sequence of flags themselves, e.g., `is-u`.
1260 #[derive(Clone, Debug, Eq, PartialEq)]
1262 /// The span of this group of flags.
1264 /// A sequence of flag items. Each item is either a flag or a negation
1266 pub items
: Vec
<FlagsItem
>,
1270 /// Add the given item to this sequence of flags.
1272 /// If the item was added successfully, then `None` is returned. If the
1273 /// given item is a duplicate, then `Some(i)` is returned, where
1274 /// `items[i].kind == item.kind`.
1275 pub fn add_item(&mut self, item
: FlagsItem
) -> Option
<usize> {
1276 for (i
, x
) in self.items
.iter().enumerate() {
1277 if x
.kind
== item
.kind
{
1281 self.items
.push(item
);
1285 /// Returns the state of the given flag in this set.
1287 /// If the given flag is in the set but is negated, then `Some(false)` is
1290 /// If the given flag is in the set and is not negated, then `Some(true)`
1293 /// Otherwise, `None` is returned.
1294 pub fn flag_state(&self, flag
: Flag
) -> Option
<bool
> {
1295 let mut negated
= false;
1296 for x
in &self.items
{
1298 FlagsItemKind
::Negation
=> {
1301 FlagsItemKind
::Flag(ref xflag
) if xflag
== &flag
=> {
1302 return Some(!negated
);
1311 /// A single item in a group of flags.
1312 #[derive(Clone, Debug, Eq, PartialEq)]
1313 pub struct FlagsItem
{
1314 /// The span of this item.
1316 /// The kind of this item.
1317 pub kind
: FlagsItemKind
,
1320 /// The kind of an item in a group of flags.
1321 #[derive(Clone, Debug, Eq, PartialEq)]
1322 pub enum FlagsItemKind
{
1323 /// A negation operator applied to all subsequent flags in the enclosing
1326 /// A single flag in a group.
1330 impl FlagsItemKind
{
1331 /// Returns true if and only if this item is a negation operator.
1332 pub fn is_negation(&self) -> bool
{
1334 FlagsItemKind
::Negation
=> true,
1341 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
1357 /// A custom `Drop` impl is used for `Ast` such that it uses constant stack
1358 /// space but heap space proportional to the depth of the `Ast`.
1360 fn drop(&mut self) {
1369 // Classes are recursive, so they get their own Drop impl.
1370 | Ast
::Class(_
) => return,
1371 Ast
::Repetition(ref x
) if !x
.ast
.has_subexprs() => return,
1372 Ast
::Group(ref x
) if !x
.ast
.has_subexprs() => return,
1373 Ast
::Alternation(ref x
) if x
.asts
.is_empty() => return,
1374 Ast
::Concat(ref x
) if x
.asts
.is_empty() => return,
1378 let empty_span
= || Span
::splat(Position
::new(0, 0, 0));
1379 let empty_ast
= || Ast
::Empty(empty_span());
1380 let mut stack
= vec
![mem
::replace(self, empty_ast())];
1381 while let Some(mut ast
) = stack
.pop() {
1388 // Classes are recursive, so they get their own Drop impl.
1389 | Ast
::Class(_
) => {}
1390 Ast
::Repetition(ref mut x
) => {
1391 stack
.push(mem
::replace(&mut x
.ast
, empty_ast()));
1393 Ast
::Group(ref mut x
) => {
1394 stack
.push(mem
::replace(&mut x
.ast
, empty_ast()));
1396 Ast
::Alternation(ref mut x
) => {
1397 stack
.extend(x
.asts
.drain(..));
1399 Ast
::Concat(ref mut x
) => {
1400 stack
.extend(x
.asts
.drain(..));
1407 /// A custom `Drop` impl is used for `ClassSet` such that it uses constant
1408 /// stack space but heap space proportional to the depth of the `ClassSet`.
1409 impl Drop
for ClassSet
{
1410 fn drop(&mut self) {
1414 ClassSet
::Item(ref item
) => {
1416 ClassSetItem
::Empty(_
)
1417 | ClassSetItem
::Literal(_
)
1418 | ClassSetItem
::Range(_
)
1419 | ClassSetItem
::Ascii(_
)
1420 | ClassSetItem
::Unicode(_
)
1421 | ClassSetItem
::Perl(_
) => return,
1422 ClassSetItem
::Bracketed(ref x
) => {
1423 if x
.kind
.is_empty() {
1427 ClassSetItem
::Union(ref x
) => {
1428 if x
.items
.is_empty() {
1434 ClassSet
::BinaryOp(ref op
) => {
1435 if op
.lhs
.is_empty() && op
.rhs
.is_empty() {
1441 let empty_span
= || Span
::splat(Position
::new(0, 0, 0));
1442 let empty_set
= || ClassSet
::Item(ClassSetItem
::Empty(empty_span()));
1443 let mut stack
= vec
![mem
::replace(self, empty_set())];
1444 while let Some(mut set
) = stack
.pop() {
1446 ClassSet
::Item(ref mut item
) => {
1448 ClassSetItem
::Empty(_
)
1449 | ClassSetItem
::Literal(_
)
1450 | ClassSetItem
::Range(_
)
1451 | ClassSetItem
::Ascii(_
)
1452 | ClassSetItem
::Unicode(_
)
1453 | ClassSetItem
::Perl(_
) => {}
1454 ClassSetItem
::Bracketed(ref mut x
) => {
1455 stack
.push(mem
::replace(&mut x
.kind
, empty_set()));
1457 ClassSetItem
::Union(ref mut x
) => {
1459 x
.items
.drain(..).map(ClassSet
::Item
));
1463 ClassSet
::BinaryOp(ref mut op
) => {
1464 stack
.push(mem
::replace(&mut op
.lhs
, empty_set()));
1465 stack
.push(mem
::replace(&mut op
.rhs
, empty_set()));
1476 // We use a thread with an explicit stack size to test that our destructor
1477 // for Ast can handle arbitrarily sized expressions in constant stack
1478 // space. In case we run on a platform without threads (WASM?), we limit
1479 // this test to Windows/Unix.
1481 #[cfg(any(unix, windows))]
1482 fn no_stack_overflow_on_drop() {
1486 let span
= || Span
::splat(Position
::new(0, 0, 0));
1487 let mut ast
= Ast
::Empty(span());
1489 ast
= Ast
::Group(Group
{
1491 kind
: GroupKind
::CaptureIndex(i
),
1495 assert
!(!ast
.is_empty());
1498 // We run our test on a thread with a small stack size so we can
1499 // force the issue more easily.
1500 thread
::Builder
::new()