]> git.proxmox.com Git - rustc.git/blame - src/vendor/regex-syntax/src/ast/parse.rs
New upstream version 1.28.0~beta.14+dfsg1
[rustc.git] / src / vendor / regex-syntax / src / ast / parse.rs
CommitLineData
0531ce1d
XL
1// Copyright 2018 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11/*!
12This module provides a regular expression parser.
13*/
14
15use std::borrow::Borrow;
16use std::cell::{Cell, RefCell};
17use std::mem;
18use std::result;
19
20use ast::{self, Ast, Position, Span};
21use either::Either;
22
23use is_meta_character;
24
25type Result<T> = result::Result<T, ast::Error>;
26
27/// A primitive is an expression with no sub-expressions. This includes
28/// literals, assertions and non-set character classes. This representation
29/// is used as intermediate state in the parser.
30///
31/// This does not include ASCII character classes, since they can only appear
32/// within a set character class.
33#[derive(Clone, Debug, Eq, PartialEq)]
34enum Primitive {
35 Literal(ast::Literal),
36 Assertion(ast::Assertion),
37 Dot(Span),
38 Perl(ast::ClassPerl),
39 Unicode(ast::ClassUnicode),
40}
41
42impl Primitive {
43 /// Return the span of this primitive.
44 fn span(&self) -> &Span {
45 match *self {
46 Primitive::Literal(ref x) => &x.span,
47 Primitive::Assertion(ref x) => &x.span,
48 Primitive::Dot(ref span) => span,
49 Primitive::Perl(ref x) => &x.span,
50 Primitive::Unicode(ref x) => &x.span,
51 }
52 }
53
54 /// Convert this primitive into a proper AST.
55 fn into_ast(self) -> Ast {
56 match self {
57 Primitive::Literal(lit) => Ast::Literal(lit),
58 Primitive::Assertion(assert) => Ast::Assertion(assert),
59 Primitive::Dot(span) => Ast::Dot(span),
60 Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)),
61 Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)),
62 }
63 }
64
65 /// Convert this primitive into an item in a character class.
66 ///
67 /// If this primitive is not a legal item (i.e., an assertion or a dot),
68 /// then return an error.
69 fn into_class_set_item<P: Borrow<Parser>>(
70 self,
71 p: &ParserI<P>,
72 ) -> Result<ast::ClassSetItem> {
73 use ast::ClassSetItem;
74 use self::Primitive::*;
75
76 match self {
77 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
78 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
79 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
80 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
81 }
82 }
83
84 /// Convert this primitive into a literal in a character class. In
85 /// particular, literals are the only valid items that can appear in
86 /// ranges.
87 ///
88 /// If this primitive is not a legal item (i.e., a class, assertion or a
89 /// dot), then return an error.
90 fn into_class_literal<P: Borrow<Parser>>(
91 self,
92 p: &ParserI<P>,
93 ) -> Result<ast::Literal> {
94 use self::Primitive::*;
95
96 match self {
97 Literal(lit) => Ok(lit),
94b46f34 98 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
0531ce1d
XL
99 }
100 }
101}
102
103/// Returns true if the given character is a hexadecimal digit.
104fn is_hex(c: char) -> bool {
105 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
106}
107
108/// Returns true if the given character is a valid in a capture group name.
109///
110/// If `first` is true, then `c` is treated as the first character in the
111/// group name (which is not allowed to be a digit).
112fn is_capture_char(c: char, first: bool) -> bool {
113 c == '_' || (!first && c >= '0' && c <= '9')
114 || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
115}
116
117/// A builder for a regular expression parser.
118///
119/// This builder permits modifying configuration options for the parser.
120#[derive(Clone, Debug)]
121pub struct ParserBuilder {
122 ignore_whitespace: bool,
123 nest_limit: u32,
124 octal: bool,
125}
126
127impl Default for ParserBuilder {
128 fn default() -> ParserBuilder {
129 ParserBuilder::new()
130 }
131}
132
133impl ParserBuilder {
134 /// Create a new parser builder with a default configuration.
135 pub fn new() -> ParserBuilder {
136 ParserBuilder {
137 ignore_whitespace: false,
138 nest_limit: 250,
139 octal: false,
140 }
141 }
142
143 /// Build a parser from this configuration with the given pattern.
144 pub fn build(&self) -> Parser {
145 Parser {
146 pos: Cell::new(Position { offset: 0, line: 1, column: 1 }),
147 capture_index: Cell::new(0),
148 nest_limit: self.nest_limit,
149 octal: self.octal,
150 initial_ignore_whitespace: self.ignore_whitespace,
151 ignore_whitespace: Cell::new(self.ignore_whitespace),
152 comments: RefCell::new(vec![]),
153 stack_group: RefCell::new(vec![]),
154 stack_class: RefCell::new(vec![]),
155 capture_names: RefCell::new(vec![]),
156 scratch: RefCell::new(String::new()),
157 }
158 }
159
160 /// Set the nesting limit for this parser.
161 ///
162 /// The nesting limit controls how deep the abstract syntax tree is allowed
163 /// to be. If the AST exceeds the given limit (e.g., with too many nested
164 /// groups), then an error is returned by the parser.
165 ///
166 /// The purpose of this limit is to act as a heuristic to prevent stack
167 /// overflow for consumers that do structural induction on an `Ast` using
168 /// explicit recursion. While this crate never does this (instead using
169 /// constant stack space and moving the call stack to the heap), other
170 /// crates may.
171 ///
172 /// This limit is not checked until the entire Ast is parsed. Therefore,
173 /// if callers want to put a limit on the amount of heap space used, then
174 /// they should impose a limit on the length, in bytes, of the concrete
175 /// pattern string. In particular, this is viable since this parser
176 /// implementation will limit itself to heap space proportional to the
177 /// lenth of the pattern string.
178 ///
179 /// Note that a nest limit of `0` will return a nest limit error for most
180 /// patterns but not all. For example, a nest limit of `0` permits `a` but
181 /// not `ab`, since `ab` requires a concatenation, which results in a nest
182 /// depth of `1`. In general, a nest limit is not something that manifests
183 /// in an obvious way in the concrete syntax, therefore, it should not be
184 /// used in a granular way.
185 pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
186 self.nest_limit = limit;
187 self
188 }
189
190 /// Whether to support octal syntax or not.
191 ///
192 /// Octal syntax is a little-known way of uttering Unicode codepoints in
193 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
194 /// `\141` are all equivalent regular expressions, where the last example
195 /// shows octal syntax.
196 ///
197 /// While supporting octal syntax isn't in and of itself a problem, it does
198 /// make good error messages harder. That is, in PCRE based regex engines,
199 /// syntax like `\0` invokes a backreference, which is explicitly
200 /// unsupported in Rust's regex engine. However, many users expect it to
201 /// be supported. Therefore, when octal support is disabled, the error
202 /// message will explicitly mention that backreferences aren't supported.
203 ///
204 /// Octal syntax is disabled by default.
205 pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
206 self.octal = yes;
207 self
208 }
209
210 /// Enable verbose mode in the regular expression.
211 ///
212 /// When enabled, verbose mode permits insigificant whitespace in many
213 /// places in the regular expression, as well as comments. Comments are
214 /// started using `#` and continue until the end of the line.
215 ///
216 /// By default, this is disabled. It may be selectively enabled in the
217 /// regular expression by using the `x` flag regardless of this setting.
218 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
219 self.ignore_whitespace = yes;
220 self
221 }
222}
223
224/// A regular expression parser.
225///
226/// This parses a string representation of a regular expression into an
227/// abstract syntax tree. The size of the tree is proportional to the length
228/// of the regular expression pattern.
229///
230/// A `Parser` can be configured in more detail via a
231/// [`ParserBuilder`](struct.ParserBuilder.html).
232#[derive(Clone, Debug)]
233pub struct Parser {
234 /// The current position of the parser.
235 pos: Cell<Position>,
236 /// The current capture index.
237 capture_index: Cell<u32>,
238 /// The maximum number of open parens/brackets allowed. If the parser
239 /// exceeds this number, then an error is returned.
240 nest_limit: u32,
241 /// Whether to support octal syntax or not. When `false`, the parser will
242 /// return an error helpfully pointing out that backreferences are not
243 /// supported.
244 octal: bool,
245 /// The initial setting for `ignore_whitespace` as provided by
246 /// Th`ParserBuilder`. is is used when reseting the parser's state.
247 initial_ignore_whitespace: bool,
248 /// Whether whitespace should be ignored. When enabled, comments are
249 /// also permitted.
250 ignore_whitespace: Cell<bool>,
251 /// A list of comments, in order of appearance.
252 comments: RefCell<Vec<ast::Comment>>,
253 /// A stack of grouped sub-expressions, including alternations.
254 stack_group: RefCell<Vec<GroupState>>,
255 /// A stack of nested character classes. This is only non-empty when
256 /// parsing a class.
257 stack_class: RefCell<Vec<ClassState>>,
258 /// A sorted sequence of capture names. This is used to detect duplicate
259 /// capture names and report an error if one is detected.
260 capture_names: RefCell<Vec<ast::CaptureName>>,
261 /// A scratch buffer used in various places. Mostly this is used to
262 /// accumulate relevant characters from parts of a pattern.
263 scratch: RefCell<String>,
264}
265
266/// ParserI is the internal parser implementation.
267///
268/// We use this separate type so that we can carry the provided pattern string
269/// along with us. In particular, a `Parser` internal state is not tied to any
270/// one pattern, but `ParserI` is.
271///
272/// This type also lets us use `ParserI<&Parser>` in production code while
273/// retaining the convenience of `ParserI<Parser>` for tests, which sometimes
274/// work against the internal interface of the parser.
275#[derive(Clone, Debug)]
276struct ParserI<'s, P> {
277 /// The parser state/configuration.
278 parser: P,
279 /// The full regular expression provided by the user.
280 pattern: &'s str,
281}
282
283/// GroupState represents a single stack frame while parsing nested groups
284/// and alternations. Each frame records the state up to an opening parenthesis
285/// or a alternating bracket `|`.
286#[derive(Clone, Debug)]
287enum GroupState {
288 /// This state is pushed whenever an opening group is found.
289 Group {
290 /// The concatenation immediately preceding the opening group.
291 concat: ast::Concat,
292 /// The group that has been opened. Its sub-AST is always empty.
293 group: ast::Group,
294 /// Whether this group has the `x` flag enabled or not.
295 ignore_whitespace: bool,
296 },
297 /// This state is pushed whenever a new alternation branch is found. If
298 /// an alternation branch is found and this state is at the top of the
299 /// stack, then this state should be modified to include the new
300 /// alternation.
301 Alternation(ast::Alternation),
302}
303
304/// ClassState represents a single stack frame while parsing character classes.
305/// Each frame records the state up to an intersection, difference, symmetric
306/// difference or nested class.
307///
308/// Note that a parser's character class stack is only non-empty when parsing
309/// a character class. In all other cases, it is empty.
310#[derive(Clone, Debug)]
311enum ClassState {
312 /// This state is pushed whenever an opening bracket is found.
313 Open {
314 /// The union of class items immediately preceding this class.
315 union: ast::ClassSetUnion,
316 /// The class that has been opened. Typically this just corresponds
317 /// to the `[`, but it can also include `[^` since `^` indicates
318 /// negation of the class.
319 set: ast::ClassBracketed,
320 },
321 /// This state is pushed when a operator is seen. When popped, the stored
322 /// set becomes the left hand side of the operator.
323 Op {
324 /// The type of the operation, i.e., &&, -- or ~~.
325 kind: ast::ClassSetBinaryOpKind,
326 /// The left-hand side of the operator.
327 lhs: ast::ClassSet,
328 },
329}
330
331impl Parser {
332 /// Create a new parser with a default configuration.
333 ///
334 /// The parser can be run with either the `parse` or `parse_with_comments`
335 /// methods. The parse methods return an abstract syntax tree.
336 ///
337 /// To set configuration options on the parser, use
338 /// [`ParserBuilder`](struct.ParserBuilder.html).
339 pub fn new() -> Parser {
340 ParserBuilder::new().build()
341 }
342
343 /// Parse the regular expression into an abstract syntax tree.
344 pub fn parse(&mut self, pattern: &str) -> Result<Ast> {
345 ParserI::new(self, pattern).parse()
346 }
347
348 /// Parse the regular expression and return an abstract syntax tree with
349 /// all of the comments found in the pattern.
350 pub fn parse_with_comments(
351 &mut self,
352 pattern: &str,
353 ) -> Result<ast::WithComments> {
354 ParserI::new(self, pattern).parse_with_comments()
355 }
356
357 /// Reset the internal state of a parser.
358 ///
359 /// This is called at the beginning of every parse. This prevents the
360 /// parser from running with inconsistent state (say, if a previous
361 /// invocation returned an error and the parser is reused).
362 fn reset(&self) {
363 // These settings should be in line with the construction
364 // in `ParserBuilder::build`.
365 self.pos.set(Position { offset: 0, line: 1, column: 1});
366 self.ignore_whitespace.set(self.initial_ignore_whitespace);
367 self.comments.borrow_mut().clear();
368 self.stack_group.borrow_mut().clear();
369 self.stack_class.borrow_mut().clear();
370 }
371}
372
373impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
374 /// Build an internal parser from a parser configuration and a pattern.
375 fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> {
376 ParserI { parser: parser, pattern: pattern }
377 }
378
379 /// Return a reference to the parser state.
380 fn parser(&self) -> &Parser {
381 self.parser.borrow()
382 }
383
384 /// Return a reference to the pattern being parsed.
385 fn pattern(&self) -> &str {
386 self.pattern.borrow()
387 }
388
389 /// Create a new error with the given span and error type.
390 fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error {
391 ast::Error {
392 kind: kind,
393 pattern: self.pattern().to_string(),
394 span: span,
395 }
396 }
397
398 /// Return the current offset of the parser.
399 ///
400 /// The offset starts at `0` from the beginning of the regular expression
401 /// pattern string.
402 fn offset(&self) -> usize {
403 self.parser().pos.get().offset
404 }
405
406 /// Return the current line number of the parser.
407 ///
408 /// The line number starts at `1`.
409 fn line(&self) -> usize {
410 self.parser().pos.get().line
411 }
412
413 /// Return the current column of the parser.
414 ///
415 /// The column number starts at `1` and is reset whenever a `\n` is seen.
416 fn column(&self) -> usize {
417 self.parser().pos.get().column
418 }
419
420 /// Return the next capturing index. Each subsequent call increments the
421 /// internal index.
422 ///
423 /// The span given should correspond to the location of the opening
424 /// parenthesis.
425 ///
426 /// If the capture limit is exceeded, then an error is returned.
427 fn next_capture_index(&self, span: Span) -> Result<u32> {
428 let current = self.parser().capture_index.get();
94b46f34 429 let i = current.checked_add(1).ok_or_else(|| {
0531ce1d 430 self.error(span, ast::ErrorKind::CaptureLimitExceeded)
94b46f34 431 })?;
0531ce1d
XL
432 self.parser().capture_index.set(i);
433 Ok(i)
434 }
435
436 /// Adds the given capture name to this parser. If this capture name has
437 /// already been used, then an error is returned.
438 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
439 let mut names = self.parser().capture_names.borrow_mut();
440 match names.binary_search_by_key(
441 &cap.name.as_str(),
442 |c| c.name.as_str(),
443 ) {
444 Err(i) => {
445 names.insert(i, cap.clone());
446 Ok(())
447 }
448 Ok(i) => {
449 Err(self.error(cap.span, ast::ErrorKind::GroupNameDuplicate {
450 original: names[i].span,
451 }))
452 }
453 }
454 }
455
456 /// Return whether the parser should ignore whitespace or not.
457 fn ignore_whitespace(&self) -> bool {
458 self.parser().ignore_whitespace.get()
459 }
460
461 /// Return the character at the current position of the parser.
462 ///
463 /// This panics if the current position does not point to a valid char.
464 fn char(&self) -> char {
465 self.char_at(self.offset())
466 }
467
468 /// Return the character at the given position.
469 ///
470 /// This panics if the given position does not point to a valid char.
471 fn char_at(&self, i: usize) -> char {
472 self.pattern()[i..].chars().next()
473 .unwrap_or_else(|| {
474 panic!("expected char at offset {}", i)
475 })
476 }
477
478 /// Bump the parser to the next Unicode scalar value.
479 ///
480 /// If the end of the input has been reached, then `false` is returned.
481 fn bump(&self) -> bool {
482 if self.is_eof() {
483 return false;
484 }
485 let Position { mut offset, mut line, mut column } = self.pos();
486 if self.char() == '\n' {
487 line = line.checked_add(1).unwrap();
488 column = 1;
489 } else {
490 column = column.checked_add(1).unwrap();
491 }
492 offset += self.char().len_utf8();
493 self.parser().pos.set(Position {
494 offset: offset,
495 line: line,
496 column: column,
497 });
498 self.pattern()[self.offset()..].chars().next().is_some()
499 }
500
501 /// If the substring starting at the current position of the parser has
502 /// the given prefix, then bump the parser to the character immediately
503 /// following the prefix and return true. Otherwise, don't bump the parser
504 /// and return false.
505 fn bump_if(&self, prefix: &str) -> bool {
506 if self.pattern()[self.offset()..].starts_with(prefix) {
507 for _ in 0..prefix.chars().count() {
508 self.bump();
509 }
510 true
511 } else {
512 false
513 }
514 }
515
516 /// Returns true if and only if the parser is positioned at a look-around
517 /// prefix. The conditions under which this returns true must always
518 /// correspond to a regular expression that would otherwise be consider
519 /// invalid.
520 ///
521 /// This should only be called immediately after parsing the opening of
522 /// a group or a set of flags.
523 fn is_lookaround_prefix(&self) -> bool {
524 self.bump_if("?=")
525 || self.bump_if("?!")
526 || self.bump_if("?<=")
527 || self.bump_if("?<!")
528 }
529
530 /// Bump the parser, and if the `x` flag is enabled, bump through any
531 /// subsequent spaces. Return true if and only if the parser is not at
532 /// EOF.
533 fn bump_and_bump_space(&self) -> bool {
534 if !self.bump() {
535 return false;
536 }
537 self.bump_space();
538 !self.is_eof()
539 }
540
541 /// If the `x` flag is enabled (i.e., whitespace insensitivity with
542 /// comments), then this will advance the parser through all whitespace
543 /// and comments to the next non-whitespace non-comment byte.
544 ///
545 /// If the `x` flag is disabled, then this is a no-op.
546 ///
547 /// This should be used selectively throughout the parser where
548 /// arbitrary whitespace is permitted when the `x` flag is enabled. For
549 /// example, `{ 5 , 6}` is equivalent to `{5,6}`.
550 fn bump_space(&self) {
551 if !self.ignore_whitespace() {
552 return;
553 }
554 while !self.is_eof() {
555 if self.char().is_whitespace() {
556 self.bump();
557 } else if self.char() == '#' {
558 let start = self.pos();
559 let mut comment_text = String::new();
560 self.bump();
561 while !self.is_eof() {
562 let c = self.char();
563 self.bump();
564 if c == '\n' {
565 break;
566 }
567 comment_text.push(c);
568 }
569 let comment = ast::Comment {
570 span: Span::new(start, self.pos()),
571 comment: comment_text,
572 };
573 self.parser().comments.borrow_mut().push(comment);
574 } else {
575 break;
576 }
577 }
578 }
579
580 /// Peek at the next character in the input without advancing the parser.
581 ///
582 /// If the input has been exhausted, then this returns `None`.
583 fn peek(&self) -> Option<char> {
584 if self.is_eof() {
585 return None;
586 }
587 self.pattern()[self.offset() + self.char().len_utf8()..].chars().next()
588 }
589
590 /// Like peek, but will ignore spaces when the parser is in whitespace
591 /// insensitive mode.
592 fn peek_space(&self) -> Option<char> {
593 if !self.ignore_whitespace() {
594 return self.peek();
595 }
596 if self.is_eof() {
597 return None;
598 }
599 let mut start = self.offset() + self.char().len_utf8();
600 let mut in_comment = false;
601 for (i, c) in self.pattern()[start..].char_indices() {
602 if c.is_whitespace() {
603 continue;
604 } else if !in_comment && c == '#' {
605 in_comment = true;
606 } else if in_comment && c == '\n' {
607 in_comment = false;
608 } else {
609 start += i;
610 break;
611 }
612 }
613 self.pattern()[start..].chars().next()
614 }
615
616 /// Returns true if the next call to `bump` would return false.
617 fn is_eof(&self) -> bool {
618 self.offset() == self.pattern().len()
619 }
620
621 /// Return the current position of the parser, which includes the offset,
622 /// line and column.
623 fn pos(&self) -> Position {
624 self.parser().pos.get()
625 }
626
627 /// Create a span at the current position of the parser. Both the start
628 /// and end of the span are set.
629 fn span(&self) -> Span {
630 Span::splat(self.pos())
631 }
632
633 /// Create a span that covers the current character.
634 fn span_char(&self) -> Span {
635 let mut next = Position {
636 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
637 line: self.line(),
638 column: self.column().checked_add(1).unwrap(),
639 };
640 if self.char() == '\n' {
641 next.line += 1;
642 next.column = 1;
643 }
644 Span::new(self.pos(), next)
645 }
646
647 /// Parse and push a single alternation on to the parser's internal stack.
648 /// If the top of the stack already has an alternation, then add to that
649 /// instead of pushing a new one.
650 ///
651 /// The concatenation given corresponds to a single alternation branch.
652 /// The concatenation returned starts the next branch and is empty.
653 ///
654 /// This assumes the parser is currently positioned at `|` and will advance
655 /// the parser to the character following `|`.
656 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
657 assert_eq!(self.char(), '|');
658 concat.span.end = self.pos();
659 self.push_or_add_alternation(concat);
660 self.bump();
661 Ok(ast::Concat {
662 span: self.span(),
663 asts: vec![],
664 })
665 }
666
667 /// Pushes or adds the given branch of an alternation to the parser's
668 /// internal stack of state.
669 fn push_or_add_alternation(&self, concat: ast::Concat) {
670 use self::GroupState::*;
671
672 let mut stack = self.parser().stack_group.borrow_mut();
673 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
674 alts.asts.push(concat.into_ast());
675 return;
676 }
677 stack.push(Alternation(ast::Alternation {
678 span: Span::new(concat.span.start, self.pos()),
679 asts: vec![concat.into_ast()],
680 }));
681 }
682
683 /// Parse and push a group AST (and its parent concatenation) on to the
684 /// parser's internal stack. Return a fresh concatenation corresponding
685 /// to the group's sub-AST.
686 ///
687 /// If a set of flags was found (with no group), then the concatenation
688 /// is returned with that set of flags added.
689 ///
690 /// This assumes that the parser is currently positioned on the opening
691 /// parenthesis. It advances the parser to the character at the start
692 /// of the sub-expression (or adjoining expression).
693 ///
694 /// If there was a problem parsing the start of the group, then an error
695 /// is returned.
696 fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
697 assert_eq!(self.char(), '(');
94b46f34 698 match self.parse_group()? {
0531ce1d
XL
699 Either::Left(set) => {
700 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
701 if let Some(v) = ignore {
702 self.parser().ignore_whitespace.set(v);
703 }
704
705 concat.asts.push(Ast::Flags(set));
706 Ok(concat)
707 }
708 Either::Right(group) => {
709 let old_ignore_whitespace = self.ignore_whitespace();
710 let new_ignore_whitespace = group
711 .flags()
712 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
713 .unwrap_or(old_ignore_whitespace);
714 self.parser().stack_group.borrow_mut().push(GroupState::Group {
715 concat: concat,
716 group: group,
717 ignore_whitespace: old_ignore_whitespace,
718 });
719 self.parser().ignore_whitespace.set(new_ignore_whitespace);
720 Ok(ast::Concat {
721 span: self.span(),
722 asts: vec![],
723 })
724 }
725 }
726 }
727
728 /// Pop a group AST from the parser's internal stack and set the group's
729 /// AST to the given concatenation. Return the concatenation containing
730 /// the group.
731 ///
732 /// This assumes that the parser is currently positioned on the closing
733 /// parenthesis and advances the parser to the character following the `)`.
734 ///
735 /// If no such group could be popped, then an unopened group error is
736 /// returned.
737 fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> {
738 use self::GroupState::*;
739
740 assert_eq!(self.char(), ')');
741 let mut stack = self.parser().stack_group.borrow_mut();
742 let (mut prior_concat, mut group, ignore_whitespace, alt) =
743 match stack.pop() {
744 Some(Group { concat, group, ignore_whitespace }) => {
745 (concat, group, ignore_whitespace, None)
746 }
747 Some(Alternation(alt)) => {
748 match stack.pop() {
749 Some(Group { concat, group, ignore_whitespace }) => {
750 (concat, group, ignore_whitespace, Some(alt))
751 }
752 None | Some(Alternation(_)) => {
753 return Err(self.error(
754 self.span_char(),
755 ast::ErrorKind::GroupUnopened,
756 ));
757 }
758 }
759 }
760 None => {
761 return Err(self.error(
762 self.span_char(),
763 ast::ErrorKind::GroupUnopened,
764 ));
765 }
766 };
767 self.parser().ignore_whitespace.set(ignore_whitespace);
768 group_concat.span.end = self.pos();
769 self.bump();
770 group.span.end = self.pos();
771 match alt {
772 Some(mut alt) => {
773 alt.span.end = group_concat.span.end;
774 alt.asts.push(group_concat.into_ast());
775 group.ast = Box::new(alt.into_ast());
776 }
777 None => {
778 group.ast = Box::new(group_concat.into_ast());
779 }
780 }
781 prior_concat.asts.push(Ast::Group(group));
782 Ok(prior_concat)
783 }
784
785 /// Pop the last state from the parser's internal stack, if it exists, and
786 /// add the given concatenation to it. There either must be no state or a
787 /// single alternation item on the stack. Any other scenario produces an
788 /// error.
789 ///
790 /// This assumes that the parser has advanced to the end.
791 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
792 concat.span.end = self.pos();
793 let mut stack = self.parser().stack_group.borrow_mut();
794 let ast = match stack.pop() {
795 None => Ok(concat.into_ast()),
796 Some(GroupState::Alternation(mut alt)) => {
797 alt.span.end = self.pos();
798 alt.asts.push(concat.into_ast());
799 Ok(Ast::Alternation(alt))
800 }
801 Some(GroupState::Group { group, .. }) => {
802 return Err(self.error(
803 group.span,
804 ast::ErrorKind::GroupUnclosed,
805 ));
806 }
807 };
808 // If we try to pop again, there should be nothing.
809 match stack.pop() {
810 None => ast,
811 Some(GroupState::Alternation(_)) => {
812 // This unreachable is unfortunate. This case can't happen
813 // because the only way we can be here is if there were two
814 // `GroupState::Alternation`s adjacent in the parser's stack,
815 // which we guarantee to never happen because we never push a
816 // `GroupState::Alternation` if one is already at the top of
817 // the stack.
818 unreachable!()
819 }
820 Some(GroupState::Group { group, .. }) => {
821 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
822 }
823 }
824 }
825
826 /// Parse the opening of a character class and push the current class
827 /// parsing context onto the parser's stack. This assumes that the parser
828 /// is positioned at an opening `[`. The given union should correspond to
829 /// the union of set items built up before seeing the `[`.
830 ///
831 /// If there was a problem parsing the opening of the class, then an error
832 /// is returned. Otherwise, a new union of set items for the class is
833 /// returned (which may be populated with either a `]` or a `-`).
834 fn push_class_open(
835 &self,
836 parent_union: ast::ClassSetUnion,
837 ) -> Result<ast::ClassSetUnion> {
838 assert_eq!(self.char(), '[');
839
94b46f34 840 let (nested_set, nested_union) = self.parse_set_class_open()?;
0531ce1d
XL
841 self.parser().stack_class.borrow_mut().push(ClassState::Open {
842 union: parent_union,
843 set: nested_set,
844 });
845 Ok(nested_union)
846 }
847
848 /// Parse the end of a character class set and pop the character class
849 /// parser stack. The union given corresponds to the last union built
850 /// before seeing the closing `]`. The union returned corresponds to the
851 /// parent character class set with the nested class added to it.
852 ///
853 /// This assumes that the parser is positioned at a `]` and will advance
854 /// the parser to the byte immediately following the `]`.
855 ///
856 /// If the stack is empty after popping, then this returns the final
857 /// "top-level" character class AST (where a "top-level" character class
858 /// is one that is not nested inside any other character class).
859 ///
860 /// If there is no corresponding opening bracket on the parser's stack,
861 /// then an error is returned.
862 fn pop_class(
863 &self,
864 nested_union: ast::ClassSetUnion,
865 ) -> Result<Either<ast::ClassSetUnion, ast::Class>> {
866 assert_eq!(self.char(), ']');
867
868 let item = ast::ClassSet::Item(nested_union.into_item());
869 let prevset = self.pop_class_op(item);
870 let mut stack = self.parser().stack_class.borrow_mut();
871 match stack.pop() {
872 None => {
873 // We can never observe an empty stack:
874 //
875 // 1) We are guaranteed to start with a non-empty stack since
876 // the character class parser is only initiated when it sees
877 // a `[`.
878 // 2) If we ever observe an empty stack while popping after
879 // seeing a `]`, then we signal the character class parser
880 // to terminate.
881 panic!("unexpected empty character class stack")
882 },
883 Some(ClassState::Op { .. }) => {
884 // This panic is unfortunate, but this case is impossible
885 // since we already popped the Op state if one exists above.
886 // Namely, every push to the class parser stack is guarded by
887 // whether an existing Op is already on the top of the stack.
888 // If it is, the existing Op is modified. That is, the stack
889 // can never have consecutive Op states.
890 panic!("unexpected ClassState::Op")
891 }
892 Some(ClassState::Open { mut union, mut set }) => {
893 self.bump();
894 set.span.end = self.pos();
895 set.kind = prevset;
896 if stack.is_empty() {
897 Ok(Either::Right(ast::Class::Bracketed(set)))
898 } else {
899 union.push(ast::ClassSetItem::Bracketed(Box::new(set)));
900 Ok(Either::Left(union))
901 }
902 }
903 }
904 }
905
906 /// Return an "unclosed class" error whose span points to the most
907 /// recently opened class.
908 ///
909 /// This should only be called while parsing a character class.
910 fn unclosed_class_error(&self) -> ast::Error {
911 for state in self.parser().stack_class.borrow().iter().rev() {
912 match *state {
913 ClassState::Open { ref set, .. } => {
914 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
915 }
916 _ => {}
917 }
918 }
919 // We are guaranteed to have a non-empty stack with at least
920 // one open bracket, so we should never get here.
921 panic!("no open character class found")
922 }
923
924 /// Push the current set of class items on to the class parser's stack as
925 /// the left hand side of the given operator.
926 ///
927 /// A fresh set union is returned, which should be used to build the right
928 /// hand side of this operator.
929 fn push_class_op(
930 &self,
931 next_kind: ast::ClassSetBinaryOpKind,
932 next_union: ast::ClassSetUnion,
933 ) -> ast::ClassSetUnion {
934
935 let item = ast::ClassSet::Item(next_union.into_item());
936 let new_lhs = self.pop_class_op(item);
937 self.parser().stack_class.borrow_mut().push(ClassState::Op {
938 kind: next_kind,
939 lhs: new_lhs,
940 });
941 ast::ClassSetUnion { span: self.span(), items: vec![] }
942 }
943
944 /// Pop a character class set from the character class parser stack. If the
945 /// top of the stack is just an item (not an operation), then return the
946 /// given set unchanged. If the top of the stack is an operation, then the
947 /// given set will be used as the rhs of the operation on the top of the
948 /// stack. In that case, the binary operation is returned as a set.
949 fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet {
950 let mut stack = self.parser().stack_class.borrow_mut();
951 let (kind, lhs) = match stack.pop() {
952 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
953 Some(state @ ClassState::Open { .. }) => {
954 stack.push(state);
955 return rhs;
956 }
957 None => unreachable!(),
958 };
959 let span = Span::new(lhs.span().start, rhs.span().end);
960 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
961 span: span,
962 kind: kind,
963 lhs: Box::new(lhs),
964 rhs: Box::new(rhs),
965 })
966 }
967}
968
969impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
970 /// Parse the regular expression into an abstract syntax tree.
971 fn parse(&self) -> Result<Ast> {
972 self.parse_with_comments().map(|astc| astc.ast)
973 }
974
975 /// Parse the regular expression and return an abstract syntax tree with
976 /// all of the comments found in the pattern.
977 fn parse_with_comments(&self) -> Result<ast::WithComments> {
978 assert_eq!(self.offset(), 0, "parser can only be used once");
979 self.parser().reset();
980 let mut concat = ast::Concat {
981 span: self.span(),
982 asts: vec![],
983 };
984 loop {
985 self.bump_space();
986 if self.is_eof() {
987 break;
988 }
989 match self.char() {
94b46f34
XL
990 '(' => concat = self.push_group(concat)?,
991 ')' => concat = self.pop_group(concat)?,
992 '|' => concat = self.push_alternate(concat)?,
0531ce1d 993 '[' => {
94b46f34 994 let class = self.parse_set_class()?;
0531ce1d
XL
995 concat.asts.push(Ast::Class(class));
996 }
997 '?' => {
94b46f34
XL
998 concat = self.parse_uncounted_repetition(
999 concat, ast::RepetitionKind::ZeroOrOne)?;
0531ce1d
XL
1000 }
1001 '*' => {
94b46f34
XL
1002 concat = self.parse_uncounted_repetition(
1003 concat, ast::RepetitionKind::ZeroOrMore)?;
0531ce1d
XL
1004 }
1005 '+' => {
94b46f34
XL
1006 concat = self.parse_uncounted_repetition(
1007 concat, ast::RepetitionKind::OneOrMore)?;
0531ce1d
XL
1008 }
1009 '{' => {
94b46f34 1010 concat = self.parse_counted_repetition(concat)?;
0531ce1d 1011 }
94b46f34 1012 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
0531ce1d
XL
1013 }
1014 }
94b46f34
XL
1015 let ast = self.pop_group_end(concat)?;
1016 NestLimiter::new(self).check(&ast)?;
0531ce1d
XL
1017 Ok(ast::WithComments {
1018 ast: ast,
1019 comments: mem::replace(
1020 &mut *self.parser().comments.borrow_mut(),
1021 vec![],
1022 ),
1023 })
1024 }
1025
1026 /// Parses an uncounted repetition operation. An uncounted repetition
1027 /// operator includes ?, * and +, but does not include the {m,n} syntax.
1028 /// The given `kind` should correspond to the operator observed by the
1029 /// caller.
1030 ///
1031 /// This assumes that the paser is currently positioned at the repetition
1032 /// operator and advances the parser to the first character after the
1033 /// operator. (Note that the operator may include a single additional `?`,
1034 /// which makes the operator ungreedy.)
1035 ///
1036 /// The caller should include the concatenation that is being built. The
1037 /// concatenation returned includes the repetition operator applied to the
1038 /// last expression in the given concatenation.
1039 fn parse_uncounted_repetition(
1040 &self,
1041 mut concat: ast::Concat,
1042 kind: ast::RepetitionKind,
1043 ) -> Result<ast::Concat> {
1044 assert!(
1045 self.char() == '?' || self.char() == '*' || self.char() == '+');
1046 let op_start = self.pos();
1047 let ast = match concat.asts.pop() {
1048 Some(ast) => ast,
1049 None => return Err(self.error(
1050 self.span(),
1051 ast::ErrorKind::RepetitionMissing,
1052 )),
1053 };
94b46f34
XL
1054 match ast {
1055 Ast::Empty(_) | Ast::Flags(_) => return Err(self.error(
1056 self.span(),
1057 ast::ErrorKind::RepetitionMissing,
1058 )),
1059 _ => {}
1060 }
0531ce1d
XL
1061 let mut greedy = true;
1062 if self.bump() && self.char() == '?' {
1063 greedy = false;
1064 self.bump();
1065 }
1066 concat.asts.push(Ast::Repetition(ast::Repetition {
1067 span: ast.span().with_end(self.pos()),
1068 op: ast::RepetitionOp {
1069 span: Span::new(op_start, self.pos()),
1070 kind: kind,
1071 },
1072 greedy: greedy,
1073 ast: Box::new(ast),
1074 }));
1075 Ok(concat)
1076 }
1077
1078 /// Parses a counted repetition operation. A counted repetition operator
1079 /// corresponds to the {m,n} syntax, and does not include the ?, * or +
1080 /// operators.
1081 ///
1082 /// This assumes that the paser is currently positioned at the opening `{`
1083 /// and advances the parser to the first character after the operator.
1084 /// (Note that the operator may include a single additional `?`, which
1085 /// makes the operator ungreedy.)
1086 ///
1087 /// The caller should include the concatenation that is being built. The
1088 /// concatenation returned includes the repetition operator applied to the
1089 /// last expression in the given concatenation.
1090 fn parse_counted_repetition(
1091 &self,
1092 mut concat: ast::Concat,
1093 ) -> Result<ast::Concat> {
1094 assert!(self.char() == '{');
1095 let start = self.pos();
1096 let ast = match concat.asts.pop() {
1097 Some(ast) => ast,
1098 None => return Err(self.error(
1099 self.span(),
1100 ast::ErrorKind::RepetitionMissing,
1101 )),
1102 };
1103 if !self.bump_and_bump_space() {
1104 return Err(self.error(
1105 Span::new(start, self.pos()),
1106 ast::ErrorKind::RepetitionCountUnclosed,
1107 ));
1108 }
94b46f34 1109 let count_start = self.parse_decimal()?;
0531ce1d
XL
1110 let mut range = ast::RepetitionRange::Exactly(count_start);
1111 if self.is_eof() {
1112 return Err(self.error(
1113 Span::new(start, self.pos()),
1114 ast::ErrorKind::RepetitionCountUnclosed,
1115 ));
1116 }
1117 if self.char() == ',' {
1118 if !self.bump_and_bump_space() {
1119 return Err(self.error(
1120 Span::new(start, self.pos()),
1121 ast::ErrorKind::RepetitionCountUnclosed,
1122 ));
1123 }
1124 if self.char() != '}' {
94b46f34 1125 let count_end = self.parse_decimal()?;
0531ce1d
XL
1126 range = ast::RepetitionRange::Bounded(count_start, count_end);
1127 } else {
1128 range = ast::RepetitionRange::AtLeast(count_start);
1129 }
1130 }
1131 if self.is_eof() || self.char() != '}' {
1132 return Err(self.error(
1133 Span::new(start, self.pos()),
1134 ast::ErrorKind::RepetitionCountUnclosed,
1135 ));
1136 }
1137
1138 let mut greedy = true;
1139 if self.bump_and_bump_space() && self.char() == '?' {
1140 greedy = false;
1141 self.bump();
1142 }
1143
1144 let op_span = Span::new(start, self.pos());
1145 if !range.is_valid() {
1146 return Err(self.error(
1147 op_span,
1148 ast::ErrorKind::RepetitionCountInvalid,
1149 ));
1150 }
1151 concat.asts.push(Ast::Repetition(ast::Repetition {
1152 span: ast.span().with_end(self.pos()),
1153 op: ast::RepetitionOp {
1154 span: op_span,
1155 kind: ast::RepetitionKind::Range(range),
1156 },
1157 greedy: greedy,
1158 ast: Box::new(ast),
1159 }));
1160 Ok(concat)
1161 }
1162
1163 /// Parse a group (which contains a sub-expression) or a set of flags.
1164 ///
1165 /// If a group was found, then it is returned with an empty AST. If a set
1166 /// of flags is found, then that set is returned.
1167 ///
1168 /// The parser should be positioned at the opening parenthesis.
1169 ///
1170 /// This advances the parser to the character before the start of the
1171 /// sub-expression (in the case of a group) or to the closing parenthesis
1172 /// immediately following the set of flags.
1173 ///
1174 /// # Errors
1175 ///
1176 /// If flags are given and incorrectly specified, then a corresponding
1177 /// error is returned.
1178 ///
1179 /// If a capture name is given and it is incorrectly specified, then a
1180 /// corresponding error is returned.
1181 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1182 assert_eq!(self.char(), '(');
1183 let open_span = self.span_char();
1184 self.bump();
1185 self.bump_space();
1186 if self.is_lookaround_prefix() {
1187 return Err(self.error(
1188 Span::new(open_span.start, self.span().end),
1189 ast::ErrorKind::UnsupportedLookAround,
1190 ));
1191 }
1192 let inner_span = self.span();
1193 if self.bump_if("?P<") {
94b46f34
XL
1194 let capture_index = self.next_capture_index(open_span)?;
1195 let cap = self.parse_capture_name(capture_index)?;
0531ce1d
XL
1196 Ok(Either::Right(ast::Group {
1197 span: open_span,
1198 kind: ast::GroupKind::CaptureName(cap),
1199 ast: Box::new(Ast::Empty(self.span())),
1200 }))
1201 } else if self.bump_if("?") {
1202 if self.is_eof() {
1203 return Err(self.error(
1204 open_span,
1205 ast::ErrorKind::GroupUnclosed,
1206 ));
1207 }
94b46f34 1208 let flags = self.parse_flags()?;
0531ce1d
XL
1209 let char_end = self.char();
1210 self.bump();
1211 if char_end == ')' {
1212 // We don't allow empty flags, e.g., `(?)`. We instead
1213 // interpret it as a repetition operator missing its argument.
1214 if flags.items.is_empty() {
1215 return Err(self.error(
1216 inner_span,
1217 ast::ErrorKind::RepetitionMissing,
1218 ));
1219 }
1220 Ok(Either::Left(ast::SetFlags {
1221 span: Span { end: self.pos(), ..open_span },
1222 flags: flags,
1223 }))
1224 } else {
1225 assert_eq!(char_end, ':');
1226 Ok(Either::Right(ast::Group {
1227 span: open_span,
1228 kind: ast::GroupKind::NonCapturing(flags),
1229 ast: Box::new(Ast::Empty(self.span())),
1230 }))
1231 }
1232 } else {
94b46f34 1233 let capture_index = self.next_capture_index(open_span)?;
0531ce1d
XL
1234 Ok(Either::Right(ast::Group {
1235 span: open_span,
1236 kind: ast::GroupKind::CaptureIndex(capture_index),
1237 ast: Box::new(Ast::Empty(self.span())),
1238 }))
1239 }
1240 }
1241
1242 /// Parses a capture group name. Assumes that the parser is positioned at
1243 /// the first character in the name following the opening `<` (and may
1244 /// possibly be EOF). This advances the parser to the first character
1245 /// following the closing `>`.
1246 ///
1247 /// The caller must provide the capture index of the group for this name.
1248 fn parse_capture_name(
1249 &self,
1250 capture_index: u32,
1251 ) -> Result<ast::CaptureName> {
1252 if self.is_eof() {
1253 return Err(self.error(
1254 self.span(),
1255 ast::ErrorKind::GroupNameUnexpectedEof,
1256 ));
1257 }
1258 let start = self.pos();
1259 loop {
1260 if self.char() == '>' {
1261 break;
1262 }
1263 if !is_capture_char(self.char(), self.pos() == start) {
1264 return Err(self.error(
1265 self.span_char(),
1266 ast::ErrorKind::GroupNameInvalid,
1267 ));
1268 }
1269 if !self.bump() {
1270 break;
1271 }
1272 }
1273 let end = self.pos();
1274 if self.is_eof() {
1275 return Err(self.error(
1276 self.span(),
1277 ast::ErrorKind::GroupNameUnexpectedEof,
1278 ));
1279 }
1280 assert_eq!(self.char(), '>');
1281 self.bump();
1282 let name = &self.pattern()[start.offset..end.offset];
1283 if name.is_empty() {
1284 return Err(self.error(
1285 Span::new(start, start),
1286 ast::ErrorKind::GroupNameEmpty,
1287 ));
1288 }
1289 let capname = ast::CaptureName {
1290 span: Span::new(start, end),
1291 name: name.to_string(),
1292 index: capture_index,
1293 };
94b46f34 1294 self.add_capture_name(&capname)?;
0531ce1d
XL
1295 Ok(capname)
1296 }
1297
1298 /// Parse a sequence of flags starting at the current character.
1299 ///
1300 /// This advances the parser to the character immediately following the
1301 /// flags, which is guaranteed to be either `:` or `)`.
1302 ///
1303 /// # Errors
1304 ///
1305 /// If any flags are duplicated, then an error is returned.
1306 ///
1307 /// If the negation operator is used more than once, then an error is
1308 /// returned.
1309 ///
1310 /// If no flags could be found or if the negation operation is not followed
1311 /// by any flags, then an error is returned.
1312 fn parse_flags(&self) -> Result<ast::Flags> {
1313 let mut flags = ast::Flags {
1314 span: self.span(),
1315 items: vec![],
1316 };
1317 let mut last_was_negation = None;
1318 while self.char() != ':' && self.char() != ')' {
1319 if self.char() == '-' {
1320 last_was_negation = Some(self.span_char());
1321 let item = ast::FlagsItem {
1322 span: self.span_char(),
1323 kind: ast::FlagsItemKind::Negation,
1324 };
1325 if let Some(i) = flags.add_item(item) {
1326 return Err(self.error(
1327 self.span_char(),
1328 ast::ErrorKind::FlagRepeatedNegation {
1329 original: flags.items[i].span,
1330 },
1331 ));
1332 }
1333 } else {
1334 last_was_negation = None;
1335 let item = ast::FlagsItem {
1336 span: self.span_char(),
94b46f34 1337 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
0531ce1d
XL
1338 };
1339 if let Some(i) = flags.add_item(item) {
1340 return Err(self.error(
1341 self.span_char(),
1342 ast::ErrorKind::FlagDuplicate {
1343 original: flags.items[i].span,
1344 },
1345 ));
1346 }
1347 }
1348 if !self.bump() {
1349 return Err(self.error(
1350 self.span(),
1351 ast::ErrorKind::FlagUnexpectedEof,
1352 ));
1353 }
1354 }
1355 if let Some(span) = last_was_negation {
1356 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1357 }
1358 flags.span.end = self.pos();
1359 Ok(flags)
1360 }
1361
1362 /// Parse the current character as a flag. Do not advance the parser.
1363 ///
1364 /// # Errors
1365 ///
1366 /// If the flag is not recognized, then an error is returned.
1367 fn parse_flag(&self) -> Result<ast::Flag> {
1368 match self.char() {
1369 'i' => Ok(ast::Flag::CaseInsensitive),
1370 'm' => Ok(ast::Flag::MultiLine),
1371 's' => Ok(ast::Flag::DotMatchesNewLine),
1372 'U' => Ok(ast::Flag::SwapGreed),
1373 'u' => Ok(ast::Flag::Unicode),
1374 'x' => Ok(ast::Flag::IgnoreWhitespace),
1375 _ => Err(self.error(
1376 self.span_char(),
1377 ast::ErrorKind::FlagUnrecognized,
1378 )),
1379 }
1380 }
1381
1382 /// Parse a primitive AST. e.g., A literal, non-set character class or
1383 /// assertion.
1384 ///
1385 /// This assumes that the parser expects a primitive at the current
1386 /// location. i.e., All other non-primitive cases have been handled.
1387 /// For example, if the parser's position is at `|`, then `|` will be
1388 /// treated as a literal (e.g., inside a character class).
1389 ///
1390 /// This advances the parser to the first character immediately following
1391 /// the primitive.
1392 fn parse_primitive(&self) -> Result<Primitive> {
1393 match self.char() {
1394 '\\' => self.parse_escape(),
1395 '.' => {
1396 let ast = Primitive::Dot(self.span_char());
1397 self.bump();
1398 Ok(ast)
1399 }
1400 '^' => {
1401 let ast = Primitive::Assertion(ast::Assertion {
1402 span: self.span_char(),
1403 kind: ast::AssertionKind::StartLine,
1404 });
1405 self.bump();
1406 Ok(ast)
1407 }
1408 '$' => {
1409 let ast = Primitive::Assertion(ast::Assertion {
1410 span: self.span_char(),
1411 kind: ast::AssertionKind::EndLine,
1412 });
1413 self.bump();
1414 Ok(ast)
1415 }
1416 c => {
1417 let ast = Primitive::Literal(ast::Literal {
1418 span: self.span_char(),
1419 kind: ast::LiteralKind::Verbatim,
1420 c: c,
1421 });
1422 self.bump();
1423 Ok(ast)
1424 }
1425 }
1426 }
1427
1428 /// Parse an escape sequence as a primitive AST.
1429 ///
1430 /// This assumes the parser is positioned at the start of the escape
1431 /// sequence, i.e., `\`. It advances the parser to the first position
1432 /// immediately following the escape sequence.
1433 fn parse_escape(&self) -> Result<Primitive> {
1434 assert_eq!(self.char(), '\\');
1435 let start = self.pos();
1436 if !self.bump() {
1437 return Err(self.error(
1438 Span::new(start, self.pos()),
1439 ast::ErrorKind::EscapeUnexpectedEof,
1440 ));
1441 }
1442 let c = self.char();
1443 // Put some of the more complicated routines into helpers.
1444 match c {
1445 '0'...'7' => {
1446 if !self.parser().octal {
1447 return Err(self.error(
1448 Span::new(start, self.span_char().end),
1449 ast::ErrorKind::UnsupportedBackreference,
1450 ));
1451 }
1452 let mut lit = self.parse_octal();
1453 lit.span.start = start;
1454 return Ok(Primitive::Literal(lit));
1455 }
1456 '8'...'9' if !self.parser().octal => {
1457 return Err(self.error(
1458 Span::new(start, self.span_char().end),
1459 ast::ErrorKind::UnsupportedBackreference,
1460 ));
1461 }
1462 'x' | 'u' | 'U' => {
94b46f34 1463 let mut lit = self.parse_hex()?;
0531ce1d
XL
1464 lit.span.start = start;
1465 return Ok(Primitive::Literal(lit));
1466 }
1467 'p' | 'P' => {
94b46f34 1468 let mut cls = self.parse_unicode_class()?;
0531ce1d
XL
1469 cls.span.start = start;
1470 return Ok(Primitive::Unicode(cls));
1471 }
1472 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1473 let mut cls = self.parse_perl_class();
1474 cls.span.start = start;
1475 return Ok(Primitive::Perl(cls));
1476 }
1477 _ => {}
1478 }
1479
1480 // Handle all of the one letter sequences inline.
1481 self.bump();
1482 let span = Span::new(start, self.pos());
1483 if is_meta_character(c) {
1484 return Ok(Primitive::Literal(ast::Literal {
1485 span: span,
1486 kind: ast::LiteralKind::Punctuation,
1487 c: c,
1488 }));
1489 }
1490 let special = |kind, c| Ok(Primitive::Literal(ast::Literal {
1491 span: span,
1492 kind: ast::LiteralKind::Special(kind),
1493 c: c,
1494 }));
1495 match c {
1496 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'),
1497 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'),
1498 't' => special(ast::SpecialLiteralKind::Tab, '\t'),
1499 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'),
1500 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'),
1501 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'),
1502 ' ' if self.ignore_whitespace() => {
1503 special(ast::SpecialLiteralKind::Space, ' ')
1504 }
1505 'A' => Ok(Primitive::Assertion(ast::Assertion {
1506 span: span,
1507 kind: ast::AssertionKind::StartText,
1508 })),
1509 'z' => Ok(Primitive::Assertion(ast::Assertion {
1510 span: span,
1511 kind: ast::AssertionKind::EndText,
1512 })),
1513 'b' => Ok(Primitive::Assertion(ast::Assertion {
1514 span: span,
1515 kind: ast::AssertionKind::WordBoundary,
1516 })),
1517 'B' => Ok(Primitive::Assertion(ast::Assertion {
1518 span: span,
1519 kind: ast::AssertionKind::NotWordBoundary,
1520 })),
1521 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
1522 }
1523 }
1524
1525 /// Parse an octal representation of a Unicode codepoint up to 3 digits
1526 /// long. This expects the parser to be positioned at the first octal
1527 /// digit and advances the parser to the first character immediately
1528 /// following the octal number. This also assumes that parsing octal
1529 /// escapes is enabled.
1530 ///
1531 /// Assuming the preconditions are met, this routine can never fail.
1532 fn parse_octal(&self) -> ast::Literal {
1533 use std::char;
1534 use std::u32;
1535
1536 assert!(self.parser().octal);
1537 assert!('0' <= self.char() && self.char() <= '7');
1538 let start = self.pos();
1539 // Parse up to two more digits.
1540 while
1541 self.bump() &&
1542 '0' <= self.char() && self.char() <= '7' &&
1543 self.pos().offset - start.offset <= 2
1544 {}
1545 let end = self.pos();
1546 let octal = &self.pattern()[start.offset..end.offset];
1547 // Parsing the octal should never fail since the above guarantees a
1548 // valid number.
1549 let codepoint =
1550 u32::from_str_radix(octal, 8).expect("valid octal number");
1551 // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
1552 // invalid Unicode scalar values.
1553 let c = char::from_u32(codepoint).expect("Unicode scalar value");
1554 ast::Literal {
1555 span: Span::new(start, end),
1556 kind: ast::LiteralKind::Octal,
1557 c: c,
1558 }
1559 }
1560
1561 /// Parse a hex representation of a Unicode codepoint. This handles both
1562 /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
1563 /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
1564 /// the first character immediately following the hexadecimal literal.
1565 fn parse_hex(&self) -> Result<ast::Literal> {
1566 assert!(self.char() == 'x'
1567 || self.char() == 'u'
1568 || self.char() == 'U');
1569
1570 let hex_kind = match self.char() {
1571 'x' => ast::HexLiteralKind::X,
1572 'u' => ast::HexLiteralKind::UnicodeShort,
1573 _ => ast::HexLiteralKind::UnicodeLong,
1574 };
1575 if !self.bump_and_bump_space() {
1576 return Err(self.error(
1577 self.span(),
1578 ast::ErrorKind::EscapeUnexpectedEof,
1579 ));
1580 }
1581 if self.char() == '{' {
1582 self.parse_hex_brace(hex_kind)
1583 } else {
1584 self.parse_hex_digits(hex_kind)
1585 }
1586 }
1587
1588 /// Parse an N-digit hex representation of a Unicode codepoint. This
1589 /// expects the parser to be positioned at the first digit and will advance
1590 /// the parser to the first character immediately following the escape
1591 /// sequence.
1592 ///
1593 /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`)
1594 /// or 8 (for `\UNNNNNNNN`).
1595 fn parse_hex_digits(
1596 &self,
1597 kind: ast::HexLiteralKind,
1598 ) -> Result<ast::Literal> {
1599 use std::char;
1600 use std::u32;
1601
1602 let mut scratch = self.parser().scratch.borrow_mut();
1603 scratch.clear();
1604
1605 let start = self.pos();
1606 for i in 0..kind.digits() {
1607 if i > 0 && !self.bump_and_bump_space() {
1608 return Err(self.error(
1609 self.span(),
1610 ast::ErrorKind::EscapeUnexpectedEof,
1611 ));
1612 }
1613 if !is_hex(self.char()) {
1614 return Err(self.error(
1615 self.span_char(),
1616 ast::ErrorKind::EscapeHexInvalidDigit,
1617 ));
1618 }
1619 scratch.push(self.char());
1620 }
1621 // The final bump just moves the parser past the literal, which may
1622 // be EOF.
1623 self.bump_and_bump_space();
1624 let end = self.pos();
1625 let hex = scratch.as_str();
1626 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1627 None => Err(self.error(
1628 Span::new(start, end),
1629 ast::ErrorKind::EscapeHexInvalid,
1630 )),
1631 Some(c) => Ok(ast::Literal {
1632 span: Span::new(start, end),
1633 kind: ast::LiteralKind::HexFixed(kind),
1634 c: c,
1635 }),
1636 }
1637 }
1638
1639 /// Parse a hex representation of any Unicode scalar value. This expects
1640 /// the parser to be positioned at the opening brace `{` and will advance
1641 /// the parser to the first character following the closing brace `}`.
1642 fn parse_hex_brace(
1643 &self,
1644 kind: ast::HexLiteralKind,
1645 ) -> Result<ast::Literal> {
1646 use std::char;
1647 use std::u32;
1648
1649 let mut scratch = self.parser().scratch.borrow_mut();
1650 scratch.clear();
1651
1652 let brace_pos = self.pos();
1653 let start = self.span_char().end;
1654 while self.bump_and_bump_space() && self.char() != '}' {
1655 if !is_hex(self.char()) {
1656 return Err(self.error(
1657 self.span_char(),
1658 ast::ErrorKind::EscapeHexInvalidDigit,
1659 ));
1660 }
1661 scratch.push(self.char());
1662 }
1663 if self.is_eof() {
1664 return Err(self.error(
1665 Span::new(brace_pos, self.pos()),
1666 ast::ErrorKind::EscapeUnexpectedEof,
1667 ));
1668 }
1669 let end = self.pos();
1670 let hex = scratch.as_str();
1671 assert_eq!(self.char(), '}');
1672 self.bump_and_bump_space();
1673
1674 if hex.is_empty() {
1675 return Err(self.error(
1676 Span::new(brace_pos, self.pos()),
1677 ast::ErrorKind::EscapeHexEmpty,
1678 ));
1679 }
1680 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1681 None => Err(self.error(
1682 Span::new(start, end),
1683 ast::ErrorKind::EscapeHexInvalid,
1684 )),
1685 Some(c) => Ok(ast::Literal {
1686 span: Span::new(start, self.pos()),
1687 kind: ast::LiteralKind::HexBrace(kind),
1688 c: c,
1689 }),
1690 }
1691 }
1692
1693 /// Parse a decimal number into a u32 while trimming leading and trailing
1694 /// whitespace.
1695 ///
1696 /// This expects the parser to be positioned at the first position where
1697 /// a decimal digit could occur. This will advance the parser to the byte
1698 /// immediately following the last contiguous decimal digit.
1699 ///
1700 /// If no decimal digit could be found or if there was a problem parsing
1701 /// the complete set of digits into a u32, then an error is returned.
1702 fn parse_decimal(&self) -> Result<u32> {
1703 let mut scratch = self.parser().scratch.borrow_mut();
1704 scratch.clear();
1705
1706 while !self.is_eof() && self.char().is_whitespace() {
1707 self.bump();
1708 }
1709 let start = self.pos();
1710 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
1711 scratch.push(self.char());
1712 self.bump_and_bump_space();
1713 }
1714 let span = Span::new(start, self.pos());
1715 while !self.is_eof() && self.char().is_whitespace() {
1716 self.bump_and_bump_space();
1717 }
1718 let digits = scratch.as_str();
1719 if digits.is_empty() {
1720 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
1721 }
1722 match u32::from_str_radix(digits, 10).ok() {
1723 Some(n) => Ok(n),
1724 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
1725 }
1726 }
1727
1728 /// Parse a standard character class consisting primarily of characters or
1729 /// character ranges, but can also contain nested character classes of
1730 /// any type (sans `.`).
1731 ///
1732 /// This assumes the parser is positioned at the opening `[`. If parsing
1733 /// is successful, then the parser is advanced to the position immediately
1734 /// following the closing `]`.
1735 fn parse_set_class(&self) -> Result<ast::Class> {
1736 assert_eq!(self.char(), '[');
1737
1738 let mut union = ast::ClassSetUnion {
1739 span: self.span(),
1740 items: vec![],
1741 };
1742 loop {
1743 self.bump_space();
1744 if self.is_eof() {
1745 return Err(self.unclosed_class_error());
1746 }
1747 match self.char() {
1748 '[' => {
1749 // If we've already parsed the opening bracket, then
1750 // attempt to treat this as the beginning of an ASCII
1751 // class. If ASCII class parsing fails, then the parser
1752 // backs up to `[`.
1753 if !self.parser().stack_class.borrow().is_empty() {
1754 if let Some(cls) = self.maybe_parse_ascii_class() {
1755 union.push(ast::ClassSetItem::Ascii(cls));
1756 continue;
1757 }
1758 }
94b46f34 1759 union = self.push_class_open(union)?;
0531ce1d
XL
1760 }
1761 ']' => {
94b46f34 1762 match self.pop_class(union)? {
0531ce1d
XL
1763 Either::Left(nested_union) => { union = nested_union; }
1764 Either::Right(class) => return Ok(class),
1765 }
1766 }
1767 '&' if self.peek() == Some('&') => {
1768 assert!(self.bump_if("&&"));
1769 union = self.push_class_op(
1770 ast::ClassSetBinaryOpKind::Intersection, union);
1771 }
1772 '-' if self.peek() == Some('-') => {
1773 assert!(self.bump_if("--"));
1774 union = self.push_class_op(
1775 ast::ClassSetBinaryOpKind::Difference, union);
1776 }
1777 '~' if self.peek() == Some('~') => {
1778 assert!(self.bump_if("~~"));
1779 union = self.push_class_op(
1780 ast::ClassSetBinaryOpKind::SymmetricDifference, union);
1781 }
1782 _ => {
94b46f34 1783 union.push(self.parse_set_class_range()?);
0531ce1d
XL
1784 }
1785 }
1786 }
1787 }
1788
1789 /// Parse a single primitive item in a character class set. The item to
1790 /// be parsed can either be one of a simple literal character, a range
1791 /// between two simple literal characters or a "primitive" character
1792 /// class like \w or \p{Greek}.
1793 ///
1794 /// If an invalid escape is found, or if a character class is found where
1795 /// a simple literal is expected (e.g., in a range), then an error is
1796 /// returned.
1797 fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> {
94b46f34 1798 let prim1 = self.parse_set_class_item()?;
0531ce1d
XL
1799 self.bump_space();
1800 if self.is_eof() {
1801 return Err(self.unclosed_class_error());
1802 }
1803 // If the next char isn't a `-`, then we don't have a range.
1804 // There are two exceptions. If the char after a `-` is a `]`, then
1805 // `-` is interpreted as a literal `-`. Alternatively, if the char
1806 // after a `-` is a `-`, then `--` corresponds to a "difference"
1807 // operation.
1808 if self.char() != '-'
1809 || self.peek_space() == Some(']')
1810 || self.peek_space() == Some('-')
1811 {
1812 return prim1.into_class_set_item(self);
1813 }
1814 // OK, now we're parsing a range, so bump past the `-` and parse the
1815 // second half of the range.
1816 if !self.bump_and_bump_space() {
1817 return Err(self.unclosed_class_error());
1818 }
94b46f34 1819 let prim2 = self.parse_set_class_item()?;
0531ce1d
XL
1820 let range = ast::ClassSetRange {
1821 span: Span::new(prim1.span().start, prim2.span().end),
94b46f34
XL
1822 start: prim1.into_class_literal(self)?,
1823 end: prim2.into_class_literal(self)?,
0531ce1d
XL
1824 };
1825 if !range.is_valid() {
1826 return Err(self.error(
1827 range.span,
1828 ast::ErrorKind::ClassRangeInvalid,
1829 ));
1830 }
1831 Ok(ast::ClassSetItem::Range(range))
1832 }
1833
1834 /// Parse a single item in a character class as a primitive, where the
1835 /// primitive either consists of a verbatim literal or a single escape
1836 /// sequence.
1837 ///
1838 /// This assumes the parser is positioned at the beginning of a primitive,
1839 /// and advances the parser to the first position after the primitive if
1840 /// successful.
1841 ///
1842 /// Note that it is the caller's responsibility to report an error if an
1843 /// illegal primitive was parsed.
1844 fn parse_set_class_item(&self) -> Result<Primitive> {
1845 if self.char() == '\\' {
1846 self.parse_escape()
1847 } else {
1848 let x = Primitive::Literal(ast::Literal {
1849 span: self.span_char(),
1850 kind: ast::LiteralKind::Verbatim,
1851 c: self.char(),
1852 });
1853 self.bump();
1854 Ok(x)
1855 }
1856 }
1857
1858 /// Parses the opening of a character class set. This includes the opening
1859 /// bracket along with `^` if present to indicate negation. This also
1860 /// starts parsing the opening set of unioned items if applicable, since
1861 /// there are special rules applied to certain characters in the opening
1862 /// of a character class. For example, `[^]]` is the class of all
1863 /// characters not equal to `]`. (`]` would need to be escaped in any other
1864 /// position.) Similarly for `-`.
1865 ///
1866 /// In all cases, the op inside the returned `ast::ClassBracketed` is an
1867 /// empty union. This empty union should be replaced with the actual item
1868 /// when it is popped from the parser's stack.
1869 ///
1870 /// This assumes the parser is positioned at the opening `[` and advances
1871 /// the parser to the first non-special byte of the character class.
1872 ///
1873 /// An error is returned if EOF is found.
1874 fn parse_set_class_open(
1875 &self,
1876 ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> {
1877 assert_eq!(self.char(), '[');
1878 let start = self.pos();
1879 if !self.bump_and_bump_space() {
1880 return Err(self.error(
1881 Span::new(start, self.pos()),
1882 ast::ErrorKind::ClassUnclosed,
1883 ));
1884 }
1885
1886 let negated =
1887 if self.char() != '^' {
1888 false
1889 } else {
1890 if !self.bump_and_bump_space() {
1891 return Err(self.error(
1892 Span::new(start, self.pos()),
1893 ast::ErrorKind::ClassUnclosed,
1894 ));
1895 }
1896 true
1897 };
1898 // Accept any number of `-` as literal `-`.
1899 let mut union = ast::ClassSetUnion {
1900 span: self.span(),
1901 items: vec![],
1902 };
1903 while self.char() == '-' {
1904 union.push(ast::ClassSetItem::Literal(ast::Literal {
1905 span: self.span_char(),
1906 kind: ast::LiteralKind::Verbatim,
1907 c: '-',
1908 }));
1909 if !self.bump_and_bump_space() {
1910 return Err(self.error(
1911 Span::new(start, self.pos()),
1912 ast::ErrorKind::ClassUnclosed,
1913 ));
1914 }
1915 }
1916 // If `]` is the *first* char in a set, then interpret it as a literal
1917 // `]`. That is, an empty class is impossible to write.
1918 if union.items.is_empty() && self.char() == ']' {
1919 union.push(ast::ClassSetItem::Literal(ast::Literal {
1920 span: self.span_char(),
1921 kind: ast::LiteralKind::Verbatim,
1922 c: ']',
1923 }));
1924 if !self.bump_and_bump_space() {
1925 return Err(self.error(
1926 Span::new(start, self.pos()),
1927 ast::ErrorKind::ClassUnclosed,
1928 ));
1929 }
1930 }
1931 let set = ast::ClassBracketed {
1932 span: Span::new(start, self.pos()),
1933 negated: negated,
1934 kind: ast::ClassSet::union(ast::ClassSetUnion {
1935 span: Span::new(union.span.start, union.span.start),
1936 items: vec![],
1937 }),
1938 };
1939 Ok((set, union))
1940 }
1941
1942 /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`.
1943 ///
1944 /// This assumes the parser is positioned at the opening `[`.
1945 ///
1946 /// If no valid ASCII character class could be found, then this does not
1947 /// advance the parser and `None` is returned. Otherwise, the parser is
1948 /// advanced to the first byte following the closing `]` and the
1949 /// corresponding ASCII class is returned.
1950 fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> {
1951 // ASCII character classes are interesting from a parsing perspective
1952 // because parsing cannot fail with any interesting error. For example,
1953 // in order to use an ASCII character class, it must be enclosed in
1954 // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
1955 // of it as "ASCII character characters have the syntax `[:NAME:]`
1956 // which can only appear within character brackets." This means that
1957 // things like `[[:lower:]A]` are legal constructs.
1958 //
1959 // However, if one types an incorrect ASCII character class, e.g.,
1960 // `[[:loower:]]`, then we treat that as a normal nested character
1961 // class containing the characters `:elorw`. One might argue that we
1962 // should return an error instead since the repeated colons give away
1963 // the intent to write an ASCII class. But what if the user typed
1964 // `[[:lower]]` instead? How can we tell that was intended to be an
1965 // ASCII class and not just a normal nested class?
1966 //
1967 // Reasonable people can probably disagree over this, but for better
1968 // or worse, we implement semantics that never fails at the expense
1969 // of better failure modes.
1970 assert_eq!(self.char(), '[');
1971 // If parsing fails, then we back up the parser to this starting point.
1972 let start = self.pos();
1973 let mut negated = false;
1974 if !self.bump() || self.char() != ':' {
1975 self.parser().pos.set(start);
1976 return None;
1977 }
1978 if !self.bump() {
1979 self.parser().pos.set(start);
1980 return None;
1981 }
1982 if self.char() == '^' {
1983 negated = true;
1984 if !self.bump() {
1985 self.parser().pos.set(start);
1986 return None;
1987 }
1988 }
1989 let name_start = self.offset();
1990 while self.char() != ':' && self.bump() {}
1991 if self.is_eof() {
1992 self.parser().pos.set(start);
1993 return None;
1994 }
1995 let name = &self.pattern()[name_start..self.offset()];
1996 if !self.bump_if(":]") {
1997 self.parser().pos.set(start);
1998 return None;
1999 }
2000 let kind = match ast::ClassAsciiKind::from_name(name) {
2001 Some(kind) => kind,
2002 None => {
2003 self.parser().pos.set(start);
2004 return None;
2005 }
2006 };
2007 Some(ast::ClassAscii {
2008 span: Span::new(start, self.pos()),
2009 kind: kind,
2010 negated: negated,
2011 })
2012 }
2013
2014 /// Parse a Unicode class in either the single character notation, `\pN`
2015 /// or the multi-character bracketed notation, `\p{Greek}`. This assumes
2016 /// the parser is positioned at the `p` (or `P` for negation) and will
2017 /// advance the parser to the character immediately following the class.
2018 ///
2019 /// Note that this does not check whether the class name is valid or not.
2020 fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> {
2021 assert!(self.char() == 'p' || self.char() == 'P');
2022
2023 let mut scratch = self.parser().scratch.borrow_mut();
2024 scratch.clear();
2025
2026 let negated = self.char() == 'P';
2027 if !self.bump_and_bump_space() {
2028 return Err(self.error(
2029 self.span(),
2030 ast::ErrorKind::EscapeUnexpectedEof,
2031 ));
2032 }
2033 let (start, kind) =
2034 if self.char() == '{' {
2035 let start = self.span_char().end;
2036 while self.bump_and_bump_space() && self.char() != '}' {
2037 scratch.push(self.char());
2038 }
2039 if self.is_eof() {
2040 return Err(self.error(
2041 self.span(),
2042 ast::ErrorKind::EscapeUnexpectedEof,
2043 ));
2044 }
2045 assert_eq!(self.char(), '}');
2046 self.bump();
2047
2048 let name = scratch.as_str();
2049 if let Some(i) = name.find("!=") {
2050 (start, ast::ClassUnicodeKind::NamedValue {
2051 op: ast::ClassUnicodeOpKind::NotEqual,
2052 name: name[..i].to_string(),
2053 value: name[i+2..].to_string(),
2054 })
2055 } else if let Some(i) = name.find(':') {
2056 (start, ast::ClassUnicodeKind::NamedValue {
2057 op: ast::ClassUnicodeOpKind::Colon,
2058 name: name[..i].to_string(),
2059 value: name[i+1..].to_string(),
2060 })
2061 } else if let Some(i) = name.find('=') {
2062 (start, ast::ClassUnicodeKind::NamedValue {
2063 op: ast::ClassUnicodeOpKind::Equal,
2064 name: name[..i].to_string(),
2065 value: name[i+1..].to_string(),
2066 })
2067 } else {
2068 (start, ast::ClassUnicodeKind::Named(name.to_string()))
2069 }
2070 } else {
2071 let start = self.pos();
2072 let c = self.char();
2073 self.bump_and_bump_space();
2074 let kind = ast::ClassUnicodeKind::OneLetter(c);
2075 (start, kind)
2076 };
2077 Ok(ast::ClassUnicode {
2078 span: Span::new(start, self.pos()),
2079 negated: negated,
2080 kind: kind,
2081 })
2082 }
2083
2084 /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the
2085 /// parser is currently at a valid character class name and will be
2086 /// advanced to the character immediately following the class.
2087 fn parse_perl_class(&self) -> ast::ClassPerl {
2088 let c = self.char();
2089 let span = self.span_char();
2090 self.bump();
2091 let (negated, kind) = match c {
2092 'd' => (false, ast::ClassPerlKind::Digit),
2093 'D' => (true, ast::ClassPerlKind::Digit),
2094 's' => (false, ast::ClassPerlKind::Space),
2095 'S' => (true, ast::ClassPerlKind::Space),
2096 'w' => (false, ast::ClassPerlKind::Word),
2097 'W' => (true, ast::ClassPerlKind::Word),
2098 c => panic!("expected valid Perl class but got '{}'", c),
2099 };
2100 ast::ClassPerl { span: span, kind: kind, negated: negated }
2101 }
2102}
2103
2104/// A type that traverses a fully parsed Ast and checks whether its depth
2105/// exceeds the specified nesting limit. If it does, then an error is returned.
2106#[derive(Debug)]
2107struct NestLimiter<'p, 's: 'p, P: 'p + 's> {
2108 /// The parser that is checking the nest limit.
2109 p: &'p ParserI<'s, P>,
2110 /// The current depth while walking an Ast.
2111 depth: u32,
2112}
2113
2114impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> {
2115 fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> {
2116 NestLimiter { p: p, depth: 0 }
2117 }
2118
2119 fn check(self, ast: &Ast) -> Result<()> {
2120 ast::visit(ast, self)
2121 }
2122
2123 fn increment_depth(&mut self, span: &Span) -> Result<()> {
94b46f34 2124 let new = self.depth.checked_add(1).ok_or_else(|| self.p.error(
0531ce1d
XL
2125 span.clone(),
2126 ast::ErrorKind::NestLimitExceeded(::std::u32::MAX),
94b46f34 2127 ))?;
0531ce1d
XL
2128 let limit = self.p.parser().nest_limit;
2129 if new > limit {
2130 return Err(self.p.error(
2131 span.clone(),
2132 ast::ErrorKind::NestLimitExceeded(limit),
2133 ));
2134 }
2135 self.depth = new;
2136 Ok(())
2137 }
2138
2139 fn decrement_depth(&mut self) {
2140 // Assuming the correctness of the visitor, this should never drop
2141 // below 0.
2142 self.depth = self.depth.checked_sub(1).unwrap();
2143 }
2144}
2145
2146impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
2147 type Output = ();
2148 type Err = ast::Error;
2149
2150 fn finish(self) -> Result<()> {
2151 Ok(())
2152 }
2153
2154 fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
2155 let span = match *ast {
2156 Ast::Empty(_)
2157 | Ast::Flags(_)
2158 | Ast::Literal(_)
2159 | Ast::Dot(_)
2160 | Ast::Assertion(_)
2161 | Ast::Class(ast::Class::Unicode(_))
2162 | Ast::Class(ast::Class::Perl(_)) => {
2163 // These are all base cases, so we don't increment depth.
2164 return Ok(());
2165 }
2166 Ast::Class(ast::Class::Bracketed(ref x)) => &x.span,
2167 Ast::Repetition(ref x) => &x.span,
2168 Ast::Group(ref x) => &x.span,
2169 Ast::Alternation(ref x) => &x.span,
2170 Ast::Concat(ref x) => &x.span,
2171 };
2172 self.increment_depth(span)
2173 }
2174
2175 fn visit_post(&mut self, ast: &Ast) -> Result<()> {
2176 match *ast {
2177 Ast::Empty(_)
2178 | Ast::Flags(_)
2179 | Ast::Literal(_)
2180 | Ast::Dot(_)
2181 | Ast::Assertion(_)
2182 | Ast::Class(ast::Class::Unicode(_))
2183 | Ast::Class(ast::Class::Perl(_)) => {
2184 // These are all base cases, so we don't decrement depth.
2185 Ok(())
2186 }
2187 Ast::Class(ast::Class::Bracketed(_))
2188 | Ast::Repetition(_)
2189 | Ast::Group(_)
2190 | Ast::Alternation(_)
2191 | Ast::Concat(_) => {
2192 self.decrement_depth();
2193 Ok(())
2194 }
2195 }
2196 }
2197
2198 fn visit_class_set_item_pre(
2199 &mut self,
2200 ast: &ast::ClassSetItem,
2201 ) -> Result<()> {
2202 let span = match *ast {
2203 ast::ClassSetItem::Empty(_)
2204 | ast::ClassSetItem::Literal(_)
2205 | ast::ClassSetItem::Range(_)
2206 | ast::ClassSetItem::Ascii(_)
2207 | ast::ClassSetItem::Unicode(_)
2208 | ast::ClassSetItem::Perl(_) => {
2209 // These are all base cases, so we don't increment depth.
2210 return Ok(());
2211 }
2212 ast::ClassSetItem::Bracketed(ref x) => &x.span,
2213 ast::ClassSetItem::Union(ref x) => &x.span,
2214 };
2215 self.increment_depth(span)
2216 }
2217
2218 fn visit_class_set_item_post(
2219 &mut self,
2220 ast: &ast::ClassSetItem,
2221 ) -> Result<()> {
2222 match *ast {
2223 ast::ClassSetItem::Empty(_)
2224 | ast::ClassSetItem::Literal(_)
2225 | ast::ClassSetItem::Range(_)
2226 | ast::ClassSetItem::Ascii(_)
2227 | ast::ClassSetItem::Unicode(_)
2228 | ast::ClassSetItem::Perl(_) => {
2229 // These are all base cases, so we don't decrement depth.
2230 Ok(())
2231 }
2232 ast::ClassSetItem::Bracketed(_)
2233 | ast::ClassSetItem::Union(_) => {
2234 self.decrement_depth();
2235 Ok(())
2236 }
2237 }
2238 }
2239
2240 fn visit_class_set_binary_op_pre(
2241 &mut self,
2242 ast: &ast::ClassSetBinaryOp,
2243 ) -> Result<()> {
2244 self.increment_depth(&ast.span)
2245 }
2246
2247 fn visit_class_set_binary_op_post(
2248 &mut self,
2249 _ast: &ast::ClassSetBinaryOp,
2250 ) -> Result<()> {
2251 self.decrement_depth();
2252 Ok(())
2253 }
2254}
2255
2256#[cfg(test)]
2257mod tests {
2258 use std::ops::Range;
2259
2260 use ast::{self, Ast, Position, Span};
2261 use super::{Parser, ParserI, ParserBuilder, Primitive};
2262
2263 // Our own assert_eq, which has slightly better formatting (but honestly
2264 // still kind of crappy).
2265 macro_rules! assert_eq {
2266 ($left:expr, $right:expr) => ({
2267 match (&$left, &$right) {
2268 (left_val, right_val) => {
2269 if !(*left_val == *right_val) {
2270 panic!("assertion failed: `(left == right)`\n\n\
2271 left: `{:?}`\nright: `{:?}`\n\n",
2272 left_val, right_val)
2273 }
2274 }
2275 }
2276 });
2277 }
2278
2279 // We create these errors to compare with real ast::Errors in the tests.
2280 // We define equality between TestError and ast::Error to disregard the
2281 // pattern string in ast::Error, which is annoying to provide in tests.
2282 #[derive(Clone, Debug)]
2283 struct TestError {
2284 span: Span,
2285 kind: ast::ErrorKind,
2286 }
2287
2288 impl PartialEq<ast::Error> for TestError {
2289 fn eq(&self, other: &ast::Error) -> bool {
2290 self.span == other.span && self.kind == other.kind
2291 }
2292 }
2293
2294 impl PartialEq<TestError> for ast::Error {
2295 fn eq(&self, other: &TestError) -> bool {
2296 self.span == other.span && self.kind == other.kind
2297 }
2298 }
2299
2300 fn s(str: &str) -> String {
2301 str.to_string()
2302 }
2303
2304 fn parser(pattern: &str) -> ParserI<Parser> {
2305 ParserI::new(Parser::new(), pattern)
2306 }
2307
2308 fn parser_octal(pattern: &str) -> ParserI<Parser> {
2309 let parser = ParserBuilder::new().octal(true).build();
2310 ParserI::new(parser, pattern)
2311 }
2312
2313 fn parser_nest_limit(pattern: &str, nest_limit: u32) -> ParserI<Parser> {
2314 let p = ParserBuilder::new().nest_limit(nest_limit).build();
2315 ParserI::new(p, pattern)
2316 }
2317
2318 fn parser_ignore_whitespace(pattern: &str) -> ParserI<Parser> {
2319 let p = ParserBuilder::new().ignore_whitespace(true).build();
2320 ParserI::new(p, pattern)
2321 }
2322
2323 /// Short alias for creating a new span.
2324 fn nspan(start: Position, end: Position) -> Span {
2325 Span::new(start, end)
2326 }
2327
2328 /// Short alias for creating a new position.
2329 fn npos(offset: usize, line: usize, column: usize) -> Position {
2330 Position::new(offset, line, column)
2331 }
2332
2333 /// Create a new span from the given offset range. This assumes a single
2334 /// line and sets the columns based on the offsets. i.e., This only works
2335 /// out of the box for ASCII, which is fine for most tests.
2336 fn span(range: Range<usize>) -> Span {
2337 let start = Position::new(range.start, 1, range.start + 1);
2338 let end = Position::new(range.end, 1, range.end + 1);
2339 Span::new(start, end)
2340 }
2341
2342 /// Create a new span for the corresponding byte range in the given string.
2343 fn span_range(subject: &str, range: Range<usize>) -> Span {
2344 let start = Position {
2345 offset: range.start,
2346 line: 1 + subject[..range.start].matches('\n').count(),
2347 column: 1 + subject[..range.start]
2348 .chars()
2349 .rev()
2350 .position(|c| c == '\n')
2351 .unwrap_or(subject[..range.start].chars().count()),
2352 };
2353 let end = Position {
2354 offset: range.end,
2355 line: 1 + subject[..range.end].matches('\n').count(),
2356 column: 1 + subject[..range.end]
2357 .chars()
2358 .rev()
2359 .position(|c| c == '\n')
2360 .unwrap_or(subject[..range.end].chars().count()),
2361 };
2362 Span::new(start, end)
2363 }
2364
2365 /// Create a verbatim literal starting at the given position.
2366 fn lit(c: char, start: usize) -> Ast {
2367 lit_with(c, span(start..start + c.len_utf8()))
2368 }
2369
2370 /// Create a punctuation literal starting at the given position.
2371 fn punct_lit(c: char, span: Span) -> Ast {
2372 Ast::Literal(ast::Literal {
2373 span: span,
2374 kind: ast::LiteralKind::Punctuation,
2375 c: c,
2376 })
2377 }
2378
2379 /// Create a verbatim literal with the given span.
2380 fn lit_with(c: char, span: Span) -> Ast {
2381 Ast::Literal(ast::Literal {
2382 span: span,
2383 kind: ast::LiteralKind::Verbatim,
2384 c: c,
2385 })
2386 }
2387
2388 /// Create a concatenation with the given range.
2389 fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2390 concat_with(span(range), asts)
2391 }
2392
2393 /// Create a concatenation with the given span.
2394 fn concat_with(span: Span, asts: Vec<Ast>) -> Ast {
2395 Ast::Concat(ast::Concat { span: span, asts: asts })
2396 }
2397
2398 /// Create an alternation with the given span.
2399 fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2400 Ast::Alternation(ast::Alternation { span: span(range), asts: asts })
2401 }
2402
2403 /// Create a capturing group with the given span.
2404 fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast {
2405 Ast::Group(ast::Group {
2406 span: span(range),
2407 kind: ast::GroupKind::CaptureIndex(index),
2408 ast: Box::new(ast),
2409 })
2410 }
2411
2412 /// Create an ast::SetFlags.
2413 ///
2414 /// The given pattern should be the full pattern string. The range given
2415 /// should correspond to the byte offsets where the flag set occurs.
2416 ///
2417 /// If negated is true, then the set is interpreted as beginning with a
2418 /// negation.
2419 fn flag_set(
2420 pat: &str,
2421 range: Range<usize>,
2422 flag: ast::Flag,
2423 negated: bool,
2424 ) -> Ast {
2425 let mut items = vec![
2426 ast::FlagsItem {
2427 span: span_range(pat, (range.end - 2)..(range.end - 1)),
2428 kind: ast::FlagsItemKind::Flag(flag),
2429 },
2430 ];
2431 if negated {
2432 items.insert(0, ast::FlagsItem {
2433 span: span_range(pat, (range.start + 2)..(range.end - 2)),
2434 kind: ast::FlagsItemKind::Negation,
2435 });
2436 }
2437 Ast::Flags(ast::SetFlags {
2438 span: span_range(pat, range.clone()),
2439 flags: ast::Flags {
2440 span: span_range(pat, (range.start + 2)..(range.end - 1)),
2441 items: items,
2442 },
2443 })
2444 }
2445
2446 #[test]
2447 fn parse_nest_limit() {
2448 // A nest limit of 0 still allows some types of regexes.
2449 assert_eq!(
2450 parser_nest_limit("", 0).parse(),
2451 Ok(Ast::Empty(span(0..0))));
2452 assert_eq!(
2453 parser_nest_limit("a", 0).parse(),
2454 Ok(lit('a', 0)));
2455
2456 // Test repetition operations, which require one level of nesting.
2457 assert_eq!(
2458 parser_nest_limit("a+", 0).parse().unwrap_err(),
2459 TestError {
2460 span: span(0..2),
2461 kind: ast::ErrorKind::NestLimitExceeded(0),
2462 });
2463 assert_eq!(
2464 parser_nest_limit("a+", 1).parse(),
2465 Ok(Ast::Repetition(ast::Repetition {
2466 span: span(0..2),
2467 op: ast::RepetitionOp {
2468 span: span(1..2),
2469 kind: ast::RepetitionKind::OneOrMore,
2470 },
2471 greedy: true,
2472 ast: Box::new(lit('a', 0)),
2473 })));
2474 assert_eq!(
2475 parser_nest_limit("(a)+", 1).parse().unwrap_err(),
2476 TestError {
2477 span: span(0..3),
2478 kind: ast::ErrorKind::NestLimitExceeded(1),
2479 });
2480 assert_eq!(
2481 parser_nest_limit("a+*", 1).parse().unwrap_err(),
2482 TestError {
2483 span: span(0..2),
2484 kind: ast::ErrorKind::NestLimitExceeded(1),
2485 });
2486 assert_eq!(
2487 parser_nest_limit("a+*", 2).parse(),
2488 Ok(Ast::Repetition(ast::Repetition {
2489 span: span(0..3),
2490 op: ast::RepetitionOp {
2491 span: span(2..3),
2492 kind: ast::RepetitionKind::ZeroOrMore,
2493 },
2494 greedy: true,
2495 ast: Box::new(Ast::Repetition(ast::Repetition {
2496 span: span(0..2),
2497 op: ast::RepetitionOp {
2498 span: span(1..2),
2499 kind: ast::RepetitionKind::OneOrMore,
2500 },
2501 greedy: true,
2502 ast: Box::new(lit('a', 0)),
2503 })),
2504 })));
2505
2506 // Test concatenations. A concatenation requires one level of nesting.
2507 assert_eq!(
2508 parser_nest_limit("ab", 0).parse().unwrap_err(),
2509 TestError {
2510 span: span(0..2),
2511 kind: ast::ErrorKind::NestLimitExceeded(0),
2512 });
2513 assert_eq!(
2514 parser_nest_limit("ab", 1).parse(),
2515 Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)])));
2516 assert_eq!(
2517 parser_nest_limit("abc", 1).parse(),
2518 Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)])));
2519
2520 // Test alternations. An alternation requires one level of nesting.
2521 assert_eq!(
2522 parser_nest_limit("a|b", 0).parse().unwrap_err(),
2523 TestError {
2524 span: span(0..3),
2525 kind: ast::ErrorKind::NestLimitExceeded(0),
2526 });
2527 assert_eq!(
2528 parser_nest_limit("a|b", 1).parse(),
2529 Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)])));
2530 assert_eq!(
2531 parser_nest_limit("a|b|c", 1).parse(),
2532 Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)])));
2533
2534 // Test character classes. Classes form their own mini-recursive
2535 // syntax!
2536 assert_eq!(
2537 parser_nest_limit("[a]", 0).parse().unwrap_err(),
2538 TestError {
2539 span: span(0..3),
2540 kind: ast::ErrorKind::NestLimitExceeded(0),
2541 });
2542 assert_eq!(
2543 parser_nest_limit("[a]", 1).parse(),
2544 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
2545 span: span(0..3),
2546 negated: false,
2547 kind: ast::ClassSet::Item(
2548 ast::ClassSetItem::Literal(ast::Literal {
2549 span: span(1..2),
2550 kind: ast::LiteralKind::Verbatim,
2551 c: 'a',
2552 })
2553 ),
2554 }))));
2555 assert_eq!(
2556 parser_nest_limit("[ab]", 1).parse().unwrap_err(),
2557 TestError {
2558 span: span(1..3),
2559 kind: ast::ErrorKind::NestLimitExceeded(1),
2560 });
2561 assert_eq!(
2562 parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(),
2563 TestError {
2564 span: span(3..7),
2565 kind: ast::ErrorKind::NestLimitExceeded(2),
2566 });
2567 assert_eq!(
2568 parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(),
2569 TestError {
2570 span: span(4..6),
2571 kind: ast::ErrorKind::NestLimitExceeded(3),
2572 });
2573 assert_eq!(
2574 parser_nest_limit("[a--b]", 1).parse().unwrap_err(),
2575 TestError {
2576 span: span(1..5),
2577 kind: ast::ErrorKind::NestLimitExceeded(1),
2578 });
2579 assert_eq!(
2580 parser_nest_limit("[a--bc]", 2).parse().unwrap_err(),
2581 TestError {
2582 span: span(4..6),
2583 kind: ast::ErrorKind::NestLimitExceeded(2),
2584 });
2585 }
2586
2587 #[test]
2588 fn parse_comments() {
2589 let pat = "(?x)
2590# This is comment 1.
2591foo # This is comment 2.
2592 # This is comment 3.
2593bar
2594# This is comment 4.";
2595 let astc = parser(pat).parse_with_comments().unwrap();
2596 assert_eq!(
2597 astc.ast,
2598 concat_with(span_range(pat, 0..pat.len()), vec![
2599 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2600 lit_with('f', span_range(pat, 26..27)),
2601 lit_with('o', span_range(pat, 27..28)),
2602 lit_with('o', span_range(pat, 28..29)),
2603 lit_with('b', span_range(pat, 74..75)),
2604 lit_with('a', span_range(pat, 75..76)),
2605 lit_with('r', span_range(pat, 76..77)),
2606 ]));
2607 assert_eq!(astc.comments, vec![
2608 ast::Comment {
2609 span: span_range(pat, 5..26),
2610 comment: s(" This is comment 1."),
2611 },
2612 ast::Comment {
2613 span: span_range(pat, 30..51),
2614 comment: s(" This is comment 2."),
2615 },
2616 ast::Comment {
2617 span: span_range(pat, 53..74),
2618 comment: s(" This is comment 3."),
2619 },
2620 ast::Comment {
2621 span: span_range(pat, 78..98),
2622 comment: s(" This is comment 4."),
2623 },
2624 ]);
2625 }
2626
2627 #[test]
2628 fn parse_holistic() {
2629 assert_eq!(
2630 parser("]").parse(),
2631 Ok(lit(']', 0)));
2632 assert_eq!(
2633 parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(),
2634 Ok(concat(0..36, vec![
2635 punct_lit('\\', span(0..2)),
2636 punct_lit('.', span(2..4)),
2637 punct_lit('+', span(4..6)),
2638 punct_lit('*', span(6..8)),
2639 punct_lit('?', span(8..10)),
2640 punct_lit('(', span(10..12)),
2641 punct_lit(')', span(12..14)),
2642 punct_lit('|', span(14..16)),
2643 punct_lit('[', span(16..18)),
2644 punct_lit(']', span(18..20)),
2645 punct_lit('{', span(20..22)),
2646 punct_lit('}', span(22..24)),
2647 punct_lit('^', span(24..26)),
2648 punct_lit('$', span(26..28)),
2649 punct_lit('#', span(28..30)),
2650 punct_lit('&', span(30..32)),
2651 punct_lit('-', span(32..34)),
2652 punct_lit('~', span(34..36)),
2653 ])));
2654 }
2655
2656 #[test]
2657 fn parse_ignore_whitespace() {
2658 // Test that basic whitespace insensitivity works.
2659 let pat = "(?x)a b";
2660 assert_eq!(
2661 parser(pat).parse(),
2662 Ok(concat_with(nspan(npos(0, 1, 1), npos(7, 1, 8)), vec![
2663 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2664 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2665 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2666 ])));
2667
2668 // Test that we can toggle whitespace insensitivity.
2669 let pat = "(?x)a b(?-x)a b";
2670 assert_eq!(
2671 parser(pat).parse(),
2672 Ok(concat_with(nspan(npos(0, 1, 1), npos(15, 1, 16)), vec![
2673 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2674 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2675 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2676 flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true),
2677 lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))),
2678 lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))),
2679 lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))),
2680 ])));
2681
2682 // Test that nesting whitespace insensitive flags works.
2683 let pat = "a (?x:a )a ";
2684 assert_eq!(
2685 parser(pat).parse(),
2686 Ok(concat_with(span_range(pat, 0..11), vec![
2687 lit_with('a', span_range(pat, 0..1)),
2688 lit_with(' ', span_range(pat, 1..2)),
2689 Ast::Group(ast::Group {
2690 span: span_range(pat, 2..9),
2691 kind: ast::GroupKind::NonCapturing(ast::Flags {
2692 span: span_range(pat, 4..5),
2693 items: vec![
2694 ast::FlagsItem {
2695 span: span_range(pat, 4..5),
2696 kind: ast::FlagsItemKind::Flag(
2697 ast::Flag::IgnoreWhitespace),
2698 },
2699 ],
2700 }),
2701 ast: Box::new(lit_with('a', span_range(pat, 6..7))),
2702 }),
2703 lit_with('a', span_range(pat, 9..10)),
2704 lit_with(' ', span_range(pat, 10..11)),
2705 ])));
2706
2707 // Test that whitespace after an opening paren is insignificant.
2708 let pat = "(?x)( ?P<foo> a )";
2709 assert_eq!(
2710 parser(pat).parse(),
2711 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2712 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2713 Ast::Group(ast::Group {
2714 span: span_range(pat, 4..pat.len()),
2715 kind: ast::GroupKind::CaptureName(ast::CaptureName {
2716 span: span_range(pat, 9..12),
2717 name: s("foo"),
2718 index: 1,
2719 }),
2720 ast: Box::new(lit_with('a', span_range(pat, 14..15))),
2721 }),
2722 ])));
2723 let pat = "(?x)( a )";
2724 assert_eq!(
2725 parser(pat).parse(),
2726 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2727 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2728 Ast::Group(ast::Group {
2729 span: span_range(pat, 4..pat.len()),
2730 kind: ast::GroupKind::CaptureIndex(1),
2731 ast: Box::new(lit_with('a', span_range(pat, 7..8))),
2732 }),
2733 ])));
2734 let pat = "(?x)( ?: a )";
2735 assert_eq!(
2736 parser(pat).parse(),
2737 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2738 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2739 Ast::Group(ast::Group {
2740 span: span_range(pat, 4..pat.len()),
2741 kind: ast::GroupKind::NonCapturing(ast::Flags {
2742 span: span_range(pat, 8..8),
2743 items: vec![],
2744 }),
2745 ast: Box::new(lit_with('a', span_range(pat, 11..12))),
2746 }),
2747 ])));
2748 let pat = r"(?x)\x { 53 }";
2749 assert_eq!(
2750 parser(pat).parse(),
2751 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2752 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2753 Ast::Literal(ast::Literal {
2754 span: span(4..13),
2755 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
2756 c: 'S',
2757 }),
2758 ])));
2759
2760 // Test that whitespace after an escape is OK.
2761 let pat = r"(?x)\ ";
2762 assert_eq!(
2763 parser(pat).parse(),
2764 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2765 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2766 Ast::Literal(ast::Literal {
2767 span: span_range(pat, 4..6),
2768 kind: ast::LiteralKind::Special(
2769 ast::SpecialLiteralKind::Space),
2770 c: ' ',
2771 }),
2772 ])));
2773 // ... but only when `x` mode is enabled.
2774 let pat = r"\ ";
2775 assert_eq!(
2776 parser(pat).parse().unwrap_err(),
2777 TestError {
2778 span: span_range(pat, 0..2),
2779 kind: ast::ErrorKind::EscapeUnrecognized,
2780 });
2781 }
2782
2783 #[test]
2784 fn parse_newlines() {
2785 let pat = ".\n.";
2786 assert_eq!(
2787 parser(pat).parse(),
2788 Ok(concat_with(span_range(pat, 0..3), vec![
2789 Ast::Dot(span_range(pat, 0..1)),
2790 lit_with('\n', span_range(pat, 1..2)),
2791 Ast::Dot(span_range(pat, 2..3)),
2792 ])));
2793
2794 let pat = "foobar\nbaz\nquux\n";
2795 assert_eq!(
2796 parser(pat).parse(),
2797 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2798 lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))),
2799 lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))),
2800 lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))),
2801 lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))),
2802 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2803 lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))),
2804 lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))),
2805 lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))),
2806 lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))),
2807 lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))),
2808 lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))),
2809 lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))),
2810 lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))),
2811 lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))),
2812 lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))),
2813 lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))),
2814 ])));
2815 }
2816
2817 #[test]
2818 fn parse_uncounted_repetition() {
2819 assert_eq!(
2820 parser(r"a*").parse(),
2821 Ok(Ast::Repetition(ast::Repetition {
2822 span: span(0..2),
2823 op: ast::RepetitionOp {
2824 span: span(1..2),
2825 kind: ast::RepetitionKind::ZeroOrMore,
2826 },
2827 greedy: true,
2828 ast: Box::new(lit('a', 0)),
2829 })));
2830 assert_eq!(
2831 parser(r"a+").parse(),
2832 Ok(Ast::Repetition(ast::Repetition {
2833 span: span(0..2),
2834 op: ast::RepetitionOp {
2835 span: span(1..2),
2836 kind: ast::RepetitionKind::OneOrMore,
2837 },
2838 greedy: true,
2839 ast: Box::new(lit('a', 0)),
2840 })));
2841
2842 assert_eq!(
2843 parser(r"a?").parse(),
2844 Ok(Ast::Repetition(ast::Repetition {
2845 span: span(0..2),
2846 op: ast::RepetitionOp {
2847 span: span(1..2),
2848 kind: ast::RepetitionKind::ZeroOrOne,
2849 },
2850 greedy: true,
2851 ast: Box::new(lit('a', 0)),
2852 })));
2853 assert_eq!(
2854 parser(r"a??").parse(),
2855 Ok(Ast::Repetition(ast::Repetition {
2856 span: span(0..3),
2857 op: ast::RepetitionOp {
2858 span: span(1..3),
2859 kind: ast::RepetitionKind::ZeroOrOne,
2860 },
2861 greedy: false,
2862 ast: Box::new(lit('a', 0)),
2863 })));
2864 assert_eq!(
2865 parser(r"a?").parse(),
2866 Ok(Ast::Repetition(ast::Repetition {
2867 span: span(0..2),
2868 op: ast::RepetitionOp {
2869 span: span(1..2),
2870 kind: ast::RepetitionKind::ZeroOrOne,
2871 },
2872 greedy: true,
2873 ast: Box::new(lit('a', 0)),
2874 })));
2875 assert_eq!(
2876 parser(r"a?b").parse(),
2877 Ok(concat(0..3, vec![
2878 Ast::Repetition(ast::Repetition {
2879 span: span(0..2),
2880 op: ast::RepetitionOp {
2881 span: span(1..2),
2882 kind: ast::RepetitionKind::ZeroOrOne,
2883 },
2884 greedy: true,
2885 ast: Box::new(lit('a', 0)),
2886 }),
2887 lit('b', 2),
2888 ])));
2889 assert_eq!(
2890 parser(r"a??b").parse(),
2891 Ok(concat(0..4, vec![
2892 Ast::Repetition(ast::Repetition {
2893 span: span(0..3),
2894 op: ast::RepetitionOp {
2895 span: span(1..3),
2896 kind: ast::RepetitionKind::ZeroOrOne,
2897 },
2898 greedy: false,
2899 ast: Box::new(lit('a', 0)),
2900 }),
2901 lit('b', 3),
2902 ])));
2903 assert_eq!(
2904 parser(r"ab?").parse(),
2905 Ok(concat(0..3, vec![
2906 lit('a', 0),
2907 Ast::Repetition(ast::Repetition {
2908 span: span(1..3),
2909 op: ast::RepetitionOp {
2910 span: span(2..3),
2911 kind: ast::RepetitionKind::ZeroOrOne,
2912 },
2913 greedy: true,
2914 ast: Box::new(lit('b', 1)),
2915 }),
2916 ])));
2917 assert_eq!(
2918 parser(r"(ab)?").parse(),
2919 Ok(Ast::Repetition(ast::Repetition {
2920 span: span(0..5),
2921 op: ast::RepetitionOp {
2922 span: span(4..5),
2923 kind: ast::RepetitionKind::ZeroOrOne,
2924 },
2925 greedy: true,
2926 ast: Box::new(group(0..4, 1, concat(1..3, vec![
2927 lit('a', 1),
2928 lit('b', 2),
2929 ]))),
2930 })));
2931 assert_eq!(
2932 parser(r"|a?").parse(),
2933 Ok(alt(0..3, vec![
2934 Ast::Empty(span(0..0)),
2935 Ast::Repetition(ast::Repetition {
2936 span: span(1..3),
2937 op: ast::RepetitionOp {
2938 span: span(2..3),
2939 kind: ast::RepetitionKind::ZeroOrOne,
2940 },
2941 greedy: true,
2942 ast: Box::new(lit('a', 1)),
2943 }),
2944 ])));
2945
2946 assert_eq!(
2947 parser(r"*").parse().unwrap_err(),
2948 TestError {
2949 span: span(0..0),
2950 kind: ast::ErrorKind::RepetitionMissing,
2951 });
94b46f34
XL
2952 assert_eq!(
2953 parser(r"(?i)*").parse().unwrap_err(),
2954 TestError {
2955 span: span(4..4),
2956 kind: ast::ErrorKind::RepetitionMissing,
2957 });
0531ce1d
XL
2958 assert_eq!(
2959 parser(r"(*)").parse().unwrap_err(),
2960 TestError {
2961 span: span(1..1),
2962 kind: ast::ErrorKind::RepetitionMissing,
2963 });
2964 assert_eq!(
2965 parser(r"(?:?)").parse().unwrap_err(),
2966 TestError {
2967 span: span(3..3),
2968 kind: ast::ErrorKind::RepetitionMissing,
2969 });
2970 assert_eq!(
2971 parser(r"+").parse().unwrap_err(),
2972 TestError {
2973 span: span(0..0),
2974 kind: ast::ErrorKind::RepetitionMissing,
2975 });
2976 assert_eq!(
2977 parser(r"?").parse().unwrap_err(),
2978 TestError {
2979 span: span(0..0),
2980 kind: ast::ErrorKind::RepetitionMissing,
2981 });
2982 assert_eq!(
2983 parser(r"(?)").parse().unwrap_err(),
2984 TestError {
2985 span: span(1..1),
2986 kind: ast::ErrorKind::RepetitionMissing,
2987 });
2988 assert_eq!(
2989 parser(r"|*").parse().unwrap_err(),
2990 TestError {
2991 span: span(1..1),
2992 kind: ast::ErrorKind::RepetitionMissing,
2993 });
2994 assert_eq!(
2995 parser(r"|+").parse().unwrap_err(),
2996 TestError {
2997 span: span(1..1),
2998 kind: ast::ErrorKind::RepetitionMissing,
2999 });
3000 assert_eq!(
3001 parser(r"|?").parse().unwrap_err(),
3002 TestError {
3003 span: span(1..1),
3004 kind: ast::ErrorKind::RepetitionMissing,
3005 });
3006 }
3007
3008 #[test]
3009 fn parse_counted_repetition() {
3010 assert_eq!(
3011 parser(r"a{5}").parse(),
3012 Ok(Ast::Repetition(ast::Repetition {
3013 span: span(0..4),
3014 op: ast::RepetitionOp {
3015 span: span(1..4),
3016 kind: ast::RepetitionKind::Range(
3017 ast::RepetitionRange::Exactly(5)),
3018 },
3019 greedy: true,
3020 ast: Box::new(lit('a', 0)),
3021 })));
3022 assert_eq!(
3023 parser(r"a{5,}").parse(),
3024 Ok(Ast::Repetition(ast::Repetition {
3025 span: span(0..5),
3026 op: ast::RepetitionOp {
3027 span: span(1..5),
3028 kind: ast::RepetitionKind::Range(
3029 ast::RepetitionRange::AtLeast(5)),
3030 },
3031 greedy: true,
3032 ast: Box::new(lit('a', 0)),
3033 })));
3034 assert_eq!(
3035 parser(r"a{5,9}").parse(),
3036 Ok(Ast::Repetition(ast::Repetition {
3037 span: span(0..6),
3038 op: ast::RepetitionOp {
3039 span: span(1..6),
3040 kind: ast::RepetitionKind::Range(
3041 ast::RepetitionRange::Bounded(5, 9)),
3042 },
3043 greedy: true,
3044 ast: Box::new(lit('a', 0)),
3045 })));
3046 assert_eq!(
3047 parser(r"a{5}?").parse(),
3048 Ok(Ast::Repetition(ast::Repetition {
3049 span: span(0..5),
3050 op: ast::RepetitionOp {
3051 span: span(1..5),
3052 kind: ast::RepetitionKind::Range(
3053 ast::RepetitionRange::Exactly(5)),
3054 },
3055 greedy: false,
3056 ast: Box::new(lit('a', 0)),
3057 })));
3058 assert_eq!(
3059 parser(r"ab{5}").parse(),
3060 Ok(concat(0..5, vec![
3061 lit('a', 0),
3062 Ast::Repetition(ast::Repetition {
3063 span: span(1..5),
3064 op: ast::RepetitionOp {
3065 span: span(2..5),
3066 kind: ast::RepetitionKind::Range(
3067 ast::RepetitionRange::Exactly(5)),
3068 },
3069 greedy: true,
3070 ast: Box::new(lit('b', 1)),
3071 }),
3072 ])));
3073 assert_eq!(
3074 parser(r"ab{5}c").parse(),
3075 Ok(concat(0..6, vec![
3076 lit('a', 0),
3077 Ast::Repetition(ast::Repetition {
3078 span: span(1..5),
3079 op: ast::RepetitionOp {
3080 span: span(2..5),
3081 kind: ast::RepetitionKind::Range(
3082 ast::RepetitionRange::Exactly(5)),
3083 },
3084 greedy: true,
3085 ast: Box::new(lit('b', 1)),
3086 }),
3087 lit('c', 5),
3088 ])));
3089
3090 assert_eq!(
3091 parser(r"a{ 5 }").parse(),
3092 Ok(Ast::Repetition(ast::Repetition {
3093 span: span(0..6),
3094 op: ast::RepetitionOp {
3095 span: span(1..6),
3096 kind: ast::RepetitionKind::Range(
3097 ast::RepetitionRange::Exactly(5)),
3098 },
3099 greedy: true,
3100 ast: Box::new(lit('a', 0)),
3101 })));
3102 assert_eq!(
3103 parser(r"a{ 5 , 9 }").parse(),
3104 Ok(Ast::Repetition(ast::Repetition {
3105 span: span(0..10),
3106 op: ast::RepetitionOp {
3107 span: span(1..10),
3108 kind: ast::RepetitionKind::Range(
3109 ast::RepetitionRange::Bounded(5, 9)),
3110 },
3111 greedy: true,
3112 ast: Box::new(lit('a', 0)),
3113 })));
3114 assert_eq!(
3115 parser_ignore_whitespace(r"a{5,9} ?").parse(),
3116 Ok(Ast::Repetition(ast::Repetition {
3117 span: span(0..8),
3118 op: ast::RepetitionOp {
3119 span: span(1..8),
3120 kind: ast::RepetitionKind::Range(
3121 ast::RepetitionRange::Bounded(5, 9)),
3122 },
3123 greedy: false,
3124 ast: Box::new(lit('a', 0)),
3125 })));
3126
3127 assert_eq!(
3128 parser(r"a{").parse().unwrap_err(),
3129 TestError {
3130 span: span(1..2),
3131 kind: ast::ErrorKind::RepetitionCountUnclosed,
3132 });
3133 assert_eq!(
3134 parser(r"a{}").parse().unwrap_err(),
3135 TestError {
3136 span: span(2..2),
3137 kind: ast::ErrorKind::DecimalEmpty,
3138 });
3139 assert_eq!(
3140 parser(r"a{a").parse().unwrap_err(),
3141 TestError {
3142 span: span(2..2),
3143 kind: ast::ErrorKind::DecimalEmpty,
3144 });
3145 assert_eq!(
3146 parser(r"a{9999999999}").parse().unwrap_err(),
3147 TestError {
3148 span: span(2..12),
3149 kind: ast::ErrorKind::DecimalInvalid,
3150 });
3151 assert_eq!(
3152 parser(r"a{9").parse().unwrap_err(),
3153 TestError {
3154 span: span(1..3),
3155 kind: ast::ErrorKind::RepetitionCountUnclosed,
3156 });
3157 assert_eq!(
3158 parser(r"a{9,a").parse().unwrap_err(),
3159 TestError {
3160 span: span(4..4),
3161 kind: ast::ErrorKind::DecimalEmpty,
3162 });
3163 assert_eq!(
3164 parser(r"a{9,9999999999}").parse().unwrap_err(),
3165 TestError {
3166 span: span(4..14),
3167 kind: ast::ErrorKind::DecimalInvalid,
3168 });
3169 assert_eq!(
3170 parser(r"a{9,").parse().unwrap_err(),
3171 TestError {
3172 span: span(1..4),
3173 kind: ast::ErrorKind::RepetitionCountUnclosed,
3174 });
3175 assert_eq!(
3176 parser(r"a{9,11").parse().unwrap_err(),
3177 TestError {
3178 span: span(1..6),
3179 kind: ast::ErrorKind::RepetitionCountUnclosed,
3180 });
3181 assert_eq!(
3182 parser(r"a{2,1}").parse().unwrap_err(),
3183 TestError {
3184 span: span(1..6),
3185 kind: ast::ErrorKind::RepetitionCountInvalid,
3186 });
3187 assert_eq!(
3188 parser(r"{5}").parse().unwrap_err(),
3189 TestError {
3190 span: span(0..0),
3191 kind: ast::ErrorKind::RepetitionMissing,
3192 });
3193 assert_eq!(
3194 parser(r"|{5}").parse().unwrap_err(),
3195 TestError {
3196 span: span(1..1),
3197 kind: ast::ErrorKind::RepetitionMissing,
3198 });
3199 }
3200
3201 #[test]
3202 fn parse_alternate() {
3203 assert_eq!(
3204 parser(r"a|b").parse(),
3205 Ok(Ast::Alternation(ast::Alternation {
3206 span: span(0..3),
3207 asts: vec![lit('a', 0), lit('b', 2)],
3208 })));
3209 assert_eq!(
3210 parser(r"(a|b)").parse(),
3211 Ok(group(0..5, 1, Ast::Alternation(ast::Alternation {
3212 span: span(1..4),
3213 asts: vec![lit('a', 1), lit('b', 3)],
3214 }))));
3215
3216 assert_eq!(
3217 parser(r"a|b|c").parse(),
3218 Ok(Ast::Alternation(ast::Alternation {
3219 span: span(0..5),
3220 asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)],
3221 })));
3222 assert_eq!(
3223 parser(r"ax|by|cz").parse(),
3224 Ok(Ast::Alternation(ast::Alternation {
3225 span: span(0..8),
3226 asts: vec![
3227 concat(0..2, vec![lit('a', 0), lit('x', 1)]),
3228 concat(3..5, vec![lit('b', 3), lit('y', 4)]),
3229 concat(6..8, vec![lit('c', 6), lit('z', 7)]),
3230 ],
3231 })));
3232 assert_eq!(
3233 parser(r"(ax|by|cz)").parse(),
3234 Ok(group(0..10, 1, Ast::Alternation(ast::Alternation {
3235 span: span(1..9),
3236 asts: vec![
3237 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3238 concat(4..6, vec![lit('b', 4), lit('y', 5)]),
3239 concat(7..9, vec![lit('c', 7), lit('z', 8)]),
3240 ],
3241 }))));
3242 assert_eq!(
3243 parser(r"(ax|(by|(cz)))").parse(),
3244 Ok(group(0..14, 1, alt(1..13, vec![
3245 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3246 group(4..13, 2, alt(5..12, vec![
3247 concat(5..7, vec![lit('b', 5), lit('y', 6)]),
3248 group(8..12, 3, concat(9..11, vec![
3249 lit('c', 9),
3250 lit('z', 10),
3251 ])),
3252 ])),
3253 ]))));
3254
3255 assert_eq!(
3256 parser(r"|").parse(), Ok(alt(0..1, vec![
3257 Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),
3258 ])));
3259 assert_eq!(
3260 parser(r"||").parse(), Ok(alt(0..2, vec![
3261 Ast::Empty(span(0..0)),
3262 Ast::Empty(span(1..1)),
3263 Ast::Empty(span(2..2)),
3264 ])));
3265 assert_eq!(
3266 parser(r"a|").parse(), Ok(alt(0..2, vec![
3267 lit('a', 0), Ast::Empty(span(2..2)),
3268 ])));
3269 assert_eq!(
3270 parser(r"|a").parse(), Ok(alt(0..2, vec![
3271 Ast::Empty(span(0..0)), lit('a', 1),
3272 ])));
3273
3274 assert_eq!(
3275 parser(r"(|)").parse(), Ok(group(0..3, 1, alt(1..2, vec![
3276 Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),
3277 ]))));
3278 assert_eq!(
3279 parser(r"(a|)").parse(), Ok(group(0..4, 1, alt(1..3, vec![
3280 lit('a', 1), Ast::Empty(span(3..3)),
3281 ]))));
3282 assert_eq!(
3283 parser(r"(|a)").parse(), Ok(group(0..4, 1, alt(1..3, vec![
3284 Ast::Empty(span(1..1)), lit('a', 2),
3285 ]))));
3286
3287 assert_eq!(
3288 parser(r"a|b)").parse().unwrap_err(),
3289 TestError {
3290 span: span(3..4),
3291 kind: ast::ErrorKind::GroupUnopened,
3292 });
3293 assert_eq!(
3294 parser(r"(a|b").parse().unwrap_err(),
3295 TestError {
3296 span: span(0..1),
3297 kind: ast::ErrorKind::GroupUnclosed,
3298 });
3299 }
3300
3301 #[test]
3302 fn parse_unsupported_lookaround() {
3303 assert_eq!(
3304 parser(r"(?=a)").parse().unwrap_err(),
3305 TestError {
3306 span: span(0..3),
3307 kind: ast::ErrorKind::UnsupportedLookAround,
3308 });
3309 assert_eq!(
3310 parser(r"(?!a)").parse().unwrap_err(),
3311 TestError {
3312 span: span(0..3),
3313 kind: ast::ErrorKind::UnsupportedLookAround,
3314 });
3315 assert_eq!(
3316 parser(r"(?<=a)").parse().unwrap_err(),
3317 TestError {
3318 span: span(0..4),
3319 kind: ast::ErrorKind::UnsupportedLookAround,
3320 });
3321 assert_eq!(
3322 parser(r"(?<!a)").parse().unwrap_err(),
3323 TestError {
3324 span: span(0..4),
3325 kind: ast::ErrorKind::UnsupportedLookAround,
3326 });
3327 }
3328
3329 #[test]
3330 fn parse_group() {
3331 assert_eq!(parser("(?i)").parse(), Ok(Ast::Flags(ast::SetFlags {
3332 span: span(0..4),
3333 flags: ast::Flags {
3334 span: span(2..3),
3335 items: vec![ast::FlagsItem {
3336 span: span(2..3),
3337 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3338 }],
3339 },
3340 })));
3341 assert_eq!(parser("(?iU)").parse(), Ok(Ast::Flags(ast::SetFlags {
3342 span: span(0..5),
3343 flags: ast::Flags {
3344 span: span(2..4),
3345 items: vec![
3346 ast::FlagsItem {
3347 span: span(2..3),
3348 kind: ast::FlagsItemKind::Flag(
3349 ast::Flag::CaseInsensitive),
3350 },
3351 ast::FlagsItem {
3352 span: span(3..4),
3353 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3354 },
3355 ],
3356 },
3357 })));
3358 assert_eq!(parser("(?i-U)").parse(), Ok(Ast::Flags(ast::SetFlags {
3359 span: span(0..6),
3360 flags: ast::Flags {
3361 span: span(2..5),
3362 items: vec![
3363 ast::FlagsItem {
3364 span: span(2..3),
3365 kind: ast::FlagsItemKind::Flag(
3366 ast::Flag::CaseInsensitive),
3367 },
3368 ast::FlagsItem {
3369 span: span(3..4),
3370 kind: ast::FlagsItemKind::Negation,
3371 },
3372 ast::FlagsItem {
3373 span: span(4..5),
3374 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3375 },
3376 ],
3377 },
3378 })));
3379
3380 assert_eq!(parser("()").parse(), Ok(Ast::Group(ast::Group {
3381 span: span(0..2),
3382 kind: ast::GroupKind::CaptureIndex(1),
3383 ast: Box::new(Ast::Empty(span(1..1))),
3384 })));
3385 assert_eq!(parser("(a)").parse(), Ok(Ast::Group(ast::Group {
3386 span: span(0..3),
3387 kind: ast::GroupKind::CaptureIndex(1),
3388 ast: Box::new(lit('a', 1)),
3389 })));
3390 assert_eq!(parser("(())").parse(), Ok(Ast::Group(ast::Group {
3391 span: span(0..4),
3392 kind: ast::GroupKind::CaptureIndex(1),
3393 ast: Box::new(Ast::Group(ast::Group {
3394 span: span(1..3),
3395 kind: ast::GroupKind::CaptureIndex(2),
3396 ast: Box::new(Ast::Empty(span(2..2))),
3397 })),
3398 })));
3399
3400 assert_eq!(parser("(?:a)").parse(), Ok(Ast::Group(ast::Group {
3401 span: span(0..5),
3402 kind: ast::GroupKind::NonCapturing(ast::Flags {
3403 span: span(2..2),
3404 items: vec![],
3405 }),
3406 ast: Box::new(lit('a', 3)),
3407 })));
3408
3409 assert_eq!(parser("(?i:a)").parse(), Ok(Ast::Group(ast::Group {
3410 span: span(0..6),
3411 kind: ast::GroupKind::NonCapturing(ast::Flags {
3412 span: span(2..3),
3413 items: vec![
3414 ast::FlagsItem {
3415 span: span(2..3),
3416 kind: ast::FlagsItemKind::Flag(
3417 ast::Flag::CaseInsensitive),
3418 },
3419 ],
3420 }),
3421 ast: Box::new(lit('a', 4)),
3422 })));
3423 assert_eq!(parser("(?i-U:a)").parse(), Ok(Ast::Group(ast::Group {
3424 span: span(0..8),
3425 kind: ast::GroupKind::NonCapturing(ast::Flags {
3426 span: span(2..5),
3427 items: vec![
3428 ast::FlagsItem {
3429 span: span(2..3),
3430 kind: ast::FlagsItemKind::Flag(
3431 ast::Flag::CaseInsensitive),
3432 },
3433 ast::FlagsItem {
3434 span: span(3..4),
3435 kind: ast::FlagsItemKind::Negation,
3436 },
3437 ast::FlagsItem {
3438 span: span(4..5),
3439 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3440 },
3441 ],
3442 }),
3443 ast: Box::new(lit('a', 6)),
3444 })));
3445
3446 assert_eq!(
3447 parser("(").parse().unwrap_err(),
3448 TestError {
3449 span: span(0..1),
3450 kind: ast::ErrorKind::GroupUnclosed,
3451 });
3452 assert_eq!(
3453 parser("(?").parse().unwrap_err(),
3454 TestError {
3455 span: span(0..1),
3456 kind: ast::ErrorKind::GroupUnclosed,
3457 });
3458 assert_eq!(
3459 parser("(?P").parse().unwrap_err(),
3460 TestError {
3461 span: span(2..3),
3462 kind: ast::ErrorKind::FlagUnrecognized,
3463 });
3464 assert_eq!(
3465 parser("(?P<").parse().unwrap_err(),
3466 TestError {
3467 span: span(4..4),
3468 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3469 });
3470 assert_eq!(
3471 parser("(a").parse().unwrap_err(),
3472 TestError {
3473 span: span(0..1),
3474 kind: ast::ErrorKind::GroupUnclosed,
3475 });
3476 assert_eq!(
3477 parser("(()").parse().unwrap_err(),
3478 TestError {
3479 span: span(0..1),
3480 kind: ast::ErrorKind::GroupUnclosed,
3481 });
3482 assert_eq!(
3483 parser(")").parse().unwrap_err(),
3484 TestError {
3485 span: span(0..1),
3486 kind: ast::ErrorKind::GroupUnopened,
3487 });
3488 assert_eq!(
3489 parser("a)").parse().unwrap_err(),
3490 TestError {
3491 span: span(1..2),
3492 kind: ast::ErrorKind::GroupUnopened,
3493 });
3494 }
3495
3496 #[test]
3497 fn parse_capture_name() {
3498 assert_eq!(parser("(?P<a>z)").parse(), Ok(Ast::Group(ast::Group {
3499 span: span(0..8),
3500 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3501 span: span(4..5),
3502 name: s("a"),
3503 index: 1,
3504 }),
3505 ast: Box::new(lit('z', 6)),
3506 })));
3507 assert_eq!(parser("(?P<abc>z)").parse(), Ok(Ast::Group(ast::Group {
3508 span: span(0..10),
3509 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3510 span: span(4..7),
3511 name: s("abc"),
3512 index: 1,
3513 }),
3514 ast: Box::new(lit('z', 8)),
3515 })));
3516
3517 assert_eq!(
3518 parser("(?P<").parse().unwrap_err(),
3519 TestError {
3520 span: span(4..4),
3521 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3522 });
3523 assert_eq!(
3524 parser("(?P<>z)").parse().unwrap_err(),
3525 TestError {
3526 span: span(4..4),
3527 kind: ast::ErrorKind::GroupNameEmpty,
3528 });
3529 assert_eq!(
3530 parser("(?P<a").parse().unwrap_err(),
3531 TestError {
3532 span: span(5..5),
3533 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3534 });
3535 assert_eq!(
3536 parser("(?P<ab").parse().unwrap_err(),
3537 TestError {
3538 span: span(6..6),
3539 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3540 });
3541 assert_eq!(
3542 parser("(?P<0a").parse().unwrap_err(),
3543 TestError {
3544 span: span(4..5),
3545 kind: ast::ErrorKind::GroupNameInvalid,
3546 });
3547 assert_eq!(
3548 parser("(?P<~").parse().unwrap_err(),
3549 TestError {
3550 span: span(4..5),
3551 kind: ast::ErrorKind::GroupNameInvalid,
3552 });
3553 assert_eq!(
3554 parser("(?P<abc~").parse().unwrap_err(),
3555 TestError {
3556 span: span(7..8),
3557 kind: ast::ErrorKind::GroupNameInvalid,
3558 });
3559 assert_eq!(
3560 parser("(?P<a>y)(?P<a>z)").parse().unwrap_err(),
3561 TestError {
3562 span: span(12..13),
3563 kind: ast::ErrorKind::GroupNameDuplicate {
3564 original: span(4..5),
3565 },
3566 });
3567 }
3568
3569 #[test]
3570 fn parse_flags() {
3571 assert_eq!(parser("i:").parse_flags(), Ok(ast::Flags {
3572 span: span(0..1),
3573 items: vec![ast::FlagsItem {
3574 span: span(0..1),
3575 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3576 }],
3577 }));
3578 assert_eq!(parser("i)").parse_flags(), Ok(ast::Flags {
3579 span: span(0..1),
3580 items: vec![ast::FlagsItem {
3581 span: span(0..1),
3582 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3583 }],
3584 }));
3585
3586 assert_eq!(parser("isU:").parse_flags(), Ok(ast::Flags {
3587 span: span(0..3),
3588 items: vec![
3589 ast::FlagsItem {
3590 span: span(0..1),
3591 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3592 },
3593 ast::FlagsItem {
3594 span: span(1..2),
3595 kind: ast::FlagsItemKind::Flag(
3596 ast::Flag::DotMatchesNewLine),
3597 },
3598 ast::FlagsItem {
3599 span: span(2..3),
3600 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3601 },
3602 ],
3603 }));
3604
3605 assert_eq!(parser("-isU:").parse_flags(), Ok(ast::Flags {
3606 span: span(0..4),
3607 items: vec![
3608 ast::FlagsItem {
3609 span: span(0..1),
3610 kind: ast::FlagsItemKind::Negation,
3611 },
3612 ast::FlagsItem {
3613 span: span(1..2),
3614 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3615 },
3616 ast::FlagsItem {
3617 span: span(2..3),
3618 kind: ast::FlagsItemKind::Flag(
3619 ast::Flag::DotMatchesNewLine),
3620 },
3621 ast::FlagsItem {
3622 span: span(3..4),
3623 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3624 },
3625 ],
3626 }));
3627 assert_eq!(parser("i-sU:").parse_flags(), Ok(ast::Flags {
3628 span: span(0..4),
3629 items: vec![
3630 ast::FlagsItem {
3631 span: span(0..1),
3632 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3633 },
3634 ast::FlagsItem {
3635 span: span(1..2),
3636 kind: ast::FlagsItemKind::Negation,
3637 },
3638 ast::FlagsItem {
3639 span: span(2..3),
3640 kind: ast::FlagsItemKind::Flag(
3641 ast::Flag::DotMatchesNewLine),
3642 },
3643 ast::FlagsItem {
3644 span: span(3..4),
3645 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3646 },
3647 ],
3648 }));
3649
3650 assert_eq!(
3651 parser("isU").parse_flags().unwrap_err(),
3652 TestError {
3653 span: span(3..3),
3654 kind: ast::ErrorKind::FlagUnexpectedEof,
3655 });
3656 assert_eq!(
3657 parser("isUa:").parse_flags().unwrap_err(),
3658 TestError {
3659 span: span(3..4),
3660 kind: ast::ErrorKind::FlagUnrecognized,
3661 });
3662 assert_eq!(
3663 parser("isUi:").parse_flags().unwrap_err(),
3664 TestError {
3665 span: span(3..4),
3666 kind: ast::ErrorKind::FlagDuplicate {
3667 original: span(0..1),
3668 },
3669 });
3670 assert_eq!(
3671 parser("i-sU-i:").parse_flags().unwrap_err(),
3672 TestError {
3673 span: span(4..5),
3674 kind: ast::ErrorKind::FlagRepeatedNegation {
3675 original: span(1..2),
3676 },
3677 });
3678 assert_eq!(
3679 parser("-)").parse_flags().unwrap_err(),
3680 TestError {
3681 span: span(0..1),
3682 kind: ast::ErrorKind::FlagDanglingNegation,
3683 });
3684 assert_eq!(
3685 parser("i-)").parse_flags().unwrap_err(),
3686 TestError {
3687 span: span(1..2),
3688 kind: ast::ErrorKind::FlagDanglingNegation,
3689 });
3690 assert_eq!(
3691 parser("iU-)").parse_flags().unwrap_err(),
3692 TestError {
3693 span: span(2..3),
3694 kind: ast::ErrorKind::FlagDanglingNegation,
3695 });
3696 }
3697
3698 #[test]
3699 fn parse_flag() {
3700 assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive));
3701 assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine));
3702 assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine));
3703 assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed));
3704 assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode));
3705 assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace));
3706
3707 assert_eq!(
3708 parser("a").parse_flag().unwrap_err(),
3709 TestError {
3710 span: span(0..1),
3711 kind: ast::ErrorKind::FlagUnrecognized,
3712 });
3713 assert_eq!(
3714 parser("☃").parse_flag().unwrap_err(),
3715 TestError {
3716 span: span_range("☃", 0..3),
3717 kind: ast::ErrorKind::FlagUnrecognized,
3718 });
3719 }
3720
3721 #[test]
3722 fn parse_primitive_non_escape() {
3723 assert_eq!(
3724 parser(r".").parse_primitive(),
3725 Ok(Primitive::Dot(span(0..1))));
3726 assert_eq!(
3727 parser(r"^").parse_primitive(),
3728 Ok(Primitive::Assertion(ast::Assertion {
3729 span: span(0..1),
3730 kind: ast::AssertionKind::StartLine,
3731 })));
3732 assert_eq!(
3733 parser(r"$").parse_primitive(),
3734 Ok(Primitive::Assertion(ast::Assertion {
3735 span: span(0..1),
3736 kind: ast::AssertionKind::EndLine,
3737 })));
3738
3739 assert_eq!(
3740 parser(r"a").parse_primitive(),
3741 Ok(Primitive::Literal(ast::Literal {
3742 span: span(0..1),
3743 kind: ast::LiteralKind::Verbatim,
3744 c: 'a',
3745 })));
3746 assert_eq!(
3747 parser(r"|").parse_primitive(),
3748 Ok(Primitive::Literal(ast::Literal {
3749 span: span(0..1),
3750 kind: ast::LiteralKind::Verbatim,
3751 c: '|',
3752 })));
3753 assert_eq!(
3754 parser(r"☃").parse_primitive(),
3755 Ok(Primitive::Literal(ast::Literal {
3756 span: span_range("☃", 0..3),
3757 kind: ast::LiteralKind::Verbatim,
3758 c: '☃',
3759 })));
3760 }
3761
3762 #[test]
3763 fn parse_escape() {
3764 assert_eq!(
3765 parser(r"\|").parse_primitive(),
3766 Ok(Primitive::Literal(ast::Literal {
3767 span: span(0..2),
3768 kind: ast::LiteralKind::Punctuation,
3769 c: '|',
3770 })));
3771 let specials = &[
3772 (r"\a", '\x07', ast::SpecialLiteralKind::Bell),
3773 (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed),
3774 (r"\t", '\t', ast::SpecialLiteralKind::Tab),
3775 (r"\n", '\n', ast::SpecialLiteralKind::LineFeed),
3776 (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn),
3777 (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab),
3778 ];
3779 for &(pat, c, ref kind) in specials {
3780 assert_eq!(
3781 parser(pat).parse_primitive(),
3782 Ok(Primitive::Literal(ast::Literal {
3783 span: span(0..2),
3784 kind: ast::LiteralKind::Special(kind.clone()),
3785 c: c,
3786 })));
3787 }
3788 assert_eq!(
3789 parser(r"\A").parse_primitive(),
3790 Ok(Primitive::Assertion(ast::Assertion {
3791 span: span(0..2),
3792 kind: ast::AssertionKind::StartText,
3793 })));
3794 assert_eq!(
3795 parser(r"\z").parse_primitive(),
3796 Ok(Primitive::Assertion(ast::Assertion {
3797 span: span(0..2),
3798 kind: ast::AssertionKind::EndText,
3799 })));
3800 assert_eq!(
3801 parser(r"\b").parse_primitive(),
3802 Ok(Primitive::Assertion(ast::Assertion {
3803 span: span(0..2),
3804 kind: ast::AssertionKind::WordBoundary,
3805 })));
3806 assert_eq!(
3807 parser(r"\B").parse_primitive(),
3808 Ok(Primitive::Assertion(ast::Assertion {
3809 span: span(0..2),
3810 kind: ast::AssertionKind::NotWordBoundary,
3811 })));
3812
3813 assert_eq!(
3814 parser(r"\").parse_escape().unwrap_err(),
3815 TestError {
3816 span: span(0..1),
3817 kind: ast::ErrorKind::EscapeUnexpectedEof,
3818 });
3819 assert_eq!(
3820 parser(r"\y").parse_escape().unwrap_err(),
3821 TestError {
3822 span: span(0..2),
3823 kind: ast::ErrorKind::EscapeUnrecognized,
3824 });
3825 }
3826
3827 #[test]
3828 fn parse_unsupported_backreference() {
3829 assert_eq!(
3830 parser(r"\0").parse_escape().unwrap_err(),
3831 TestError {
3832 span: span(0..2),
3833 kind: ast::ErrorKind::UnsupportedBackreference,
3834 });
3835 assert_eq!(
3836 parser(r"\9").parse_escape().unwrap_err(),
3837 TestError {
3838 span: span(0..2),
3839 kind: ast::ErrorKind::UnsupportedBackreference,
3840 });
3841 }
3842
3843 #[test]
3844 fn parse_octal() {
3845 for i in 0..511 {
3846 let pat = format!(r"\{:o}", i);
3847 assert_eq!(
3848 parser_octal(&pat).parse_escape(),
3849 Ok(Primitive::Literal(ast::Literal {
3850 span: span(0..pat.len()),
3851 kind: ast::LiteralKind::Octal,
3852 c: ::std::char::from_u32(i).unwrap(),
3853 })));
3854 }
3855 assert_eq!(
3856 parser_octal(r"\778").parse_escape(),
3857 Ok(Primitive::Literal(ast::Literal {
3858 span: span(0..3),
3859 kind: ast::LiteralKind::Octal,
3860 c: '?',
3861 })));
3862 assert_eq!(
3863 parser_octal(r"\7777").parse_escape(),
3864 Ok(Primitive::Literal(ast::Literal {
3865 span: span(0..4),
3866 kind: ast::LiteralKind::Octal,
3867 c: '\u{01FF}',
3868 })));
3869 assert_eq!(
3870 parser_octal(r"\778").parse(),
3871 Ok(Ast::Concat(ast::Concat {
3872 span: span(0..4),
3873 asts: vec![
3874 Ast::Literal(ast::Literal {
3875 span: span(0..3),
3876 kind: ast::LiteralKind::Octal,
3877 c: '?',
3878 }),
3879 Ast::Literal(ast::Literal {
3880 span: span(3..4),
3881 kind: ast::LiteralKind::Verbatim,
3882 c: '8',
3883 }),
3884 ],
3885 })));
3886 assert_eq!(
3887 parser_octal(r"\7777").parse(),
3888 Ok(Ast::Concat(ast::Concat {
3889 span: span(0..5),
3890 asts: vec![
3891 Ast::Literal(ast::Literal {
3892 span: span(0..4),
3893 kind: ast::LiteralKind::Octal,
3894 c: '\u{01FF}',
3895 }),
3896 Ast::Literal(ast::Literal {
3897 span: span(4..5),
3898 kind: ast::LiteralKind::Verbatim,
3899 c: '7',
3900 }),
3901 ],
3902 })));
3903
3904 assert_eq!(
3905 parser_octal(r"\8").parse_escape().unwrap_err(),
3906 TestError {
3907 span: span(0..2),
3908 kind: ast::ErrorKind::EscapeUnrecognized,
3909 });
3910 }
3911
3912 #[test]
3913 fn parse_hex_two() {
3914 for i in 0..256 {
3915 let pat = format!(r"\x{:02x}", i);
3916 assert_eq!(
3917 parser(&pat).parse_escape(),
3918 Ok(Primitive::Literal(ast::Literal {
3919 span: span(0..pat.len()),
3920 kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X),
3921 c: ::std::char::from_u32(i).unwrap(),
3922 })));
3923 }
3924
3925 assert_eq!(
3926 parser(r"\xF").parse_escape().unwrap_err(),
3927 TestError {
3928 span: span(3..3),
3929 kind: ast::ErrorKind::EscapeUnexpectedEof,
3930 });
3931 assert_eq!(
3932 parser(r"\xG").parse_escape().unwrap_err(),
3933 TestError {
3934 span: span(2..3),
3935 kind: ast::ErrorKind::EscapeHexInvalidDigit,
3936 });
3937 assert_eq!(
3938 parser(r"\xFG").parse_escape().unwrap_err(),
3939 TestError {
3940 span: span(3..4),
3941 kind: ast::ErrorKind::EscapeHexInvalidDigit,
3942 });
3943 }
3944
3945 #[test]
3946 fn parse_hex_four() {
3947 for i in 0..65536 {
3948 let c = match ::std::char::from_u32(i) {
3949 None => continue,
3950 Some(c) => c,
3951 };
3952 let pat = format!(r"\u{:04x}", i);
3953 assert_eq!(
3954 parser(&pat).parse_escape(),
3955 Ok(Primitive::Literal(ast::Literal {
3956 span: span(0..pat.len()),
3957 kind: ast::LiteralKind::HexFixed(
3958 ast::HexLiteralKind::UnicodeShort),
3959 c: c,
3960 })));
3961 }
3962
3963 assert_eq!(
3964 parser(r"\uF").parse_escape().unwrap_err(),
3965 TestError {
3966 span: span(3..3),
3967 kind: ast::ErrorKind::EscapeUnexpectedEof,
3968 });
3969 assert_eq!(
3970 parser(r"\uG").parse_escape().unwrap_err(),
3971 TestError {
3972 span: span(2..3),
3973 kind: ast::ErrorKind::EscapeHexInvalidDigit,
3974 });
3975 assert_eq!(
3976 parser(r"\uFG").parse_escape().unwrap_err(),
3977 TestError {
3978 span: span(3..4),
3979 kind: ast::ErrorKind::EscapeHexInvalidDigit,
3980 });
3981 assert_eq!(
3982 parser(r"\uFFG").parse_escape().unwrap_err(),
3983 TestError {
3984 span: span(4..5),
3985 kind: ast::ErrorKind::EscapeHexInvalidDigit,
3986 });
3987 assert_eq!(
3988 parser(r"\uFFFG").parse_escape().unwrap_err(),
3989 TestError {
3990 span: span(5..6),
3991 kind: ast::ErrorKind::EscapeHexInvalidDigit,
3992 });
3993 assert_eq!(
3994 parser(r"\uD800").parse_escape().unwrap_err(),
3995 TestError {
3996 span: span(2..6),
3997 kind: ast::ErrorKind::EscapeHexInvalid,
3998 });
3999 }
4000
4001 #[test]
4002 fn parse_hex_eight() {
4003 for i in 0..65536 {
4004 let c = match ::std::char::from_u32(i) {
4005 None => continue,
4006 Some(c) => c,
4007 };
4008 let pat = format!(r"\U{:08x}", i);
4009 assert_eq!(
4010 parser(&pat).parse_escape(),
4011 Ok(Primitive::Literal(ast::Literal {
4012 span: span(0..pat.len()),
4013 kind: ast::LiteralKind::HexFixed(
4014 ast::HexLiteralKind::UnicodeLong),
4015 c: c,
4016 })));
4017 }
4018
4019 assert_eq!(
4020 parser(r"\UF").parse_escape().unwrap_err(),
4021 TestError {
4022 span: span(3..3),
4023 kind: ast::ErrorKind::EscapeUnexpectedEof,
4024 });
4025 assert_eq!(
4026 parser(r"\UG").parse_escape().unwrap_err(),
4027 TestError {
4028 span: span(2..3),
4029 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4030 });
4031 assert_eq!(
4032 parser(r"\UFG").parse_escape().unwrap_err(),
4033 TestError {
4034 span: span(3..4),
4035 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4036 });
4037 assert_eq!(
4038 parser(r"\UFFG").parse_escape().unwrap_err(),
4039 TestError {
4040 span: span(4..5),
4041 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4042 });
4043 assert_eq!(
4044 parser(r"\UFFFG").parse_escape().unwrap_err(),
4045 TestError {
4046 span: span(5..6),
4047 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4048 });
4049 assert_eq!(
4050 parser(r"\UFFFFG").parse_escape().unwrap_err(),
4051 TestError {
4052 span: span(6..7),
4053 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4054 });
4055 assert_eq!(
4056 parser(r"\UFFFFFG").parse_escape().unwrap_err(),
4057 TestError {
4058 span: span(7..8),
4059 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4060 });
4061 assert_eq!(
4062 parser(r"\UFFFFFFG").parse_escape().unwrap_err(),
4063 TestError {
4064 span: span(8..9),
4065 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4066 });
4067 assert_eq!(
4068 parser(r"\UFFFFFFFG").parse_escape().unwrap_err(),
4069 TestError {
4070 span: span(9..10),
4071 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4072 });
4073 }
4074
4075 #[test]
4076 fn parse_hex_brace() {
4077 assert_eq!(
4078 parser(r"\u{26c4}").parse_escape(),
4079 Ok(Primitive::Literal(ast::Literal {
4080 span: span(0..8),
4081 kind: ast::LiteralKind::HexBrace(
4082 ast::HexLiteralKind::UnicodeShort),
4083 c: '⛄',
4084 })));
4085 assert_eq!(
4086 parser(r"\U{26c4}").parse_escape(),
4087 Ok(Primitive::Literal(ast::Literal {
4088 span: span(0..8),
4089 kind: ast::LiteralKind::HexBrace(
4090 ast::HexLiteralKind::UnicodeLong),
4091 c: '⛄',
4092 })));
4093 assert_eq!(
4094 parser(r"\x{26c4}").parse_escape(),
4095 Ok(Primitive::Literal(ast::Literal {
4096 span: span(0..8),
4097 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4098 c: '⛄',
4099 })));
4100 assert_eq!(
4101 parser(r"\x{26C4}").parse_escape(),
4102 Ok(Primitive::Literal(ast::Literal {
4103 span: span(0..8),
4104 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4105 c: '⛄',
4106 })));
4107 assert_eq!(
4108 parser(r"\x{10fFfF}").parse_escape(),
4109 Ok(Primitive::Literal(ast::Literal {
4110 span: span(0..10),
4111 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4112 c: '\u{10FFFF}',
4113 })));
4114
4115 assert_eq!(
4116 parser(r"\x").parse_escape().unwrap_err(),
4117 TestError {
4118 span: span(2..2),
4119 kind: ast::ErrorKind::EscapeUnexpectedEof,
4120 });
4121 assert_eq!(
4122 parser(r"\x{").parse_escape().unwrap_err(),
4123 TestError {
4124 span: span(2..3),
4125 kind: ast::ErrorKind::EscapeUnexpectedEof,
4126 });
4127 assert_eq!(
4128 parser(r"\x{FF").parse_escape().unwrap_err(),
4129 TestError {
4130 span: span(2..5),
4131 kind: ast::ErrorKind::EscapeUnexpectedEof,
4132 });
4133 assert_eq!(
4134 parser(r"\x{}").parse_escape().unwrap_err(),
4135 TestError {
4136 span: span(2..4),
4137 kind: ast::ErrorKind::EscapeHexEmpty,
4138 });
4139 assert_eq!(
4140 parser(r"\x{FGF}").parse_escape().unwrap_err(),
4141 TestError {
4142 span: span(4..5),
4143 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4144 });
4145 assert_eq!(
4146 parser(r"\x{FFFFFF}").parse_escape().unwrap_err(),
4147 TestError {
4148 span: span(3..9),
4149 kind: ast::ErrorKind::EscapeHexInvalid,
4150 });
4151 assert_eq!(
4152 parser(r"\x{D800}").parse_escape().unwrap_err(),
4153 TestError {
4154 span: span(3..7),
4155 kind: ast::ErrorKind::EscapeHexInvalid,
4156 });
4157 assert_eq!(
4158 parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(),
4159 TestError {
4160 span: span(3..12),
4161 kind: ast::ErrorKind::EscapeHexInvalid,
4162 });
4163 }
4164
4165 #[test]
4166 fn parse_decimal() {
4167 assert_eq!(parser("123").parse_decimal(), Ok(123));
4168 assert_eq!(parser("0").parse_decimal(), Ok(0));
4169 assert_eq!(parser("01").parse_decimal(), Ok(1));
4170
4171 assert_eq!(
4172 parser("-1").parse_decimal().unwrap_err(),
4173 TestError {
4174 span: span(0..0),
4175 kind: ast::ErrorKind::DecimalEmpty,
4176 });
4177 assert_eq!(
4178 parser("").parse_decimal().unwrap_err(),
4179 TestError {
4180 span: span(0..0),
4181 kind: ast::ErrorKind::DecimalEmpty,
4182 });
4183 assert_eq!(
4184 parser("9999999999").parse_decimal().unwrap_err(),
4185 TestError {
4186 span: span(0..10),
4187 kind: ast::ErrorKind::DecimalInvalid,
4188 });
4189 }
4190
4191 #[test]
4192 fn parse_set_class() {
4193 fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet {
4194 ast::ClassSet::union(ast::ClassSetUnion {
4195 span: span,
4196 items: items,
4197 })
4198 }
4199
4200 fn intersection(
4201 span: Span,
4202 lhs: ast::ClassSet,
4203 rhs: ast::ClassSet,
4204 ) -> ast::ClassSet {
4205 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4206 span: span,
4207 kind: ast::ClassSetBinaryOpKind::Intersection,
4208 lhs: Box::new(lhs),
4209 rhs: Box::new(rhs),
4210 })
4211 }
4212
4213 fn difference(
4214 span: Span,
4215 lhs: ast::ClassSet,
4216 rhs: ast::ClassSet,
4217 ) -> ast::ClassSet {
4218 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4219 span: span,
4220 kind: ast::ClassSetBinaryOpKind::Difference,
4221 lhs: Box::new(lhs),
4222 rhs: Box::new(rhs),
4223 })
4224 }
4225
4226 fn symdifference(
4227 span: Span,
4228 lhs: ast::ClassSet,
4229 rhs: ast::ClassSet,
4230 ) -> ast::ClassSet {
4231 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4232 span: span,
4233 kind: ast::ClassSetBinaryOpKind::SymmetricDifference,
4234 lhs: Box::new(lhs),
4235 rhs: Box::new(rhs),
4236 })
4237 }
4238
4239 fn itemset(item: ast::ClassSetItem) -> ast::ClassSet {
4240 ast::ClassSet::Item(item)
4241 }
4242
4243 fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem {
4244 ast::ClassSetItem::Ascii(cls)
4245 }
4246
4247 fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem {
4248 ast::ClassSetItem::Unicode(cls)
4249 }
4250
4251 fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem {
4252 ast::ClassSetItem::Perl(cls)
4253 }
4254
4255 fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem {
4256 ast::ClassSetItem::Bracketed(Box::new(cls))
4257 }
4258
4259 fn lit(span: Span, c: char) -> ast::ClassSetItem {
4260 ast::ClassSetItem::Literal(ast::Literal {
4261 span: span,
4262 kind: ast::LiteralKind::Verbatim,
4263 c: c,
4264 })
4265 }
4266
4267 fn empty(span: Span) -> ast::ClassSetItem {
4268 ast::ClassSetItem::Empty(span)
4269 }
4270
4271 fn range(span: Span, start: char, end: char) -> ast::ClassSetItem {
4272 let pos1 = Position {
4273 offset: span.start.offset + start.len_utf8(),
4274 column: span.start.column + 1,
4275 ..span.start
4276 };
4277 let pos2 = Position {
4278 offset: span.end.offset - end.len_utf8(),
4279 column: span.end.column - 1,
4280 ..span.end
4281 };
4282 ast::ClassSetItem::Range(ast::ClassSetRange {
4283 span: span,
4284 start: ast::Literal {
4285 span: Span { end: pos1, ..span },
4286 kind: ast::LiteralKind::Verbatim,
4287 c: start,
4288 },
4289 end: ast::Literal {
4290 span: Span { start: pos2, ..span },
4291 kind: ast::LiteralKind::Verbatim,
4292 c: end,
4293 },
4294 })
4295 }
4296
4297 fn alnum(span: Span, negated: bool) -> ast::ClassAscii {
4298 ast::ClassAscii {
4299 span: span,
4300 kind: ast::ClassAsciiKind::Alnum,
4301 negated: negated,
4302 }
4303 }
4304
4305 fn lower(span: Span, negated: bool) -> ast::ClassAscii {
4306 ast::ClassAscii {
4307 span: span,
4308 kind: ast::ClassAsciiKind::Lower,
4309 negated: negated,
4310 }
4311 }
4312
4313 assert_eq!(
4314 parser("[[:alnum:]]").parse(),
4315 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4316 span: span(0..11),
4317 negated: false,
4318 kind: itemset(item_ascii(alnum(span(1..10), false))),
4319 }))));
4320 assert_eq!(
4321 parser("[[[:alnum:]]]").parse(),
4322 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4323 span: span(0..13),
4324 negated: false,
4325 kind: itemset(item_bracket(ast::ClassBracketed {
4326 span: span(1..12),
4327 negated: false,
4328 kind: itemset(item_ascii(alnum(span(2..11), false))),
4329 })),
4330 }))));
4331 assert_eq!(
4332 parser("[[:alnum:]&&[:lower:]]").parse(),
4333 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4334 span: span(0..22),
4335 negated: false,
4336 kind: intersection(
4337 span(1..21),
4338 itemset(item_ascii(alnum(span(1..10), false))),
4339 itemset(item_ascii(lower(span(12..21), false))),
4340 ),
4341 }))));
4342 assert_eq!(
4343 parser("[[:alnum:]--[:lower:]]").parse(),
4344 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4345 span: span(0..22),
4346 negated: false,
4347 kind: difference(
4348 span(1..21),
4349 itemset(item_ascii(alnum(span(1..10), false))),
4350 itemset(item_ascii(lower(span(12..21), false))),
4351 ),
4352 }))));
4353 assert_eq!(
4354 parser("[[:alnum:]~~[:lower:]]").parse(),
4355 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4356 span: span(0..22),
4357 negated: false,
4358 kind: symdifference(
4359 span(1..21),
4360 itemset(item_ascii(alnum(span(1..10), false))),
4361 itemset(item_ascii(lower(span(12..21), false))),
4362 ),
4363 }))));
4364
4365 assert_eq!(
4366 parser("[a]").parse(),
4367 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4368 span: span(0..3),
4369 negated: false,
4370 kind: itemset(lit(span(1..2), 'a')),
4371 }))));
4372 assert_eq!(
4373 parser(r"[a\]]").parse(),
4374 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4375 span: span(0..5),
4376 negated: false,
4377 kind: union(span(1..4), vec![
4378 lit(span(1..2), 'a'),
4379 ast::ClassSetItem::Literal(ast::Literal {
4380 span: span(2..4),
4381 kind: ast::LiteralKind::Punctuation,
4382 c: ']',
4383 }),
4384 ]),
4385 }))));
4386 assert_eq!(
4387 parser(r"[a\-z]").parse(),
4388 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4389 span: span(0..6),
4390 negated: false,
4391 kind: union(span(1..5), vec![
4392 lit(span(1..2), 'a'),
4393 ast::ClassSetItem::Literal(ast::Literal {
4394 span: span(2..4),
4395 kind: ast::LiteralKind::Punctuation,
4396 c: '-',
4397 }),
4398 lit(span(4..5), 'z'),
4399 ]),
4400 }))));
4401 assert_eq!(
4402 parser("[ab]").parse(),
4403 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4404 span: span(0..4),
4405 negated: false,
4406 kind: union(span(1..3), vec![
4407 lit(span(1..2), 'a'),
4408 lit(span(2..3), 'b'),
4409 ]),
4410 }))));
4411 assert_eq!(
4412 parser("[a-]").parse(),
4413 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4414 span: span(0..4),
4415 negated: false,
4416 kind: union(span(1..3), vec![
4417 lit(span(1..2), 'a'),
4418 lit(span(2..3), '-'),
4419 ]),
4420 }))));
4421 assert_eq!(
4422 parser("[-a]").parse(),
4423 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4424 span: span(0..4),
4425 negated: false,
4426 kind: union(span(1..3), vec![
4427 lit(span(1..2), '-'),
4428 lit(span(2..3), 'a'),
4429 ]),
4430 }))));
4431 assert_eq!(
4432 parser(r"[\pL]").parse(),
4433 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4434 span: span(0..5),
4435 negated: false,
4436 kind: itemset(item_unicode(ast::ClassUnicode {
4437 span: span(1..4),
4438 negated: false,
4439 kind: ast::ClassUnicodeKind::OneLetter('L'),
4440 })),
4441 }))));
4442 assert_eq!(
4443 parser(r"[\w]").parse(),
4444 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4445 span: span(0..4),
4446 negated: false,
4447 kind: itemset(item_perl(ast::ClassPerl {
4448 span: span(1..3),
4449 kind: ast::ClassPerlKind::Word,
4450 negated: false,
4451 })),
4452 }))));
4453 assert_eq!(
4454 parser(r"[a\wz]").parse(),
4455 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4456 span: span(0..6),
4457 negated: false,
4458 kind: union(span(1..5), vec![
4459 lit(span(1..2), 'a'),
4460 item_perl(ast::ClassPerl {
4461 span: span(2..4),
4462 kind: ast::ClassPerlKind::Word,
4463 negated: false,
4464 }),
4465 lit(span(4..5), 'z'),
4466 ]),
4467 }))));
4468
4469 assert_eq!(
4470 parser("[a-z]").parse(),
4471 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4472 span: span(0..5),
4473 negated: false,
4474 kind: itemset(range(span(1..4), 'a', 'z')),
4475 }))));
4476 assert_eq!(
4477 parser("[a-cx-z]").parse(),
4478 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4479 span: span(0..8),
4480 negated: false,
4481 kind: union(span(1..7), vec![
4482 range(span(1..4), 'a', 'c'),
4483 range(span(4..7), 'x', 'z'),
4484 ]),
4485 }))));
4486 assert_eq!(
4487 parser(r"[\w&&a-cx-z]").parse(),
4488 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4489 span: span(0..12),
4490 negated: false,
4491 kind: intersection(
4492 span(1..11),
4493 itemset(item_perl(ast::ClassPerl {
4494 span: span(1..3),
4495 kind: ast::ClassPerlKind::Word,
4496 negated: false,
4497 })),
4498 union(span(5..11), vec![
4499 range(span(5..8), 'a', 'c'),
4500 range(span(8..11), 'x', 'z'),
4501 ]),
4502 ),
4503 }))));
4504 assert_eq!(
4505 parser(r"[a-cx-z&&\w]").parse(),
4506 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4507 span: span(0..12),
4508 negated: false,
4509 kind: intersection(
4510 span(1..11),
4511 union(span(1..7), vec![
4512 range(span(1..4), 'a', 'c'),
4513 range(span(4..7), 'x', 'z'),
4514 ]),
4515 itemset(item_perl(ast::ClassPerl {
4516 span: span(9..11),
4517 kind: ast::ClassPerlKind::Word,
4518 negated: false,
4519 })),
4520 ),
4521 }))));
4522 assert_eq!(
4523 parser(r"[a--b--c]").parse(),
4524 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4525 span: span(0..9),
4526 negated: false,
4527 kind: difference(
4528 span(1..8),
4529 difference(
4530 span(1..5),
4531 itemset(lit(span(1..2), 'a')),
4532 itemset(lit(span(4..5), 'b')),
4533 ),
4534 itemset(lit(span(7..8), 'c')),
4535 ),
4536 }))));
4537 assert_eq!(
4538 parser(r"[a~~b~~c]").parse(),
4539 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4540 span: span(0..9),
4541 negated: false,
4542 kind: symdifference(
4543 span(1..8),
4544 symdifference(
4545 span(1..5),
4546 itemset(lit(span(1..2), 'a')),
4547 itemset(lit(span(4..5), 'b')),
4548 ),
4549 itemset(lit(span(7..8), 'c')),
4550 ),
4551 }))));
4552 assert_eq!(
4553 parser(r"[\^&&^]").parse(),
4554 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4555 span: span(0..7),
4556 negated: false,
4557 kind: intersection(
4558 span(1..6),
4559 itemset(ast::ClassSetItem::Literal(ast::Literal {
4560 span: span(1..3),
4561 kind: ast::LiteralKind::Punctuation,
4562 c: '^',
4563 })),
4564 itemset(lit(span(5..6), '^')),
4565 ),
4566 }))));
4567 assert_eq!(
4568 parser(r"[\&&&&]").parse(),
4569 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4570 span: span(0..7),
4571 negated: false,
4572 kind: intersection(
4573 span(1..6),
4574 itemset(ast::ClassSetItem::Literal(ast::Literal {
4575 span: span(1..3),
4576 kind: ast::LiteralKind::Punctuation,
4577 c: '&',
4578 })),
4579 itemset(lit(span(5..6), '&')),
4580 ),
4581 }))));
4582 assert_eq!(
4583 parser(r"[&&&&]").parse(),
4584 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4585 span: span(0..6),
4586 negated: false,
4587 kind: intersection(
4588 span(1..5),
4589 intersection(
4590 span(1..3),
4591 itemset(empty(span(1..1))),
4592 itemset(empty(span(3..3))),
4593 ),
4594 itemset(empty(span(5..5))),
4595 ),
4596 }))));
4597
4598 let pat = "[☃-⛄]";
4599 assert_eq!(
4600 parser(pat).parse(),
4601 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4602 span: span_range(pat, 0..9),
4603 negated: false,
4604 kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange {
4605 span: span_range(pat, 1..8),
4606 start: ast::Literal {
4607 span: span_range(pat, 1..4),
4608 kind: ast::LiteralKind::Verbatim,
4609 c: '☃',
4610 },
4611 end: ast::Literal {
4612 span: span_range(pat, 5..8),
4613 kind: ast::LiteralKind::Verbatim,
4614 c: '⛄',
4615 },
4616 })),
4617 }))));
4618
4619 assert_eq!(
4620 parser(r"[]]").parse(),
4621 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4622 span: span(0..3),
4623 negated: false,
4624 kind: itemset(lit(span(1..2), ']')),
4625 }))));
4626 assert_eq!(
4627 parser(r"[]\[]").parse(),
4628 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4629 span: span(0..5),
4630 negated: false,
4631 kind: union(span(1..4), vec![
4632 lit(span(1..2), ']'),
4633 ast::ClassSetItem::Literal(ast::Literal {
4634 span: span(2..4),
4635 kind: ast::LiteralKind::Punctuation,
4636 c: '[',
4637 }),
4638 ]),
4639 }))));
4640 assert_eq!(
4641 parser(r"[\[]]").parse(),
4642 Ok(concat(0..5, vec![
4643 Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4644 span: span(0..4),
4645 negated: false,
4646 kind: itemset(ast::ClassSetItem::Literal(ast::Literal {
4647 span: span(1..3),
4648 kind: ast::LiteralKind::Punctuation,
4649 c: '[',
4650 })),
4651 })),
4652 Ast::Literal(ast::Literal {
4653 span: span(4..5),
4654 kind: ast::LiteralKind::Verbatim,
4655 c: ']',
4656 }),
4657 ])));
4658
4659 assert_eq!(
4660 parser("[").parse().unwrap_err(),
4661 TestError {
4662 span: span(0..1),
4663 kind: ast::ErrorKind::ClassUnclosed,
4664 });
4665 assert_eq!(
4666 parser("[[").parse().unwrap_err(),
4667 TestError {
4668 span: span(1..2),
4669 kind: ast::ErrorKind::ClassUnclosed,
4670 });
4671 assert_eq!(
4672 parser("[[-]").parse().unwrap_err(),
4673 TestError {
4674 span: span(0..1),
4675 kind: ast::ErrorKind::ClassUnclosed,
4676 });
4677 assert_eq!(
4678 parser("[[[:alnum:]").parse().unwrap_err(),
4679 TestError {
4680 span: span(1..2),
4681 kind: ast::ErrorKind::ClassUnclosed,
4682 });
4683 assert_eq!(
4684 parser(r"[\b]").parse().unwrap_err(),
4685 TestError {
4686 span: span(1..3),
4687 kind: ast::ErrorKind::ClassEscapeInvalid,
4688 });
4689 assert_eq!(
4690 parser(r"[\w-a]").parse().unwrap_err(),
4691 TestError {
4692 span: span(1..3),
94b46f34 4693 kind: ast::ErrorKind::ClassRangeLiteral,
0531ce1d
XL
4694 });
4695 assert_eq!(
4696 parser(r"[a-\w]").parse().unwrap_err(),
4697 TestError {
4698 span: span(3..5),
94b46f34 4699 kind: ast::ErrorKind::ClassRangeLiteral,
0531ce1d
XL
4700 });
4701 assert_eq!(
4702 parser(r"[z-a]").parse().unwrap_err(),
4703 TestError {
4704 span: span(1..4),
4705 kind: ast::ErrorKind::ClassRangeInvalid,
4706 });
4707
4708 assert_eq!(
4709 parser_ignore_whitespace("[a ").parse().unwrap_err(),
4710 TestError {
4711 span: span(0..1),
4712 kind: ast::ErrorKind::ClassUnclosed,
4713 });
4714 assert_eq!(
4715 parser_ignore_whitespace("[a- ").parse().unwrap_err(),
4716 TestError {
4717 span: span(0..1),
4718 kind: ast::ErrorKind::ClassUnclosed,
4719 });
4720 }
4721
4722 #[test]
4723 fn parse_set_class_open() {
4724 assert_eq!(
4725 parser("[a]").parse_set_class_open(), {
4726 let set = ast::ClassBracketed {
4727 span: span(0..1),
4728 negated: false,
4729 kind: ast::ClassSet::union(ast::ClassSetUnion {
4730 span: span(1..1),
4731 items: vec![],
4732 }),
4733 };
4734 let union = ast::ClassSetUnion {
4735 span: span(1..1),
4736 items: vec![],
4737 };
4738 Ok((set, union))
4739 });
4740 assert_eq!(
4741 parser_ignore_whitespace("[ a]").parse_set_class_open(), {
4742 let set = ast::ClassBracketed {
4743 span: span(0..4),
4744 negated: false,
4745 kind: ast::ClassSet::union(ast::ClassSetUnion {
4746 span: span(4..4),
4747 items: vec![],
4748 }),
4749 };
4750 let union = ast::ClassSetUnion {
4751 span: span(4..4),
4752 items: vec![],
4753 };
4754 Ok((set, union))
4755 });
4756 assert_eq!(
4757 parser("[^a]").parse_set_class_open(), {
4758 let set = ast::ClassBracketed {
4759 span: span(0..2),
4760 negated: true,
4761 kind: ast::ClassSet::union(ast::ClassSetUnion {
4762 span: span(2..2),
4763 items: vec![],
4764 }),
4765 };
4766 let union = ast::ClassSetUnion {
4767 span: span(2..2),
4768 items: vec![],
4769 };
4770 Ok((set, union))
4771 });
4772 assert_eq!(
4773 parser_ignore_whitespace("[ ^ a]").parse_set_class_open(), {
4774 let set = ast::ClassBracketed {
4775 span: span(0..4),
4776 negated: true,
4777 kind: ast::ClassSet::union(ast::ClassSetUnion {
4778 span: span(4..4),
4779 items: vec![],
4780 }),
4781 };
4782 let union = ast::ClassSetUnion {
4783 span: span(4..4),
4784 items: vec![],
4785 };
4786 Ok((set, union))
4787 });
4788 assert_eq!(
4789 parser("[-a]").parse_set_class_open(), {
4790 let set = ast::ClassBracketed {
4791 span: span(0..2),
4792 negated: false,
4793 kind: ast::ClassSet::union(ast::ClassSetUnion {
4794 span: span(1..1),
4795 items: vec![],
4796 }),
4797 };
4798 let union = ast::ClassSetUnion {
4799 span: span(1..2),
4800 items: vec![
4801 ast::ClassSetItem::Literal(ast::Literal {
4802 span: span(1..2),
4803 kind: ast::LiteralKind::Verbatim,
4804 c: '-',
4805 }),
4806 ],
4807 };
4808 Ok((set, union))
4809 });
4810 assert_eq!(
4811 parser_ignore_whitespace("[ - a]").parse_set_class_open(), {
4812 let set = ast::ClassBracketed {
4813 span: span(0..4),
4814 negated: false,
4815 kind: ast::ClassSet::union(ast::ClassSetUnion {
4816 span: span(2..2),
4817 items: vec![],
4818 }),
4819 };
4820 let union = ast::ClassSetUnion {
4821 span: span(2..3),
4822 items: vec![
4823 ast::ClassSetItem::Literal(ast::Literal {
4824 span: span(2..3),
4825 kind: ast::LiteralKind::Verbatim,
4826 c: '-',
4827 }),
4828 ],
4829 };
4830 Ok((set, union))
4831 });
4832 assert_eq!(
4833 parser("[^-a]").parse_set_class_open(), {
4834 let set = ast::ClassBracketed {
4835 span: span(0..3),
4836 negated: true,
4837 kind: ast::ClassSet::union(ast::ClassSetUnion {
4838 span: span(2..2),
4839 items: vec![],
4840 }),
4841 };
4842 let union = ast::ClassSetUnion {
4843 span: span(2..3),
4844 items: vec![
4845 ast::ClassSetItem::Literal(ast::Literal {
4846 span: span(2..3),
4847 kind: ast::LiteralKind::Verbatim,
4848 c: '-',
4849 }),
4850 ],
4851 };
4852 Ok((set, union))
4853 });
4854 assert_eq!(
4855 parser("[--a]").parse_set_class_open(), {
4856 let set = ast::ClassBracketed {
4857 span: span(0..3),
4858 negated: false,
4859 kind: ast::ClassSet::union(ast::ClassSetUnion {
4860 span: span(1..1),
4861 items: vec![],
4862 }),
4863 };
4864 let union = ast::ClassSetUnion {
4865 span: span(1..3),
4866 items: vec![
4867 ast::ClassSetItem::Literal(ast::Literal {
4868 span: span(1..2),
4869 kind: ast::LiteralKind::Verbatim,
4870 c: '-',
4871 }),
4872 ast::ClassSetItem::Literal(ast::Literal {
4873 span: span(2..3),
4874 kind: ast::LiteralKind::Verbatim,
4875 c: '-',
4876 }),
4877 ],
4878 };
4879 Ok((set, union))
4880 });
4881 assert_eq!(
4882 parser("[]a]").parse_set_class_open(), {
4883 let set = ast::ClassBracketed {
4884 span: span(0..2),
4885 negated: false,
4886 kind: ast::ClassSet::union(ast::ClassSetUnion {
4887 span: span(1..1),
4888 items: vec![],
4889 }),
4890 };
4891 let union = ast::ClassSetUnion {
4892 span: span(1..2),
4893 items: vec![
4894 ast::ClassSetItem::Literal(ast::Literal {
4895 span: span(1..2),
4896 kind: ast::LiteralKind::Verbatim,
4897 c: ']',
4898 }),
4899 ],
4900 };
4901 Ok((set, union))
4902 });
4903 assert_eq!(
4904 parser_ignore_whitespace("[ ] a]").parse_set_class_open(), {
4905 let set = ast::ClassBracketed {
4906 span: span(0..4),
4907 negated: false,
4908 kind: ast::ClassSet::union(ast::ClassSetUnion {
4909 span: span(2..2),
4910 items: vec![],
4911 }),
4912 };
4913 let union = ast::ClassSetUnion {
4914 span: span(2..3),
4915 items: vec![
4916 ast::ClassSetItem::Literal(ast::Literal {
4917 span: span(2..3),
4918 kind: ast::LiteralKind::Verbatim,
4919 c: ']',
4920 }),
4921 ],
4922 };
4923 Ok((set, union))
4924 });
4925 assert_eq!(
4926 parser("[^]a]").parse_set_class_open(), {
4927 let set = ast::ClassBracketed {
4928 span: span(0..3),
4929 negated: true,
4930 kind: ast::ClassSet::union(ast::ClassSetUnion {
4931 span: span(2..2),
4932 items: vec![],
4933 }),
4934 };
4935 let union = ast::ClassSetUnion {
4936 span: span(2..3),
4937 items: vec![
4938 ast::ClassSetItem::Literal(ast::Literal {
4939 span: span(2..3),
4940 kind: ast::LiteralKind::Verbatim,
4941 c: ']',
4942 }),
4943 ],
4944 };
4945 Ok((set, union))
4946 });
4947 assert_eq!(
4948 parser("[-]a]").parse_set_class_open(), {
4949 let set = ast::ClassBracketed {
4950 span: span(0..2),
4951 negated: false,
4952 kind: ast::ClassSet::union(ast::ClassSetUnion {
4953 span: span(1..1),
4954 items: vec![],
4955 }),
4956 };
4957 let union = ast::ClassSetUnion {
4958 span: span(1..2),
4959 items: vec![
4960 ast::ClassSetItem::Literal(ast::Literal {
4961 span: span(1..2),
4962 kind: ast::LiteralKind::Verbatim,
4963 c: '-',
4964 }),
4965 ],
4966 };
4967 Ok((set, union))
4968 });
4969
4970 assert_eq!(
4971 parser("[").parse_set_class_open().unwrap_err(),
4972 TestError {
4973 span: span(0..1),
4974 kind: ast::ErrorKind::ClassUnclosed,
4975 });
4976 assert_eq!(
4977 parser_ignore_whitespace("[ ")
4978 .parse_set_class_open()
4979 .unwrap_err(),
4980 TestError {
4981 span: span(0..5),
4982 kind: ast::ErrorKind::ClassUnclosed,
4983 });
4984 assert_eq!(
4985 parser("[^").parse_set_class_open().unwrap_err(),
4986 TestError {
4987 span: span(0..2),
4988 kind: ast::ErrorKind::ClassUnclosed,
4989 });
4990 assert_eq!(
4991 parser("[]").parse_set_class_open().unwrap_err(),
4992 TestError {
4993 span: span(0..2),
4994 kind: ast::ErrorKind::ClassUnclosed,
4995 });
4996 assert_eq!(
4997 parser("[-").parse_set_class_open().unwrap_err(),
4998 TestError {
4999 span: span(0..2),
5000 kind: ast::ErrorKind::ClassUnclosed,
5001 });
5002 assert_eq!(
5003 parser("[--").parse_set_class_open().unwrap_err(),
5004 TestError {
5005 span: span(0..3),
5006 kind: ast::ErrorKind::ClassUnclosed,
5007 });
5008 }
5009
5010 #[test]
5011 fn maybe_parse_ascii_class() {
5012 assert_eq!(
5013 parser(r"[:alnum:]").maybe_parse_ascii_class(),
5014 Some(ast::ClassAscii {
5015 span: span(0..9),
5016 kind: ast::ClassAsciiKind::Alnum,
5017 negated: false,
5018 }));
5019 assert_eq!(
5020 parser(r"[:alnum:]A").maybe_parse_ascii_class(),
5021 Some(ast::ClassAscii {
5022 span: span(0..9),
5023 kind: ast::ClassAsciiKind::Alnum,
5024 negated: false,
5025 }));
5026 assert_eq!(
5027 parser(r"[:^alnum:]").maybe_parse_ascii_class(),
5028 Some(ast::ClassAscii {
5029 span: span(0..10),
5030 kind: ast::ClassAsciiKind::Alnum,
5031 negated: true,
5032 }));
5033
5034 let p = parser(r"[:");
5035 assert_eq!(p.maybe_parse_ascii_class(), None);
5036 assert_eq!(p.offset(), 0);
5037
5038 let p = parser(r"[:^");
5039 assert_eq!(p.maybe_parse_ascii_class(), None);
5040 assert_eq!(p.offset(), 0);
5041
5042 let p = parser(r"[^:alnum:]");
5043 assert_eq!(p.maybe_parse_ascii_class(), None);
5044 assert_eq!(p.offset(), 0);
5045
5046 let p = parser(r"[:alnnum:]");
5047 assert_eq!(p.maybe_parse_ascii_class(), None);
5048 assert_eq!(p.offset(), 0);
5049
5050 let p = parser(r"[:alnum]");
5051 assert_eq!(p.maybe_parse_ascii_class(), None);
5052 assert_eq!(p.offset(), 0);
5053
5054 let p = parser(r"[:alnum:");
5055 assert_eq!(p.maybe_parse_ascii_class(), None);
5056 assert_eq!(p.offset(), 0);
5057 }
5058
5059 #[test]
5060 fn parse_unicode_class() {
5061 assert_eq!(
5062 parser(r"\pN").parse_escape(),
5063 Ok(Primitive::Unicode(ast::ClassUnicode {
5064 span: span(0..3),
5065 negated: false,
5066 kind: ast::ClassUnicodeKind::OneLetter('N'),
5067 })));
5068 assert_eq!(
5069 parser(r"\PN").parse_escape(),
5070 Ok(Primitive::Unicode(ast::ClassUnicode {
5071 span: span(0..3),
5072 negated: true,
5073 kind: ast::ClassUnicodeKind::OneLetter('N'),
5074 })));
5075 assert_eq!(
5076 parser(r"\p{N}").parse_escape(),
5077 Ok(Primitive::Unicode(ast::ClassUnicode {
5078 span: span(0..5),
5079 negated: false,
5080 kind: ast::ClassUnicodeKind::Named(s("N")),
5081 })));
5082 assert_eq!(
5083 parser(r"\P{N}").parse_escape(),
5084 Ok(Primitive::Unicode(ast::ClassUnicode {
5085 span: span(0..5),
5086 negated: true,
5087 kind: ast::ClassUnicodeKind::Named(s("N")),
5088 })));
5089 assert_eq!(
5090 parser(r"\p{Greek}").parse_escape(),
5091 Ok(Primitive::Unicode(ast::ClassUnicode {
5092 span: span(0..9),
5093 negated: false,
5094 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5095 })));
5096
5097 assert_eq!(
5098 parser(r"\p{scx:Katakana}").parse_escape(),
5099 Ok(Primitive::Unicode(ast::ClassUnicode {
5100 span: span(0..16),
5101 negated: false,
5102 kind: ast::ClassUnicodeKind::NamedValue {
5103 op: ast::ClassUnicodeOpKind::Colon,
5104 name: s("scx"),
5105 value: s("Katakana"),
5106 },
5107 })));
5108 assert_eq!(
5109 parser(r"\p{scx=Katakana}").parse_escape(),
5110 Ok(Primitive::Unicode(ast::ClassUnicode {
5111 span: span(0..16),
5112 negated: false,
5113 kind: ast::ClassUnicodeKind::NamedValue {
5114 op: ast::ClassUnicodeOpKind::Equal,
5115 name: s("scx"),
5116 value: s("Katakana"),
5117 },
5118 })));
5119 assert_eq!(
5120 parser(r"\p{scx!=Katakana}").parse_escape(),
5121 Ok(Primitive::Unicode(ast::ClassUnicode {
5122 span: span(0..17),
5123 negated: false,
5124 kind: ast::ClassUnicodeKind::NamedValue {
5125 op: ast::ClassUnicodeOpKind::NotEqual,
5126 name: s("scx"),
5127 value: s("Katakana"),
5128 },
5129 })));
5130
5131 assert_eq!(
5132 parser(r"\p{:}").parse_escape(),
5133 Ok(Primitive::Unicode(ast::ClassUnicode {
5134 span: span(0..5),
5135 negated: false,
5136 kind: ast::ClassUnicodeKind::NamedValue {
5137 op: ast::ClassUnicodeOpKind::Colon,
5138 name: s(""),
5139 value: s(""),
5140 },
5141 })));
5142 assert_eq!(
5143 parser(r"\p{=}").parse_escape(),
5144 Ok(Primitive::Unicode(ast::ClassUnicode {
5145 span: span(0..5),
5146 negated: false,
5147 kind: ast::ClassUnicodeKind::NamedValue {
5148 op: ast::ClassUnicodeOpKind::Equal,
5149 name: s(""),
5150 value: s(""),
5151 },
5152 })));
5153 assert_eq!(
5154 parser(r"\p{!=}").parse_escape(),
5155 Ok(Primitive::Unicode(ast::ClassUnicode {
5156 span: span(0..6),
5157 negated: false,
5158 kind: ast::ClassUnicodeKind::NamedValue {
5159 op: ast::ClassUnicodeOpKind::NotEqual,
5160 name: s(""),
5161 value: s(""),
5162 },
5163 })));
5164
5165 assert_eq!(
5166 parser(r"\p").parse_escape().unwrap_err(),
5167 TestError {
5168 span: span(2..2),
5169 kind: ast::ErrorKind::EscapeUnexpectedEof,
5170 });
5171 assert_eq!(
5172 parser(r"\p{").parse_escape().unwrap_err(),
5173 TestError {
5174 span: span(3..3),
5175 kind: ast::ErrorKind::EscapeUnexpectedEof,
5176 });
5177 assert_eq!(
5178 parser(r"\p{N").parse_escape().unwrap_err(),
5179 TestError {
5180 span: span(4..4),
5181 kind: ast::ErrorKind::EscapeUnexpectedEof,
5182 });
5183 assert_eq!(
5184 parser(r"\p{Greek").parse_escape().unwrap_err(),
5185 TestError {
5186 span: span(8..8),
5187 kind: ast::ErrorKind::EscapeUnexpectedEof,
5188 });
5189
5190 assert_eq!(
5191 parser(r"\pNz").parse(),
5192 Ok(Ast::Concat(ast::Concat {
5193 span: span(0..4),
5194 asts: vec![
5195 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5196 span: span(0..3),
5197 negated: false,
5198 kind: ast::ClassUnicodeKind::OneLetter('N'),
5199 })),
5200 Ast::Literal(ast::Literal {
5201 span: span(3..4),
5202 kind: ast::LiteralKind::Verbatim,
5203 c: 'z',
5204 }),
5205 ],
5206 })));
5207 assert_eq!(
5208 parser(r"\p{Greek}z").parse(),
5209 Ok(Ast::Concat(ast::Concat {
5210 span: span(0..10),
5211 asts: vec![
5212 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5213 span: span(0..9),
5214 negated: false,
5215 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5216 })),
5217 Ast::Literal(ast::Literal {
5218 span: span(9..10),
5219 kind: ast::LiteralKind::Verbatim,
5220 c: 'z',
5221 }),
5222 ],
5223 })));
5224 }
5225
5226 #[test]
5227 fn parse_perl_class() {
5228 assert_eq!(
5229 parser(r"\d").parse_escape(),
5230 Ok(Primitive::Perl(ast::ClassPerl {
5231 span: span(0..2),
5232 kind: ast::ClassPerlKind::Digit,
5233 negated: false,
5234 })));
5235 assert_eq!(
5236 parser(r"\D").parse_escape(),
5237 Ok(Primitive::Perl(ast::ClassPerl {
5238 span: span(0..2),
5239 kind: ast::ClassPerlKind::Digit,
5240 negated: true,
5241 })));
5242 assert_eq!(
5243 parser(r"\s").parse_escape(),
5244 Ok(Primitive::Perl(ast::ClassPerl {
5245 span: span(0..2),
5246 kind: ast::ClassPerlKind::Space,
5247 negated: false,
5248 })));
5249 assert_eq!(
5250 parser(r"\S").parse_escape(),
5251 Ok(Primitive::Perl(ast::ClassPerl {
5252 span: span(0..2),
5253 kind: ast::ClassPerlKind::Space,
5254 negated: true,
5255 })));
5256 assert_eq!(
5257 parser(r"\w").parse_escape(),
5258 Ok(Primitive::Perl(ast::ClassPerl {
5259 span: span(0..2),
5260 kind: ast::ClassPerlKind::Word,
5261 negated: false,
5262 })));
5263 assert_eq!(
5264 parser(r"\W").parse_escape(),
5265 Ok(Primitive::Perl(ast::ClassPerl {
5266 span: span(0..2),
5267 kind: ast::ClassPerlKind::Word,
5268 negated: true,
5269 })));
5270
5271 assert_eq!(
5272 parser(r"\d").parse(),
5273 Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl {
5274 span: span(0..2),
5275 kind: ast::ClassPerlKind::Digit,
5276 negated: false,
5277 }))));
5278 assert_eq!(
5279 parser(r"\dz").parse(),
5280 Ok(Ast::Concat(ast::Concat {
5281 span: span(0..3),
5282 asts: vec![
5283 Ast::Class(ast::Class::Perl(ast::ClassPerl {
5284 span: span(0..2),
5285 kind: ast::ClassPerlKind::Digit,
5286 negated: false,
5287 })),
5288 Ast::Literal(ast::Literal {
5289 span: span(2..3),
5290 kind: ast::LiteralKind::Verbatim,
5291 c: 'z',
5292 }),
5293 ],
5294 })));
5295 }
5296
5297 // This tests a bug fix where the nest limit checker wasn't decrementing
5298 // its depth during post-traversal, which causes long regexes to trip
5299 // the default limit too aggressively.
5300 #[test]
5301 fn regression_454_nest_too_big() {
5302 let pattern = r#"
5303 2(?:
5304 [45]\d{3}|
5305 7(?:
5306 1[0-267]|
5307 2[0-289]|
5308 3[0-29]|
5309 4[01]|
5310 5[1-3]|
5311 6[013]|
5312 7[0178]|
5313 91
5314 )|
5315 8(?:
5316 0[125]|
5317 [139][1-6]|
5318 2[0157-9]|
5319 41|
5320 6[1-35]|
5321 7[1-5]|
5322 8[1-8]|
5323 90
5324 )|
5325 9(?:
5326 0[0-2]|
5327 1[0-4]|
5328 2[568]|
5329 3[3-6]|
5330 5[5-7]|
5331 6[0167]|
5332 7[15]|
5333 8[0146-9]
5334 )
5335 )\d{4}
5336 "#;
5337 assert!(parser_nest_limit(pattern, 50).parse().is_ok());
5338 }
5339
5340 // This tests that we treat a trailing `-` in a character class as a
5341 // literal `-` even when whitespace mode is enabled and there is whitespace
5342 // after the trailing `-`.
5343 #[test]
5344 fn regression_455_trailing_dash_ignore_whitespace() {
5345 assert!(parser("(?x)[ / - ]").parse().is_ok());
5346 assert!(parser("(?x)[ a - ]").parse().is_ok());
5347 assert!(parser("(?x)[
5348 a
5349 - ]
5350 ").parse().is_ok());
5351 assert!(parser("(?x)[
5352 a # wat
5353 - ]
5354 ").parse().is_ok());
5355
5356 assert!(parser("(?x)[ / -").parse().is_err());
5357 assert!(parser("(?x)[ / - ").parse().is_err());
5358 assert!(parser("(?x)[
5359 / -
5360 ").parse().is_err());
5361 assert!(parser("(?x)[
5362 / - # wat
5363 ").parse().is_err());
5364 }
5365}