]> git.proxmox.com Git - rustc.git/blame - src/vendor/regex-syntax/src/ast/parse.rs
New upstream version 1.26.0+dfsg1
[rustc.git] / src / vendor / regex-syntax / src / ast / parse.rs
CommitLineData
0531ce1d
XL
1// Copyright 2018 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11/*!
12This module provides a regular expression parser.
13*/
14
15use std::borrow::Borrow;
16use std::cell::{Cell, RefCell};
17use std::mem;
18use std::result;
19
20use ast::{self, Ast, Position, Span};
21use either::Either;
22
23use is_meta_character;
24
25type Result<T> = result::Result<T, ast::Error>;
26
27/// A primitive is an expression with no sub-expressions. This includes
28/// literals, assertions and non-set character classes. This representation
29/// is used as intermediate state in the parser.
30///
31/// This does not include ASCII character classes, since they can only appear
32/// within a set character class.
33#[derive(Clone, Debug, Eq, PartialEq)]
34enum Primitive {
35 Literal(ast::Literal),
36 Assertion(ast::Assertion),
37 Dot(Span),
38 Perl(ast::ClassPerl),
39 Unicode(ast::ClassUnicode),
40}
41
42impl Primitive {
43 /// Return the span of this primitive.
44 fn span(&self) -> &Span {
45 match *self {
46 Primitive::Literal(ref x) => &x.span,
47 Primitive::Assertion(ref x) => &x.span,
48 Primitive::Dot(ref span) => span,
49 Primitive::Perl(ref x) => &x.span,
50 Primitive::Unicode(ref x) => &x.span,
51 }
52 }
53
54 /// Convert this primitive into a proper AST.
55 fn into_ast(self) -> Ast {
56 match self {
57 Primitive::Literal(lit) => Ast::Literal(lit),
58 Primitive::Assertion(assert) => Ast::Assertion(assert),
59 Primitive::Dot(span) => Ast::Dot(span),
60 Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)),
61 Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)),
62 }
63 }
64
65 /// Convert this primitive into an item in a character class.
66 ///
67 /// If this primitive is not a legal item (i.e., an assertion or a dot),
68 /// then return an error.
69 fn into_class_set_item<P: Borrow<Parser>>(
70 self,
71 p: &ParserI<P>,
72 ) -> Result<ast::ClassSetItem> {
73 use ast::ClassSetItem;
74 use self::Primitive::*;
75
76 match self {
77 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
78 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
79 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
80 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
81 }
82 }
83
84 /// Convert this primitive into a literal in a character class. In
85 /// particular, literals are the only valid items that can appear in
86 /// ranges.
87 ///
88 /// If this primitive is not a legal item (i.e., a class, assertion or a
89 /// dot), then return an error.
90 fn into_class_literal<P: Borrow<Parser>>(
91 self,
92 p: &ParserI<P>,
93 ) -> Result<ast::Literal> {
94 use self::Primitive::*;
95
96 match self {
97 Literal(lit) => Ok(lit),
98 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
99 }
100 }
101}
102
103/// Returns true if the given character is a hexadecimal digit.
104fn is_hex(c: char) -> bool {
105 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
106}
107
108/// Returns true if the given character is a valid in a capture group name.
109///
110/// If `first` is true, then `c` is treated as the first character in the
111/// group name (which is not allowed to be a digit).
112fn is_capture_char(c: char, first: bool) -> bool {
113 c == '_' || (!first && c >= '0' && c <= '9')
114 || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
115}
116
117/// A builder for a regular expression parser.
118///
119/// This builder permits modifying configuration options for the parser.
120#[derive(Clone, Debug)]
121pub struct ParserBuilder {
122 ignore_whitespace: bool,
123 nest_limit: u32,
124 octal: bool,
125}
126
127impl Default for ParserBuilder {
128 fn default() -> ParserBuilder {
129 ParserBuilder::new()
130 }
131}
132
133impl ParserBuilder {
134 /// Create a new parser builder with a default configuration.
135 pub fn new() -> ParserBuilder {
136 ParserBuilder {
137 ignore_whitespace: false,
138 nest_limit: 250,
139 octal: false,
140 }
141 }
142
143 /// Build a parser from this configuration with the given pattern.
144 pub fn build(&self) -> Parser {
145 Parser {
146 pos: Cell::new(Position { offset: 0, line: 1, column: 1 }),
147 capture_index: Cell::new(0),
148 nest_limit: self.nest_limit,
149 octal: self.octal,
150 initial_ignore_whitespace: self.ignore_whitespace,
151 ignore_whitespace: Cell::new(self.ignore_whitespace),
152 comments: RefCell::new(vec![]),
153 stack_group: RefCell::new(vec![]),
154 stack_class: RefCell::new(vec![]),
155 capture_names: RefCell::new(vec![]),
156 scratch: RefCell::new(String::new()),
157 }
158 }
159
160 /// Set the nesting limit for this parser.
161 ///
162 /// The nesting limit controls how deep the abstract syntax tree is allowed
163 /// to be. If the AST exceeds the given limit (e.g., with too many nested
164 /// groups), then an error is returned by the parser.
165 ///
166 /// The purpose of this limit is to act as a heuristic to prevent stack
167 /// overflow for consumers that do structural induction on an `Ast` using
168 /// explicit recursion. While this crate never does this (instead using
169 /// constant stack space and moving the call stack to the heap), other
170 /// crates may.
171 ///
172 /// This limit is not checked until the entire Ast is parsed. Therefore,
173 /// if callers want to put a limit on the amount of heap space used, then
174 /// they should impose a limit on the length, in bytes, of the concrete
175 /// pattern string. In particular, this is viable since this parser
176 /// implementation will limit itself to heap space proportional to the
177 /// lenth of the pattern string.
178 ///
179 /// Note that a nest limit of `0` will return a nest limit error for most
180 /// patterns but not all. For example, a nest limit of `0` permits `a` but
181 /// not `ab`, since `ab` requires a concatenation, which results in a nest
182 /// depth of `1`. In general, a nest limit is not something that manifests
183 /// in an obvious way in the concrete syntax, therefore, it should not be
184 /// used in a granular way.
185 pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
186 self.nest_limit = limit;
187 self
188 }
189
190 /// Whether to support octal syntax or not.
191 ///
192 /// Octal syntax is a little-known way of uttering Unicode codepoints in
193 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
194 /// `\141` are all equivalent regular expressions, where the last example
195 /// shows octal syntax.
196 ///
197 /// While supporting octal syntax isn't in and of itself a problem, it does
198 /// make good error messages harder. That is, in PCRE based regex engines,
199 /// syntax like `\0` invokes a backreference, which is explicitly
200 /// unsupported in Rust's regex engine. However, many users expect it to
201 /// be supported. Therefore, when octal support is disabled, the error
202 /// message will explicitly mention that backreferences aren't supported.
203 ///
204 /// Octal syntax is disabled by default.
205 pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
206 self.octal = yes;
207 self
208 }
209
210 /// Enable verbose mode in the regular expression.
211 ///
212 /// When enabled, verbose mode permits insigificant whitespace in many
213 /// places in the regular expression, as well as comments. Comments are
214 /// started using `#` and continue until the end of the line.
215 ///
216 /// By default, this is disabled. It may be selectively enabled in the
217 /// regular expression by using the `x` flag regardless of this setting.
218 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
219 self.ignore_whitespace = yes;
220 self
221 }
222}
223
224/// A regular expression parser.
225///
226/// This parses a string representation of a regular expression into an
227/// abstract syntax tree. The size of the tree is proportional to the length
228/// of the regular expression pattern.
229///
230/// A `Parser` can be configured in more detail via a
231/// [`ParserBuilder`](struct.ParserBuilder.html).
232#[derive(Clone, Debug)]
233pub struct Parser {
234 /// The current position of the parser.
235 pos: Cell<Position>,
236 /// The current capture index.
237 capture_index: Cell<u32>,
238 /// The maximum number of open parens/brackets allowed. If the parser
239 /// exceeds this number, then an error is returned.
240 nest_limit: u32,
241 /// Whether to support octal syntax or not. When `false`, the parser will
242 /// return an error helpfully pointing out that backreferences are not
243 /// supported.
244 octal: bool,
245 /// The initial setting for `ignore_whitespace` as provided by
246 /// Th`ParserBuilder`. is is used when reseting the parser's state.
247 initial_ignore_whitespace: bool,
248 /// Whether whitespace should be ignored. When enabled, comments are
249 /// also permitted.
250 ignore_whitespace: Cell<bool>,
251 /// A list of comments, in order of appearance.
252 comments: RefCell<Vec<ast::Comment>>,
253 /// A stack of grouped sub-expressions, including alternations.
254 stack_group: RefCell<Vec<GroupState>>,
255 /// A stack of nested character classes. This is only non-empty when
256 /// parsing a class.
257 stack_class: RefCell<Vec<ClassState>>,
258 /// A sorted sequence of capture names. This is used to detect duplicate
259 /// capture names and report an error if one is detected.
260 capture_names: RefCell<Vec<ast::CaptureName>>,
261 /// A scratch buffer used in various places. Mostly this is used to
262 /// accumulate relevant characters from parts of a pattern.
263 scratch: RefCell<String>,
264}
265
266/// ParserI is the internal parser implementation.
267///
268/// We use this separate type so that we can carry the provided pattern string
269/// along with us. In particular, a `Parser` internal state is not tied to any
270/// one pattern, but `ParserI` is.
271///
272/// This type also lets us use `ParserI<&Parser>` in production code while
273/// retaining the convenience of `ParserI<Parser>` for tests, which sometimes
274/// work against the internal interface of the parser.
275#[derive(Clone, Debug)]
276struct ParserI<'s, P> {
277 /// The parser state/configuration.
278 parser: P,
279 /// The full regular expression provided by the user.
280 pattern: &'s str,
281}
282
283/// GroupState represents a single stack frame while parsing nested groups
284/// and alternations. Each frame records the state up to an opening parenthesis
285/// or a alternating bracket `|`.
286#[derive(Clone, Debug)]
287enum GroupState {
288 /// This state is pushed whenever an opening group is found.
289 Group {
290 /// The concatenation immediately preceding the opening group.
291 concat: ast::Concat,
292 /// The group that has been opened. Its sub-AST is always empty.
293 group: ast::Group,
294 /// Whether this group has the `x` flag enabled or not.
295 ignore_whitespace: bool,
296 },
297 /// This state is pushed whenever a new alternation branch is found. If
298 /// an alternation branch is found and this state is at the top of the
299 /// stack, then this state should be modified to include the new
300 /// alternation.
301 Alternation(ast::Alternation),
302}
303
304/// ClassState represents a single stack frame while parsing character classes.
305/// Each frame records the state up to an intersection, difference, symmetric
306/// difference or nested class.
307///
308/// Note that a parser's character class stack is only non-empty when parsing
309/// a character class. In all other cases, it is empty.
310#[derive(Clone, Debug)]
311enum ClassState {
312 /// This state is pushed whenever an opening bracket is found.
313 Open {
314 /// The union of class items immediately preceding this class.
315 union: ast::ClassSetUnion,
316 /// The class that has been opened. Typically this just corresponds
317 /// to the `[`, but it can also include `[^` since `^` indicates
318 /// negation of the class.
319 set: ast::ClassBracketed,
320 },
321 /// This state is pushed when a operator is seen. When popped, the stored
322 /// set becomes the left hand side of the operator.
323 Op {
324 /// The type of the operation, i.e., &&, -- or ~~.
325 kind: ast::ClassSetBinaryOpKind,
326 /// The left-hand side of the operator.
327 lhs: ast::ClassSet,
328 },
329}
330
331impl Parser {
332 /// Create a new parser with a default configuration.
333 ///
334 /// The parser can be run with either the `parse` or `parse_with_comments`
335 /// methods. The parse methods return an abstract syntax tree.
336 ///
337 /// To set configuration options on the parser, use
338 /// [`ParserBuilder`](struct.ParserBuilder.html).
339 pub fn new() -> Parser {
340 ParserBuilder::new().build()
341 }
342
343 /// Parse the regular expression into an abstract syntax tree.
344 pub fn parse(&mut self, pattern: &str) -> Result<Ast> {
345 ParserI::new(self, pattern).parse()
346 }
347
348 /// Parse the regular expression and return an abstract syntax tree with
349 /// all of the comments found in the pattern.
350 pub fn parse_with_comments(
351 &mut self,
352 pattern: &str,
353 ) -> Result<ast::WithComments> {
354 ParserI::new(self, pattern).parse_with_comments()
355 }
356
357 /// Reset the internal state of a parser.
358 ///
359 /// This is called at the beginning of every parse. This prevents the
360 /// parser from running with inconsistent state (say, if a previous
361 /// invocation returned an error and the parser is reused).
362 fn reset(&self) {
363 // These settings should be in line with the construction
364 // in `ParserBuilder::build`.
365 self.pos.set(Position { offset: 0, line: 1, column: 1});
366 self.ignore_whitespace.set(self.initial_ignore_whitespace);
367 self.comments.borrow_mut().clear();
368 self.stack_group.borrow_mut().clear();
369 self.stack_class.borrow_mut().clear();
370 }
371}
372
373impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
374 /// Build an internal parser from a parser configuration and a pattern.
375 fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> {
376 ParserI { parser: parser, pattern: pattern }
377 }
378
379 /// Return a reference to the parser state.
380 fn parser(&self) -> &Parser {
381 self.parser.borrow()
382 }
383
384 /// Return a reference to the pattern being parsed.
385 fn pattern(&self) -> &str {
386 self.pattern.borrow()
387 }
388
389 /// Create a new error with the given span and error type.
390 fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error {
391 ast::Error {
392 kind: kind,
393 pattern: self.pattern().to_string(),
394 span: span,
395 }
396 }
397
398 /// Return the current offset of the parser.
399 ///
400 /// The offset starts at `0` from the beginning of the regular expression
401 /// pattern string.
402 fn offset(&self) -> usize {
403 self.parser().pos.get().offset
404 }
405
406 /// Return the current line number of the parser.
407 ///
408 /// The line number starts at `1`.
409 fn line(&self) -> usize {
410 self.parser().pos.get().line
411 }
412
413 /// Return the current column of the parser.
414 ///
415 /// The column number starts at `1` and is reset whenever a `\n` is seen.
416 fn column(&self) -> usize {
417 self.parser().pos.get().column
418 }
419
420 /// Return the next capturing index. Each subsequent call increments the
421 /// internal index.
422 ///
423 /// The span given should correspond to the location of the opening
424 /// parenthesis.
425 ///
426 /// If the capture limit is exceeded, then an error is returned.
427 fn next_capture_index(&self, span: Span) -> Result<u32> {
428 let current = self.parser().capture_index.get();
429 let i = try!(current.checked_add(1).ok_or_else(|| {
430 self.error(span, ast::ErrorKind::CaptureLimitExceeded)
431 }));
432 self.parser().capture_index.set(i);
433 Ok(i)
434 }
435
436 /// Adds the given capture name to this parser. If this capture name has
437 /// already been used, then an error is returned.
438 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
439 let mut names = self.parser().capture_names.borrow_mut();
440 match names.binary_search_by_key(
441 &cap.name.as_str(),
442 |c| c.name.as_str(),
443 ) {
444 Err(i) => {
445 names.insert(i, cap.clone());
446 Ok(())
447 }
448 Ok(i) => {
449 Err(self.error(cap.span, ast::ErrorKind::GroupNameDuplicate {
450 original: names[i].span,
451 }))
452 }
453 }
454 }
455
456 /// Return whether the parser should ignore whitespace or not.
457 fn ignore_whitespace(&self) -> bool {
458 self.parser().ignore_whitespace.get()
459 }
460
461 /// Return the character at the current position of the parser.
462 ///
463 /// This panics if the current position does not point to a valid char.
464 fn char(&self) -> char {
465 self.char_at(self.offset())
466 }
467
468 /// Return the character at the given position.
469 ///
470 /// This panics if the given position does not point to a valid char.
471 fn char_at(&self, i: usize) -> char {
472 self.pattern()[i..].chars().next()
473 .unwrap_or_else(|| {
474 panic!("expected char at offset {}", i)
475 })
476 }
477
478 /// Bump the parser to the next Unicode scalar value.
479 ///
480 /// If the end of the input has been reached, then `false` is returned.
481 fn bump(&self) -> bool {
482 if self.is_eof() {
483 return false;
484 }
485 let Position { mut offset, mut line, mut column } = self.pos();
486 if self.char() == '\n' {
487 line = line.checked_add(1).unwrap();
488 column = 1;
489 } else {
490 column = column.checked_add(1).unwrap();
491 }
492 offset += self.char().len_utf8();
493 self.parser().pos.set(Position {
494 offset: offset,
495 line: line,
496 column: column,
497 });
498 self.pattern()[self.offset()..].chars().next().is_some()
499 }
500
501 /// If the substring starting at the current position of the parser has
502 /// the given prefix, then bump the parser to the character immediately
503 /// following the prefix and return true. Otherwise, don't bump the parser
504 /// and return false.
505 fn bump_if(&self, prefix: &str) -> bool {
506 if self.pattern()[self.offset()..].starts_with(prefix) {
507 for _ in 0..prefix.chars().count() {
508 self.bump();
509 }
510 true
511 } else {
512 false
513 }
514 }
515
516 /// Returns true if and only if the parser is positioned at a look-around
517 /// prefix. The conditions under which this returns true must always
518 /// correspond to a regular expression that would otherwise be consider
519 /// invalid.
520 ///
521 /// This should only be called immediately after parsing the opening of
522 /// a group or a set of flags.
523 fn is_lookaround_prefix(&self) -> bool {
524 self.bump_if("?=")
525 || self.bump_if("?!")
526 || self.bump_if("?<=")
527 || self.bump_if("?<!")
528 }
529
530 /// Bump the parser, and if the `x` flag is enabled, bump through any
531 /// subsequent spaces. Return true if and only if the parser is not at
532 /// EOF.
533 fn bump_and_bump_space(&self) -> bool {
534 if !self.bump() {
535 return false;
536 }
537 self.bump_space();
538 !self.is_eof()
539 }
540
541 /// If the `x` flag is enabled (i.e., whitespace insensitivity with
542 /// comments), then this will advance the parser through all whitespace
543 /// and comments to the next non-whitespace non-comment byte.
544 ///
545 /// If the `x` flag is disabled, then this is a no-op.
546 ///
547 /// This should be used selectively throughout the parser where
548 /// arbitrary whitespace is permitted when the `x` flag is enabled. For
549 /// example, `{ 5 , 6}` is equivalent to `{5,6}`.
550 fn bump_space(&self) {
551 if !self.ignore_whitespace() {
552 return;
553 }
554 while !self.is_eof() {
555 if self.char().is_whitespace() {
556 self.bump();
557 } else if self.char() == '#' {
558 let start = self.pos();
559 let mut comment_text = String::new();
560 self.bump();
561 while !self.is_eof() {
562 let c = self.char();
563 self.bump();
564 if c == '\n' {
565 break;
566 }
567 comment_text.push(c);
568 }
569 let comment = ast::Comment {
570 span: Span::new(start, self.pos()),
571 comment: comment_text,
572 };
573 self.parser().comments.borrow_mut().push(comment);
574 } else {
575 break;
576 }
577 }
578 }
579
580 /// Peek at the next character in the input without advancing the parser.
581 ///
582 /// If the input has been exhausted, then this returns `None`.
583 fn peek(&self) -> Option<char> {
584 if self.is_eof() {
585 return None;
586 }
587 self.pattern()[self.offset() + self.char().len_utf8()..].chars().next()
588 }
589
590 /// Like peek, but will ignore spaces when the parser is in whitespace
591 /// insensitive mode.
592 fn peek_space(&self) -> Option<char> {
593 if !self.ignore_whitespace() {
594 return self.peek();
595 }
596 if self.is_eof() {
597 return None;
598 }
599 let mut start = self.offset() + self.char().len_utf8();
600 let mut in_comment = false;
601 for (i, c) in self.pattern()[start..].char_indices() {
602 if c.is_whitespace() {
603 continue;
604 } else if !in_comment && c == '#' {
605 in_comment = true;
606 } else if in_comment && c == '\n' {
607 in_comment = false;
608 } else {
609 start += i;
610 break;
611 }
612 }
613 self.pattern()[start..].chars().next()
614 }
615
616 /// Returns true if the next call to `bump` would return false.
617 fn is_eof(&self) -> bool {
618 self.offset() == self.pattern().len()
619 }
620
621 /// Return the current position of the parser, which includes the offset,
622 /// line and column.
623 fn pos(&self) -> Position {
624 self.parser().pos.get()
625 }
626
627 /// Create a span at the current position of the parser. Both the start
628 /// and end of the span are set.
629 fn span(&self) -> Span {
630 Span::splat(self.pos())
631 }
632
633 /// Create a span that covers the current character.
634 fn span_char(&self) -> Span {
635 let mut next = Position {
636 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
637 line: self.line(),
638 column: self.column().checked_add(1).unwrap(),
639 };
640 if self.char() == '\n' {
641 next.line += 1;
642 next.column = 1;
643 }
644 Span::new(self.pos(), next)
645 }
646
647 /// Parse and push a single alternation on to the parser's internal stack.
648 /// If the top of the stack already has an alternation, then add to that
649 /// instead of pushing a new one.
650 ///
651 /// The concatenation given corresponds to a single alternation branch.
652 /// The concatenation returned starts the next branch and is empty.
653 ///
654 /// This assumes the parser is currently positioned at `|` and will advance
655 /// the parser to the character following `|`.
656 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
657 assert_eq!(self.char(), '|');
658 concat.span.end = self.pos();
659 self.push_or_add_alternation(concat);
660 self.bump();
661 Ok(ast::Concat {
662 span: self.span(),
663 asts: vec![],
664 })
665 }
666
667 /// Pushes or adds the given branch of an alternation to the parser's
668 /// internal stack of state.
669 fn push_or_add_alternation(&self, concat: ast::Concat) {
670 use self::GroupState::*;
671
672 let mut stack = self.parser().stack_group.borrow_mut();
673 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
674 alts.asts.push(concat.into_ast());
675 return;
676 }
677 stack.push(Alternation(ast::Alternation {
678 span: Span::new(concat.span.start, self.pos()),
679 asts: vec![concat.into_ast()],
680 }));
681 }
682
683 /// Parse and push a group AST (and its parent concatenation) on to the
684 /// parser's internal stack. Return a fresh concatenation corresponding
685 /// to the group's sub-AST.
686 ///
687 /// If a set of flags was found (with no group), then the concatenation
688 /// is returned with that set of flags added.
689 ///
690 /// This assumes that the parser is currently positioned on the opening
691 /// parenthesis. It advances the parser to the character at the start
692 /// of the sub-expression (or adjoining expression).
693 ///
694 /// If there was a problem parsing the start of the group, then an error
695 /// is returned.
696 fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
697 assert_eq!(self.char(), '(');
698 match try!(self.parse_group()) {
699 Either::Left(set) => {
700 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
701 if let Some(v) = ignore {
702 self.parser().ignore_whitespace.set(v);
703 }
704
705 concat.asts.push(Ast::Flags(set));
706 Ok(concat)
707 }
708 Either::Right(group) => {
709 let old_ignore_whitespace = self.ignore_whitespace();
710 let new_ignore_whitespace = group
711 .flags()
712 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
713 .unwrap_or(old_ignore_whitespace);
714 self.parser().stack_group.borrow_mut().push(GroupState::Group {
715 concat: concat,
716 group: group,
717 ignore_whitespace: old_ignore_whitespace,
718 });
719 self.parser().ignore_whitespace.set(new_ignore_whitespace);
720 Ok(ast::Concat {
721 span: self.span(),
722 asts: vec![],
723 })
724 }
725 }
726 }
727
728 /// Pop a group AST from the parser's internal stack and set the group's
729 /// AST to the given concatenation. Return the concatenation containing
730 /// the group.
731 ///
732 /// This assumes that the parser is currently positioned on the closing
733 /// parenthesis and advances the parser to the character following the `)`.
734 ///
735 /// If no such group could be popped, then an unopened group error is
736 /// returned.
737 fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> {
738 use self::GroupState::*;
739
740 assert_eq!(self.char(), ')');
741 let mut stack = self.parser().stack_group.borrow_mut();
742 let (mut prior_concat, mut group, ignore_whitespace, alt) =
743 match stack.pop() {
744 Some(Group { concat, group, ignore_whitespace }) => {
745 (concat, group, ignore_whitespace, None)
746 }
747 Some(Alternation(alt)) => {
748 match stack.pop() {
749 Some(Group { concat, group, ignore_whitespace }) => {
750 (concat, group, ignore_whitespace, Some(alt))
751 }
752 None | Some(Alternation(_)) => {
753 return Err(self.error(
754 self.span_char(),
755 ast::ErrorKind::GroupUnopened,
756 ));
757 }
758 }
759 }
760 None => {
761 return Err(self.error(
762 self.span_char(),
763 ast::ErrorKind::GroupUnopened,
764 ));
765 }
766 };
767 self.parser().ignore_whitespace.set(ignore_whitespace);
768 group_concat.span.end = self.pos();
769 self.bump();
770 group.span.end = self.pos();
771 match alt {
772 Some(mut alt) => {
773 alt.span.end = group_concat.span.end;
774 alt.asts.push(group_concat.into_ast());
775 group.ast = Box::new(alt.into_ast());
776 }
777 None => {
778 group.ast = Box::new(group_concat.into_ast());
779 }
780 }
781 prior_concat.asts.push(Ast::Group(group));
782 Ok(prior_concat)
783 }
784
785 /// Pop the last state from the parser's internal stack, if it exists, and
786 /// add the given concatenation to it. There either must be no state or a
787 /// single alternation item on the stack. Any other scenario produces an
788 /// error.
789 ///
790 /// This assumes that the parser has advanced to the end.
791 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
792 concat.span.end = self.pos();
793 let mut stack = self.parser().stack_group.borrow_mut();
794 let ast = match stack.pop() {
795 None => Ok(concat.into_ast()),
796 Some(GroupState::Alternation(mut alt)) => {
797 alt.span.end = self.pos();
798 alt.asts.push(concat.into_ast());
799 Ok(Ast::Alternation(alt))
800 }
801 Some(GroupState::Group { group, .. }) => {
802 return Err(self.error(
803 group.span,
804 ast::ErrorKind::GroupUnclosed,
805 ));
806 }
807 };
808 // If we try to pop again, there should be nothing.
809 match stack.pop() {
810 None => ast,
811 Some(GroupState::Alternation(_)) => {
812 // This unreachable is unfortunate. This case can't happen
813 // because the only way we can be here is if there were two
814 // `GroupState::Alternation`s adjacent in the parser's stack,
815 // which we guarantee to never happen because we never push a
816 // `GroupState::Alternation` if one is already at the top of
817 // the stack.
818 unreachable!()
819 }
820 Some(GroupState::Group { group, .. }) => {
821 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
822 }
823 }
824 }
825
826 /// Parse the opening of a character class and push the current class
827 /// parsing context onto the parser's stack. This assumes that the parser
828 /// is positioned at an opening `[`. The given union should correspond to
829 /// the union of set items built up before seeing the `[`.
830 ///
831 /// If there was a problem parsing the opening of the class, then an error
832 /// is returned. Otherwise, a new union of set items for the class is
833 /// returned (which may be populated with either a `]` or a `-`).
834 fn push_class_open(
835 &self,
836 parent_union: ast::ClassSetUnion,
837 ) -> Result<ast::ClassSetUnion> {
838 assert_eq!(self.char(), '[');
839
840 let (nested_set, nested_union) = try!(self.parse_set_class_open());
841 self.parser().stack_class.borrow_mut().push(ClassState::Open {
842 union: parent_union,
843 set: nested_set,
844 });
845 Ok(nested_union)
846 }
847
848 /// Parse the end of a character class set and pop the character class
849 /// parser stack. The union given corresponds to the last union built
850 /// before seeing the closing `]`. The union returned corresponds to the
851 /// parent character class set with the nested class added to it.
852 ///
853 /// This assumes that the parser is positioned at a `]` and will advance
854 /// the parser to the byte immediately following the `]`.
855 ///
856 /// If the stack is empty after popping, then this returns the final
857 /// "top-level" character class AST (where a "top-level" character class
858 /// is one that is not nested inside any other character class).
859 ///
860 /// If there is no corresponding opening bracket on the parser's stack,
861 /// then an error is returned.
862 fn pop_class(
863 &self,
864 nested_union: ast::ClassSetUnion,
865 ) -> Result<Either<ast::ClassSetUnion, ast::Class>> {
866 assert_eq!(self.char(), ']');
867
868 let item = ast::ClassSet::Item(nested_union.into_item());
869 let prevset = self.pop_class_op(item);
870 let mut stack = self.parser().stack_class.borrow_mut();
871 match stack.pop() {
872 None => {
873 // We can never observe an empty stack:
874 //
875 // 1) We are guaranteed to start with a non-empty stack since
876 // the character class parser is only initiated when it sees
877 // a `[`.
878 // 2) If we ever observe an empty stack while popping after
879 // seeing a `]`, then we signal the character class parser
880 // to terminate.
881 panic!("unexpected empty character class stack")
882 },
883 Some(ClassState::Op { .. }) => {
884 // This panic is unfortunate, but this case is impossible
885 // since we already popped the Op state if one exists above.
886 // Namely, every push to the class parser stack is guarded by
887 // whether an existing Op is already on the top of the stack.
888 // If it is, the existing Op is modified. That is, the stack
889 // can never have consecutive Op states.
890 panic!("unexpected ClassState::Op")
891 }
892 Some(ClassState::Open { mut union, mut set }) => {
893 self.bump();
894 set.span.end = self.pos();
895 set.kind = prevset;
896 if stack.is_empty() {
897 Ok(Either::Right(ast::Class::Bracketed(set)))
898 } else {
899 union.push(ast::ClassSetItem::Bracketed(Box::new(set)));
900 Ok(Either::Left(union))
901 }
902 }
903 }
904 }
905
906 /// Return an "unclosed class" error whose span points to the most
907 /// recently opened class.
908 ///
909 /// This should only be called while parsing a character class.
910 fn unclosed_class_error(&self) -> ast::Error {
911 for state in self.parser().stack_class.borrow().iter().rev() {
912 match *state {
913 ClassState::Open { ref set, .. } => {
914 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
915 }
916 _ => {}
917 }
918 }
919 // We are guaranteed to have a non-empty stack with at least
920 // one open bracket, so we should never get here.
921 panic!("no open character class found")
922 }
923
924 /// Push the current set of class items on to the class parser's stack as
925 /// the left hand side of the given operator.
926 ///
927 /// A fresh set union is returned, which should be used to build the right
928 /// hand side of this operator.
929 fn push_class_op(
930 &self,
931 next_kind: ast::ClassSetBinaryOpKind,
932 next_union: ast::ClassSetUnion,
933 ) -> ast::ClassSetUnion {
934
935 let item = ast::ClassSet::Item(next_union.into_item());
936 let new_lhs = self.pop_class_op(item);
937 self.parser().stack_class.borrow_mut().push(ClassState::Op {
938 kind: next_kind,
939 lhs: new_lhs,
940 });
941 ast::ClassSetUnion { span: self.span(), items: vec![] }
942 }
943
944 /// Pop a character class set from the character class parser stack. If the
945 /// top of the stack is just an item (not an operation), then return the
946 /// given set unchanged. If the top of the stack is an operation, then the
947 /// given set will be used as the rhs of the operation on the top of the
948 /// stack. In that case, the binary operation is returned as a set.
949 fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet {
950 let mut stack = self.parser().stack_class.borrow_mut();
951 let (kind, lhs) = match stack.pop() {
952 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
953 Some(state @ ClassState::Open { .. }) => {
954 stack.push(state);
955 return rhs;
956 }
957 None => unreachable!(),
958 };
959 let span = Span::new(lhs.span().start, rhs.span().end);
960 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
961 span: span,
962 kind: kind,
963 lhs: Box::new(lhs),
964 rhs: Box::new(rhs),
965 })
966 }
967}
968
969impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
970 /// Parse the regular expression into an abstract syntax tree.
971 fn parse(&self) -> Result<Ast> {
972 self.parse_with_comments().map(|astc| astc.ast)
973 }
974
975 /// Parse the regular expression and return an abstract syntax tree with
976 /// all of the comments found in the pattern.
977 fn parse_with_comments(&self) -> Result<ast::WithComments> {
978 assert_eq!(self.offset(), 0, "parser can only be used once");
979 self.parser().reset();
980 let mut concat = ast::Concat {
981 span: self.span(),
982 asts: vec![],
983 };
984 loop {
985 self.bump_space();
986 if self.is_eof() {
987 break;
988 }
989 match self.char() {
990 '(' => concat = try!(self.push_group(concat)),
991 ')' => concat = try!(self.pop_group(concat)),
992 '|' => concat = try!(self.push_alternate(concat)),
993 '[' => {
994 let class = try!(self.parse_set_class());
995 concat.asts.push(Ast::Class(class));
996 }
997 '?' => {
998 concat = try!(self.parse_uncounted_repetition(
999 concat, ast::RepetitionKind::ZeroOrOne));
1000 }
1001 '*' => {
1002 concat = try!(self.parse_uncounted_repetition(
1003 concat, ast::RepetitionKind::ZeroOrMore));
1004 }
1005 '+' => {
1006 concat = try!(self.parse_uncounted_repetition(
1007 concat, ast::RepetitionKind::OneOrMore));
1008 }
1009 '{' => {
1010 concat = try!(self.parse_counted_repetition(concat));
1011 }
1012 _ => concat.asts.push(try!(self.parse_primitive()).into_ast()),
1013 }
1014 }
1015 let ast = try!(self.pop_group_end(concat));
1016 try!(NestLimiter::new(self).check(&ast));
1017 Ok(ast::WithComments {
1018 ast: ast,
1019 comments: mem::replace(
1020 &mut *self.parser().comments.borrow_mut(),
1021 vec![],
1022 ),
1023 })
1024 }
1025
1026 /// Parses an uncounted repetition operation. An uncounted repetition
1027 /// operator includes ?, * and +, but does not include the {m,n} syntax.
1028 /// The given `kind` should correspond to the operator observed by the
1029 /// caller.
1030 ///
1031 /// This assumes that the paser is currently positioned at the repetition
1032 /// operator and advances the parser to the first character after the
1033 /// operator. (Note that the operator may include a single additional `?`,
1034 /// which makes the operator ungreedy.)
1035 ///
1036 /// The caller should include the concatenation that is being built. The
1037 /// concatenation returned includes the repetition operator applied to the
1038 /// last expression in the given concatenation.
1039 fn parse_uncounted_repetition(
1040 &self,
1041 mut concat: ast::Concat,
1042 kind: ast::RepetitionKind,
1043 ) -> Result<ast::Concat> {
1044 assert!(
1045 self.char() == '?' || self.char() == '*' || self.char() == '+');
1046 let op_start = self.pos();
1047 let ast = match concat.asts.pop() {
1048 Some(ast) => ast,
1049 None => return Err(self.error(
1050 self.span(),
1051 ast::ErrorKind::RepetitionMissing,
1052 )),
1053 };
1054 let mut greedy = true;
1055 if self.bump() && self.char() == '?' {
1056 greedy = false;
1057 self.bump();
1058 }
1059 concat.asts.push(Ast::Repetition(ast::Repetition {
1060 span: ast.span().with_end(self.pos()),
1061 op: ast::RepetitionOp {
1062 span: Span::new(op_start, self.pos()),
1063 kind: kind,
1064 },
1065 greedy: greedy,
1066 ast: Box::new(ast),
1067 }));
1068 Ok(concat)
1069 }
1070
1071 /// Parses a counted repetition operation. A counted repetition operator
1072 /// corresponds to the {m,n} syntax, and does not include the ?, * or +
1073 /// operators.
1074 ///
1075 /// This assumes that the paser is currently positioned at the opening `{`
1076 /// and advances the parser to the first character after the operator.
1077 /// (Note that the operator may include a single additional `?`, which
1078 /// makes the operator ungreedy.)
1079 ///
1080 /// The caller should include the concatenation that is being built. The
1081 /// concatenation returned includes the repetition operator applied to the
1082 /// last expression in the given concatenation.
1083 fn parse_counted_repetition(
1084 &self,
1085 mut concat: ast::Concat,
1086 ) -> Result<ast::Concat> {
1087 assert!(self.char() == '{');
1088 let start = self.pos();
1089 let ast = match concat.asts.pop() {
1090 Some(ast) => ast,
1091 None => return Err(self.error(
1092 self.span(),
1093 ast::ErrorKind::RepetitionMissing,
1094 )),
1095 };
1096 if !self.bump_and_bump_space() {
1097 return Err(self.error(
1098 Span::new(start, self.pos()),
1099 ast::ErrorKind::RepetitionCountUnclosed,
1100 ));
1101 }
1102 let count_start = try!(self.parse_decimal());
1103 let mut range = ast::RepetitionRange::Exactly(count_start);
1104 if self.is_eof() {
1105 return Err(self.error(
1106 Span::new(start, self.pos()),
1107 ast::ErrorKind::RepetitionCountUnclosed,
1108 ));
1109 }
1110 if self.char() == ',' {
1111 if !self.bump_and_bump_space() {
1112 return Err(self.error(
1113 Span::new(start, self.pos()),
1114 ast::ErrorKind::RepetitionCountUnclosed,
1115 ));
1116 }
1117 if self.char() != '}' {
1118 let count_end = try!(self.parse_decimal());
1119 range = ast::RepetitionRange::Bounded(count_start, count_end);
1120 } else {
1121 range = ast::RepetitionRange::AtLeast(count_start);
1122 }
1123 }
1124 if self.is_eof() || self.char() != '}' {
1125 return Err(self.error(
1126 Span::new(start, self.pos()),
1127 ast::ErrorKind::RepetitionCountUnclosed,
1128 ));
1129 }
1130
1131 let mut greedy = true;
1132 if self.bump_and_bump_space() && self.char() == '?' {
1133 greedy = false;
1134 self.bump();
1135 }
1136
1137 let op_span = Span::new(start, self.pos());
1138 if !range.is_valid() {
1139 return Err(self.error(
1140 op_span,
1141 ast::ErrorKind::RepetitionCountInvalid,
1142 ));
1143 }
1144 concat.asts.push(Ast::Repetition(ast::Repetition {
1145 span: ast.span().with_end(self.pos()),
1146 op: ast::RepetitionOp {
1147 span: op_span,
1148 kind: ast::RepetitionKind::Range(range),
1149 },
1150 greedy: greedy,
1151 ast: Box::new(ast),
1152 }));
1153 Ok(concat)
1154 }
1155
1156 /// Parse a group (which contains a sub-expression) or a set of flags.
1157 ///
1158 /// If a group was found, then it is returned with an empty AST. If a set
1159 /// of flags is found, then that set is returned.
1160 ///
1161 /// The parser should be positioned at the opening parenthesis.
1162 ///
1163 /// This advances the parser to the character before the start of the
1164 /// sub-expression (in the case of a group) or to the closing parenthesis
1165 /// immediately following the set of flags.
1166 ///
1167 /// # Errors
1168 ///
1169 /// If flags are given and incorrectly specified, then a corresponding
1170 /// error is returned.
1171 ///
1172 /// If a capture name is given and it is incorrectly specified, then a
1173 /// corresponding error is returned.
1174 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1175 assert_eq!(self.char(), '(');
1176 let open_span = self.span_char();
1177 self.bump();
1178 self.bump_space();
1179 if self.is_lookaround_prefix() {
1180 return Err(self.error(
1181 Span::new(open_span.start, self.span().end),
1182 ast::ErrorKind::UnsupportedLookAround,
1183 ));
1184 }
1185 let inner_span = self.span();
1186 if self.bump_if("?P<") {
1187 let capture_index = try!(self.next_capture_index(open_span));
1188 let cap = try!(self.parse_capture_name(capture_index));
1189 Ok(Either::Right(ast::Group {
1190 span: open_span,
1191 kind: ast::GroupKind::CaptureName(cap),
1192 ast: Box::new(Ast::Empty(self.span())),
1193 }))
1194 } else if self.bump_if("?") {
1195 if self.is_eof() {
1196 return Err(self.error(
1197 open_span,
1198 ast::ErrorKind::GroupUnclosed,
1199 ));
1200 }
1201 let flags = try!(self.parse_flags());
1202 let char_end = self.char();
1203 self.bump();
1204 if char_end == ')' {
1205 // We don't allow empty flags, e.g., `(?)`. We instead
1206 // interpret it as a repetition operator missing its argument.
1207 if flags.items.is_empty() {
1208 return Err(self.error(
1209 inner_span,
1210 ast::ErrorKind::RepetitionMissing,
1211 ));
1212 }
1213 Ok(Either::Left(ast::SetFlags {
1214 span: Span { end: self.pos(), ..open_span },
1215 flags: flags,
1216 }))
1217 } else {
1218 assert_eq!(char_end, ':');
1219 Ok(Either::Right(ast::Group {
1220 span: open_span,
1221 kind: ast::GroupKind::NonCapturing(flags),
1222 ast: Box::new(Ast::Empty(self.span())),
1223 }))
1224 }
1225 } else {
1226 let capture_index = try!(self.next_capture_index(open_span));
1227 Ok(Either::Right(ast::Group {
1228 span: open_span,
1229 kind: ast::GroupKind::CaptureIndex(capture_index),
1230 ast: Box::new(Ast::Empty(self.span())),
1231 }))
1232 }
1233 }
1234
1235 /// Parses a capture group name. Assumes that the parser is positioned at
1236 /// the first character in the name following the opening `<` (and may
1237 /// possibly be EOF). This advances the parser to the first character
1238 /// following the closing `>`.
1239 ///
1240 /// The caller must provide the capture index of the group for this name.
1241 fn parse_capture_name(
1242 &self,
1243 capture_index: u32,
1244 ) -> Result<ast::CaptureName> {
1245 if self.is_eof() {
1246 return Err(self.error(
1247 self.span(),
1248 ast::ErrorKind::GroupNameUnexpectedEof,
1249 ));
1250 }
1251 let start = self.pos();
1252 loop {
1253 if self.char() == '>' {
1254 break;
1255 }
1256 if !is_capture_char(self.char(), self.pos() == start) {
1257 return Err(self.error(
1258 self.span_char(),
1259 ast::ErrorKind::GroupNameInvalid,
1260 ));
1261 }
1262 if !self.bump() {
1263 break;
1264 }
1265 }
1266 let end = self.pos();
1267 if self.is_eof() {
1268 return Err(self.error(
1269 self.span(),
1270 ast::ErrorKind::GroupNameUnexpectedEof,
1271 ));
1272 }
1273 assert_eq!(self.char(), '>');
1274 self.bump();
1275 let name = &self.pattern()[start.offset..end.offset];
1276 if name.is_empty() {
1277 return Err(self.error(
1278 Span::new(start, start),
1279 ast::ErrorKind::GroupNameEmpty,
1280 ));
1281 }
1282 let capname = ast::CaptureName {
1283 span: Span::new(start, end),
1284 name: name.to_string(),
1285 index: capture_index,
1286 };
1287 try!(self.add_capture_name(&capname));
1288 Ok(capname)
1289 }
1290
1291 /// Parse a sequence of flags starting at the current character.
1292 ///
1293 /// This advances the parser to the character immediately following the
1294 /// flags, which is guaranteed to be either `:` or `)`.
1295 ///
1296 /// # Errors
1297 ///
1298 /// If any flags are duplicated, then an error is returned.
1299 ///
1300 /// If the negation operator is used more than once, then an error is
1301 /// returned.
1302 ///
1303 /// If no flags could be found or if the negation operation is not followed
1304 /// by any flags, then an error is returned.
1305 fn parse_flags(&self) -> Result<ast::Flags> {
1306 let mut flags = ast::Flags {
1307 span: self.span(),
1308 items: vec![],
1309 };
1310 let mut last_was_negation = None;
1311 while self.char() != ':' && self.char() != ')' {
1312 if self.char() == '-' {
1313 last_was_negation = Some(self.span_char());
1314 let item = ast::FlagsItem {
1315 span: self.span_char(),
1316 kind: ast::FlagsItemKind::Negation,
1317 };
1318 if let Some(i) = flags.add_item(item) {
1319 return Err(self.error(
1320 self.span_char(),
1321 ast::ErrorKind::FlagRepeatedNegation {
1322 original: flags.items[i].span,
1323 },
1324 ));
1325 }
1326 } else {
1327 last_was_negation = None;
1328 let item = ast::FlagsItem {
1329 span: self.span_char(),
1330 kind: ast::FlagsItemKind::Flag(try!(self.parse_flag())),
1331 };
1332 if let Some(i) = flags.add_item(item) {
1333 return Err(self.error(
1334 self.span_char(),
1335 ast::ErrorKind::FlagDuplicate {
1336 original: flags.items[i].span,
1337 },
1338 ));
1339 }
1340 }
1341 if !self.bump() {
1342 return Err(self.error(
1343 self.span(),
1344 ast::ErrorKind::FlagUnexpectedEof,
1345 ));
1346 }
1347 }
1348 if let Some(span) = last_was_negation {
1349 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1350 }
1351 flags.span.end = self.pos();
1352 Ok(flags)
1353 }
1354
1355 /// Parse the current character as a flag. Do not advance the parser.
1356 ///
1357 /// # Errors
1358 ///
1359 /// If the flag is not recognized, then an error is returned.
1360 fn parse_flag(&self) -> Result<ast::Flag> {
1361 match self.char() {
1362 'i' => Ok(ast::Flag::CaseInsensitive),
1363 'm' => Ok(ast::Flag::MultiLine),
1364 's' => Ok(ast::Flag::DotMatchesNewLine),
1365 'U' => Ok(ast::Flag::SwapGreed),
1366 'u' => Ok(ast::Flag::Unicode),
1367 'x' => Ok(ast::Flag::IgnoreWhitespace),
1368 _ => Err(self.error(
1369 self.span_char(),
1370 ast::ErrorKind::FlagUnrecognized,
1371 )),
1372 }
1373 }
1374
1375 /// Parse a primitive AST. e.g., A literal, non-set character class or
1376 /// assertion.
1377 ///
1378 /// This assumes that the parser expects a primitive at the current
1379 /// location. i.e., All other non-primitive cases have been handled.
1380 /// For example, if the parser's position is at `|`, then `|` will be
1381 /// treated as a literal (e.g., inside a character class).
1382 ///
1383 /// This advances the parser to the first character immediately following
1384 /// the primitive.
1385 fn parse_primitive(&self) -> Result<Primitive> {
1386 match self.char() {
1387 '\\' => self.parse_escape(),
1388 '.' => {
1389 let ast = Primitive::Dot(self.span_char());
1390 self.bump();
1391 Ok(ast)
1392 }
1393 '^' => {
1394 let ast = Primitive::Assertion(ast::Assertion {
1395 span: self.span_char(),
1396 kind: ast::AssertionKind::StartLine,
1397 });
1398 self.bump();
1399 Ok(ast)
1400 }
1401 '$' => {
1402 let ast = Primitive::Assertion(ast::Assertion {
1403 span: self.span_char(),
1404 kind: ast::AssertionKind::EndLine,
1405 });
1406 self.bump();
1407 Ok(ast)
1408 }
1409 c => {
1410 let ast = Primitive::Literal(ast::Literal {
1411 span: self.span_char(),
1412 kind: ast::LiteralKind::Verbatim,
1413 c: c,
1414 });
1415 self.bump();
1416 Ok(ast)
1417 }
1418 }
1419 }
1420
1421 /// Parse an escape sequence as a primitive AST.
1422 ///
1423 /// This assumes the parser is positioned at the start of the escape
1424 /// sequence, i.e., `\`. It advances the parser to the first position
1425 /// immediately following the escape sequence.
1426 fn parse_escape(&self) -> Result<Primitive> {
1427 assert_eq!(self.char(), '\\');
1428 let start = self.pos();
1429 if !self.bump() {
1430 return Err(self.error(
1431 Span::new(start, self.pos()),
1432 ast::ErrorKind::EscapeUnexpectedEof,
1433 ));
1434 }
1435 let c = self.char();
1436 // Put some of the more complicated routines into helpers.
1437 match c {
1438 '0'...'7' => {
1439 if !self.parser().octal {
1440 return Err(self.error(
1441 Span::new(start, self.span_char().end),
1442 ast::ErrorKind::UnsupportedBackreference,
1443 ));
1444 }
1445 let mut lit = self.parse_octal();
1446 lit.span.start = start;
1447 return Ok(Primitive::Literal(lit));
1448 }
1449 '8'...'9' if !self.parser().octal => {
1450 return Err(self.error(
1451 Span::new(start, self.span_char().end),
1452 ast::ErrorKind::UnsupportedBackreference,
1453 ));
1454 }
1455 'x' | 'u' | 'U' => {
1456 let mut lit = try!(self.parse_hex());
1457 lit.span.start = start;
1458 return Ok(Primitive::Literal(lit));
1459 }
1460 'p' | 'P' => {
1461 let mut cls = try!(self.parse_unicode_class());
1462 cls.span.start = start;
1463 return Ok(Primitive::Unicode(cls));
1464 }
1465 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1466 let mut cls = self.parse_perl_class();
1467 cls.span.start = start;
1468 return Ok(Primitive::Perl(cls));
1469 }
1470 _ => {}
1471 }
1472
1473 // Handle all of the one letter sequences inline.
1474 self.bump();
1475 let span = Span::new(start, self.pos());
1476 if is_meta_character(c) {
1477 return Ok(Primitive::Literal(ast::Literal {
1478 span: span,
1479 kind: ast::LiteralKind::Punctuation,
1480 c: c,
1481 }));
1482 }
1483 let special = |kind, c| Ok(Primitive::Literal(ast::Literal {
1484 span: span,
1485 kind: ast::LiteralKind::Special(kind),
1486 c: c,
1487 }));
1488 match c {
1489 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'),
1490 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'),
1491 't' => special(ast::SpecialLiteralKind::Tab, '\t'),
1492 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'),
1493 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'),
1494 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'),
1495 ' ' if self.ignore_whitespace() => {
1496 special(ast::SpecialLiteralKind::Space, ' ')
1497 }
1498 'A' => Ok(Primitive::Assertion(ast::Assertion {
1499 span: span,
1500 kind: ast::AssertionKind::StartText,
1501 })),
1502 'z' => Ok(Primitive::Assertion(ast::Assertion {
1503 span: span,
1504 kind: ast::AssertionKind::EndText,
1505 })),
1506 'b' => Ok(Primitive::Assertion(ast::Assertion {
1507 span: span,
1508 kind: ast::AssertionKind::WordBoundary,
1509 })),
1510 'B' => Ok(Primitive::Assertion(ast::Assertion {
1511 span: span,
1512 kind: ast::AssertionKind::NotWordBoundary,
1513 })),
1514 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
1515 }
1516 }
1517
1518 /// Parse an octal representation of a Unicode codepoint up to 3 digits
1519 /// long. This expects the parser to be positioned at the first octal
1520 /// digit and advances the parser to the first character immediately
1521 /// following the octal number. This also assumes that parsing octal
1522 /// escapes is enabled.
1523 ///
1524 /// Assuming the preconditions are met, this routine can never fail.
1525 fn parse_octal(&self) -> ast::Literal {
1526 use std::char;
1527 use std::u32;
1528
1529 assert!(self.parser().octal);
1530 assert!('0' <= self.char() && self.char() <= '7');
1531 let start = self.pos();
1532 // Parse up to two more digits.
1533 while
1534 self.bump() &&
1535 '0' <= self.char() && self.char() <= '7' &&
1536 self.pos().offset - start.offset <= 2
1537 {}
1538 let end = self.pos();
1539 let octal = &self.pattern()[start.offset..end.offset];
1540 // Parsing the octal should never fail since the above guarantees a
1541 // valid number.
1542 let codepoint =
1543 u32::from_str_radix(octal, 8).expect("valid octal number");
1544 // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
1545 // invalid Unicode scalar values.
1546 let c = char::from_u32(codepoint).expect("Unicode scalar value");
1547 ast::Literal {
1548 span: Span::new(start, end),
1549 kind: ast::LiteralKind::Octal,
1550 c: c,
1551 }
1552 }
1553
1554 /// Parse a hex representation of a Unicode codepoint. This handles both
1555 /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
1556 /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
1557 /// the first character immediately following the hexadecimal literal.
1558 fn parse_hex(&self) -> Result<ast::Literal> {
1559 assert!(self.char() == 'x'
1560 || self.char() == 'u'
1561 || self.char() == 'U');
1562
1563 let hex_kind = match self.char() {
1564 'x' => ast::HexLiteralKind::X,
1565 'u' => ast::HexLiteralKind::UnicodeShort,
1566 _ => ast::HexLiteralKind::UnicodeLong,
1567 };
1568 if !self.bump_and_bump_space() {
1569 return Err(self.error(
1570 self.span(),
1571 ast::ErrorKind::EscapeUnexpectedEof,
1572 ));
1573 }
1574 if self.char() == '{' {
1575 self.parse_hex_brace(hex_kind)
1576 } else {
1577 self.parse_hex_digits(hex_kind)
1578 }
1579 }
1580
1581 /// Parse an N-digit hex representation of a Unicode codepoint. This
1582 /// expects the parser to be positioned at the first digit and will advance
1583 /// the parser to the first character immediately following the escape
1584 /// sequence.
1585 ///
1586 /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`)
1587 /// or 8 (for `\UNNNNNNNN`).
1588 fn parse_hex_digits(
1589 &self,
1590 kind: ast::HexLiteralKind,
1591 ) -> Result<ast::Literal> {
1592 use std::char;
1593 use std::u32;
1594
1595 let mut scratch = self.parser().scratch.borrow_mut();
1596 scratch.clear();
1597
1598 let start = self.pos();
1599 for i in 0..kind.digits() {
1600 if i > 0 && !self.bump_and_bump_space() {
1601 return Err(self.error(
1602 self.span(),
1603 ast::ErrorKind::EscapeUnexpectedEof,
1604 ));
1605 }
1606 if !is_hex(self.char()) {
1607 return Err(self.error(
1608 self.span_char(),
1609 ast::ErrorKind::EscapeHexInvalidDigit,
1610 ));
1611 }
1612 scratch.push(self.char());
1613 }
1614 // The final bump just moves the parser past the literal, which may
1615 // be EOF.
1616 self.bump_and_bump_space();
1617 let end = self.pos();
1618 let hex = scratch.as_str();
1619 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1620 None => Err(self.error(
1621 Span::new(start, end),
1622 ast::ErrorKind::EscapeHexInvalid,
1623 )),
1624 Some(c) => Ok(ast::Literal {
1625 span: Span::new(start, end),
1626 kind: ast::LiteralKind::HexFixed(kind),
1627 c: c,
1628 }),
1629 }
1630 }
1631
1632 /// Parse a hex representation of any Unicode scalar value. This expects
1633 /// the parser to be positioned at the opening brace `{` and will advance
1634 /// the parser to the first character following the closing brace `}`.
1635 fn parse_hex_brace(
1636 &self,
1637 kind: ast::HexLiteralKind,
1638 ) -> Result<ast::Literal> {
1639 use std::char;
1640 use std::u32;
1641
1642 let mut scratch = self.parser().scratch.borrow_mut();
1643 scratch.clear();
1644
1645 let brace_pos = self.pos();
1646 let start = self.span_char().end;
1647 while self.bump_and_bump_space() && self.char() != '}' {
1648 if !is_hex(self.char()) {
1649 return Err(self.error(
1650 self.span_char(),
1651 ast::ErrorKind::EscapeHexInvalidDigit,
1652 ));
1653 }
1654 scratch.push(self.char());
1655 }
1656 if self.is_eof() {
1657 return Err(self.error(
1658 Span::new(brace_pos, self.pos()),
1659 ast::ErrorKind::EscapeUnexpectedEof,
1660 ));
1661 }
1662 let end = self.pos();
1663 let hex = scratch.as_str();
1664 assert_eq!(self.char(), '}');
1665 self.bump_and_bump_space();
1666
1667 if hex.is_empty() {
1668 return Err(self.error(
1669 Span::new(brace_pos, self.pos()),
1670 ast::ErrorKind::EscapeHexEmpty,
1671 ));
1672 }
1673 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1674 None => Err(self.error(
1675 Span::new(start, end),
1676 ast::ErrorKind::EscapeHexInvalid,
1677 )),
1678 Some(c) => Ok(ast::Literal {
1679 span: Span::new(start, self.pos()),
1680 kind: ast::LiteralKind::HexBrace(kind),
1681 c: c,
1682 }),
1683 }
1684 }
1685
1686 /// Parse a decimal number into a u32 while trimming leading and trailing
1687 /// whitespace.
1688 ///
1689 /// This expects the parser to be positioned at the first position where
1690 /// a decimal digit could occur. This will advance the parser to the byte
1691 /// immediately following the last contiguous decimal digit.
1692 ///
1693 /// If no decimal digit could be found or if there was a problem parsing
1694 /// the complete set of digits into a u32, then an error is returned.
1695 fn parse_decimal(&self) -> Result<u32> {
1696 let mut scratch = self.parser().scratch.borrow_mut();
1697 scratch.clear();
1698
1699 while !self.is_eof() && self.char().is_whitespace() {
1700 self.bump();
1701 }
1702 let start = self.pos();
1703 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
1704 scratch.push(self.char());
1705 self.bump_and_bump_space();
1706 }
1707 let span = Span::new(start, self.pos());
1708 while !self.is_eof() && self.char().is_whitespace() {
1709 self.bump_and_bump_space();
1710 }
1711 let digits = scratch.as_str();
1712 if digits.is_empty() {
1713 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
1714 }
1715 match u32::from_str_radix(digits, 10).ok() {
1716 Some(n) => Ok(n),
1717 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
1718 }
1719 }
1720
1721 /// Parse a standard character class consisting primarily of characters or
1722 /// character ranges, but can also contain nested character classes of
1723 /// any type (sans `.`).
1724 ///
1725 /// This assumes the parser is positioned at the opening `[`. If parsing
1726 /// is successful, then the parser is advanced to the position immediately
1727 /// following the closing `]`.
1728 fn parse_set_class(&self) -> Result<ast::Class> {
1729 assert_eq!(self.char(), '[');
1730
1731 let mut union = ast::ClassSetUnion {
1732 span: self.span(),
1733 items: vec![],
1734 };
1735 loop {
1736 self.bump_space();
1737 if self.is_eof() {
1738 return Err(self.unclosed_class_error());
1739 }
1740 match self.char() {
1741 '[' => {
1742 // If we've already parsed the opening bracket, then
1743 // attempt to treat this as the beginning of an ASCII
1744 // class. If ASCII class parsing fails, then the parser
1745 // backs up to `[`.
1746 if !self.parser().stack_class.borrow().is_empty() {
1747 if let Some(cls) = self.maybe_parse_ascii_class() {
1748 union.push(ast::ClassSetItem::Ascii(cls));
1749 continue;
1750 }
1751 }
1752 union = try!(self.push_class_open(union));
1753 }
1754 ']' => {
1755 match try!(self.pop_class(union)) {
1756 Either::Left(nested_union) => { union = nested_union; }
1757 Either::Right(class) => return Ok(class),
1758 }
1759 }
1760 '&' if self.peek() == Some('&') => {
1761 assert!(self.bump_if("&&"));
1762 union = self.push_class_op(
1763 ast::ClassSetBinaryOpKind::Intersection, union);
1764 }
1765 '-' if self.peek() == Some('-') => {
1766 assert!(self.bump_if("--"));
1767 union = self.push_class_op(
1768 ast::ClassSetBinaryOpKind::Difference, union);
1769 }
1770 '~' if self.peek() == Some('~') => {
1771 assert!(self.bump_if("~~"));
1772 union = self.push_class_op(
1773 ast::ClassSetBinaryOpKind::SymmetricDifference, union);
1774 }
1775 _ => {
1776 union.push(try!(self.parse_set_class_range()));
1777 }
1778 }
1779 }
1780 }
1781
1782 /// Parse a single primitive item in a character class set. The item to
1783 /// be parsed can either be one of a simple literal character, a range
1784 /// between two simple literal characters or a "primitive" character
1785 /// class like \w or \p{Greek}.
1786 ///
1787 /// If an invalid escape is found, or if a character class is found where
1788 /// a simple literal is expected (e.g., in a range), then an error is
1789 /// returned.
1790 fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> {
1791 let prim1 = try!(self.parse_set_class_item());
1792 self.bump_space();
1793 if self.is_eof() {
1794 return Err(self.unclosed_class_error());
1795 }
1796 // If the next char isn't a `-`, then we don't have a range.
1797 // There are two exceptions. If the char after a `-` is a `]`, then
1798 // `-` is interpreted as a literal `-`. Alternatively, if the char
1799 // after a `-` is a `-`, then `--` corresponds to a "difference"
1800 // operation.
1801 if self.char() != '-'
1802 || self.peek_space() == Some(']')
1803 || self.peek_space() == Some('-')
1804 {
1805 return prim1.into_class_set_item(self);
1806 }
1807 // OK, now we're parsing a range, so bump past the `-` and parse the
1808 // second half of the range.
1809 if !self.bump_and_bump_space() {
1810 return Err(self.unclosed_class_error());
1811 }
1812 let prim2 = try!(self.parse_set_class_item());
1813 let range = ast::ClassSetRange {
1814 span: Span::new(prim1.span().start, prim2.span().end),
1815 start: try!(prim1.into_class_literal(self)),
1816 end: try!(prim2.into_class_literal(self)),
1817 };
1818 if !range.is_valid() {
1819 return Err(self.error(
1820 range.span,
1821 ast::ErrorKind::ClassRangeInvalid,
1822 ));
1823 }
1824 Ok(ast::ClassSetItem::Range(range))
1825 }
1826
1827 /// Parse a single item in a character class as a primitive, where the
1828 /// primitive either consists of a verbatim literal or a single escape
1829 /// sequence.
1830 ///
1831 /// This assumes the parser is positioned at the beginning of a primitive,
1832 /// and advances the parser to the first position after the primitive if
1833 /// successful.
1834 ///
1835 /// Note that it is the caller's responsibility to report an error if an
1836 /// illegal primitive was parsed.
1837 fn parse_set_class_item(&self) -> Result<Primitive> {
1838 if self.char() == '\\' {
1839 self.parse_escape()
1840 } else {
1841 let x = Primitive::Literal(ast::Literal {
1842 span: self.span_char(),
1843 kind: ast::LiteralKind::Verbatim,
1844 c: self.char(),
1845 });
1846 self.bump();
1847 Ok(x)
1848 }
1849 }
1850
1851 /// Parses the opening of a character class set. This includes the opening
1852 /// bracket along with `^` if present to indicate negation. This also
1853 /// starts parsing the opening set of unioned items if applicable, since
1854 /// there are special rules applied to certain characters in the opening
1855 /// of a character class. For example, `[^]]` is the class of all
1856 /// characters not equal to `]`. (`]` would need to be escaped in any other
1857 /// position.) Similarly for `-`.
1858 ///
1859 /// In all cases, the op inside the returned `ast::ClassBracketed` is an
1860 /// empty union. This empty union should be replaced with the actual item
1861 /// when it is popped from the parser's stack.
1862 ///
1863 /// This assumes the parser is positioned at the opening `[` and advances
1864 /// the parser to the first non-special byte of the character class.
1865 ///
1866 /// An error is returned if EOF is found.
1867 fn parse_set_class_open(
1868 &self,
1869 ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> {
1870 assert_eq!(self.char(), '[');
1871 let start = self.pos();
1872 if !self.bump_and_bump_space() {
1873 return Err(self.error(
1874 Span::new(start, self.pos()),
1875 ast::ErrorKind::ClassUnclosed,
1876 ));
1877 }
1878
1879 let negated =
1880 if self.char() != '^' {
1881 false
1882 } else {
1883 if !self.bump_and_bump_space() {
1884 return Err(self.error(
1885 Span::new(start, self.pos()),
1886 ast::ErrorKind::ClassUnclosed,
1887 ));
1888 }
1889 true
1890 };
1891 // Accept any number of `-` as literal `-`.
1892 let mut union = ast::ClassSetUnion {
1893 span: self.span(),
1894 items: vec![],
1895 };
1896 while self.char() == '-' {
1897 union.push(ast::ClassSetItem::Literal(ast::Literal {
1898 span: self.span_char(),
1899 kind: ast::LiteralKind::Verbatim,
1900 c: '-',
1901 }));
1902 if !self.bump_and_bump_space() {
1903 return Err(self.error(
1904 Span::new(start, self.pos()),
1905 ast::ErrorKind::ClassUnclosed,
1906 ));
1907 }
1908 }
1909 // If `]` is the *first* char in a set, then interpret it as a literal
1910 // `]`. That is, an empty class is impossible to write.
1911 if union.items.is_empty() && self.char() == ']' {
1912 union.push(ast::ClassSetItem::Literal(ast::Literal {
1913 span: self.span_char(),
1914 kind: ast::LiteralKind::Verbatim,
1915 c: ']',
1916 }));
1917 if !self.bump_and_bump_space() {
1918 return Err(self.error(
1919 Span::new(start, self.pos()),
1920 ast::ErrorKind::ClassUnclosed,
1921 ));
1922 }
1923 }
1924 let set = ast::ClassBracketed {
1925 span: Span::new(start, self.pos()),
1926 negated: negated,
1927 kind: ast::ClassSet::union(ast::ClassSetUnion {
1928 span: Span::new(union.span.start, union.span.start),
1929 items: vec![],
1930 }),
1931 };
1932 Ok((set, union))
1933 }
1934
1935 /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`.
1936 ///
1937 /// This assumes the parser is positioned at the opening `[`.
1938 ///
1939 /// If no valid ASCII character class could be found, then this does not
1940 /// advance the parser and `None` is returned. Otherwise, the parser is
1941 /// advanced to the first byte following the closing `]` and the
1942 /// corresponding ASCII class is returned.
1943 fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> {
1944 // ASCII character classes are interesting from a parsing perspective
1945 // because parsing cannot fail with any interesting error. For example,
1946 // in order to use an ASCII character class, it must be enclosed in
1947 // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
1948 // of it as "ASCII character characters have the syntax `[:NAME:]`
1949 // which can only appear within character brackets." This means that
1950 // things like `[[:lower:]A]` are legal constructs.
1951 //
1952 // However, if one types an incorrect ASCII character class, e.g.,
1953 // `[[:loower:]]`, then we treat that as a normal nested character
1954 // class containing the characters `:elorw`. One might argue that we
1955 // should return an error instead since the repeated colons give away
1956 // the intent to write an ASCII class. But what if the user typed
1957 // `[[:lower]]` instead? How can we tell that was intended to be an
1958 // ASCII class and not just a normal nested class?
1959 //
1960 // Reasonable people can probably disagree over this, but for better
1961 // or worse, we implement semantics that never fails at the expense
1962 // of better failure modes.
1963 assert_eq!(self.char(), '[');
1964 // If parsing fails, then we back up the parser to this starting point.
1965 let start = self.pos();
1966 let mut negated = false;
1967 if !self.bump() || self.char() != ':' {
1968 self.parser().pos.set(start);
1969 return None;
1970 }
1971 if !self.bump() {
1972 self.parser().pos.set(start);
1973 return None;
1974 }
1975 if self.char() == '^' {
1976 negated = true;
1977 if !self.bump() {
1978 self.parser().pos.set(start);
1979 return None;
1980 }
1981 }
1982 let name_start = self.offset();
1983 while self.char() != ':' && self.bump() {}
1984 if self.is_eof() {
1985 self.parser().pos.set(start);
1986 return None;
1987 }
1988 let name = &self.pattern()[name_start..self.offset()];
1989 if !self.bump_if(":]") {
1990 self.parser().pos.set(start);
1991 return None;
1992 }
1993 let kind = match ast::ClassAsciiKind::from_name(name) {
1994 Some(kind) => kind,
1995 None => {
1996 self.parser().pos.set(start);
1997 return None;
1998 }
1999 };
2000 Some(ast::ClassAscii {
2001 span: Span::new(start, self.pos()),
2002 kind: kind,
2003 negated: negated,
2004 })
2005 }
2006
2007 /// Parse a Unicode class in either the single character notation, `\pN`
2008 /// or the multi-character bracketed notation, `\p{Greek}`. This assumes
2009 /// the parser is positioned at the `p` (or `P` for negation) and will
2010 /// advance the parser to the character immediately following the class.
2011 ///
2012 /// Note that this does not check whether the class name is valid or not.
2013 fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> {
2014 assert!(self.char() == 'p' || self.char() == 'P');
2015
2016 let mut scratch = self.parser().scratch.borrow_mut();
2017 scratch.clear();
2018
2019 let negated = self.char() == 'P';
2020 if !self.bump_and_bump_space() {
2021 return Err(self.error(
2022 self.span(),
2023 ast::ErrorKind::EscapeUnexpectedEof,
2024 ));
2025 }
2026 let (start, kind) =
2027 if self.char() == '{' {
2028 let start = self.span_char().end;
2029 while self.bump_and_bump_space() && self.char() != '}' {
2030 scratch.push(self.char());
2031 }
2032 if self.is_eof() {
2033 return Err(self.error(
2034 self.span(),
2035 ast::ErrorKind::EscapeUnexpectedEof,
2036 ));
2037 }
2038 assert_eq!(self.char(), '}');
2039 self.bump();
2040
2041 let name = scratch.as_str();
2042 if let Some(i) = name.find("!=") {
2043 (start, ast::ClassUnicodeKind::NamedValue {
2044 op: ast::ClassUnicodeOpKind::NotEqual,
2045 name: name[..i].to_string(),
2046 value: name[i+2..].to_string(),
2047 })
2048 } else if let Some(i) = name.find(':') {
2049 (start, ast::ClassUnicodeKind::NamedValue {
2050 op: ast::ClassUnicodeOpKind::Colon,
2051 name: name[..i].to_string(),
2052 value: name[i+1..].to_string(),
2053 })
2054 } else if let Some(i) = name.find('=') {
2055 (start, ast::ClassUnicodeKind::NamedValue {
2056 op: ast::ClassUnicodeOpKind::Equal,
2057 name: name[..i].to_string(),
2058 value: name[i+1..].to_string(),
2059 })
2060 } else {
2061 (start, ast::ClassUnicodeKind::Named(name.to_string()))
2062 }
2063 } else {
2064 let start = self.pos();
2065 let c = self.char();
2066 self.bump_and_bump_space();
2067 let kind = ast::ClassUnicodeKind::OneLetter(c);
2068 (start, kind)
2069 };
2070 Ok(ast::ClassUnicode {
2071 span: Span::new(start, self.pos()),
2072 negated: negated,
2073 kind: kind,
2074 })
2075 }
2076
2077 /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the
2078 /// parser is currently at a valid character class name and will be
2079 /// advanced to the character immediately following the class.
2080 fn parse_perl_class(&self) -> ast::ClassPerl {
2081 let c = self.char();
2082 let span = self.span_char();
2083 self.bump();
2084 let (negated, kind) = match c {
2085 'd' => (false, ast::ClassPerlKind::Digit),
2086 'D' => (true, ast::ClassPerlKind::Digit),
2087 's' => (false, ast::ClassPerlKind::Space),
2088 'S' => (true, ast::ClassPerlKind::Space),
2089 'w' => (false, ast::ClassPerlKind::Word),
2090 'W' => (true, ast::ClassPerlKind::Word),
2091 c => panic!("expected valid Perl class but got '{}'", c),
2092 };
2093 ast::ClassPerl { span: span, kind: kind, negated: negated }
2094 }
2095}
2096
2097/// A type that traverses a fully parsed Ast and checks whether its depth
2098/// exceeds the specified nesting limit. If it does, then an error is returned.
2099#[derive(Debug)]
2100struct NestLimiter<'p, 's: 'p, P: 'p + 's> {
2101 /// The parser that is checking the nest limit.
2102 p: &'p ParserI<'s, P>,
2103 /// The current depth while walking an Ast.
2104 depth: u32,
2105}
2106
2107impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> {
2108 fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> {
2109 NestLimiter { p: p, depth: 0 }
2110 }
2111
2112 fn check(self, ast: &Ast) -> Result<()> {
2113 ast::visit(ast, self)
2114 }
2115
2116 fn increment_depth(&mut self, span: &Span) -> Result<()> {
2117 let new = try!(self.depth.checked_add(1).ok_or_else(|| self.p.error(
2118 span.clone(),
2119 ast::ErrorKind::NestLimitExceeded(::std::u32::MAX),
2120 )));
2121 let limit = self.p.parser().nest_limit;
2122 if new > limit {
2123 return Err(self.p.error(
2124 span.clone(),
2125 ast::ErrorKind::NestLimitExceeded(limit),
2126 ));
2127 }
2128 self.depth = new;
2129 Ok(())
2130 }
2131
2132 fn decrement_depth(&mut self) {
2133 // Assuming the correctness of the visitor, this should never drop
2134 // below 0.
2135 self.depth = self.depth.checked_sub(1).unwrap();
2136 }
2137}
2138
2139impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
2140 type Output = ();
2141 type Err = ast::Error;
2142
2143 fn finish(self) -> Result<()> {
2144 Ok(())
2145 }
2146
2147 fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
2148 let span = match *ast {
2149 Ast::Empty(_)
2150 | Ast::Flags(_)
2151 | Ast::Literal(_)
2152 | Ast::Dot(_)
2153 | Ast::Assertion(_)
2154 | Ast::Class(ast::Class::Unicode(_))
2155 | Ast::Class(ast::Class::Perl(_)) => {
2156 // These are all base cases, so we don't increment depth.
2157 return Ok(());
2158 }
2159 Ast::Class(ast::Class::Bracketed(ref x)) => &x.span,
2160 Ast::Repetition(ref x) => &x.span,
2161 Ast::Group(ref x) => &x.span,
2162 Ast::Alternation(ref x) => &x.span,
2163 Ast::Concat(ref x) => &x.span,
2164 };
2165 self.increment_depth(span)
2166 }
2167
2168 fn visit_post(&mut self, ast: &Ast) -> Result<()> {
2169 match *ast {
2170 Ast::Empty(_)
2171 | Ast::Flags(_)
2172 | Ast::Literal(_)
2173 | Ast::Dot(_)
2174 | Ast::Assertion(_)
2175 | Ast::Class(ast::Class::Unicode(_))
2176 | Ast::Class(ast::Class::Perl(_)) => {
2177 // These are all base cases, so we don't decrement depth.
2178 Ok(())
2179 }
2180 Ast::Class(ast::Class::Bracketed(_))
2181 | Ast::Repetition(_)
2182 | Ast::Group(_)
2183 | Ast::Alternation(_)
2184 | Ast::Concat(_) => {
2185 self.decrement_depth();
2186 Ok(())
2187 }
2188 }
2189 }
2190
2191 fn visit_class_set_item_pre(
2192 &mut self,
2193 ast: &ast::ClassSetItem,
2194 ) -> Result<()> {
2195 let span = match *ast {
2196 ast::ClassSetItem::Empty(_)
2197 | ast::ClassSetItem::Literal(_)
2198 | ast::ClassSetItem::Range(_)
2199 | ast::ClassSetItem::Ascii(_)
2200 | ast::ClassSetItem::Unicode(_)
2201 | ast::ClassSetItem::Perl(_) => {
2202 // These are all base cases, so we don't increment depth.
2203 return Ok(());
2204 }
2205 ast::ClassSetItem::Bracketed(ref x) => &x.span,
2206 ast::ClassSetItem::Union(ref x) => &x.span,
2207 };
2208 self.increment_depth(span)
2209 }
2210
2211 fn visit_class_set_item_post(
2212 &mut self,
2213 ast: &ast::ClassSetItem,
2214 ) -> Result<()> {
2215 match *ast {
2216 ast::ClassSetItem::Empty(_)
2217 | ast::ClassSetItem::Literal(_)
2218 | ast::ClassSetItem::Range(_)
2219 | ast::ClassSetItem::Ascii(_)
2220 | ast::ClassSetItem::Unicode(_)
2221 | ast::ClassSetItem::Perl(_) => {
2222 // These are all base cases, so we don't decrement depth.
2223 Ok(())
2224 }
2225 ast::ClassSetItem::Bracketed(_)
2226 | ast::ClassSetItem::Union(_) => {
2227 self.decrement_depth();
2228 Ok(())
2229 }
2230 }
2231 }
2232
2233 fn visit_class_set_binary_op_pre(
2234 &mut self,
2235 ast: &ast::ClassSetBinaryOp,
2236 ) -> Result<()> {
2237 self.increment_depth(&ast.span)
2238 }
2239
2240 fn visit_class_set_binary_op_post(
2241 &mut self,
2242 _ast: &ast::ClassSetBinaryOp,
2243 ) -> Result<()> {
2244 self.decrement_depth();
2245 Ok(())
2246 }
2247}
2248
2249#[cfg(test)]
2250mod tests {
2251 use std::ops::Range;
2252
2253 use ast::{self, Ast, Position, Span};
2254 use super::{Parser, ParserI, ParserBuilder, Primitive};
2255
2256 // Our own assert_eq, which has slightly better formatting (but honestly
2257 // still kind of crappy).
2258 macro_rules! assert_eq {
2259 ($left:expr, $right:expr) => ({
2260 match (&$left, &$right) {
2261 (left_val, right_val) => {
2262 if !(*left_val == *right_val) {
2263 panic!("assertion failed: `(left == right)`\n\n\
2264 left: `{:?}`\nright: `{:?}`\n\n",
2265 left_val, right_val)
2266 }
2267 }
2268 }
2269 });
2270 }
2271
2272 // We create these errors to compare with real ast::Errors in the tests.
2273 // We define equality between TestError and ast::Error to disregard the
2274 // pattern string in ast::Error, which is annoying to provide in tests.
2275 #[derive(Clone, Debug)]
2276 struct TestError {
2277 span: Span,
2278 kind: ast::ErrorKind,
2279 }
2280
2281 impl PartialEq<ast::Error> for TestError {
2282 fn eq(&self, other: &ast::Error) -> bool {
2283 self.span == other.span && self.kind == other.kind
2284 }
2285 }
2286
2287 impl PartialEq<TestError> for ast::Error {
2288 fn eq(&self, other: &TestError) -> bool {
2289 self.span == other.span && self.kind == other.kind
2290 }
2291 }
2292
2293 fn s(str: &str) -> String {
2294 str.to_string()
2295 }
2296
2297 fn parser(pattern: &str) -> ParserI<Parser> {
2298 ParserI::new(Parser::new(), pattern)
2299 }
2300
2301 fn parser_octal(pattern: &str) -> ParserI<Parser> {
2302 let parser = ParserBuilder::new().octal(true).build();
2303 ParserI::new(parser, pattern)
2304 }
2305
2306 fn parser_nest_limit(pattern: &str, nest_limit: u32) -> ParserI<Parser> {
2307 let p = ParserBuilder::new().nest_limit(nest_limit).build();
2308 ParserI::new(p, pattern)
2309 }
2310
2311 fn parser_ignore_whitespace(pattern: &str) -> ParserI<Parser> {
2312 let p = ParserBuilder::new().ignore_whitespace(true).build();
2313 ParserI::new(p, pattern)
2314 }
2315
2316 /// Short alias for creating a new span.
2317 fn nspan(start: Position, end: Position) -> Span {
2318 Span::new(start, end)
2319 }
2320
2321 /// Short alias for creating a new position.
2322 fn npos(offset: usize, line: usize, column: usize) -> Position {
2323 Position::new(offset, line, column)
2324 }
2325
2326 /// Create a new span from the given offset range. This assumes a single
2327 /// line and sets the columns based on the offsets. i.e., This only works
2328 /// out of the box for ASCII, which is fine for most tests.
2329 fn span(range: Range<usize>) -> Span {
2330 let start = Position::new(range.start, 1, range.start + 1);
2331 let end = Position::new(range.end, 1, range.end + 1);
2332 Span::new(start, end)
2333 }
2334
2335 /// Create a new span for the corresponding byte range in the given string.
2336 fn span_range(subject: &str, range: Range<usize>) -> Span {
2337 let start = Position {
2338 offset: range.start,
2339 line: 1 + subject[..range.start].matches('\n').count(),
2340 column: 1 + subject[..range.start]
2341 .chars()
2342 .rev()
2343 .position(|c| c == '\n')
2344 .unwrap_or(subject[..range.start].chars().count()),
2345 };
2346 let end = Position {
2347 offset: range.end,
2348 line: 1 + subject[..range.end].matches('\n').count(),
2349 column: 1 + subject[..range.end]
2350 .chars()
2351 .rev()
2352 .position(|c| c == '\n')
2353 .unwrap_or(subject[..range.end].chars().count()),
2354 };
2355 Span::new(start, end)
2356 }
2357
2358 /// Create a verbatim literal starting at the given position.
2359 fn lit(c: char, start: usize) -> Ast {
2360 lit_with(c, span(start..start + c.len_utf8()))
2361 }
2362
2363 /// Create a punctuation literal starting at the given position.
2364 fn punct_lit(c: char, span: Span) -> Ast {
2365 Ast::Literal(ast::Literal {
2366 span: span,
2367 kind: ast::LiteralKind::Punctuation,
2368 c: c,
2369 })
2370 }
2371
2372 /// Create a verbatim literal with the given span.
2373 fn lit_with(c: char, span: Span) -> Ast {
2374 Ast::Literal(ast::Literal {
2375 span: span,
2376 kind: ast::LiteralKind::Verbatim,
2377 c: c,
2378 })
2379 }
2380
2381 /// Create a concatenation with the given range.
2382 fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2383 concat_with(span(range), asts)
2384 }
2385
2386 /// Create a concatenation with the given span.
2387 fn concat_with(span: Span, asts: Vec<Ast>) -> Ast {
2388 Ast::Concat(ast::Concat { span: span, asts: asts })
2389 }
2390
2391 /// Create an alternation with the given span.
2392 fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2393 Ast::Alternation(ast::Alternation { span: span(range), asts: asts })
2394 }
2395
2396 /// Create a capturing group with the given span.
2397 fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast {
2398 Ast::Group(ast::Group {
2399 span: span(range),
2400 kind: ast::GroupKind::CaptureIndex(index),
2401 ast: Box::new(ast),
2402 })
2403 }
2404
2405 /// Create an ast::SetFlags.
2406 ///
2407 /// The given pattern should be the full pattern string. The range given
2408 /// should correspond to the byte offsets where the flag set occurs.
2409 ///
2410 /// If negated is true, then the set is interpreted as beginning with a
2411 /// negation.
2412 fn flag_set(
2413 pat: &str,
2414 range: Range<usize>,
2415 flag: ast::Flag,
2416 negated: bool,
2417 ) -> Ast {
2418 let mut items = vec![
2419 ast::FlagsItem {
2420 span: span_range(pat, (range.end - 2)..(range.end - 1)),
2421 kind: ast::FlagsItemKind::Flag(flag),
2422 },
2423 ];
2424 if negated {
2425 items.insert(0, ast::FlagsItem {
2426 span: span_range(pat, (range.start + 2)..(range.end - 2)),
2427 kind: ast::FlagsItemKind::Negation,
2428 });
2429 }
2430 Ast::Flags(ast::SetFlags {
2431 span: span_range(pat, range.clone()),
2432 flags: ast::Flags {
2433 span: span_range(pat, (range.start + 2)..(range.end - 1)),
2434 items: items,
2435 },
2436 })
2437 }
2438
2439 #[test]
2440 fn parse_nest_limit() {
2441 // A nest limit of 0 still allows some types of regexes.
2442 assert_eq!(
2443 parser_nest_limit("", 0).parse(),
2444 Ok(Ast::Empty(span(0..0))));
2445 assert_eq!(
2446 parser_nest_limit("a", 0).parse(),
2447 Ok(lit('a', 0)));
2448
2449 // Test repetition operations, which require one level of nesting.
2450 assert_eq!(
2451 parser_nest_limit("a+", 0).parse().unwrap_err(),
2452 TestError {
2453 span: span(0..2),
2454 kind: ast::ErrorKind::NestLimitExceeded(0),
2455 });
2456 assert_eq!(
2457 parser_nest_limit("a+", 1).parse(),
2458 Ok(Ast::Repetition(ast::Repetition {
2459 span: span(0..2),
2460 op: ast::RepetitionOp {
2461 span: span(1..2),
2462 kind: ast::RepetitionKind::OneOrMore,
2463 },
2464 greedy: true,
2465 ast: Box::new(lit('a', 0)),
2466 })));
2467 assert_eq!(
2468 parser_nest_limit("(a)+", 1).parse().unwrap_err(),
2469 TestError {
2470 span: span(0..3),
2471 kind: ast::ErrorKind::NestLimitExceeded(1),
2472 });
2473 assert_eq!(
2474 parser_nest_limit("a+*", 1).parse().unwrap_err(),
2475 TestError {
2476 span: span(0..2),
2477 kind: ast::ErrorKind::NestLimitExceeded(1),
2478 });
2479 assert_eq!(
2480 parser_nest_limit("a+*", 2).parse(),
2481 Ok(Ast::Repetition(ast::Repetition {
2482 span: span(0..3),
2483 op: ast::RepetitionOp {
2484 span: span(2..3),
2485 kind: ast::RepetitionKind::ZeroOrMore,
2486 },
2487 greedy: true,
2488 ast: Box::new(Ast::Repetition(ast::Repetition {
2489 span: span(0..2),
2490 op: ast::RepetitionOp {
2491 span: span(1..2),
2492 kind: ast::RepetitionKind::OneOrMore,
2493 },
2494 greedy: true,
2495 ast: Box::new(lit('a', 0)),
2496 })),
2497 })));
2498
2499 // Test concatenations. A concatenation requires one level of nesting.
2500 assert_eq!(
2501 parser_nest_limit("ab", 0).parse().unwrap_err(),
2502 TestError {
2503 span: span(0..2),
2504 kind: ast::ErrorKind::NestLimitExceeded(0),
2505 });
2506 assert_eq!(
2507 parser_nest_limit("ab", 1).parse(),
2508 Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)])));
2509 assert_eq!(
2510 parser_nest_limit("abc", 1).parse(),
2511 Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)])));
2512
2513 // Test alternations. An alternation requires one level of nesting.
2514 assert_eq!(
2515 parser_nest_limit("a|b", 0).parse().unwrap_err(),
2516 TestError {
2517 span: span(0..3),
2518 kind: ast::ErrorKind::NestLimitExceeded(0),
2519 });
2520 assert_eq!(
2521 parser_nest_limit("a|b", 1).parse(),
2522 Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)])));
2523 assert_eq!(
2524 parser_nest_limit("a|b|c", 1).parse(),
2525 Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)])));
2526
2527 // Test character classes. Classes form their own mini-recursive
2528 // syntax!
2529 assert_eq!(
2530 parser_nest_limit("[a]", 0).parse().unwrap_err(),
2531 TestError {
2532 span: span(0..3),
2533 kind: ast::ErrorKind::NestLimitExceeded(0),
2534 });
2535 assert_eq!(
2536 parser_nest_limit("[a]", 1).parse(),
2537 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
2538 span: span(0..3),
2539 negated: false,
2540 kind: ast::ClassSet::Item(
2541 ast::ClassSetItem::Literal(ast::Literal {
2542 span: span(1..2),
2543 kind: ast::LiteralKind::Verbatim,
2544 c: 'a',
2545 })
2546 ),
2547 }))));
2548 assert_eq!(
2549 parser_nest_limit("[ab]", 1).parse().unwrap_err(),
2550 TestError {
2551 span: span(1..3),
2552 kind: ast::ErrorKind::NestLimitExceeded(1),
2553 });
2554 assert_eq!(
2555 parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(),
2556 TestError {
2557 span: span(3..7),
2558 kind: ast::ErrorKind::NestLimitExceeded(2),
2559 });
2560 assert_eq!(
2561 parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(),
2562 TestError {
2563 span: span(4..6),
2564 kind: ast::ErrorKind::NestLimitExceeded(3),
2565 });
2566 assert_eq!(
2567 parser_nest_limit("[a--b]", 1).parse().unwrap_err(),
2568 TestError {
2569 span: span(1..5),
2570 kind: ast::ErrorKind::NestLimitExceeded(1),
2571 });
2572 assert_eq!(
2573 parser_nest_limit("[a--bc]", 2).parse().unwrap_err(),
2574 TestError {
2575 span: span(4..6),
2576 kind: ast::ErrorKind::NestLimitExceeded(2),
2577 });
2578 }
2579
2580 #[test]
2581 fn parse_comments() {
2582 let pat = "(?x)
2583# This is comment 1.
2584foo # This is comment 2.
2585 # This is comment 3.
2586bar
2587# This is comment 4.";
2588 let astc = parser(pat).parse_with_comments().unwrap();
2589 assert_eq!(
2590 astc.ast,
2591 concat_with(span_range(pat, 0..pat.len()), vec![
2592 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2593 lit_with('f', span_range(pat, 26..27)),
2594 lit_with('o', span_range(pat, 27..28)),
2595 lit_with('o', span_range(pat, 28..29)),
2596 lit_with('b', span_range(pat, 74..75)),
2597 lit_with('a', span_range(pat, 75..76)),
2598 lit_with('r', span_range(pat, 76..77)),
2599 ]));
2600 assert_eq!(astc.comments, vec![
2601 ast::Comment {
2602 span: span_range(pat, 5..26),
2603 comment: s(" This is comment 1."),
2604 },
2605 ast::Comment {
2606 span: span_range(pat, 30..51),
2607 comment: s(" This is comment 2."),
2608 },
2609 ast::Comment {
2610 span: span_range(pat, 53..74),
2611 comment: s(" This is comment 3."),
2612 },
2613 ast::Comment {
2614 span: span_range(pat, 78..98),
2615 comment: s(" This is comment 4."),
2616 },
2617 ]);
2618 }
2619
2620 #[test]
2621 fn parse_holistic() {
2622 assert_eq!(
2623 parser("]").parse(),
2624 Ok(lit(']', 0)));
2625 assert_eq!(
2626 parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(),
2627 Ok(concat(0..36, vec![
2628 punct_lit('\\', span(0..2)),
2629 punct_lit('.', span(2..4)),
2630 punct_lit('+', span(4..6)),
2631 punct_lit('*', span(6..8)),
2632 punct_lit('?', span(8..10)),
2633 punct_lit('(', span(10..12)),
2634 punct_lit(')', span(12..14)),
2635 punct_lit('|', span(14..16)),
2636 punct_lit('[', span(16..18)),
2637 punct_lit(']', span(18..20)),
2638 punct_lit('{', span(20..22)),
2639 punct_lit('}', span(22..24)),
2640 punct_lit('^', span(24..26)),
2641 punct_lit('$', span(26..28)),
2642 punct_lit('#', span(28..30)),
2643 punct_lit('&', span(30..32)),
2644 punct_lit('-', span(32..34)),
2645 punct_lit('~', span(34..36)),
2646 ])));
2647 }
2648
2649 #[test]
2650 fn parse_ignore_whitespace() {
2651 // Test that basic whitespace insensitivity works.
2652 let pat = "(?x)a b";
2653 assert_eq!(
2654 parser(pat).parse(),
2655 Ok(concat_with(nspan(npos(0, 1, 1), npos(7, 1, 8)), vec![
2656 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2657 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2658 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2659 ])));
2660
2661 // Test that we can toggle whitespace insensitivity.
2662 let pat = "(?x)a b(?-x)a b";
2663 assert_eq!(
2664 parser(pat).parse(),
2665 Ok(concat_with(nspan(npos(0, 1, 1), npos(15, 1, 16)), vec![
2666 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2667 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2668 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2669 flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true),
2670 lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))),
2671 lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))),
2672 lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))),
2673 ])));
2674
2675 // Test that nesting whitespace insensitive flags works.
2676 let pat = "a (?x:a )a ";
2677 assert_eq!(
2678 parser(pat).parse(),
2679 Ok(concat_with(span_range(pat, 0..11), vec![
2680 lit_with('a', span_range(pat, 0..1)),
2681 lit_with(' ', span_range(pat, 1..2)),
2682 Ast::Group(ast::Group {
2683 span: span_range(pat, 2..9),
2684 kind: ast::GroupKind::NonCapturing(ast::Flags {
2685 span: span_range(pat, 4..5),
2686 items: vec![
2687 ast::FlagsItem {
2688 span: span_range(pat, 4..5),
2689 kind: ast::FlagsItemKind::Flag(
2690 ast::Flag::IgnoreWhitespace),
2691 },
2692 ],
2693 }),
2694 ast: Box::new(lit_with('a', span_range(pat, 6..7))),
2695 }),
2696 lit_with('a', span_range(pat, 9..10)),
2697 lit_with(' ', span_range(pat, 10..11)),
2698 ])));
2699
2700 // Test that whitespace after an opening paren is insignificant.
2701 let pat = "(?x)( ?P<foo> a )";
2702 assert_eq!(
2703 parser(pat).parse(),
2704 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2705 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2706 Ast::Group(ast::Group {
2707 span: span_range(pat, 4..pat.len()),
2708 kind: ast::GroupKind::CaptureName(ast::CaptureName {
2709 span: span_range(pat, 9..12),
2710 name: s("foo"),
2711 index: 1,
2712 }),
2713 ast: Box::new(lit_with('a', span_range(pat, 14..15))),
2714 }),
2715 ])));
2716 let pat = "(?x)( a )";
2717 assert_eq!(
2718 parser(pat).parse(),
2719 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2720 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2721 Ast::Group(ast::Group {
2722 span: span_range(pat, 4..pat.len()),
2723 kind: ast::GroupKind::CaptureIndex(1),
2724 ast: Box::new(lit_with('a', span_range(pat, 7..8))),
2725 }),
2726 ])));
2727 let pat = "(?x)( ?: a )";
2728 assert_eq!(
2729 parser(pat).parse(),
2730 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2731 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2732 Ast::Group(ast::Group {
2733 span: span_range(pat, 4..pat.len()),
2734 kind: ast::GroupKind::NonCapturing(ast::Flags {
2735 span: span_range(pat, 8..8),
2736 items: vec![],
2737 }),
2738 ast: Box::new(lit_with('a', span_range(pat, 11..12))),
2739 }),
2740 ])));
2741 let pat = r"(?x)\x { 53 }";
2742 assert_eq!(
2743 parser(pat).parse(),
2744 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2745 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2746 Ast::Literal(ast::Literal {
2747 span: span(4..13),
2748 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
2749 c: 'S',
2750 }),
2751 ])));
2752
2753 // Test that whitespace after an escape is OK.
2754 let pat = r"(?x)\ ";
2755 assert_eq!(
2756 parser(pat).parse(),
2757 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2758 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2759 Ast::Literal(ast::Literal {
2760 span: span_range(pat, 4..6),
2761 kind: ast::LiteralKind::Special(
2762 ast::SpecialLiteralKind::Space),
2763 c: ' ',
2764 }),
2765 ])));
2766 // ... but only when `x` mode is enabled.
2767 let pat = r"\ ";
2768 assert_eq!(
2769 parser(pat).parse().unwrap_err(),
2770 TestError {
2771 span: span_range(pat, 0..2),
2772 kind: ast::ErrorKind::EscapeUnrecognized,
2773 });
2774 }
2775
2776 #[test]
2777 fn parse_newlines() {
2778 let pat = ".\n.";
2779 assert_eq!(
2780 parser(pat).parse(),
2781 Ok(concat_with(span_range(pat, 0..3), vec![
2782 Ast::Dot(span_range(pat, 0..1)),
2783 lit_with('\n', span_range(pat, 1..2)),
2784 Ast::Dot(span_range(pat, 2..3)),
2785 ])));
2786
2787 let pat = "foobar\nbaz\nquux\n";
2788 assert_eq!(
2789 parser(pat).parse(),
2790 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2791 lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))),
2792 lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))),
2793 lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))),
2794 lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))),
2795 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2796 lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))),
2797 lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))),
2798 lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))),
2799 lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))),
2800 lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))),
2801 lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))),
2802 lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))),
2803 lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))),
2804 lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))),
2805 lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))),
2806 lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))),
2807 ])));
2808 }
2809
2810 #[test]
2811 fn parse_uncounted_repetition() {
2812 assert_eq!(
2813 parser(r"a*").parse(),
2814 Ok(Ast::Repetition(ast::Repetition {
2815 span: span(0..2),
2816 op: ast::RepetitionOp {
2817 span: span(1..2),
2818 kind: ast::RepetitionKind::ZeroOrMore,
2819 },
2820 greedy: true,
2821 ast: Box::new(lit('a', 0)),
2822 })));
2823 assert_eq!(
2824 parser(r"a+").parse(),
2825 Ok(Ast::Repetition(ast::Repetition {
2826 span: span(0..2),
2827 op: ast::RepetitionOp {
2828 span: span(1..2),
2829 kind: ast::RepetitionKind::OneOrMore,
2830 },
2831 greedy: true,
2832 ast: Box::new(lit('a', 0)),
2833 })));
2834
2835 assert_eq!(
2836 parser(r"a?").parse(),
2837 Ok(Ast::Repetition(ast::Repetition {
2838 span: span(0..2),
2839 op: ast::RepetitionOp {
2840 span: span(1..2),
2841 kind: ast::RepetitionKind::ZeroOrOne,
2842 },
2843 greedy: true,
2844 ast: Box::new(lit('a', 0)),
2845 })));
2846 assert_eq!(
2847 parser(r"a??").parse(),
2848 Ok(Ast::Repetition(ast::Repetition {
2849 span: span(0..3),
2850 op: ast::RepetitionOp {
2851 span: span(1..3),
2852 kind: ast::RepetitionKind::ZeroOrOne,
2853 },
2854 greedy: false,
2855 ast: Box::new(lit('a', 0)),
2856 })));
2857 assert_eq!(
2858 parser(r"a?").parse(),
2859 Ok(Ast::Repetition(ast::Repetition {
2860 span: span(0..2),
2861 op: ast::RepetitionOp {
2862 span: span(1..2),
2863 kind: ast::RepetitionKind::ZeroOrOne,
2864 },
2865 greedy: true,
2866 ast: Box::new(lit('a', 0)),
2867 })));
2868 assert_eq!(
2869 parser(r"a?b").parse(),
2870 Ok(concat(0..3, vec![
2871 Ast::Repetition(ast::Repetition {
2872 span: span(0..2),
2873 op: ast::RepetitionOp {
2874 span: span(1..2),
2875 kind: ast::RepetitionKind::ZeroOrOne,
2876 },
2877 greedy: true,
2878 ast: Box::new(lit('a', 0)),
2879 }),
2880 lit('b', 2),
2881 ])));
2882 assert_eq!(
2883 parser(r"a??b").parse(),
2884 Ok(concat(0..4, vec![
2885 Ast::Repetition(ast::Repetition {
2886 span: span(0..3),
2887 op: ast::RepetitionOp {
2888 span: span(1..3),
2889 kind: ast::RepetitionKind::ZeroOrOne,
2890 },
2891 greedy: false,
2892 ast: Box::new(lit('a', 0)),
2893 }),
2894 lit('b', 3),
2895 ])));
2896 assert_eq!(
2897 parser(r"ab?").parse(),
2898 Ok(concat(0..3, vec![
2899 lit('a', 0),
2900 Ast::Repetition(ast::Repetition {
2901 span: span(1..3),
2902 op: ast::RepetitionOp {
2903 span: span(2..3),
2904 kind: ast::RepetitionKind::ZeroOrOne,
2905 },
2906 greedy: true,
2907 ast: Box::new(lit('b', 1)),
2908 }),
2909 ])));
2910 assert_eq!(
2911 parser(r"(ab)?").parse(),
2912 Ok(Ast::Repetition(ast::Repetition {
2913 span: span(0..5),
2914 op: ast::RepetitionOp {
2915 span: span(4..5),
2916 kind: ast::RepetitionKind::ZeroOrOne,
2917 },
2918 greedy: true,
2919 ast: Box::new(group(0..4, 1, concat(1..3, vec![
2920 lit('a', 1),
2921 lit('b', 2),
2922 ]))),
2923 })));
2924 assert_eq!(
2925 parser(r"|a?").parse(),
2926 Ok(alt(0..3, vec![
2927 Ast::Empty(span(0..0)),
2928 Ast::Repetition(ast::Repetition {
2929 span: span(1..3),
2930 op: ast::RepetitionOp {
2931 span: span(2..3),
2932 kind: ast::RepetitionKind::ZeroOrOne,
2933 },
2934 greedy: true,
2935 ast: Box::new(lit('a', 1)),
2936 }),
2937 ])));
2938
2939 assert_eq!(
2940 parser(r"*").parse().unwrap_err(),
2941 TestError {
2942 span: span(0..0),
2943 kind: ast::ErrorKind::RepetitionMissing,
2944 });
2945 assert_eq!(
2946 parser(r"(*)").parse().unwrap_err(),
2947 TestError {
2948 span: span(1..1),
2949 kind: ast::ErrorKind::RepetitionMissing,
2950 });
2951 assert_eq!(
2952 parser(r"(?:?)").parse().unwrap_err(),
2953 TestError {
2954 span: span(3..3),
2955 kind: ast::ErrorKind::RepetitionMissing,
2956 });
2957 assert_eq!(
2958 parser(r"+").parse().unwrap_err(),
2959 TestError {
2960 span: span(0..0),
2961 kind: ast::ErrorKind::RepetitionMissing,
2962 });
2963 assert_eq!(
2964 parser(r"?").parse().unwrap_err(),
2965 TestError {
2966 span: span(0..0),
2967 kind: ast::ErrorKind::RepetitionMissing,
2968 });
2969 assert_eq!(
2970 parser(r"(?)").parse().unwrap_err(),
2971 TestError {
2972 span: span(1..1),
2973 kind: ast::ErrorKind::RepetitionMissing,
2974 });
2975 assert_eq!(
2976 parser(r"|*").parse().unwrap_err(),
2977 TestError {
2978 span: span(1..1),
2979 kind: ast::ErrorKind::RepetitionMissing,
2980 });
2981 assert_eq!(
2982 parser(r"|+").parse().unwrap_err(),
2983 TestError {
2984 span: span(1..1),
2985 kind: ast::ErrorKind::RepetitionMissing,
2986 });
2987 assert_eq!(
2988 parser(r"|?").parse().unwrap_err(),
2989 TestError {
2990 span: span(1..1),
2991 kind: ast::ErrorKind::RepetitionMissing,
2992 });
2993 }
2994
2995 #[test]
2996 fn parse_counted_repetition() {
2997 assert_eq!(
2998 parser(r"a{5}").parse(),
2999 Ok(Ast::Repetition(ast::Repetition {
3000 span: span(0..4),
3001 op: ast::RepetitionOp {
3002 span: span(1..4),
3003 kind: ast::RepetitionKind::Range(
3004 ast::RepetitionRange::Exactly(5)),
3005 },
3006 greedy: true,
3007 ast: Box::new(lit('a', 0)),
3008 })));
3009 assert_eq!(
3010 parser(r"a{5,}").parse(),
3011 Ok(Ast::Repetition(ast::Repetition {
3012 span: span(0..5),
3013 op: ast::RepetitionOp {
3014 span: span(1..5),
3015 kind: ast::RepetitionKind::Range(
3016 ast::RepetitionRange::AtLeast(5)),
3017 },
3018 greedy: true,
3019 ast: Box::new(lit('a', 0)),
3020 })));
3021 assert_eq!(
3022 parser(r"a{5,9}").parse(),
3023 Ok(Ast::Repetition(ast::Repetition {
3024 span: span(0..6),
3025 op: ast::RepetitionOp {
3026 span: span(1..6),
3027 kind: ast::RepetitionKind::Range(
3028 ast::RepetitionRange::Bounded(5, 9)),
3029 },
3030 greedy: true,
3031 ast: Box::new(lit('a', 0)),
3032 })));
3033 assert_eq!(
3034 parser(r"a{5}?").parse(),
3035 Ok(Ast::Repetition(ast::Repetition {
3036 span: span(0..5),
3037 op: ast::RepetitionOp {
3038 span: span(1..5),
3039 kind: ast::RepetitionKind::Range(
3040 ast::RepetitionRange::Exactly(5)),
3041 },
3042 greedy: false,
3043 ast: Box::new(lit('a', 0)),
3044 })));
3045 assert_eq!(
3046 parser(r"ab{5}").parse(),
3047 Ok(concat(0..5, vec![
3048 lit('a', 0),
3049 Ast::Repetition(ast::Repetition {
3050 span: span(1..5),
3051 op: ast::RepetitionOp {
3052 span: span(2..5),
3053 kind: ast::RepetitionKind::Range(
3054 ast::RepetitionRange::Exactly(5)),
3055 },
3056 greedy: true,
3057 ast: Box::new(lit('b', 1)),
3058 }),
3059 ])));
3060 assert_eq!(
3061 parser(r"ab{5}c").parse(),
3062 Ok(concat(0..6, vec![
3063 lit('a', 0),
3064 Ast::Repetition(ast::Repetition {
3065 span: span(1..5),
3066 op: ast::RepetitionOp {
3067 span: span(2..5),
3068 kind: ast::RepetitionKind::Range(
3069 ast::RepetitionRange::Exactly(5)),
3070 },
3071 greedy: true,
3072 ast: Box::new(lit('b', 1)),
3073 }),
3074 lit('c', 5),
3075 ])));
3076
3077 assert_eq!(
3078 parser(r"a{ 5 }").parse(),
3079 Ok(Ast::Repetition(ast::Repetition {
3080 span: span(0..6),
3081 op: ast::RepetitionOp {
3082 span: span(1..6),
3083 kind: ast::RepetitionKind::Range(
3084 ast::RepetitionRange::Exactly(5)),
3085 },
3086 greedy: true,
3087 ast: Box::new(lit('a', 0)),
3088 })));
3089 assert_eq!(
3090 parser(r"a{ 5 , 9 }").parse(),
3091 Ok(Ast::Repetition(ast::Repetition {
3092 span: span(0..10),
3093 op: ast::RepetitionOp {
3094 span: span(1..10),
3095 kind: ast::RepetitionKind::Range(
3096 ast::RepetitionRange::Bounded(5, 9)),
3097 },
3098 greedy: true,
3099 ast: Box::new(lit('a', 0)),
3100 })));
3101 assert_eq!(
3102 parser_ignore_whitespace(r"a{5,9} ?").parse(),
3103 Ok(Ast::Repetition(ast::Repetition {
3104 span: span(0..8),
3105 op: ast::RepetitionOp {
3106 span: span(1..8),
3107 kind: ast::RepetitionKind::Range(
3108 ast::RepetitionRange::Bounded(5, 9)),
3109 },
3110 greedy: false,
3111 ast: Box::new(lit('a', 0)),
3112 })));
3113
3114 assert_eq!(
3115 parser(r"a{").parse().unwrap_err(),
3116 TestError {
3117 span: span(1..2),
3118 kind: ast::ErrorKind::RepetitionCountUnclosed,
3119 });
3120 assert_eq!(
3121 parser(r"a{}").parse().unwrap_err(),
3122 TestError {
3123 span: span(2..2),
3124 kind: ast::ErrorKind::DecimalEmpty,
3125 });
3126 assert_eq!(
3127 parser(r"a{a").parse().unwrap_err(),
3128 TestError {
3129 span: span(2..2),
3130 kind: ast::ErrorKind::DecimalEmpty,
3131 });
3132 assert_eq!(
3133 parser(r"a{9999999999}").parse().unwrap_err(),
3134 TestError {
3135 span: span(2..12),
3136 kind: ast::ErrorKind::DecimalInvalid,
3137 });
3138 assert_eq!(
3139 parser(r"a{9").parse().unwrap_err(),
3140 TestError {
3141 span: span(1..3),
3142 kind: ast::ErrorKind::RepetitionCountUnclosed,
3143 });
3144 assert_eq!(
3145 parser(r"a{9,a").parse().unwrap_err(),
3146 TestError {
3147 span: span(4..4),
3148 kind: ast::ErrorKind::DecimalEmpty,
3149 });
3150 assert_eq!(
3151 parser(r"a{9,9999999999}").parse().unwrap_err(),
3152 TestError {
3153 span: span(4..14),
3154 kind: ast::ErrorKind::DecimalInvalid,
3155 });
3156 assert_eq!(
3157 parser(r"a{9,").parse().unwrap_err(),
3158 TestError {
3159 span: span(1..4),
3160 kind: ast::ErrorKind::RepetitionCountUnclosed,
3161 });
3162 assert_eq!(
3163 parser(r"a{9,11").parse().unwrap_err(),
3164 TestError {
3165 span: span(1..6),
3166 kind: ast::ErrorKind::RepetitionCountUnclosed,
3167 });
3168 assert_eq!(
3169 parser(r"a{2,1}").parse().unwrap_err(),
3170 TestError {
3171 span: span(1..6),
3172 kind: ast::ErrorKind::RepetitionCountInvalid,
3173 });
3174 assert_eq!(
3175 parser(r"{5}").parse().unwrap_err(),
3176 TestError {
3177 span: span(0..0),
3178 kind: ast::ErrorKind::RepetitionMissing,
3179 });
3180 assert_eq!(
3181 parser(r"|{5}").parse().unwrap_err(),
3182 TestError {
3183 span: span(1..1),
3184 kind: ast::ErrorKind::RepetitionMissing,
3185 });
3186 }
3187
3188 #[test]
3189 fn parse_alternate() {
3190 assert_eq!(
3191 parser(r"a|b").parse(),
3192 Ok(Ast::Alternation(ast::Alternation {
3193 span: span(0..3),
3194 asts: vec![lit('a', 0), lit('b', 2)],
3195 })));
3196 assert_eq!(
3197 parser(r"(a|b)").parse(),
3198 Ok(group(0..5, 1, Ast::Alternation(ast::Alternation {
3199 span: span(1..4),
3200 asts: vec![lit('a', 1), lit('b', 3)],
3201 }))));
3202
3203 assert_eq!(
3204 parser(r"a|b|c").parse(),
3205 Ok(Ast::Alternation(ast::Alternation {
3206 span: span(0..5),
3207 asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)],
3208 })));
3209 assert_eq!(
3210 parser(r"ax|by|cz").parse(),
3211 Ok(Ast::Alternation(ast::Alternation {
3212 span: span(0..8),
3213 asts: vec![
3214 concat(0..2, vec![lit('a', 0), lit('x', 1)]),
3215 concat(3..5, vec![lit('b', 3), lit('y', 4)]),
3216 concat(6..8, vec![lit('c', 6), lit('z', 7)]),
3217 ],
3218 })));
3219 assert_eq!(
3220 parser(r"(ax|by|cz)").parse(),
3221 Ok(group(0..10, 1, Ast::Alternation(ast::Alternation {
3222 span: span(1..9),
3223 asts: vec![
3224 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3225 concat(4..6, vec![lit('b', 4), lit('y', 5)]),
3226 concat(7..9, vec![lit('c', 7), lit('z', 8)]),
3227 ],
3228 }))));
3229 assert_eq!(
3230 parser(r"(ax|(by|(cz)))").parse(),
3231 Ok(group(0..14, 1, alt(1..13, vec![
3232 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3233 group(4..13, 2, alt(5..12, vec![
3234 concat(5..7, vec![lit('b', 5), lit('y', 6)]),
3235 group(8..12, 3, concat(9..11, vec![
3236 lit('c', 9),
3237 lit('z', 10),
3238 ])),
3239 ])),
3240 ]))));
3241
3242 assert_eq!(
3243 parser(r"|").parse(), Ok(alt(0..1, vec![
3244 Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),
3245 ])));
3246 assert_eq!(
3247 parser(r"||").parse(), Ok(alt(0..2, vec![
3248 Ast::Empty(span(0..0)),
3249 Ast::Empty(span(1..1)),
3250 Ast::Empty(span(2..2)),
3251 ])));
3252 assert_eq!(
3253 parser(r"a|").parse(), Ok(alt(0..2, vec![
3254 lit('a', 0), Ast::Empty(span(2..2)),
3255 ])));
3256 assert_eq!(
3257 parser(r"|a").parse(), Ok(alt(0..2, vec![
3258 Ast::Empty(span(0..0)), lit('a', 1),
3259 ])));
3260
3261 assert_eq!(
3262 parser(r"(|)").parse(), Ok(group(0..3, 1, alt(1..2, vec![
3263 Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),
3264 ]))));
3265 assert_eq!(
3266 parser(r"(a|)").parse(), Ok(group(0..4, 1, alt(1..3, vec![
3267 lit('a', 1), Ast::Empty(span(3..3)),
3268 ]))));
3269 assert_eq!(
3270 parser(r"(|a)").parse(), Ok(group(0..4, 1, alt(1..3, vec![
3271 Ast::Empty(span(1..1)), lit('a', 2),
3272 ]))));
3273
3274 assert_eq!(
3275 parser(r"a|b)").parse().unwrap_err(),
3276 TestError {
3277 span: span(3..4),
3278 kind: ast::ErrorKind::GroupUnopened,
3279 });
3280 assert_eq!(
3281 parser(r"(a|b").parse().unwrap_err(),
3282 TestError {
3283 span: span(0..1),
3284 kind: ast::ErrorKind::GroupUnclosed,
3285 });
3286 }
3287
3288 #[test]
3289 fn parse_unsupported_lookaround() {
3290 assert_eq!(
3291 parser(r"(?=a)").parse().unwrap_err(),
3292 TestError {
3293 span: span(0..3),
3294 kind: ast::ErrorKind::UnsupportedLookAround,
3295 });
3296 assert_eq!(
3297 parser(r"(?!a)").parse().unwrap_err(),
3298 TestError {
3299 span: span(0..3),
3300 kind: ast::ErrorKind::UnsupportedLookAround,
3301 });
3302 assert_eq!(
3303 parser(r"(?<=a)").parse().unwrap_err(),
3304 TestError {
3305 span: span(0..4),
3306 kind: ast::ErrorKind::UnsupportedLookAround,
3307 });
3308 assert_eq!(
3309 parser(r"(?<!a)").parse().unwrap_err(),
3310 TestError {
3311 span: span(0..4),
3312 kind: ast::ErrorKind::UnsupportedLookAround,
3313 });
3314 }
3315
3316 #[test]
3317 fn parse_group() {
3318 assert_eq!(parser("(?i)").parse(), Ok(Ast::Flags(ast::SetFlags {
3319 span: span(0..4),
3320 flags: ast::Flags {
3321 span: span(2..3),
3322 items: vec![ast::FlagsItem {
3323 span: span(2..3),
3324 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3325 }],
3326 },
3327 })));
3328 assert_eq!(parser("(?iU)").parse(), Ok(Ast::Flags(ast::SetFlags {
3329 span: span(0..5),
3330 flags: ast::Flags {
3331 span: span(2..4),
3332 items: vec![
3333 ast::FlagsItem {
3334 span: span(2..3),
3335 kind: ast::FlagsItemKind::Flag(
3336 ast::Flag::CaseInsensitive),
3337 },
3338 ast::FlagsItem {
3339 span: span(3..4),
3340 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3341 },
3342 ],
3343 },
3344 })));
3345 assert_eq!(parser("(?i-U)").parse(), Ok(Ast::Flags(ast::SetFlags {
3346 span: span(0..6),
3347 flags: ast::Flags {
3348 span: span(2..5),
3349 items: vec![
3350 ast::FlagsItem {
3351 span: span(2..3),
3352 kind: ast::FlagsItemKind::Flag(
3353 ast::Flag::CaseInsensitive),
3354 },
3355 ast::FlagsItem {
3356 span: span(3..4),
3357 kind: ast::FlagsItemKind::Negation,
3358 },
3359 ast::FlagsItem {
3360 span: span(4..5),
3361 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3362 },
3363 ],
3364 },
3365 })));
3366
3367 assert_eq!(parser("()").parse(), Ok(Ast::Group(ast::Group {
3368 span: span(0..2),
3369 kind: ast::GroupKind::CaptureIndex(1),
3370 ast: Box::new(Ast::Empty(span(1..1))),
3371 })));
3372 assert_eq!(parser("(a)").parse(), Ok(Ast::Group(ast::Group {
3373 span: span(0..3),
3374 kind: ast::GroupKind::CaptureIndex(1),
3375 ast: Box::new(lit('a', 1)),
3376 })));
3377 assert_eq!(parser("(())").parse(), Ok(Ast::Group(ast::Group {
3378 span: span(0..4),
3379 kind: ast::GroupKind::CaptureIndex(1),
3380 ast: Box::new(Ast::Group(ast::Group {
3381 span: span(1..3),
3382 kind: ast::GroupKind::CaptureIndex(2),
3383 ast: Box::new(Ast::Empty(span(2..2))),
3384 })),
3385 })));
3386
3387 assert_eq!(parser("(?:a)").parse(), Ok(Ast::Group(ast::Group {
3388 span: span(0..5),
3389 kind: ast::GroupKind::NonCapturing(ast::Flags {
3390 span: span(2..2),
3391 items: vec![],
3392 }),
3393 ast: Box::new(lit('a', 3)),
3394 })));
3395
3396 assert_eq!(parser("(?i:a)").parse(), Ok(Ast::Group(ast::Group {
3397 span: span(0..6),
3398 kind: ast::GroupKind::NonCapturing(ast::Flags {
3399 span: span(2..3),
3400 items: vec![
3401 ast::FlagsItem {
3402 span: span(2..3),
3403 kind: ast::FlagsItemKind::Flag(
3404 ast::Flag::CaseInsensitive),
3405 },
3406 ],
3407 }),
3408 ast: Box::new(lit('a', 4)),
3409 })));
3410 assert_eq!(parser("(?i-U:a)").parse(), Ok(Ast::Group(ast::Group {
3411 span: span(0..8),
3412 kind: ast::GroupKind::NonCapturing(ast::Flags {
3413 span: span(2..5),
3414 items: vec![
3415 ast::FlagsItem {
3416 span: span(2..3),
3417 kind: ast::FlagsItemKind::Flag(
3418 ast::Flag::CaseInsensitive),
3419 },
3420 ast::FlagsItem {
3421 span: span(3..4),
3422 kind: ast::FlagsItemKind::Negation,
3423 },
3424 ast::FlagsItem {
3425 span: span(4..5),
3426 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3427 },
3428 ],
3429 }),
3430 ast: Box::new(lit('a', 6)),
3431 })));
3432
3433 assert_eq!(
3434 parser("(").parse().unwrap_err(),
3435 TestError {
3436 span: span(0..1),
3437 kind: ast::ErrorKind::GroupUnclosed,
3438 });
3439 assert_eq!(
3440 parser("(?").parse().unwrap_err(),
3441 TestError {
3442 span: span(0..1),
3443 kind: ast::ErrorKind::GroupUnclosed,
3444 });
3445 assert_eq!(
3446 parser("(?P").parse().unwrap_err(),
3447 TestError {
3448 span: span(2..3),
3449 kind: ast::ErrorKind::FlagUnrecognized,
3450 });
3451 assert_eq!(
3452 parser("(?P<").parse().unwrap_err(),
3453 TestError {
3454 span: span(4..4),
3455 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3456 });
3457 assert_eq!(
3458 parser("(a").parse().unwrap_err(),
3459 TestError {
3460 span: span(0..1),
3461 kind: ast::ErrorKind::GroupUnclosed,
3462 });
3463 assert_eq!(
3464 parser("(()").parse().unwrap_err(),
3465 TestError {
3466 span: span(0..1),
3467 kind: ast::ErrorKind::GroupUnclosed,
3468 });
3469 assert_eq!(
3470 parser(")").parse().unwrap_err(),
3471 TestError {
3472 span: span(0..1),
3473 kind: ast::ErrorKind::GroupUnopened,
3474 });
3475 assert_eq!(
3476 parser("a)").parse().unwrap_err(),
3477 TestError {
3478 span: span(1..2),
3479 kind: ast::ErrorKind::GroupUnopened,
3480 });
3481 }
3482
3483 #[test]
3484 fn parse_capture_name() {
3485 assert_eq!(parser("(?P<a>z)").parse(), Ok(Ast::Group(ast::Group {
3486 span: span(0..8),
3487 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3488 span: span(4..5),
3489 name: s("a"),
3490 index: 1,
3491 }),
3492 ast: Box::new(lit('z', 6)),
3493 })));
3494 assert_eq!(parser("(?P<abc>z)").parse(), Ok(Ast::Group(ast::Group {
3495 span: span(0..10),
3496 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3497 span: span(4..7),
3498 name: s("abc"),
3499 index: 1,
3500 }),
3501 ast: Box::new(lit('z', 8)),
3502 })));
3503
3504 assert_eq!(
3505 parser("(?P<").parse().unwrap_err(),
3506 TestError {
3507 span: span(4..4),
3508 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3509 });
3510 assert_eq!(
3511 parser("(?P<>z)").parse().unwrap_err(),
3512 TestError {
3513 span: span(4..4),
3514 kind: ast::ErrorKind::GroupNameEmpty,
3515 });
3516 assert_eq!(
3517 parser("(?P<a").parse().unwrap_err(),
3518 TestError {
3519 span: span(5..5),
3520 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3521 });
3522 assert_eq!(
3523 parser("(?P<ab").parse().unwrap_err(),
3524 TestError {
3525 span: span(6..6),
3526 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3527 });
3528 assert_eq!(
3529 parser("(?P<0a").parse().unwrap_err(),
3530 TestError {
3531 span: span(4..5),
3532 kind: ast::ErrorKind::GroupNameInvalid,
3533 });
3534 assert_eq!(
3535 parser("(?P<~").parse().unwrap_err(),
3536 TestError {
3537 span: span(4..5),
3538 kind: ast::ErrorKind::GroupNameInvalid,
3539 });
3540 assert_eq!(
3541 parser("(?P<abc~").parse().unwrap_err(),
3542 TestError {
3543 span: span(7..8),
3544 kind: ast::ErrorKind::GroupNameInvalid,
3545 });
3546 assert_eq!(
3547 parser("(?P<a>y)(?P<a>z)").parse().unwrap_err(),
3548 TestError {
3549 span: span(12..13),
3550 kind: ast::ErrorKind::GroupNameDuplicate {
3551 original: span(4..5),
3552 },
3553 });
3554 }
3555
3556 #[test]
3557 fn parse_flags() {
3558 assert_eq!(parser("i:").parse_flags(), Ok(ast::Flags {
3559 span: span(0..1),
3560 items: vec![ast::FlagsItem {
3561 span: span(0..1),
3562 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3563 }],
3564 }));
3565 assert_eq!(parser("i)").parse_flags(), Ok(ast::Flags {
3566 span: span(0..1),
3567 items: vec![ast::FlagsItem {
3568 span: span(0..1),
3569 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3570 }],
3571 }));
3572
3573 assert_eq!(parser("isU:").parse_flags(), Ok(ast::Flags {
3574 span: span(0..3),
3575 items: vec![
3576 ast::FlagsItem {
3577 span: span(0..1),
3578 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3579 },
3580 ast::FlagsItem {
3581 span: span(1..2),
3582 kind: ast::FlagsItemKind::Flag(
3583 ast::Flag::DotMatchesNewLine),
3584 },
3585 ast::FlagsItem {
3586 span: span(2..3),
3587 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3588 },
3589 ],
3590 }));
3591
3592 assert_eq!(parser("-isU:").parse_flags(), Ok(ast::Flags {
3593 span: span(0..4),
3594 items: vec![
3595 ast::FlagsItem {
3596 span: span(0..1),
3597 kind: ast::FlagsItemKind::Negation,
3598 },
3599 ast::FlagsItem {
3600 span: span(1..2),
3601 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3602 },
3603 ast::FlagsItem {
3604 span: span(2..3),
3605 kind: ast::FlagsItemKind::Flag(
3606 ast::Flag::DotMatchesNewLine),
3607 },
3608 ast::FlagsItem {
3609 span: span(3..4),
3610 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3611 },
3612 ],
3613 }));
3614 assert_eq!(parser("i-sU:").parse_flags(), Ok(ast::Flags {
3615 span: span(0..4),
3616 items: vec![
3617 ast::FlagsItem {
3618 span: span(0..1),
3619 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3620 },
3621 ast::FlagsItem {
3622 span: span(1..2),
3623 kind: ast::FlagsItemKind::Negation,
3624 },
3625 ast::FlagsItem {
3626 span: span(2..3),
3627 kind: ast::FlagsItemKind::Flag(
3628 ast::Flag::DotMatchesNewLine),
3629 },
3630 ast::FlagsItem {
3631 span: span(3..4),
3632 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3633 },
3634 ],
3635 }));
3636
3637 assert_eq!(
3638 parser("isU").parse_flags().unwrap_err(),
3639 TestError {
3640 span: span(3..3),
3641 kind: ast::ErrorKind::FlagUnexpectedEof,
3642 });
3643 assert_eq!(
3644 parser("isUa:").parse_flags().unwrap_err(),
3645 TestError {
3646 span: span(3..4),
3647 kind: ast::ErrorKind::FlagUnrecognized,
3648 });
3649 assert_eq!(
3650 parser("isUi:").parse_flags().unwrap_err(),
3651 TestError {
3652 span: span(3..4),
3653 kind: ast::ErrorKind::FlagDuplicate {
3654 original: span(0..1),
3655 },
3656 });
3657 assert_eq!(
3658 parser("i-sU-i:").parse_flags().unwrap_err(),
3659 TestError {
3660 span: span(4..5),
3661 kind: ast::ErrorKind::FlagRepeatedNegation {
3662 original: span(1..2),
3663 },
3664 });
3665 assert_eq!(
3666 parser("-)").parse_flags().unwrap_err(),
3667 TestError {
3668 span: span(0..1),
3669 kind: ast::ErrorKind::FlagDanglingNegation,
3670 });
3671 assert_eq!(
3672 parser("i-)").parse_flags().unwrap_err(),
3673 TestError {
3674 span: span(1..2),
3675 kind: ast::ErrorKind::FlagDanglingNegation,
3676 });
3677 assert_eq!(
3678 parser("iU-)").parse_flags().unwrap_err(),
3679 TestError {
3680 span: span(2..3),
3681 kind: ast::ErrorKind::FlagDanglingNegation,
3682 });
3683 }
3684
3685 #[test]
3686 fn parse_flag() {
3687 assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive));
3688 assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine));
3689 assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine));
3690 assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed));
3691 assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode));
3692 assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace));
3693
3694 assert_eq!(
3695 parser("a").parse_flag().unwrap_err(),
3696 TestError {
3697 span: span(0..1),
3698 kind: ast::ErrorKind::FlagUnrecognized,
3699 });
3700 assert_eq!(
3701 parser("☃").parse_flag().unwrap_err(),
3702 TestError {
3703 span: span_range("☃", 0..3),
3704 kind: ast::ErrorKind::FlagUnrecognized,
3705 });
3706 }
3707
3708 #[test]
3709 fn parse_primitive_non_escape() {
3710 assert_eq!(
3711 parser(r".").parse_primitive(),
3712 Ok(Primitive::Dot(span(0..1))));
3713 assert_eq!(
3714 parser(r"^").parse_primitive(),
3715 Ok(Primitive::Assertion(ast::Assertion {
3716 span: span(0..1),
3717 kind: ast::AssertionKind::StartLine,
3718 })));
3719 assert_eq!(
3720 parser(r"$").parse_primitive(),
3721 Ok(Primitive::Assertion(ast::Assertion {
3722 span: span(0..1),
3723 kind: ast::AssertionKind::EndLine,
3724 })));
3725
3726 assert_eq!(
3727 parser(r"a").parse_primitive(),
3728 Ok(Primitive::Literal(ast::Literal {
3729 span: span(0..1),
3730 kind: ast::LiteralKind::Verbatim,
3731 c: 'a',
3732 })));
3733 assert_eq!(
3734 parser(r"|").parse_primitive(),
3735 Ok(Primitive::Literal(ast::Literal {
3736 span: span(0..1),
3737 kind: ast::LiteralKind::Verbatim,
3738 c: '|',
3739 })));
3740 assert_eq!(
3741 parser(r"☃").parse_primitive(),
3742 Ok(Primitive::Literal(ast::Literal {
3743 span: span_range("☃", 0..3),
3744 kind: ast::LiteralKind::Verbatim,
3745 c: '☃',
3746 })));
3747 }
3748
3749 #[test]
3750 fn parse_escape() {
3751 assert_eq!(
3752 parser(r"\|").parse_primitive(),
3753 Ok(Primitive::Literal(ast::Literal {
3754 span: span(0..2),
3755 kind: ast::LiteralKind::Punctuation,
3756 c: '|',
3757 })));
3758 let specials = &[
3759 (r"\a", '\x07', ast::SpecialLiteralKind::Bell),
3760 (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed),
3761 (r"\t", '\t', ast::SpecialLiteralKind::Tab),
3762 (r"\n", '\n', ast::SpecialLiteralKind::LineFeed),
3763 (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn),
3764 (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab),
3765 ];
3766 for &(pat, c, ref kind) in specials {
3767 assert_eq!(
3768 parser(pat).parse_primitive(),
3769 Ok(Primitive::Literal(ast::Literal {
3770 span: span(0..2),
3771 kind: ast::LiteralKind::Special(kind.clone()),
3772 c: c,
3773 })));
3774 }
3775 assert_eq!(
3776 parser(r"\A").parse_primitive(),
3777 Ok(Primitive::Assertion(ast::Assertion {
3778 span: span(0..2),
3779 kind: ast::AssertionKind::StartText,
3780 })));
3781 assert_eq!(
3782 parser(r"\z").parse_primitive(),
3783 Ok(Primitive::Assertion(ast::Assertion {
3784 span: span(0..2),
3785 kind: ast::AssertionKind::EndText,
3786 })));
3787 assert_eq!(
3788 parser(r"\b").parse_primitive(),
3789 Ok(Primitive::Assertion(ast::Assertion {
3790 span: span(0..2),
3791 kind: ast::AssertionKind::WordBoundary,
3792 })));
3793 assert_eq!(
3794 parser(r"\B").parse_primitive(),
3795 Ok(Primitive::Assertion(ast::Assertion {
3796 span: span(0..2),
3797 kind: ast::AssertionKind::NotWordBoundary,
3798 })));
3799
3800 assert_eq!(
3801 parser(r"\").parse_escape().unwrap_err(),
3802 TestError {
3803 span: span(0..1),
3804 kind: ast::ErrorKind::EscapeUnexpectedEof,
3805 });
3806 assert_eq!(
3807 parser(r"\y").parse_escape().unwrap_err(),
3808 TestError {
3809 span: span(0..2),
3810 kind: ast::ErrorKind::EscapeUnrecognized,
3811 });
3812 }
3813
3814 #[test]
3815 fn parse_unsupported_backreference() {
3816 assert_eq!(
3817 parser(r"\0").parse_escape().unwrap_err(),
3818 TestError {
3819 span: span(0..2),
3820 kind: ast::ErrorKind::UnsupportedBackreference,
3821 });
3822 assert_eq!(
3823 parser(r"\9").parse_escape().unwrap_err(),
3824 TestError {
3825 span: span(0..2),
3826 kind: ast::ErrorKind::UnsupportedBackreference,
3827 });
3828 }
3829
3830 #[test]
3831 fn parse_octal() {
3832 for i in 0..511 {
3833 let pat = format!(r"\{:o}", i);
3834 assert_eq!(
3835 parser_octal(&pat).parse_escape(),
3836 Ok(Primitive::Literal(ast::Literal {
3837 span: span(0..pat.len()),
3838 kind: ast::LiteralKind::Octal,
3839 c: ::std::char::from_u32(i).unwrap(),
3840 })));
3841 }
3842 assert_eq!(
3843 parser_octal(r"\778").parse_escape(),
3844 Ok(Primitive::Literal(ast::Literal {
3845 span: span(0..3),
3846 kind: ast::LiteralKind::Octal,
3847 c: '?',
3848 })));
3849 assert_eq!(
3850 parser_octal(r"\7777").parse_escape(),
3851 Ok(Primitive::Literal(ast::Literal {
3852 span: span(0..4),
3853 kind: ast::LiteralKind::Octal,
3854 c: '\u{01FF}',
3855 })));
3856 assert_eq!(
3857 parser_octal(r"\778").parse(),
3858 Ok(Ast::Concat(ast::Concat {
3859 span: span(0..4),
3860 asts: vec![
3861 Ast::Literal(ast::Literal {
3862 span: span(0..3),
3863 kind: ast::LiteralKind::Octal,
3864 c: '?',
3865 }),
3866 Ast::Literal(ast::Literal {
3867 span: span(3..4),
3868 kind: ast::LiteralKind::Verbatim,
3869 c: '8',
3870 }),
3871 ],
3872 })));
3873 assert_eq!(
3874 parser_octal(r"\7777").parse(),
3875 Ok(Ast::Concat(ast::Concat {
3876 span: span(0..5),
3877 asts: vec![
3878 Ast::Literal(ast::Literal {
3879 span: span(0..4),
3880 kind: ast::LiteralKind::Octal,
3881 c: '\u{01FF}',
3882 }),
3883 Ast::Literal(ast::Literal {
3884 span: span(4..5),
3885 kind: ast::LiteralKind::Verbatim,
3886 c: '7',
3887 }),
3888 ],
3889 })));
3890
3891 assert_eq!(
3892 parser_octal(r"\8").parse_escape().unwrap_err(),
3893 TestError {
3894 span: span(0..2),
3895 kind: ast::ErrorKind::EscapeUnrecognized,
3896 });
3897 }
3898
3899 #[test]
3900 fn parse_hex_two() {
3901 for i in 0..256 {
3902 let pat = format!(r"\x{:02x}", i);
3903 assert_eq!(
3904 parser(&pat).parse_escape(),
3905 Ok(Primitive::Literal(ast::Literal {
3906 span: span(0..pat.len()),
3907 kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X),
3908 c: ::std::char::from_u32(i).unwrap(),
3909 })));
3910 }
3911
3912 assert_eq!(
3913 parser(r"\xF").parse_escape().unwrap_err(),
3914 TestError {
3915 span: span(3..3),
3916 kind: ast::ErrorKind::EscapeUnexpectedEof,
3917 });
3918 assert_eq!(
3919 parser(r"\xG").parse_escape().unwrap_err(),
3920 TestError {
3921 span: span(2..3),
3922 kind: ast::ErrorKind::EscapeHexInvalidDigit,
3923 });
3924 assert_eq!(
3925 parser(r"\xFG").parse_escape().unwrap_err(),
3926 TestError {
3927 span: span(3..4),
3928 kind: ast::ErrorKind::EscapeHexInvalidDigit,
3929 });
3930 }
3931
3932 #[test]
3933 fn parse_hex_four() {
3934 for i in 0..65536 {
3935 let c = match ::std::char::from_u32(i) {
3936 None => continue,
3937 Some(c) => c,
3938 };
3939 let pat = format!(r"\u{:04x}", i);
3940 assert_eq!(
3941 parser(&pat).parse_escape(),
3942 Ok(Primitive::Literal(ast::Literal {
3943 span: span(0..pat.len()),
3944 kind: ast::LiteralKind::HexFixed(
3945 ast::HexLiteralKind::UnicodeShort),
3946 c: c,
3947 })));
3948 }
3949
3950 assert_eq!(
3951 parser(r"\uF").parse_escape().unwrap_err(),
3952 TestError {
3953 span: span(3..3),
3954 kind: ast::ErrorKind::EscapeUnexpectedEof,
3955 });
3956 assert_eq!(
3957 parser(r"\uG").parse_escape().unwrap_err(),
3958 TestError {
3959 span: span(2..3),
3960 kind: ast::ErrorKind::EscapeHexInvalidDigit,
3961 });
3962 assert_eq!(
3963 parser(r"\uFG").parse_escape().unwrap_err(),
3964 TestError {
3965 span: span(3..4),
3966 kind: ast::ErrorKind::EscapeHexInvalidDigit,
3967 });
3968 assert_eq!(
3969 parser(r"\uFFG").parse_escape().unwrap_err(),
3970 TestError {
3971 span: span(4..5),
3972 kind: ast::ErrorKind::EscapeHexInvalidDigit,
3973 });
3974 assert_eq!(
3975 parser(r"\uFFFG").parse_escape().unwrap_err(),
3976 TestError {
3977 span: span(5..6),
3978 kind: ast::ErrorKind::EscapeHexInvalidDigit,
3979 });
3980 assert_eq!(
3981 parser(r"\uD800").parse_escape().unwrap_err(),
3982 TestError {
3983 span: span(2..6),
3984 kind: ast::ErrorKind::EscapeHexInvalid,
3985 });
3986 }
3987
3988 #[test]
3989 fn parse_hex_eight() {
3990 for i in 0..65536 {
3991 let c = match ::std::char::from_u32(i) {
3992 None => continue,
3993 Some(c) => c,
3994 };
3995 let pat = format!(r"\U{:08x}", i);
3996 assert_eq!(
3997 parser(&pat).parse_escape(),
3998 Ok(Primitive::Literal(ast::Literal {
3999 span: span(0..pat.len()),
4000 kind: ast::LiteralKind::HexFixed(
4001 ast::HexLiteralKind::UnicodeLong),
4002 c: c,
4003 })));
4004 }
4005
4006 assert_eq!(
4007 parser(r"\UF").parse_escape().unwrap_err(),
4008 TestError {
4009 span: span(3..3),
4010 kind: ast::ErrorKind::EscapeUnexpectedEof,
4011 });
4012 assert_eq!(
4013 parser(r"\UG").parse_escape().unwrap_err(),
4014 TestError {
4015 span: span(2..3),
4016 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4017 });
4018 assert_eq!(
4019 parser(r"\UFG").parse_escape().unwrap_err(),
4020 TestError {
4021 span: span(3..4),
4022 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4023 });
4024 assert_eq!(
4025 parser(r"\UFFG").parse_escape().unwrap_err(),
4026 TestError {
4027 span: span(4..5),
4028 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4029 });
4030 assert_eq!(
4031 parser(r"\UFFFG").parse_escape().unwrap_err(),
4032 TestError {
4033 span: span(5..6),
4034 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4035 });
4036 assert_eq!(
4037 parser(r"\UFFFFG").parse_escape().unwrap_err(),
4038 TestError {
4039 span: span(6..7),
4040 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4041 });
4042 assert_eq!(
4043 parser(r"\UFFFFFG").parse_escape().unwrap_err(),
4044 TestError {
4045 span: span(7..8),
4046 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4047 });
4048 assert_eq!(
4049 parser(r"\UFFFFFFG").parse_escape().unwrap_err(),
4050 TestError {
4051 span: span(8..9),
4052 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4053 });
4054 assert_eq!(
4055 parser(r"\UFFFFFFFG").parse_escape().unwrap_err(),
4056 TestError {
4057 span: span(9..10),
4058 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4059 });
4060 }
4061
4062 #[test]
4063 fn parse_hex_brace() {
4064 assert_eq!(
4065 parser(r"\u{26c4}").parse_escape(),
4066 Ok(Primitive::Literal(ast::Literal {
4067 span: span(0..8),
4068 kind: ast::LiteralKind::HexBrace(
4069 ast::HexLiteralKind::UnicodeShort),
4070 c: '⛄',
4071 })));
4072 assert_eq!(
4073 parser(r"\U{26c4}").parse_escape(),
4074 Ok(Primitive::Literal(ast::Literal {
4075 span: span(0..8),
4076 kind: ast::LiteralKind::HexBrace(
4077 ast::HexLiteralKind::UnicodeLong),
4078 c: '⛄',
4079 })));
4080 assert_eq!(
4081 parser(r"\x{26c4}").parse_escape(),
4082 Ok(Primitive::Literal(ast::Literal {
4083 span: span(0..8),
4084 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4085 c: '⛄',
4086 })));
4087 assert_eq!(
4088 parser(r"\x{26C4}").parse_escape(),
4089 Ok(Primitive::Literal(ast::Literal {
4090 span: span(0..8),
4091 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4092 c: '⛄',
4093 })));
4094 assert_eq!(
4095 parser(r"\x{10fFfF}").parse_escape(),
4096 Ok(Primitive::Literal(ast::Literal {
4097 span: span(0..10),
4098 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4099 c: '\u{10FFFF}',
4100 })));
4101
4102 assert_eq!(
4103 parser(r"\x").parse_escape().unwrap_err(),
4104 TestError {
4105 span: span(2..2),
4106 kind: ast::ErrorKind::EscapeUnexpectedEof,
4107 });
4108 assert_eq!(
4109 parser(r"\x{").parse_escape().unwrap_err(),
4110 TestError {
4111 span: span(2..3),
4112 kind: ast::ErrorKind::EscapeUnexpectedEof,
4113 });
4114 assert_eq!(
4115 parser(r"\x{FF").parse_escape().unwrap_err(),
4116 TestError {
4117 span: span(2..5),
4118 kind: ast::ErrorKind::EscapeUnexpectedEof,
4119 });
4120 assert_eq!(
4121 parser(r"\x{}").parse_escape().unwrap_err(),
4122 TestError {
4123 span: span(2..4),
4124 kind: ast::ErrorKind::EscapeHexEmpty,
4125 });
4126 assert_eq!(
4127 parser(r"\x{FGF}").parse_escape().unwrap_err(),
4128 TestError {
4129 span: span(4..5),
4130 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4131 });
4132 assert_eq!(
4133 parser(r"\x{FFFFFF}").parse_escape().unwrap_err(),
4134 TestError {
4135 span: span(3..9),
4136 kind: ast::ErrorKind::EscapeHexInvalid,
4137 });
4138 assert_eq!(
4139 parser(r"\x{D800}").parse_escape().unwrap_err(),
4140 TestError {
4141 span: span(3..7),
4142 kind: ast::ErrorKind::EscapeHexInvalid,
4143 });
4144 assert_eq!(
4145 parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(),
4146 TestError {
4147 span: span(3..12),
4148 kind: ast::ErrorKind::EscapeHexInvalid,
4149 });
4150 }
4151
4152 #[test]
4153 fn parse_decimal() {
4154 assert_eq!(parser("123").parse_decimal(), Ok(123));
4155 assert_eq!(parser("0").parse_decimal(), Ok(0));
4156 assert_eq!(parser("01").parse_decimal(), Ok(1));
4157
4158 assert_eq!(
4159 parser("-1").parse_decimal().unwrap_err(),
4160 TestError {
4161 span: span(0..0),
4162 kind: ast::ErrorKind::DecimalEmpty,
4163 });
4164 assert_eq!(
4165 parser("").parse_decimal().unwrap_err(),
4166 TestError {
4167 span: span(0..0),
4168 kind: ast::ErrorKind::DecimalEmpty,
4169 });
4170 assert_eq!(
4171 parser("9999999999").parse_decimal().unwrap_err(),
4172 TestError {
4173 span: span(0..10),
4174 kind: ast::ErrorKind::DecimalInvalid,
4175 });
4176 }
4177
4178 #[test]
4179 fn parse_set_class() {
4180 fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet {
4181 ast::ClassSet::union(ast::ClassSetUnion {
4182 span: span,
4183 items: items,
4184 })
4185 }
4186
4187 fn intersection(
4188 span: Span,
4189 lhs: ast::ClassSet,
4190 rhs: ast::ClassSet,
4191 ) -> ast::ClassSet {
4192 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4193 span: span,
4194 kind: ast::ClassSetBinaryOpKind::Intersection,
4195 lhs: Box::new(lhs),
4196 rhs: Box::new(rhs),
4197 })
4198 }
4199
4200 fn difference(
4201 span: Span,
4202 lhs: ast::ClassSet,
4203 rhs: ast::ClassSet,
4204 ) -> ast::ClassSet {
4205 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4206 span: span,
4207 kind: ast::ClassSetBinaryOpKind::Difference,
4208 lhs: Box::new(lhs),
4209 rhs: Box::new(rhs),
4210 })
4211 }
4212
4213 fn symdifference(
4214 span: Span,
4215 lhs: ast::ClassSet,
4216 rhs: ast::ClassSet,
4217 ) -> ast::ClassSet {
4218 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4219 span: span,
4220 kind: ast::ClassSetBinaryOpKind::SymmetricDifference,
4221 lhs: Box::new(lhs),
4222 rhs: Box::new(rhs),
4223 })
4224 }
4225
4226 fn itemset(item: ast::ClassSetItem) -> ast::ClassSet {
4227 ast::ClassSet::Item(item)
4228 }
4229
4230 fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem {
4231 ast::ClassSetItem::Ascii(cls)
4232 }
4233
4234 fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem {
4235 ast::ClassSetItem::Unicode(cls)
4236 }
4237
4238 fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem {
4239 ast::ClassSetItem::Perl(cls)
4240 }
4241
4242 fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem {
4243 ast::ClassSetItem::Bracketed(Box::new(cls))
4244 }
4245
4246 fn lit(span: Span, c: char) -> ast::ClassSetItem {
4247 ast::ClassSetItem::Literal(ast::Literal {
4248 span: span,
4249 kind: ast::LiteralKind::Verbatim,
4250 c: c,
4251 })
4252 }
4253
4254 fn empty(span: Span) -> ast::ClassSetItem {
4255 ast::ClassSetItem::Empty(span)
4256 }
4257
4258 fn range(span: Span, start: char, end: char) -> ast::ClassSetItem {
4259 let pos1 = Position {
4260 offset: span.start.offset + start.len_utf8(),
4261 column: span.start.column + 1,
4262 ..span.start
4263 };
4264 let pos2 = Position {
4265 offset: span.end.offset - end.len_utf8(),
4266 column: span.end.column - 1,
4267 ..span.end
4268 };
4269 ast::ClassSetItem::Range(ast::ClassSetRange {
4270 span: span,
4271 start: ast::Literal {
4272 span: Span { end: pos1, ..span },
4273 kind: ast::LiteralKind::Verbatim,
4274 c: start,
4275 },
4276 end: ast::Literal {
4277 span: Span { start: pos2, ..span },
4278 kind: ast::LiteralKind::Verbatim,
4279 c: end,
4280 },
4281 })
4282 }
4283
4284 fn alnum(span: Span, negated: bool) -> ast::ClassAscii {
4285 ast::ClassAscii {
4286 span: span,
4287 kind: ast::ClassAsciiKind::Alnum,
4288 negated: negated,
4289 }
4290 }
4291
4292 fn lower(span: Span, negated: bool) -> ast::ClassAscii {
4293 ast::ClassAscii {
4294 span: span,
4295 kind: ast::ClassAsciiKind::Lower,
4296 negated: negated,
4297 }
4298 }
4299
4300 assert_eq!(
4301 parser("[[:alnum:]]").parse(),
4302 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4303 span: span(0..11),
4304 negated: false,
4305 kind: itemset(item_ascii(alnum(span(1..10), false))),
4306 }))));
4307 assert_eq!(
4308 parser("[[[:alnum:]]]").parse(),
4309 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4310 span: span(0..13),
4311 negated: false,
4312 kind: itemset(item_bracket(ast::ClassBracketed {
4313 span: span(1..12),
4314 negated: false,
4315 kind: itemset(item_ascii(alnum(span(2..11), false))),
4316 })),
4317 }))));
4318 assert_eq!(
4319 parser("[[:alnum:]&&[:lower:]]").parse(),
4320 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4321 span: span(0..22),
4322 negated: false,
4323 kind: intersection(
4324 span(1..21),
4325 itemset(item_ascii(alnum(span(1..10), false))),
4326 itemset(item_ascii(lower(span(12..21), false))),
4327 ),
4328 }))));
4329 assert_eq!(
4330 parser("[[:alnum:]--[:lower:]]").parse(),
4331 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4332 span: span(0..22),
4333 negated: false,
4334 kind: difference(
4335 span(1..21),
4336 itemset(item_ascii(alnum(span(1..10), false))),
4337 itemset(item_ascii(lower(span(12..21), false))),
4338 ),
4339 }))));
4340 assert_eq!(
4341 parser("[[:alnum:]~~[:lower:]]").parse(),
4342 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4343 span: span(0..22),
4344 negated: false,
4345 kind: symdifference(
4346 span(1..21),
4347 itemset(item_ascii(alnum(span(1..10), false))),
4348 itemset(item_ascii(lower(span(12..21), false))),
4349 ),
4350 }))));
4351
4352 assert_eq!(
4353 parser("[a]").parse(),
4354 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4355 span: span(0..3),
4356 negated: false,
4357 kind: itemset(lit(span(1..2), 'a')),
4358 }))));
4359 assert_eq!(
4360 parser(r"[a\]]").parse(),
4361 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4362 span: span(0..5),
4363 negated: false,
4364 kind: union(span(1..4), vec![
4365 lit(span(1..2), 'a'),
4366 ast::ClassSetItem::Literal(ast::Literal {
4367 span: span(2..4),
4368 kind: ast::LiteralKind::Punctuation,
4369 c: ']',
4370 }),
4371 ]),
4372 }))));
4373 assert_eq!(
4374 parser(r"[a\-z]").parse(),
4375 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4376 span: span(0..6),
4377 negated: false,
4378 kind: union(span(1..5), vec![
4379 lit(span(1..2), 'a'),
4380 ast::ClassSetItem::Literal(ast::Literal {
4381 span: span(2..4),
4382 kind: ast::LiteralKind::Punctuation,
4383 c: '-',
4384 }),
4385 lit(span(4..5), 'z'),
4386 ]),
4387 }))));
4388 assert_eq!(
4389 parser("[ab]").parse(),
4390 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4391 span: span(0..4),
4392 negated: false,
4393 kind: union(span(1..3), vec![
4394 lit(span(1..2), 'a'),
4395 lit(span(2..3), 'b'),
4396 ]),
4397 }))));
4398 assert_eq!(
4399 parser("[a-]").parse(),
4400 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4401 span: span(0..4),
4402 negated: false,
4403 kind: union(span(1..3), vec![
4404 lit(span(1..2), 'a'),
4405 lit(span(2..3), '-'),
4406 ]),
4407 }))));
4408 assert_eq!(
4409 parser("[-a]").parse(),
4410 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4411 span: span(0..4),
4412 negated: false,
4413 kind: union(span(1..3), vec![
4414 lit(span(1..2), '-'),
4415 lit(span(2..3), 'a'),
4416 ]),
4417 }))));
4418 assert_eq!(
4419 parser(r"[\pL]").parse(),
4420 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4421 span: span(0..5),
4422 negated: false,
4423 kind: itemset(item_unicode(ast::ClassUnicode {
4424 span: span(1..4),
4425 negated: false,
4426 kind: ast::ClassUnicodeKind::OneLetter('L'),
4427 })),
4428 }))));
4429 assert_eq!(
4430 parser(r"[\w]").parse(),
4431 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4432 span: span(0..4),
4433 negated: false,
4434 kind: itemset(item_perl(ast::ClassPerl {
4435 span: span(1..3),
4436 kind: ast::ClassPerlKind::Word,
4437 negated: false,
4438 })),
4439 }))));
4440 assert_eq!(
4441 parser(r"[a\wz]").parse(),
4442 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4443 span: span(0..6),
4444 negated: false,
4445 kind: union(span(1..5), vec![
4446 lit(span(1..2), 'a'),
4447 item_perl(ast::ClassPerl {
4448 span: span(2..4),
4449 kind: ast::ClassPerlKind::Word,
4450 negated: false,
4451 }),
4452 lit(span(4..5), 'z'),
4453 ]),
4454 }))));
4455
4456 assert_eq!(
4457 parser("[a-z]").parse(),
4458 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4459 span: span(0..5),
4460 negated: false,
4461 kind: itemset(range(span(1..4), 'a', 'z')),
4462 }))));
4463 assert_eq!(
4464 parser("[a-cx-z]").parse(),
4465 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4466 span: span(0..8),
4467 negated: false,
4468 kind: union(span(1..7), vec![
4469 range(span(1..4), 'a', 'c'),
4470 range(span(4..7), 'x', 'z'),
4471 ]),
4472 }))));
4473 assert_eq!(
4474 parser(r"[\w&&a-cx-z]").parse(),
4475 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4476 span: span(0..12),
4477 negated: false,
4478 kind: intersection(
4479 span(1..11),
4480 itemset(item_perl(ast::ClassPerl {
4481 span: span(1..3),
4482 kind: ast::ClassPerlKind::Word,
4483 negated: false,
4484 })),
4485 union(span(5..11), vec![
4486 range(span(5..8), 'a', 'c'),
4487 range(span(8..11), 'x', 'z'),
4488 ]),
4489 ),
4490 }))));
4491 assert_eq!(
4492 parser(r"[a-cx-z&&\w]").parse(),
4493 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4494 span: span(0..12),
4495 negated: false,
4496 kind: intersection(
4497 span(1..11),
4498 union(span(1..7), vec![
4499 range(span(1..4), 'a', 'c'),
4500 range(span(4..7), 'x', 'z'),
4501 ]),
4502 itemset(item_perl(ast::ClassPerl {
4503 span: span(9..11),
4504 kind: ast::ClassPerlKind::Word,
4505 negated: false,
4506 })),
4507 ),
4508 }))));
4509 assert_eq!(
4510 parser(r"[a--b--c]").parse(),
4511 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4512 span: span(0..9),
4513 negated: false,
4514 kind: difference(
4515 span(1..8),
4516 difference(
4517 span(1..5),
4518 itemset(lit(span(1..2), 'a')),
4519 itemset(lit(span(4..5), 'b')),
4520 ),
4521 itemset(lit(span(7..8), 'c')),
4522 ),
4523 }))));
4524 assert_eq!(
4525 parser(r"[a~~b~~c]").parse(),
4526 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4527 span: span(0..9),
4528 negated: false,
4529 kind: symdifference(
4530 span(1..8),
4531 symdifference(
4532 span(1..5),
4533 itemset(lit(span(1..2), 'a')),
4534 itemset(lit(span(4..5), 'b')),
4535 ),
4536 itemset(lit(span(7..8), 'c')),
4537 ),
4538 }))));
4539 assert_eq!(
4540 parser(r"[\^&&^]").parse(),
4541 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4542 span: span(0..7),
4543 negated: false,
4544 kind: intersection(
4545 span(1..6),
4546 itemset(ast::ClassSetItem::Literal(ast::Literal {
4547 span: span(1..3),
4548 kind: ast::LiteralKind::Punctuation,
4549 c: '^',
4550 })),
4551 itemset(lit(span(5..6), '^')),
4552 ),
4553 }))));
4554 assert_eq!(
4555 parser(r"[\&&&&]").parse(),
4556 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4557 span: span(0..7),
4558 negated: false,
4559 kind: intersection(
4560 span(1..6),
4561 itemset(ast::ClassSetItem::Literal(ast::Literal {
4562 span: span(1..3),
4563 kind: ast::LiteralKind::Punctuation,
4564 c: '&',
4565 })),
4566 itemset(lit(span(5..6), '&')),
4567 ),
4568 }))));
4569 assert_eq!(
4570 parser(r"[&&&&]").parse(),
4571 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4572 span: span(0..6),
4573 negated: false,
4574 kind: intersection(
4575 span(1..5),
4576 intersection(
4577 span(1..3),
4578 itemset(empty(span(1..1))),
4579 itemset(empty(span(3..3))),
4580 ),
4581 itemset(empty(span(5..5))),
4582 ),
4583 }))));
4584
4585 let pat = "[☃-⛄]";
4586 assert_eq!(
4587 parser(pat).parse(),
4588 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4589 span: span_range(pat, 0..9),
4590 negated: false,
4591 kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange {
4592 span: span_range(pat, 1..8),
4593 start: ast::Literal {
4594 span: span_range(pat, 1..4),
4595 kind: ast::LiteralKind::Verbatim,
4596 c: '☃',
4597 },
4598 end: ast::Literal {
4599 span: span_range(pat, 5..8),
4600 kind: ast::LiteralKind::Verbatim,
4601 c: '⛄',
4602 },
4603 })),
4604 }))));
4605
4606 assert_eq!(
4607 parser(r"[]]").parse(),
4608 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4609 span: span(0..3),
4610 negated: false,
4611 kind: itemset(lit(span(1..2), ']')),
4612 }))));
4613 assert_eq!(
4614 parser(r"[]\[]").parse(),
4615 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4616 span: span(0..5),
4617 negated: false,
4618 kind: union(span(1..4), vec![
4619 lit(span(1..2), ']'),
4620 ast::ClassSetItem::Literal(ast::Literal {
4621 span: span(2..4),
4622 kind: ast::LiteralKind::Punctuation,
4623 c: '[',
4624 }),
4625 ]),
4626 }))));
4627 assert_eq!(
4628 parser(r"[\[]]").parse(),
4629 Ok(concat(0..5, vec![
4630 Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4631 span: span(0..4),
4632 negated: false,
4633 kind: itemset(ast::ClassSetItem::Literal(ast::Literal {
4634 span: span(1..3),
4635 kind: ast::LiteralKind::Punctuation,
4636 c: '[',
4637 })),
4638 })),
4639 Ast::Literal(ast::Literal {
4640 span: span(4..5),
4641 kind: ast::LiteralKind::Verbatim,
4642 c: ']',
4643 }),
4644 ])));
4645
4646 assert_eq!(
4647 parser("[").parse().unwrap_err(),
4648 TestError {
4649 span: span(0..1),
4650 kind: ast::ErrorKind::ClassUnclosed,
4651 });
4652 assert_eq!(
4653 parser("[[").parse().unwrap_err(),
4654 TestError {
4655 span: span(1..2),
4656 kind: ast::ErrorKind::ClassUnclosed,
4657 });
4658 assert_eq!(
4659 parser("[[-]").parse().unwrap_err(),
4660 TestError {
4661 span: span(0..1),
4662 kind: ast::ErrorKind::ClassUnclosed,
4663 });
4664 assert_eq!(
4665 parser("[[[:alnum:]").parse().unwrap_err(),
4666 TestError {
4667 span: span(1..2),
4668 kind: ast::ErrorKind::ClassUnclosed,
4669 });
4670 assert_eq!(
4671 parser(r"[\b]").parse().unwrap_err(),
4672 TestError {
4673 span: span(1..3),
4674 kind: ast::ErrorKind::ClassEscapeInvalid,
4675 });
4676 assert_eq!(
4677 parser(r"[\w-a]").parse().unwrap_err(),
4678 TestError {
4679 span: span(1..3),
4680 kind: ast::ErrorKind::ClassEscapeInvalid,
4681 });
4682 assert_eq!(
4683 parser(r"[a-\w]").parse().unwrap_err(),
4684 TestError {
4685 span: span(3..5),
4686 kind: ast::ErrorKind::ClassEscapeInvalid,
4687 });
4688 assert_eq!(
4689 parser(r"[z-a]").parse().unwrap_err(),
4690 TestError {
4691 span: span(1..4),
4692 kind: ast::ErrorKind::ClassRangeInvalid,
4693 });
4694
4695 assert_eq!(
4696 parser_ignore_whitespace("[a ").parse().unwrap_err(),
4697 TestError {
4698 span: span(0..1),
4699 kind: ast::ErrorKind::ClassUnclosed,
4700 });
4701 assert_eq!(
4702 parser_ignore_whitespace("[a- ").parse().unwrap_err(),
4703 TestError {
4704 span: span(0..1),
4705 kind: ast::ErrorKind::ClassUnclosed,
4706 });
4707 }
4708
4709 #[test]
4710 fn parse_set_class_open() {
4711 assert_eq!(
4712 parser("[a]").parse_set_class_open(), {
4713 let set = ast::ClassBracketed {
4714 span: span(0..1),
4715 negated: false,
4716 kind: ast::ClassSet::union(ast::ClassSetUnion {
4717 span: span(1..1),
4718 items: vec![],
4719 }),
4720 };
4721 let union = ast::ClassSetUnion {
4722 span: span(1..1),
4723 items: vec![],
4724 };
4725 Ok((set, union))
4726 });
4727 assert_eq!(
4728 parser_ignore_whitespace("[ a]").parse_set_class_open(), {
4729 let set = ast::ClassBracketed {
4730 span: span(0..4),
4731 negated: false,
4732 kind: ast::ClassSet::union(ast::ClassSetUnion {
4733 span: span(4..4),
4734 items: vec![],
4735 }),
4736 };
4737 let union = ast::ClassSetUnion {
4738 span: span(4..4),
4739 items: vec![],
4740 };
4741 Ok((set, union))
4742 });
4743 assert_eq!(
4744 parser("[^a]").parse_set_class_open(), {
4745 let set = ast::ClassBracketed {
4746 span: span(0..2),
4747 negated: true,
4748 kind: ast::ClassSet::union(ast::ClassSetUnion {
4749 span: span(2..2),
4750 items: vec![],
4751 }),
4752 };
4753 let union = ast::ClassSetUnion {
4754 span: span(2..2),
4755 items: vec![],
4756 };
4757 Ok((set, union))
4758 });
4759 assert_eq!(
4760 parser_ignore_whitespace("[ ^ a]").parse_set_class_open(), {
4761 let set = ast::ClassBracketed {
4762 span: span(0..4),
4763 negated: true,
4764 kind: ast::ClassSet::union(ast::ClassSetUnion {
4765 span: span(4..4),
4766 items: vec![],
4767 }),
4768 };
4769 let union = ast::ClassSetUnion {
4770 span: span(4..4),
4771 items: vec![],
4772 };
4773 Ok((set, union))
4774 });
4775 assert_eq!(
4776 parser("[-a]").parse_set_class_open(), {
4777 let set = ast::ClassBracketed {
4778 span: span(0..2),
4779 negated: false,
4780 kind: ast::ClassSet::union(ast::ClassSetUnion {
4781 span: span(1..1),
4782 items: vec![],
4783 }),
4784 };
4785 let union = ast::ClassSetUnion {
4786 span: span(1..2),
4787 items: vec![
4788 ast::ClassSetItem::Literal(ast::Literal {
4789 span: span(1..2),
4790 kind: ast::LiteralKind::Verbatim,
4791 c: '-',
4792 }),
4793 ],
4794 };
4795 Ok((set, union))
4796 });
4797 assert_eq!(
4798 parser_ignore_whitespace("[ - a]").parse_set_class_open(), {
4799 let set = ast::ClassBracketed {
4800 span: span(0..4),
4801 negated: false,
4802 kind: ast::ClassSet::union(ast::ClassSetUnion {
4803 span: span(2..2),
4804 items: vec![],
4805 }),
4806 };
4807 let union = ast::ClassSetUnion {
4808 span: span(2..3),
4809 items: vec![
4810 ast::ClassSetItem::Literal(ast::Literal {
4811 span: span(2..3),
4812 kind: ast::LiteralKind::Verbatim,
4813 c: '-',
4814 }),
4815 ],
4816 };
4817 Ok((set, union))
4818 });
4819 assert_eq!(
4820 parser("[^-a]").parse_set_class_open(), {
4821 let set = ast::ClassBracketed {
4822 span: span(0..3),
4823 negated: true,
4824 kind: ast::ClassSet::union(ast::ClassSetUnion {
4825 span: span(2..2),
4826 items: vec![],
4827 }),
4828 };
4829 let union = ast::ClassSetUnion {
4830 span: span(2..3),
4831 items: vec![
4832 ast::ClassSetItem::Literal(ast::Literal {
4833 span: span(2..3),
4834 kind: ast::LiteralKind::Verbatim,
4835 c: '-',
4836 }),
4837 ],
4838 };
4839 Ok((set, union))
4840 });
4841 assert_eq!(
4842 parser("[--a]").parse_set_class_open(), {
4843 let set = ast::ClassBracketed {
4844 span: span(0..3),
4845 negated: false,
4846 kind: ast::ClassSet::union(ast::ClassSetUnion {
4847 span: span(1..1),
4848 items: vec![],
4849 }),
4850 };
4851 let union = ast::ClassSetUnion {
4852 span: span(1..3),
4853 items: vec![
4854 ast::ClassSetItem::Literal(ast::Literal {
4855 span: span(1..2),
4856 kind: ast::LiteralKind::Verbatim,
4857 c: '-',
4858 }),
4859 ast::ClassSetItem::Literal(ast::Literal {
4860 span: span(2..3),
4861 kind: ast::LiteralKind::Verbatim,
4862 c: '-',
4863 }),
4864 ],
4865 };
4866 Ok((set, union))
4867 });
4868 assert_eq!(
4869 parser("[]a]").parse_set_class_open(), {
4870 let set = ast::ClassBracketed {
4871 span: span(0..2),
4872 negated: false,
4873 kind: ast::ClassSet::union(ast::ClassSetUnion {
4874 span: span(1..1),
4875 items: vec![],
4876 }),
4877 };
4878 let union = ast::ClassSetUnion {
4879 span: span(1..2),
4880 items: vec![
4881 ast::ClassSetItem::Literal(ast::Literal {
4882 span: span(1..2),
4883 kind: ast::LiteralKind::Verbatim,
4884 c: ']',
4885 }),
4886 ],
4887 };
4888 Ok((set, union))
4889 });
4890 assert_eq!(
4891 parser_ignore_whitespace("[ ] a]").parse_set_class_open(), {
4892 let set = ast::ClassBracketed {
4893 span: span(0..4),
4894 negated: false,
4895 kind: ast::ClassSet::union(ast::ClassSetUnion {
4896 span: span(2..2),
4897 items: vec![],
4898 }),
4899 };
4900 let union = ast::ClassSetUnion {
4901 span: span(2..3),
4902 items: vec![
4903 ast::ClassSetItem::Literal(ast::Literal {
4904 span: span(2..3),
4905 kind: ast::LiteralKind::Verbatim,
4906 c: ']',
4907 }),
4908 ],
4909 };
4910 Ok((set, union))
4911 });
4912 assert_eq!(
4913 parser("[^]a]").parse_set_class_open(), {
4914 let set = ast::ClassBracketed {
4915 span: span(0..3),
4916 negated: true,
4917 kind: ast::ClassSet::union(ast::ClassSetUnion {
4918 span: span(2..2),
4919 items: vec![],
4920 }),
4921 };
4922 let union = ast::ClassSetUnion {
4923 span: span(2..3),
4924 items: vec![
4925 ast::ClassSetItem::Literal(ast::Literal {
4926 span: span(2..3),
4927 kind: ast::LiteralKind::Verbatim,
4928 c: ']',
4929 }),
4930 ],
4931 };
4932 Ok((set, union))
4933 });
4934 assert_eq!(
4935 parser("[-]a]").parse_set_class_open(), {
4936 let set = ast::ClassBracketed {
4937 span: span(0..2),
4938 negated: false,
4939 kind: ast::ClassSet::union(ast::ClassSetUnion {
4940 span: span(1..1),
4941 items: vec![],
4942 }),
4943 };
4944 let union = ast::ClassSetUnion {
4945 span: span(1..2),
4946 items: vec![
4947 ast::ClassSetItem::Literal(ast::Literal {
4948 span: span(1..2),
4949 kind: ast::LiteralKind::Verbatim,
4950 c: '-',
4951 }),
4952 ],
4953 };
4954 Ok((set, union))
4955 });
4956
4957 assert_eq!(
4958 parser("[").parse_set_class_open().unwrap_err(),
4959 TestError {
4960 span: span(0..1),
4961 kind: ast::ErrorKind::ClassUnclosed,
4962 });
4963 assert_eq!(
4964 parser_ignore_whitespace("[ ")
4965 .parse_set_class_open()
4966 .unwrap_err(),
4967 TestError {
4968 span: span(0..5),
4969 kind: ast::ErrorKind::ClassUnclosed,
4970 });
4971 assert_eq!(
4972 parser("[^").parse_set_class_open().unwrap_err(),
4973 TestError {
4974 span: span(0..2),
4975 kind: ast::ErrorKind::ClassUnclosed,
4976 });
4977 assert_eq!(
4978 parser("[]").parse_set_class_open().unwrap_err(),
4979 TestError {
4980 span: span(0..2),
4981 kind: ast::ErrorKind::ClassUnclosed,
4982 });
4983 assert_eq!(
4984 parser("[-").parse_set_class_open().unwrap_err(),
4985 TestError {
4986 span: span(0..2),
4987 kind: ast::ErrorKind::ClassUnclosed,
4988 });
4989 assert_eq!(
4990 parser("[--").parse_set_class_open().unwrap_err(),
4991 TestError {
4992 span: span(0..3),
4993 kind: ast::ErrorKind::ClassUnclosed,
4994 });
4995 }
4996
4997 #[test]
4998 fn maybe_parse_ascii_class() {
4999 assert_eq!(
5000 parser(r"[:alnum:]").maybe_parse_ascii_class(),
5001 Some(ast::ClassAscii {
5002 span: span(0..9),
5003 kind: ast::ClassAsciiKind::Alnum,
5004 negated: false,
5005 }));
5006 assert_eq!(
5007 parser(r"[:alnum:]A").maybe_parse_ascii_class(),
5008 Some(ast::ClassAscii {
5009 span: span(0..9),
5010 kind: ast::ClassAsciiKind::Alnum,
5011 negated: false,
5012 }));
5013 assert_eq!(
5014 parser(r"[:^alnum:]").maybe_parse_ascii_class(),
5015 Some(ast::ClassAscii {
5016 span: span(0..10),
5017 kind: ast::ClassAsciiKind::Alnum,
5018 negated: true,
5019 }));
5020
5021 let p = parser(r"[:");
5022 assert_eq!(p.maybe_parse_ascii_class(), None);
5023 assert_eq!(p.offset(), 0);
5024
5025 let p = parser(r"[:^");
5026 assert_eq!(p.maybe_parse_ascii_class(), None);
5027 assert_eq!(p.offset(), 0);
5028
5029 let p = parser(r"[^:alnum:]");
5030 assert_eq!(p.maybe_parse_ascii_class(), None);
5031 assert_eq!(p.offset(), 0);
5032
5033 let p = parser(r"[:alnnum:]");
5034 assert_eq!(p.maybe_parse_ascii_class(), None);
5035 assert_eq!(p.offset(), 0);
5036
5037 let p = parser(r"[:alnum]");
5038 assert_eq!(p.maybe_parse_ascii_class(), None);
5039 assert_eq!(p.offset(), 0);
5040
5041 let p = parser(r"[:alnum:");
5042 assert_eq!(p.maybe_parse_ascii_class(), None);
5043 assert_eq!(p.offset(), 0);
5044 }
5045
5046 #[test]
5047 fn parse_unicode_class() {
5048 assert_eq!(
5049 parser(r"\pN").parse_escape(),
5050 Ok(Primitive::Unicode(ast::ClassUnicode {
5051 span: span(0..3),
5052 negated: false,
5053 kind: ast::ClassUnicodeKind::OneLetter('N'),
5054 })));
5055 assert_eq!(
5056 parser(r"\PN").parse_escape(),
5057 Ok(Primitive::Unicode(ast::ClassUnicode {
5058 span: span(0..3),
5059 negated: true,
5060 kind: ast::ClassUnicodeKind::OneLetter('N'),
5061 })));
5062 assert_eq!(
5063 parser(r"\p{N}").parse_escape(),
5064 Ok(Primitive::Unicode(ast::ClassUnicode {
5065 span: span(0..5),
5066 negated: false,
5067 kind: ast::ClassUnicodeKind::Named(s("N")),
5068 })));
5069 assert_eq!(
5070 parser(r"\P{N}").parse_escape(),
5071 Ok(Primitive::Unicode(ast::ClassUnicode {
5072 span: span(0..5),
5073 negated: true,
5074 kind: ast::ClassUnicodeKind::Named(s("N")),
5075 })));
5076 assert_eq!(
5077 parser(r"\p{Greek}").parse_escape(),
5078 Ok(Primitive::Unicode(ast::ClassUnicode {
5079 span: span(0..9),
5080 negated: false,
5081 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5082 })));
5083
5084 assert_eq!(
5085 parser(r"\p{scx:Katakana}").parse_escape(),
5086 Ok(Primitive::Unicode(ast::ClassUnicode {
5087 span: span(0..16),
5088 negated: false,
5089 kind: ast::ClassUnicodeKind::NamedValue {
5090 op: ast::ClassUnicodeOpKind::Colon,
5091 name: s("scx"),
5092 value: s("Katakana"),
5093 },
5094 })));
5095 assert_eq!(
5096 parser(r"\p{scx=Katakana}").parse_escape(),
5097 Ok(Primitive::Unicode(ast::ClassUnicode {
5098 span: span(0..16),
5099 negated: false,
5100 kind: ast::ClassUnicodeKind::NamedValue {
5101 op: ast::ClassUnicodeOpKind::Equal,
5102 name: s("scx"),
5103 value: s("Katakana"),
5104 },
5105 })));
5106 assert_eq!(
5107 parser(r"\p{scx!=Katakana}").parse_escape(),
5108 Ok(Primitive::Unicode(ast::ClassUnicode {
5109 span: span(0..17),
5110 negated: false,
5111 kind: ast::ClassUnicodeKind::NamedValue {
5112 op: ast::ClassUnicodeOpKind::NotEqual,
5113 name: s("scx"),
5114 value: s("Katakana"),
5115 },
5116 })));
5117
5118 assert_eq!(
5119 parser(r"\p{:}").parse_escape(),
5120 Ok(Primitive::Unicode(ast::ClassUnicode {
5121 span: span(0..5),
5122 negated: false,
5123 kind: ast::ClassUnicodeKind::NamedValue {
5124 op: ast::ClassUnicodeOpKind::Colon,
5125 name: s(""),
5126 value: s(""),
5127 },
5128 })));
5129 assert_eq!(
5130 parser(r"\p{=}").parse_escape(),
5131 Ok(Primitive::Unicode(ast::ClassUnicode {
5132 span: span(0..5),
5133 negated: false,
5134 kind: ast::ClassUnicodeKind::NamedValue {
5135 op: ast::ClassUnicodeOpKind::Equal,
5136 name: s(""),
5137 value: s(""),
5138 },
5139 })));
5140 assert_eq!(
5141 parser(r"\p{!=}").parse_escape(),
5142 Ok(Primitive::Unicode(ast::ClassUnicode {
5143 span: span(0..6),
5144 negated: false,
5145 kind: ast::ClassUnicodeKind::NamedValue {
5146 op: ast::ClassUnicodeOpKind::NotEqual,
5147 name: s(""),
5148 value: s(""),
5149 },
5150 })));
5151
5152 assert_eq!(
5153 parser(r"\p").parse_escape().unwrap_err(),
5154 TestError {
5155 span: span(2..2),
5156 kind: ast::ErrorKind::EscapeUnexpectedEof,
5157 });
5158 assert_eq!(
5159 parser(r"\p{").parse_escape().unwrap_err(),
5160 TestError {
5161 span: span(3..3),
5162 kind: ast::ErrorKind::EscapeUnexpectedEof,
5163 });
5164 assert_eq!(
5165 parser(r"\p{N").parse_escape().unwrap_err(),
5166 TestError {
5167 span: span(4..4),
5168 kind: ast::ErrorKind::EscapeUnexpectedEof,
5169 });
5170 assert_eq!(
5171 parser(r"\p{Greek").parse_escape().unwrap_err(),
5172 TestError {
5173 span: span(8..8),
5174 kind: ast::ErrorKind::EscapeUnexpectedEof,
5175 });
5176
5177 assert_eq!(
5178 parser(r"\pNz").parse(),
5179 Ok(Ast::Concat(ast::Concat {
5180 span: span(0..4),
5181 asts: vec![
5182 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5183 span: span(0..3),
5184 negated: false,
5185 kind: ast::ClassUnicodeKind::OneLetter('N'),
5186 })),
5187 Ast::Literal(ast::Literal {
5188 span: span(3..4),
5189 kind: ast::LiteralKind::Verbatim,
5190 c: 'z',
5191 }),
5192 ],
5193 })));
5194 assert_eq!(
5195 parser(r"\p{Greek}z").parse(),
5196 Ok(Ast::Concat(ast::Concat {
5197 span: span(0..10),
5198 asts: vec![
5199 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5200 span: span(0..9),
5201 negated: false,
5202 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5203 })),
5204 Ast::Literal(ast::Literal {
5205 span: span(9..10),
5206 kind: ast::LiteralKind::Verbatim,
5207 c: 'z',
5208 }),
5209 ],
5210 })));
5211 }
5212
5213 #[test]
5214 fn parse_perl_class() {
5215 assert_eq!(
5216 parser(r"\d").parse_escape(),
5217 Ok(Primitive::Perl(ast::ClassPerl {
5218 span: span(0..2),
5219 kind: ast::ClassPerlKind::Digit,
5220 negated: false,
5221 })));
5222 assert_eq!(
5223 parser(r"\D").parse_escape(),
5224 Ok(Primitive::Perl(ast::ClassPerl {
5225 span: span(0..2),
5226 kind: ast::ClassPerlKind::Digit,
5227 negated: true,
5228 })));
5229 assert_eq!(
5230 parser(r"\s").parse_escape(),
5231 Ok(Primitive::Perl(ast::ClassPerl {
5232 span: span(0..2),
5233 kind: ast::ClassPerlKind::Space,
5234 negated: false,
5235 })));
5236 assert_eq!(
5237 parser(r"\S").parse_escape(),
5238 Ok(Primitive::Perl(ast::ClassPerl {
5239 span: span(0..2),
5240 kind: ast::ClassPerlKind::Space,
5241 negated: true,
5242 })));
5243 assert_eq!(
5244 parser(r"\w").parse_escape(),
5245 Ok(Primitive::Perl(ast::ClassPerl {
5246 span: span(0..2),
5247 kind: ast::ClassPerlKind::Word,
5248 negated: false,
5249 })));
5250 assert_eq!(
5251 parser(r"\W").parse_escape(),
5252 Ok(Primitive::Perl(ast::ClassPerl {
5253 span: span(0..2),
5254 kind: ast::ClassPerlKind::Word,
5255 negated: true,
5256 })));
5257
5258 assert_eq!(
5259 parser(r"\d").parse(),
5260 Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl {
5261 span: span(0..2),
5262 kind: ast::ClassPerlKind::Digit,
5263 negated: false,
5264 }))));
5265 assert_eq!(
5266 parser(r"\dz").parse(),
5267 Ok(Ast::Concat(ast::Concat {
5268 span: span(0..3),
5269 asts: vec![
5270 Ast::Class(ast::Class::Perl(ast::ClassPerl {
5271 span: span(0..2),
5272 kind: ast::ClassPerlKind::Digit,
5273 negated: false,
5274 })),
5275 Ast::Literal(ast::Literal {
5276 span: span(2..3),
5277 kind: ast::LiteralKind::Verbatim,
5278 c: 'z',
5279 }),
5280 ],
5281 })));
5282 }
5283
5284 // This tests a bug fix where the nest limit checker wasn't decrementing
5285 // its depth during post-traversal, which causes long regexes to trip
5286 // the default limit too aggressively.
5287 #[test]
5288 fn regression_454_nest_too_big() {
5289 let pattern = r#"
5290 2(?:
5291 [45]\d{3}|
5292 7(?:
5293 1[0-267]|
5294 2[0-289]|
5295 3[0-29]|
5296 4[01]|
5297 5[1-3]|
5298 6[013]|
5299 7[0178]|
5300 91
5301 )|
5302 8(?:
5303 0[125]|
5304 [139][1-6]|
5305 2[0157-9]|
5306 41|
5307 6[1-35]|
5308 7[1-5]|
5309 8[1-8]|
5310 90
5311 )|
5312 9(?:
5313 0[0-2]|
5314 1[0-4]|
5315 2[568]|
5316 3[3-6]|
5317 5[5-7]|
5318 6[0167]|
5319 7[15]|
5320 8[0146-9]
5321 )
5322 )\d{4}
5323 "#;
5324 assert!(parser_nest_limit(pattern, 50).parse().is_ok());
5325 }
5326
5327 // This tests that we treat a trailing `-` in a character class as a
5328 // literal `-` even when whitespace mode is enabled and there is whitespace
5329 // after the trailing `-`.
5330 #[test]
5331 fn regression_455_trailing_dash_ignore_whitespace() {
5332 assert!(parser("(?x)[ / - ]").parse().is_ok());
5333 assert!(parser("(?x)[ a - ]").parse().is_ok());
5334 assert!(parser("(?x)[
5335 a
5336 - ]
5337 ").parse().is_ok());
5338 assert!(parser("(?x)[
5339 a # wat
5340 - ]
5341 ").parse().is_ok());
5342
5343 assert!(parser("(?x)[ / -").parse().is_err());
5344 assert!(parser("(?x)[ / - ").parse().is_err());
5345 assert!(parser("(?x)[
5346 / -
5347 ").parse().is_err());
5348 assert!(parser("(?x)[
5349 / - # wat
5350 ").parse().is_err());
5351 }
5352}