]>
Commit | Line | Data |
---|---|---|
0531ce1d XL |
1 | // Copyright 2018 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
10 | ||
11 | /*! | |
12 | This module provides a regular expression parser. | |
13 | */ | |
14 | ||
15 | use std::borrow::Borrow; | |
16 | use std::cell::{Cell, RefCell}; | |
17 | use std::mem; | |
18 | use std::result; | |
19 | ||
20 | use ast::{self, Ast, Position, Span}; | |
21 | use either::Either; | |
22 | ||
23 | use is_meta_character; | |
24 | ||
25 | type Result<T> = result::Result<T, ast::Error>; | |
26 | ||
27 | /// A primitive is an expression with no sub-expressions. This includes | |
28 | /// literals, assertions and non-set character classes. This representation | |
29 | /// is used as intermediate state in the parser. | |
30 | /// | |
31 | /// This does not include ASCII character classes, since they can only appear | |
32 | /// within a set character class. | |
33 | #[derive(Clone, Debug, Eq, PartialEq)] | |
34 | enum Primitive { | |
35 | Literal(ast::Literal), | |
36 | Assertion(ast::Assertion), | |
37 | Dot(Span), | |
38 | Perl(ast::ClassPerl), | |
39 | Unicode(ast::ClassUnicode), | |
40 | } | |
41 | ||
42 | impl Primitive { | |
43 | /// Return the span of this primitive. | |
44 | fn span(&self) -> &Span { | |
45 | match *self { | |
46 | Primitive::Literal(ref x) => &x.span, | |
47 | Primitive::Assertion(ref x) => &x.span, | |
48 | Primitive::Dot(ref span) => span, | |
49 | Primitive::Perl(ref x) => &x.span, | |
50 | Primitive::Unicode(ref x) => &x.span, | |
51 | } | |
52 | } | |
53 | ||
54 | /// Convert this primitive into a proper AST. | |
55 | fn into_ast(self) -> Ast { | |
56 | match self { | |
57 | Primitive::Literal(lit) => Ast::Literal(lit), | |
58 | Primitive::Assertion(assert) => Ast::Assertion(assert), | |
59 | Primitive::Dot(span) => Ast::Dot(span), | |
60 | Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)), | |
61 | Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)), | |
62 | } | |
63 | } | |
64 | ||
65 | /// Convert this primitive into an item in a character class. | |
66 | /// | |
67 | /// If this primitive is not a legal item (i.e., an assertion or a dot), | |
68 | /// then return an error. | |
69 | fn into_class_set_item<P: Borrow<Parser>>( | |
70 | self, | |
71 | p: &ParserI<P>, | |
72 | ) -> Result<ast::ClassSetItem> { | |
73 | use ast::ClassSetItem; | |
74 | use self::Primitive::*; | |
75 | ||
76 | match self { | |
77 | Literal(lit) => Ok(ClassSetItem::Literal(lit)), | |
78 | Perl(cls) => Ok(ClassSetItem::Perl(cls)), | |
79 | Unicode(cls) => Ok(ClassSetItem::Unicode(cls)), | |
80 | x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)), | |
81 | } | |
82 | } | |
83 | ||
84 | /// Convert this primitive into a literal in a character class. In | |
85 | /// particular, literals are the only valid items that can appear in | |
86 | /// ranges. | |
87 | /// | |
88 | /// If this primitive is not a legal item (i.e., a class, assertion or a | |
89 | /// dot), then return an error. | |
90 | fn into_class_literal<P: Borrow<Parser>>( | |
91 | self, | |
92 | p: &ParserI<P>, | |
93 | ) -> Result<ast::Literal> { | |
94 | use self::Primitive::*; | |
95 | ||
96 | match self { | |
97 | Literal(lit) => Ok(lit), | |
98 | x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)), | |
99 | } | |
100 | } | |
101 | } | |
102 | ||
103 | /// Returns true if the given character is a hexadecimal digit. | |
104 | fn is_hex(c: char) -> bool { | |
105 | ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') | |
106 | } | |
107 | ||
108 | /// Returns true if the given character is a valid in a capture group name. | |
109 | /// | |
110 | /// If `first` is true, then `c` is treated as the first character in the | |
111 | /// group name (which is not allowed to be a digit). | |
112 | fn is_capture_char(c: char, first: bool) -> bool { | |
113 | c == '_' || (!first && c >= '0' && c <= '9') | |
114 | || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') | |
115 | } | |
116 | ||
117 | /// A builder for a regular expression parser. | |
118 | /// | |
119 | /// This builder permits modifying configuration options for the parser. | |
120 | #[derive(Clone, Debug)] | |
121 | pub struct ParserBuilder { | |
122 | ignore_whitespace: bool, | |
123 | nest_limit: u32, | |
124 | octal: bool, | |
125 | } | |
126 | ||
127 | impl Default for ParserBuilder { | |
128 | fn default() -> ParserBuilder { | |
129 | ParserBuilder::new() | |
130 | } | |
131 | } | |
132 | ||
133 | impl ParserBuilder { | |
134 | /// Create a new parser builder with a default configuration. | |
135 | pub fn new() -> ParserBuilder { | |
136 | ParserBuilder { | |
137 | ignore_whitespace: false, | |
138 | nest_limit: 250, | |
139 | octal: false, | |
140 | } | |
141 | } | |
142 | ||
143 | /// Build a parser from this configuration with the given pattern. | |
144 | pub fn build(&self) -> Parser { | |
145 | Parser { | |
146 | pos: Cell::new(Position { offset: 0, line: 1, column: 1 }), | |
147 | capture_index: Cell::new(0), | |
148 | nest_limit: self.nest_limit, | |
149 | octal: self.octal, | |
150 | initial_ignore_whitespace: self.ignore_whitespace, | |
151 | ignore_whitespace: Cell::new(self.ignore_whitespace), | |
152 | comments: RefCell::new(vec![]), | |
153 | stack_group: RefCell::new(vec![]), | |
154 | stack_class: RefCell::new(vec![]), | |
155 | capture_names: RefCell::new(vec![]), | |
156 | scratch: RefCell::new(String::new()), | |
157 | } | |
158 | } | |
159 | ||
160 | /// Set the nesting limit for this parser. | |
161 | /// | |
162 | /// The nesting limit controls how deep the abstract syntax tree is allowed | |
163 | /// to be. If the AST exceeds the given limit (e.g., with too many nested | |
164 | /// groups), then an error is returned by the parser. | |
165 | /// | |
166 | /// The purpose of this limit is to act as a heuristic to prevent stack | |
167 | /// overflow for consumers that do structural induction on an `Ast` using | |
168 | /// explicit recursion. While this crate never does this (instead using | |
169 | /// constant stack space and moving the call stack to the heap), other | |
170 | /// crates may. | |
171 | /// | |
172 | /// This limit is not checked until the entire Ast is parsed. Therefore, | |
173 | /// if callers want to put a limit on the amount of heap space used, then | |
174 | /// they should impose a limit on the length, in bytes, of the concrete | |
175 | /// pattern string. In particular, this is viable since this parser | |
176 | /// implementation will limit itself to heap space proportional to the | |
177 | /// lenth of the pattern string. | |
178 | /// | |
179 | /// Note that a nest limit of `0` will return a nest limit error for most | |
180 | /// patterns but not all. For example, a nest limit of `0` permits `a` but | |
181 | /// not `ab`, since `ab` requires a concatenation, which results in a nest | |
182 | /// depth of `1`. In general, a nest limit is not something that manifests | |
183 | /// in an obvious way in the concrete syntax, therefore, it should not be | |
184 | /// used in a granular way. | |
185 | pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { | |
186 | self.nest_limit = limit; | |
187 | self | |
188 | } | |
189 | ||
190 | /// Whether to support octal syntax or not. | |
191 | /// | |
192 | /// Octal syntax is a little-known way of uttering Unicode codepoints in | |
193 | /// a regular expression. For example, `a`, `\x61`, `\u0061` and | |
194 | /// `\141` are all equivalent regular expressions, where the last example | |
195 | /// shows octal syntax. | |
196 | /// | |
197 | /// While supporting octal syntax isn't in and of itself a problem, it does | |
198 | /// make good error messages harder. That is, in PCRE based regex engines, | |
199 | /// syntax like `\0` invokes a backreference, which is explicitly | |
200 | /// unsupported in Rust's regex engine. However, many users expect it to | |
201 | /// be supported. Therefore, when octal support is disabled, the error | |
202 | /// message will explicitly mention that backreferences aren't supported. | |
203 | /// | |
204 | /// Octal syntax is disabled by default. | |
205 | pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { | |
206 | self.octal = yes; | |
207 | self | |
208 | } | |
209 | ||
210 | /// Enable verbose mode in the regular expression. | |
211 | /// | |
212 | /// When enabled, verbose mode permits insigificant whitespace in many | |
213 | /// places in the regular expression, as well as comments. Comments are | |
214 | /// started using `#` and continue until the end of the line. | |
215 | /// | |
216 | /// By default, this is disabled. It may be selectively enabled in the | |
217 | /// regular expression by using the `x` flag regardless of this setting. | |
218 | pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { | |
219 | self.ignore_whitespace = yes; | |
220 | self | |
221 | } | |
222 | } | |
223 | ||
224 | /// A regular expression parser. | |
225 | /// | |
226 | /// This parses a string representation of a regular expression into an | |
227 | /// abstract syntax tree. The size of the tree is proportional to the length | |
228 | /// of the regular expression pattern. | |
229 | /// | |
230 | /// A `Parser` can be configured in more detail via a | |
231 | /// [`ParserBuilder`](struct.ParserBuilder.html). | |
232 | #[derive(Clone, Debug)] | |
233 | pub struct Parser { | |
234 | /// The current position of the parser. | |
235 | pos: Cell<Position>, | |
236 | /// The current capture index. | |
237 | capture_index: Cell<u32>, | |
238 | /// The maximum number of open parens/brackets allowed. If the parser | |
239 | /// exceeds this number, then an error is returned. | |
240 | nest_limit: u32, | |
241 | /// Whether to support octal syntax or not. When `false`, the parser will | |
242 | /// return an error helpfully pointing out that backreferences are not | |
243 | /// supported. | |
244 | octal: bool, | |
245 | /// The initial setting for `ignore_whitespace` as provided by | |
246 | /// Th`ParserBuilder`. is is used when reseting the parser's state. | |
247 | initial_ignore_whitespace: bool, | |
248 | /// Whether whitespace should be ignored. When enabled, comments are | |
249 | /// also permitted. | |
250 | ignore_whitespace: Cell<bool>, | |
251 | /// A list of comments, in order of appearance. | |
252 | comments: RefCell<Vec<ast::Comment>>, | |
253 | /// A stack of grouped sub-expressions, including alternations. | |
254 | stack_group: RefCell<Vec<GroupState>>, | |
255 | /// A stack of nested character classes. This is only non-empty when | |
256 | /// parsing a class. | |
257 | stack_class: RefCell<Vec<ClassState>>, | |
258 | /// A sorted sequence of capture names. This is used to detect duplicate | |
259 | /// capture names and report an error if one is detected. | |
260 | capture_names: RefCell<Vec<ast::CaptureName>>, | |
261 | /// A scratch buffer used in various places. Mostly this is used to | |
262 | /// accumulate relevant characters from parts of a pattern. | |
263 | scratch: RefCell<String>, | |
264 | } | |
265 | ||
266 | /// ParserI is the internal parser implementation. | |
267 | /// | |
268 | /// We use this separate type so that we can carry the provided pattern string | |
269 | /// along with us. In particular, a `Parser` internal state is not tied to any | |
270 | /// one pattern, but `ParserI` is. | |
271 | /// | |
272 | /// This type also lets us use `ParserI<&Parser>` in production code while | |
273 | /// retaining the convenience of `ParserI<Parser>` for tests, which sometimes | |
274 | /// work against the internal interface of the parser. | |
275 | #[derive(Clone, Debug)] | |
276 | struct ParserI<'s, P> { | |
277 | /// The parser state/configuration. | |
278 | parser: P, | |
279 | /// The full regular expression provided by the user. | |
280 | pattern: &'s str, | |
281 | } | |
282 | ||
283 | /// GroupState represents a single stack frame while parsing nested groups | |
284 | /// and alternations. Each frame records the state up to an opening parenthesis | |
285 | /// or a alternating bracket `|`. | |
286 | #[derive(Clone, Debug)] | |
287 | enum GroupState { | |
288 | /// This state is pushed whenever an opening group is found. | |
289 | Group { | |
290 | /// The concatenation immediately preceding the opening group. | |
291 | concat: ast::Concat, | |
292 | /// The group that has been opened. Its sub-AST is always empty. | |
293 | group: ast::Group, | |
294 | /// Whether this group has the `x` flag enabled or not. | |
295 | ignore_whitespace: bool, | |
296 | }, | |
297 | /// This state is pushed whenever a new alternation branch is found. If | |
298 | /// an alternation branch is found and this state is at the top of the | |
299 | /// stack, then this state should be modified to include the new | |
300 | /// alternation. | |
301 | Alternation(ast::Alternation), | |
302 | } | |
303 | ||
304 | /// ClassState represents a single stack frame while parsing character classes. | |
305 | /// Each frame records the state up to an intersection, difference, symmetric | |
306 | /// difference or nested class. | |
307 | /// | |
308 | /// Note that a parser's character class stack is only non-empty when parsing | |
309 | /// a character class. In all other cases, it is empty. | |
310 | #[derive(Clone, Debug)] | |
311 | enum ClassState { | |
312 | /// This state is pushed whenever an opening bracket is found. | |
313 | Open { | |
314 | /// The union of class items immediately preceding this class. | |
315 | union: ast::ClassSetUnion, | |
316 | /// The class that has been opened. Typically this just corresponds | |
317 | /// to the `[`, but it can also include `[^` since `^` indicates | |
318 | /// negation of the class. | |
319 | set: ast::ClassBracketed, | |
320 | }, | |
321 | /// This state is pushed when a operator is seen. When popped, the stored | |
322 | /// set becomes the left hand side of the operator. | |
323 | Op { | |
324 | /// The type of the operation, i.e., &&, -- or ~~. | |
325 | kind: ast::ClassSetBinaryOpKind, | |
326 | /// The left-hand side of the operator. | |
327 | lhs: ast::ClassSet, | |
328 | }, | |
329 | } | |
330 | ||
331 | impl Parser { | |
332 | /// Create a new parser with a default configuration. | |
333 | /// | |
334 | /// The parser can be run with either the `parse` or `parse_with_comments` | |
335 | /// methods. The parse methods return an abstract syntax tree. | |
336 | /// | |
337 | /// To set configuration options on the parser, use | |
338 | /// [`ParserBuilder`](struct.ParserBuilder.html). | |
339 | pub fn new() -> Parser { | |
340 | ParserBuilder::new().build() | |
341 | } | |
342 | ||
343 | /// Parse the regular expression into an abstract syntax tree. | |
344 | pub fn parse(&mut self, pattern: &str) -> Result<Ast> { | |
345 | ParserI::new(self, pattern).parse() | |
346 | } | |
347 | ||
348 | /// Parse the regular expression and return an abstract syntax tree with | |
349 | /// all of the comments found in the pattern. | |
350 | pub fn parse_with_comments( | |
351 | &mut self, | |
352 | pattern: &str, | |
353 | ) -> Result<ast::WithComments> { | |
354 | ParserI::new(self, pattern).parse_with_comments() | |
355 | } | |
356 | ||
357 | /// Reset the internal state of a parser. | |
358 | /// | |
359 | /// This is called at the beginning of every parse. This prevents the | |
360 | /// parser from running with inconsistent state (say, if a previous | |
361 | /// invocation returned an error and the parser is reused). | |
362 | fn reset(&self) { | |
363 | // These settings should be in line with the construction | |
364 | // in `ParserBuilder::build`. | |
365 | self.pos.set(Position { offset: 0, line: 1, column: 1}); | |
366 | self.ignore_whitespace.set(self.initial_ignore_whitespace); | |
367 | self.comments.borrow_mut().clear(); | |
368 | self.stack_group.borrow_mut().clear(); | |
369 | self.stack_class.borrow_mut().clear(); | |
370 | } | |
371 | } | |
372 | ||
373 | impl<'s, P: Borrow<Parser>> ParserI<'s, P> { | |
374 | /// Build an internal parser from a parser configuration and a pattern. | |
375 | fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> { | |
376 | ParserI { parser: parser, pattern: pattern } | |
377 | } | |
378 | ||
379 | /// Return a reference to the parser state. | |
380 | fn parser(&self) -> &Parser { | |
381 | self.parser.borrow() | |
382 | } | |
383 | ||
384 | /// Return a reference to the pattern being parsed. | |
385 | fn pattern(&self) -> &str { | |
386 | self.pattern.borrow() | |
387 | } | |
388 | ||
389 | /// Create a new error with the given span and error type. | |
390 | fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error { | |
391 | ast::Error { | |
392 | kind: kind, | |
393 | pattern: self.pattern().to_string(), | |
394 | span: span, | |
395 | } | |
396 | } | |
397 | ||
398 | /// Return the current offset of the parser. | |
399 | /// | |
400 | /// The offset starts at `0` from the beginning of the regular expression | |
401 | /// pattern string. | |
402 | fn offset(&self) -> usize { | |
403 | self.parser().pos.get().offset | |
404 | } | |
405 | ||
406 | /// Return the current line number of the parser. | |
407 | /// | |
408 | /// The line number starts at `1`. | |
409 | fn line(&self) -> usize { | |
410 | self.parser().pos.get().line | |
411 | } | |
412 | ||
413 | /// Return the current column of the parser. | |
414 | /// | |
415 | /// The column number starts at `1` and is reset whenever a `\n` is seen. | |
416 | fn column(&self) -> usize { | |
417 | self.parser().pos.get().column | |
418 | } | |
419 | ||
420 | /// Return the next capturing index. Each subsequent call increments the | |
421 | /// internal index. | |
422 | /// | |
423 | /// The span given should correspond to the location of the opening | |
424 | /// parenthesis. | |
425 | /// | |
426 | /// If the capture limit is exceeded, then an error is returned. | |
427 | fn next_capture_index(&self, span: Span) -> Result<u32> { | |
428 | let current = self.parser().capture_index.get(); | |
429 | let i = try!(current.checked_add(1).ok_or_else(|| { | |
430 | self.error(span, ast::ErrorKind::CaptureLimitExceeded) | |
431 | })); | |
432 | self.parser().capture_index.set(i); | |
433 | Ok(i) | |
434 | } | |
435 | ||
436 | /// Adds the given capture name to this parser. If this capture name has | |
437 | /// already been used, then an error is returned. | |
438 | fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> { | |
439 | let mut names = self.parser().capture_names.borrow_mut(); | |
440 | match names.binary_search_by_key( | |
441 | &cap.name.as_str(), | |
442 | |c| c.name.as_str(), | |
443 | ) { | |
444 | Err(i) => { | |
445 | names.insert(i, cap.clone()); | |
446 | Ok(()) | |
447 | } | |
448 | Ok(i) => { | |
449 | Err(self.error(cap.span, ast::ErrorKind::GroupNameDuplicate { | |
450 | original: names[i].span, | |
451 | })) | |
452 | } | |
453 | } | |
454 | } | |
455 | ||
456 | /// Return whether the parser should ignore whitespace or not. | |
457 | fn ignore_whitespace(&self) -> bool { | |
458 | self.parser().ignore_whitespace.get() | |
459 | } | |
460 | ||
461 | /// Return the character at the current position of the parser. | |
462 | /// | |
463 | /// This panics if the current position does not point to a valid char. | |
464 | fn char(&self) -> char { | |
465 | self.char_at(self.offset()) | |
466 | } | |
467 | ||
468 | /// Return the character at the given position. | |
469 | /// | |
470 | /// This panics if the given position does not point to a valid char. | |
471 | fn char_at(&self, i: usize) -> char { | |
472 | self.pattern()[i..].chars().next() | |
473 | .unwrap_or_else(|| { | |
474 | panic!("expected char at offset {}", i) | |
475 | }) | |
476 | } | |
477 | ||
478 | /// Bump the parser to the next Unicode scalar value. | |
479 | /// | |
480 | /// If the end of the input has been reached, then `false` is returned. | |
481 | fn bump(&self) -> bool { | |
482 | if self.is_eof() { | |
483 | return false; | |
484 | } | |
485 | let Position { mut offset, mut line, mut column } = self.pos(); | |
486 | if self.char() == '\n' { | |
487 | line = line.checked_add(1).unwrap(); | |
488 | column = 1; | |
489 | } else { | |
490 | column = column.checked_add(1).unwrap(); | |
491 | } | |
492 | offset += self.char().len_utf8(); | |
493 | self.parser().pos.set(Position { | |
494 | offset: offset, | |
495 | line: line, | |
496 | column: column, | |
497 | }); | |
498 | self.pattern()[self.offset()..].chars().next().is_some() | |
499 | } | |
500 | ||
501 | /// If the substring starting at the current position of the parser has | |
502 | /// the given prefix, then bump the parser to the character immediately | |
503 | /// following the prefix and return true. Otherwise, don't bump the parser | |
504 | /// and return false. | |
505 | fn bump_if(&self, prefix: &str) -> bool { | |
506 | if self.pattern()[self.offset()..].starts_with(prefix) { | |
507 | for _ in 0..prefix.chars().count() { | |
508 | self.bump(); | |
509 | } | |
510 | true | |
511 | } else { | |
512 | false | |
513 | } | |
514 | } | |
515 | ||
516 | /// Returns true if and only if the parser is positioned at a look-around | |
517 | /// prefix. The conditions under which this returns true must always | |
518 | /// correspond to a regular expression that would otherwise be consider | |
519 | /// invalid. | |
520 | /// | |
521 | /// This should only be called immediately after parsing the opening of | |
522 | /// a group or a set of flags. | |
523 | fn is_lookaround_prefix(&self) -> bool { | |
524 | self.bump_if("?=") | |
525 | || self.bump_if("?!") | |
526 | || self.bump_if("?<=") | |
527 | || self.bump_if("?<!") | |
528 | } | |
529 | ||
530 | /// Bump the parser, and if the `x` flag is enabled, bump through any | |
531 | /// subsequent spaces. Return true if and only if the parser is not at | |
532 | /// EOF. | |
533 | fn bump_and_bump_space(&self) -> bool { | |
534 | if !self.bump() { | |
535 | return false; | |
536 | } | |
537 | self.bump_space(); | |
538 | !self.is_eof() | |
539 | } | |
540 | ||
541 | /// If the `x` flag is enabled (i.e., whitespace insensitivity with | |
542 | /// comments), then this will advance the parser through all whitespace | |
543 | /// and comments to the next non-whitespace non-comment byte. | |
544 | /// | |
545 | /// If the `x` flag is disabled, then this is a no-op. | |
546 | /// | |
547 | /// This should be used selectively throughout the parser where | |
548 | /// arbitrary whitespace is permitted when the `x` flag is enabled. For | |
549 | /// example, `{ 5 , 6}` is equivalent to `{5,6}`. | |
550 | fn bump_space(&self) { | |
551 | if !self.ignore_whitespace() { | |
552 | return; | |
553 | } | |
554 | while !self.is_eof() { | |
555 | if self.char().is_whitespace() { | |
556 | self.bump(); | |
557 | } else if self.char() == '#' { | |
558 | let start = self.pos(); | |
559 | let mut comment_text = String::new(); | |
560 | self.bump(); | |
561 | while !self.is_eof() { | |
562 | let c = self.char(); | |
563 | self.bump(); | |
564 | if c == '\n' { | |
565 | break; | |
566 | } | |
567 | comment_text.push(c); | |
568 | } | |
569 | let comment = ast::Comment { | |
570 | span: Span::new(start, self.pos()), | |
571 | comment: comment_text, | |
572 | }; | |
573 | self.parser().comments.borrow_mut().push(comment); | |
574 | } else { | |
575 | break; | |
576 | } | |
577 | } | |
578 | } | |
579 | ||
580 | /// Peek at the next character in the input without advancing the parser. | |
581 | /// | |
582 | /// If the input has been exhausted, then this returns `None`. | |
583 | fn peek(&self) -> Option<char> { | |
584 | if self.is_eof() { | |
585 | return None; | |
586 | } | |
587 | self.pattern()[self.offset() + self.char().len_utf8()..].chars().next() | |
588 | } | |
589 | ||
590 | /// Like peek, but will ignore spaces when the parser is in whitespace | |
591 | /// insensitive mode. | |
592 | fn peek_space(&self) -> Option<char> { | |
593 | if !self.ignore_whitespace() { | |
594 | return self.peek(); | |
595 | } | |
596 | if self.is_eof() { | |
597 | return None; | |
598 | } | |
599 | let mut start = self.offset() + self.char().len_utf8(); | |
600 | let mut in_comment = false; | |
601 | for (i, c) in self.pattern()[start..].char_indices() { | |
602 | if c.is_whitespace() { | |
603 | continue; | |
604 | } else if !in_comment && c == '#' { | |
605 | in_comment = true; | |
606 | } else if in_comment && c == '\n' { | |
607 | in_comment = false; | |
608 | } else { | |
609 | start += i; | |
610 | break; | |
611 | } | |
612 | } | |
613 | self.pattern()[start..].chars().next() | |
614 | } | |
615 | ||
616 | /// Returns true if the next call to `bump` would return false. | |
617 | fn is_eof(&self) -> bool { | |
618 | self.offset() == self.pattern().len() | |
619 | } | |
620 | ||
621 | /// Return the current position of the parser, which includes the offset, | |
622 | /// line and column. | |
623 | fn pos(&self) -> Position { | |
624 | self.parser().pos.get() | |
625 | } | |
626 | ||
627 | /// Create a span at the current position of the parser. Both the start | |
628 | /// and end of the span are set. | |
629 | fn span(&self) -> Span { | |
630 | Span::splat(self.pos()) | |
631 | } | |
632 | ||
633 | /// Create a span that covers the current character. | |
634 | fn span_char(&self) -> Span { | |
635 | let mut next = Position { | |
636 | offset: self.offset().checked_add(self.char().len_utf8()).unwrap(), | |
637 | line: self.line(), | |
638 | column: self.column().checked_add(1).unwrap(), | |
639 | }; | |
640 | if self.char() == '\n' { | |
641 | next.line += 1; | |
642 | next.column = 1; | |
643 | } | |
644 | Span::new(self.pos(), next) | |
645 | } | |
646 | ||
647 | /// Parse and push a single alternation on to the parser's internal stack. | |
648 | /// If the top of the stack already has an alternation, then add to that | |
649 | /// instead of pushing a new one. | |
650 | /// | |
651 | /// The concatenation given corresponds to a single alternation branch. | |
652 | /// The concatenation returned starts the next branch and is empty. | |
653 | /// | |
654 | /// This assumes the parser is currently positioned at `|` and will advance | |
655 | /// the parser to the character following `|`. | |
656 | fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> { | |
657 | assert_eq!(self.char(), '|'); | |
658 | concat.span.end = self.pos(); | |
659 | self.push_or_add_alternation(concat); | |
660 | self.bump(); | |
661 | Ok(ast::Concat { | |
662 | span: self.span(), | |
663 | asts: vec![], | |
664 | }) | |
665 | } | |
666 | ||
667 | /// Pushes or adds the given branch of an alternation to the parser's | |
668 | /// internal stack of state. | |
669 | fn push_or_add_alternation(&self, concat: ast::Concat) { | |
670 | use self::GroupState::*; | |
671 | ||
672 | let mut stack = self.parser().stack_group.borrow_mut(); | |
673 | if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() { | |
674 | alts.asts.push(concat.into_ast()); | |
675 | return; | |
676 | } | |
677 | stack.push(Alternation(ast::Alternation { | |
678 | span: Span::new(concat.span.start, self.pos()), | |
679 | asts: vec![concat.into_ast()], | |
680 | })); | |
681 | } | |
682 | ||
683 | /// Parse and push a group AST (and its parent concatenation) on to the | |
684 | /// parser's internal stack. Return a fresh concatenation corresponding | |
685 | /// to the group's sub-AST. | |
686 | /// | |
687 | /// If a set of flags was found (with no group), then the concatenation | |
688 | /// is returned with that set of flags added. | |
689 | /// | |
690 | /// This assumes that the parser is currently positioned on the opening | |
691 | /// parenthesis. It advances the parser to the character at the start | |
692 | /// of the sub-expression (or adjoining expression). | |
693 | /// | |
694 | /// If there was a problem parsing the start of the group, then an error | |
695 | /// is returned. | |
696 | fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> { | |
697 | assert_eq!(self.char(), '('); | |
698 | match try!(self.parse_group()) { | |
699 | Either::Left(set) => { | |
700 | let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace); | |
701 | if let Some(v) = ignore { | |
702 | self.parser().ignore_whitespace.set(v); | |
703 | } | |
704 | ||
705 | concat.asts.push(Ast::Flags(set)); | |
706 | Ok(concat) | |
707 | } | |
708 | Either::Right(group) => { | |
709 | let old_ignore_whitespace = self.ignore_whitespace(); | |
710 | let new_ignore_whitespace = group | |
711 | .flags() | |
712 | .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace)) | |
713 | .unwrap_or(old_ignore_whitespace); | |
714 | self.parser().stack_group.borrow_mut().push(GroupState::Group { | |
715 | concat: concat, | |
716 | group: group, | |
717 | ignore_whitespace: old_ignore_whitespace, | |
718 | }); | |
719 | self.parser().ignore_whitespace.set(new_ignore_whitespace); | |
720 | Ok(ast::Concat { | |
721 | span: self.span(), | |
722 | asts: vec![], | |
723 | }) | |
724 | } | |
725 | } | |
726 | } | |
727 | ||
728 | /// Pop a group AST from the parser's internal stack and set the group's | |
729 | /// AST to the given concatenation. Return the concatenation containing | |
730 | /// the group. | |
731 | /// | |
732 | /// This assumes that the parser is currently positioned on the closing | |
733 | /// parenthesis and advances the parser to the character following the `)`. | |
734 | /// | |
735 | /// If no such group could be popped, then an unopened group error is | |
736 | /// returned. | |
737 | fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> { | |
738 | use self::GroupState::*; | |
739 | ||
740 | assert_eq!(self.char(), ')'); | |
741 | let mut stack = self.parser().stack_group.borrow_mut(); | |
742 | let (mut prior_concat, mut group, ignore_whitespace, alt) = | |
743 | match stack.pop() { | |
744 | Some(Group { concat, group, ignore_whitespace }) => { | |
745 | (concat, group, ignore_whitespace, None) | |
746 | } | |
747 | Some(Alternation(alt)) => { | |
748 | match stack.pop() { | |
749 | Some(Group { concat, group, ignore_whitespace }) => { | |
750 | (concat, group, ignore_whitespace, Some(alt)) | |
751 | } | |
752 | None | Some(Alternation(_)) => { | |
753 | return Err(self.error( | |
754 | self.span_char(), | |
755 | ast::ErrorKind::GroupUnopened, | |
756 | )); | |
757 | } | |
758 | } | |
759 | } | |
760 | None => { | |
761 | return Err(self.error( | |
762 | self.span_char(), | |
763 | ast::ErrorKind::GroupUnopened, | |
764 | )); | |
765 | } | |
766 | }; | |
767 | self.parser().ignore_whitespace.set(ignore_whitespace); | |
768 | group_concat.span.end = self.pos(); | |
769 | self.bump(); | |
770 | group.span.end = self.pos(); | |
771 | match alt { | |
772 | Some(mut alt) => { | |
773 | alt.span.end = group_concat.span.end; | |
774 | alt.asts.push(group_concat.into_ast()); | |
775 | group.ast = Box::new(alt.into_ast()); | |
776 | } | |
777 | None => { | |
778 | group.ast = Box::new(group_concat.into_ast()); | |
779 | } | |
780 | } | |
781 | prior_concat.asts.push(Ast::Group(group)); | |
782 | Ok(prior_concat) | |
783 | } | |
784 | ||
785 | /// Pop the last state from the parser's internal stack, if it exists, and | |
786 | /// add the given concatenation to it. There either must be no state or a | |
787 | /// single alternation item on the stack. Any other scenario produces an | |
788 | /// error. | |
789 | /// | |
790 | /// This assumes that the parser has advanced to the end. | |
791 | fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> { | |
792 | concat.span.end = self.pos(); | |
793 | let mut stack = self.parser().stack_group.borrow_mut(); | |
794 | let ast = match stack.pop() { | |
795 | None => Ok(concat.into_ast()), | |
796 | Some(GroupState::Alternation(mut alt)) => { | |
797 | alt.span.end = self.pos(); | |
798 | alt.asts.push(concat.into_ast()); | |
799 | Ok(Ast::Alternation(alt)) | |
800 | } | |
801 | Some(GroupState::Group { group, .. }) => { | |
802 | return Err(self.error( | |
803 | group.span, | |
804 | ast::ErrorKind::GroupUnclosed, | |
805 | )); | |
806 | } | |
807 | }; | |
808 | // If we try to pop again, there should be nothing. | |
809 | match stack.pop() { | |
810 | None => ast, | |
811 | Some(GroupState::Alternation(_)) => { | |
812 | // This unreachable is unfortunate. This case can't happen | |
813 | // because the only way we can be here is if there were two | |
814 | // `GroupState::Alternation`s adjacent in the parser's stack, | |
815 | // which we guarantee to never happen because we never push a | |
816 | // `GroupState::Alternation` if one is already at the top of | |
817 | // the stack. | |
818 | unreachable!() | |
819 | } | |
820 | Some(GroupState::Group { group, .. }) => { | |
821 | Err(self.error(group.span, ast::ErrorKind::GroupUnclosed)) | |
822 | } | |
823 | } | |
824 | } | |
825 | ||
826 | /// Parse the opening of a character class and push the current class | |
827 | /// parsing context onto the parser's stack. This assumes that the parser | |
828 | /// is positioned at an opening `[`. The given union should correspond to | |
829 | /// the union of set items built up before seeing the `[`. | |
830 | /// | |
831 | /// If there was a problem parsing the opening of the class, then an error | |
832 | /// is returned. Otherwise, a new union of set items for the class is | |
833 | /// returned (which may be populated with either a `]` or a `-`). | |
834 | fn push_class_open( | |
835 | &self, | |
836 | parent_union: ast::ClassSetUnion, | |
837 | ) -> Result<ast::ClassSetUnion> { | |
838 | assert_eq!(self.char(), '['); | |
839 | ||
840 | let (nested_set, nested_union) = try!(self.parse_set_class_open()); | |
841 | self.parser().stack_class.borrow_mut().push(ClassState::Open { | |
842 | union: parent_union, | |
843 | set: nested_set, | |
844 | }); | |
845 | Ok(nested_union) | |
846 | } | |
847 | ||
848 | /// Parse the end of a character class set and pop the character class | |
849 | /// parser stack. The union given corresponds to the last union built | |
850 | /// before seeing the closing `]`. The union returned corresponds to the | |
851 | /// parent character class set with the nested class added to it. | |
852 | /// | |
853 | /// This assumes that the parser is positioned at a `]` and will advance | |
854 | /// the parser to the byte immediately following the `]`. | |
855 | /// | |
856 | /// If the stack is empty after popping, then this returns the final | |
857 | /// "top-level" character class AST (where a "top-level" character class | |
858 | /// is one that is not nested inside any other character class). | |
859 | /// | |
860 | /// If there is no corresponding opening bracket on the parser's stack, | |
861 | /// then an error is returned. | |
862 | fn pop_class( | |
863 | &self, | |
864 | nested_union: ast::ClassSetUnion, | |
865 | ) -> Result<Either<ast::ClassSetUnion, ast::Class>> { | |
866 | assert_eq!(self.char(), ']'); | |
867 | ||
868 | let item = ast::ClassSet::Item(nested_union.into_item()); | |
869 | let prevset = self.pop_class_op(item); | |
870 | let mut stack = self.parser().stack_class.borrow_mut(); | |
871 | match stack.pop() { | |
872 | None => { | |
873 | // We can never observe an empty stack: | |
874 | // | |
875 | // 1) We are guaranteed to start with a non-empty stack since | |
876 | // the character class parser is only initiated when it sees | |
877 | // a `[`. | |
878 | // 2) If we ever observe an empty stack while popping after | |
879 | // seeing a `]`, then we signal the character class parser | |
880 | // to terminate. | |
881 | panic!("unexpected empty character class stack") | |
882 | }, | |
883 | Some(ClassState::Op { .. }) => { | |
884 | // This panic is unfortunate, but this case is impossible | |
885 | // since we already popped the Op state if one exists above. | |
886 | // Namely, every push to the class parser stack is guarded by | |
887 | // whether an existing Op is already on the top of the stack. | |
888 | // If it is, the existing Op is modified. That is, the stack | |
889 | // can never have consecutive Op states. | |
890 | panic!("unexpected ClassState::Op") | |
891 | } | |
892 | Some(ClassState::Open { mut union, mut set }) => { | |
893 | self.bump(); | |
894 | set.span.end = self.pos(); | |
895 | set.kind = prevset; | |
896 | if stack.is_empty() { | |
897 | Ok(Either::Right(ast::Class::Bracketed(set))) | |
898 | } else { | |
899 | union.push(ast::ClassSetItem::Bracketed(Box::new(set))); | |
900 | Ok(Either::Left(union)) | |
901 | } | |
902 | } | |
903 | } | |
904 | } | |
905 | ||
906 | /// Return an "unclosed class" error whose span points to the most | |
907 | /// recently opened class. | |
908 | /// | |
909 | /// This should only be called while parsing a character class. | |
910 | fn unclosed_class_error(&self) -> ast::Error { | |
911 | for state in self.parser().stack_class.borrow().iter().rev() { | |
912 | match *state { | |
913 | ClassState::Open { ref set, .. } => { | |
914 | return self.error(set.span, ast::ErrorKind::ClassUnclosed); | |
915 | } | |
916 | _ => {} | |
917 | } | |
918 | } | |
919 | // We are guaranteed to have a non-empty stack with at least | |
920 | // one open bracket, so we should never get here. | |
921 | panic!("no open character class found") | |
922 | } | |
923 | ||
924 | /// Push the current set of class items on to the class parser's stack as | |
925 | /// the left hand side of the given operator. | |
926 | /// | |
927 | /// A fresh set union is returned, which should be used to build the right | |
928 | /// hand side of this operator. | |
929 | fn push_class_op( | |
930 | &self, | |
931 | next_kind: ast::ClassSetBinaryOpKind, | |
932 | next_union: ast::ClassSetUnion, | |
933 | ) -> ast::ClassSetUnion { | |
934 | ||
935 | let item = ast::ClassSet::Item(next_union.into_item()); | |
936 | let new_lhs = self.pop_class_op(item); | |
937 | self.parser().stack_class.borrow_mut().push(ClassState::Op { | |
938 | kind: next_kind, | |
939 | lhs: new_lhs, | |
940 | }); | |
941 | ast::ClassSetUnion { span: self.span(), items: vec![] } | |
942 | } | |
943 | ||
944 | /// Pop a character class set from the character class parser stack. If the | |
945 | /// top of the stack is just an item (not an operation), then return the | |
946 | /// given set unchanged. If the top of the stack is an operation, then the | |
947 | /// given set will be used as the rhs of the operation on the top of the | |
948 | /// stack. In that case, the binary operation is returned as a set. | |
949 | fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet { | |
950 | let mut stack = self.parser().stack_class.borrow_mut(); | |
951 | let (kind, lhs) = match stack.pop() { | |
952 | Some(ClassState::Op { kind, lhs }) => (kind, lhs), | |
953 | Some(state @ ClassState::Open { .. }) => { | |
954 | stack.push(state); | |
955 | return rhs; | |
956 | } | |
957 | None => unreachable!(), | |
958 | }; | |
959 | let span = Span::new(lhs.span().start, rhs.span().end); | |
960 | ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { | |
961 | span: span, | |
962 | kind: kind, | |
963 | lhs: Box::new(lhs), | |
964 | rhs: Box::new(rhs), | |
965 | }) | |
966 | } | |
967 | } | |
968 | ||
969 | impl<'s, P: Borrow<Parser>> ParserI<'s, P> { | |
970 | /// Parse the regular expression into an abstract syntax tree. | |
971 | fn parse(&self) -> Result<Ast> { | |
972 | self.parse_with_comments().map(|astc| astc.ast) | |
973 | } | |
974 | ||
975 | /// Parse the regular expression and return an abstract syntax tree with | |
976 | /// all of the comments found in the pattern. | |
977 | fn parse_with_comments(&self) -> Result<ast::WithComments> { | |
978 | assert_eq!(self.offset(), 0, "parser can only be used once"); | |
979 | self.parser().reset(); | |
980 | let mut concat = ast::Concat { | |
981 | span: self.span(), | |
982 | asts: vec![], | |
983 | }; | |
984 | loop { | |
985 | self.bump_space(); | |
986 | if self.is_eof() { | |
987 | break; | |
988 | } | |
989 | match self.char() { | |
990 | '(' => concat = try!(self.push_group(concat)), | |
991 | ')' => concat = try!(self.pop_group(concat)), | |
992 | '|' => concat = try!(self.push_alternate(concat)), | |
993 | '[' => { | |
994 | let class = try!(self.parse_set_class()); | |
995 | concat.asts.push(Ast::Class(class)); | |
996 | } | |
997 | '?' => { | |
998 | concat = try!(self.parse_uncounted_repetition( | |
999 | concat, ast::RepetitionKind::ZeroOrOne)); | |
1000 | } | |
1001 | '*' => { | |
1002 | concat = try!(self.parse_uncounted_repetition( | |
1003 | concat, ast::RepetitionKind::ZeroOrMore)); | |
1004 | } | |
1005 | '+' => { | |
1006 | concat = try!(self.parse_uncounted_repetition( | |
1007 | concat, ast::RepetitionKind::OneOrMore)); | |
1008 | } | |
1009 | '{' => { | |
1010 | concat = try!(self.parse_counted_repetition(concat)); | |
1011 | } | |
1012 | _ => concat.asts.push(try!(self.parse_primitive()).into_ast()), | |
1013 | } | |
1014 | } | |
1015 | let ast = try!(self.pop_group_end(concat)); | |
1016 | try!(NestLimiter::new(self).check(&ast)); | |
1017 | Ok(ast::WithComments { | |
1018 | ast: ast, | |
1019 | comments: mem::replace( | |
1020 | &mut *self.parser().comments.borrow_mut(), | |
1021 | vec![], | |
1022 | ), | |
1023 | }) | |
1024 | } | |
1025 | ||
1026 | /// Parses an uncounted repetition operation. An uncounted repetition | |
1027 | /// operator includes ?, * and +, but does not include the {m,n} syntax. | |
1028 | /// The given `kind` should correspond to the operator observed by the | |
1029 | /// caller. | |
1030 | /// | |
1031 | /// This assumes that the paser is currently positioned at the repetition | |
1032 | /// operator and advances the parser to the first character after the | |
1033 | /// operator. (Note that the operator may include a single additional `?`, | |
1034 | /// which makes the operator ungreedy.) | |
1035 | /// | |
1036 | /// The caller should include the concatenation that is being built. The | |
1037 | /// concatenation returned includes the repetition operator applied to the | |
1038 | /// last expression in the given concatenation. | |
1039 | fn parse_uncounted_repetition( | |
1040 | &self, | |
1041 | mut concat: ast::Concat, | |
1042 | kind: ast::RepetitionKind, | |
1043 | ) -> Result<ast::Concat> { | |
1044 | assert!( | |
1045 | self.char() == '?' || self.char() == '*' || self.char() == '+'); | |
1046 | let op_start = self.pos(); | |
1047 | let ast = match concat.asts.pop() { | |
1048 | Some(ast) => ast, | |
1049 | None => return Err(self.error( | |
1050 | self.span(), | |
1051 | ast::ErrorKind::RepetitionMissing, | |
1052 | )), | |
1053 | }; | |
1054 | let mut greedy = true; | |
1055 | if self.bump() && self.char() == '?' { | |
1056 | greedy = false; | |
1057 | self.bump(); | |
1058 | } | |
1059 | concat.asts.push(Ast::Repetition(ast::Repetition { | |
1060 | span: ast.span().with_end(self.pos()), | |
1061 | op: ast::RepetitionOp { | |
1062 | span: Span::new(op_start, self.pos()), | |
1063 | kind: kind, | |
1064 | }, | |
1065 | greedy: greedy, | |
1066 | ast: Box::new(ast), | |
1067 | })); | |
1068 | Ok(concat) | |
1069 | } | |
1070 | ||
1071 | /// Parses a counted repetition operation. A counted repetition operator | |
1072 | /// corresponds to the {m,n} syntax, and does not include the ?, * or + | |
1073 | /// operators. | |
1074 | /// | |
1075 | /// This assumes that the paser is currently positioned at the opening `{` | |
1076 | /// and advances the parser to the first character after the operator. | |
1077 | /// (Note that the operator may include a single additional `?`, which | |
1078 | /// makes the operator ungreedy.) | |
1079 | /// | |
1080 | /// The caller should include the concatenation that is being built. The | |
1081 | /// concatenation returned includes the repetition operator applied to the | |
1082 | /// last expression in the given concatenation. | |
1083 | fn parse_counted_repetition( | |
1084 | &self, | |
1085 | mut concat: ast::Concat, | |
1086 | ) -> Result<ast::Concat> { | |
1087 | assert!(self.char() == '{'); | |
1088 | let start = self.pos(); | |
1089 | let ast = match concat.asts.pop() { | |
1090 | Some(ast) => ast, | |
1091 | None => return Err(self.error( | |
1092 | self.span(), | |
1093 | ast::ErrorKind::RepetitionMissing, | |
1094 | )), | |
1095 | }; | |
1096 | if !self.bump_and_bump_space() { | |
1097 | return Err(self.error( | |
1098 | Span::new(start, self.pos()), | |
1099 | ast::ErrorKind::RepetitionCountUnclosed, | |
1100 | )); | |
1101 | } | |
1102 | let count_start = try!(self.parse_decimal()); | |
1103 | let mut range = ast::RepetitionRange::Exactly(count_start); | |
1104 | if self.is_eof() { | |
1105 | return Err(self.error( | |
1106 | Span::new(start, self.pos()), | |
1107 | ast::ErrorKind::RepetitionCountUnclosed, | |
1108 | )); | |
1109 | } | |
1110 | if self.char() == ',' { | |
1111 | if !self.bump_and_bump_space() { | |
1112 | return Err(self.error( | |
1113 | Span::new(start, self.pos()), | |
1114 | ast::ErrorKind::RepetitionCountUnclosed, | |
1115 | )); | |
1116 | } | |
1117 | if self.char() != '}' { | |
1118 | let count_end = try!(self.parse_decimal()); | |
1119 | range = ast::RepetitionRange::Bounded(count_start, count_end); | |
1120 | } else { | |
1121 | range = ast::RepetitionRange::AtLeast(count_start); | |
1122 | } | |
1123 | } | |
1124 | if self.is_eof() || self.char() != '}' { | |
1125 | return Err(self.error( | |
1126 | Span::new(start, self.pos()), | |
1127 | ast::ErrorKind::RepetitionCountUnclosed, | |
1128 | )); | |
1129 | } | |
1130 | ||
1131 | let mut greedy = true; | |
1132 | if self.bump_and_bump_space() && self.char() == '?' { | |
1133 | greedy = false; | |
1134 | self.bump(); | |
1135 | } | |
1136 | ||
1137 | let op_span = Span::new(start, self.pos()); | |
1138 | if !range.is_valid() { | |
1139 | return Err(self.error( | |
1140 | op_span, | |
1141 | ast::ErrorKind::RepetitionCountInvalid, | |
1142 | )); | |
1143 | } | |
1144 | concat.asts.push(Ast::Repetition(ast::Repetition { | |
1145 | span: ast.span().with_end(self.pos()), | |
1146 | op: ast::RepetitionOp { | |
1147 | span: op_span, | |
1148 | kind: ast::RepetitionKind::Range(range), | |
1149 | }, | |
1150 | greedy: greedy, | |
1151 | ast: Box::new(ast), | |
1152 | })); | |
1153 | Ok(concat) | |
1154 | } | |
1155 | ||
1156 | /// Parse a group (which contains a sub-expression) or a set of flags. | |
1157 | /// | |
1158 | /// If a group was found, then it is returned with an empty AST. If a set | |
1159 | /// of flags is found, then that set is returned. | |
1160 | /// | |
1161 | /// The parser should be positioned at the opening parenthesis. | |
1162 | /// | |
1163 | /// This advances the parser to the character before the start of the | |
1164 | /// sub-expression (in the case of a group) or to the closing parenthesis | |
1165 | /// immediately following the set of flags. | |
1166 | /// | |
1167 | /// # Errors | |
1168 | /// | |
1169 | /// If flags are given and incorrectly specified, then a corresponding | |
1170 | /// error is returned. | |
1171 | /// | |
1172 | /// If a capture name is given and it is incorrectly specified, then a | |
1173 | /// corresponding error is returned. | |
1174 | fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> { | |
1175 | assert_eq!(self.char(), '('); | |
1176 | let open_span = self.span_char(); | |
1177 | self.bump(); | |
1178 | self.bump_space(); | |
1179 | if self.is_lookaround_prefix() { | |
1180 | return Err(self.error( | |
1181 | Span::new(open_span.start, self.span().end), | |
1182 | ast::ErrorKind::UnsupportedLookAround, | |
1183 | )); | |
1184 | } | |
1185 | let inner_span = self.span(); | |
1186 | if self.bump_if("?P<") { | |
1187 | let capture_index = try!(self.next_capture_index(open_span)); | |
1188 | let cap = try!(self.parse_capture_name(capture_index)); | |
1189 | Ok(Either::Right(ast::Group { | |
1190 | span: open_span, | |
1191 | kind: ast::GroupKind::CaptureName(cap), | |
1192 | ast: Box::new(Ast::Empty(self.span())), | |
1193 | })) | |
1194 | } else if self.bump_if("?") { | |
1195 | if self.is_eof() { | |
1196 | return Err(self.error( | |
1197 | open_span, | |
1198 | ast::ErrorKind::GroupUnclosed, | |
1199 | )); | |
1200 | } | |
1201 | let flags = try!(self.parse_flags()); | |
1202 | let char_end = self.char(); | |
1203 | self.bump(); | |
1204 | if char_end == ')' { | |
1205 | // We don't allow empty flags, e.g., `(?)`. We instead | |
1206 | // interpret it as a repetition operator missing its argument. | |
1207 | if flags.items.is_empty() { | |
1208 | return Err(self.error( | |
1209 | inner_span, | |
1210 | ast::ErrorKind::RepetitionMissing, | |
1211 | )); | |
1212 | } | |
1213 | Ok(Either::Left(ast::SetFlags { | |
1214 | span: Span { end: self.pos(), ..open_span }, | |
1215 | flags: flags, | |
1216 | })) | |
1217 | } else { | |
1218 | assert_eq!(char_end, ':'); | |
1219 | Ok(Either::Right(ast::Group { | |
1220 | span: open_span, | |
1221 | kind: ast::GroupKind::NonCapturing(flags), | |
1222 | ast: Box::new(Ast::Empty(self.span())), | |
1223 | })) | |
1224 | } | |
1225 | } else { | |
1226 | let capture_index = try!(self.next_capture_index(open_span)); | |
1227 | Ok(Either::Right(ast::Group { | |
1228 | span: open_span, | |
1229 | kind: ast::GroupKind::CaptureIndex(capture_index), | |
1230 | ast: Box::new(Ast::Empty(self.span())), | |
1231 | })) | |
1232 | } | |
1233 | } | |
1234 | ||
1235 | /// Parses a capture group name. Assumes that the parser is positioned at | |
1236 | /// the first character in the name following the opening `<` (and may | |
1237 | /// possibly be EOF). This advances the parser to the first character | |
1238 | /// following the closing `>`. | |
1239 | /// | |
1240 | /// The caller must provide the capture index of the group for this name. | |
1241 | fn parse_capture_name( | |
1242 | &self, | |
1243 | capture_index: u32, | |
1244 | ) -> Result<ast::CaptureName> { | |
1245 | if self.is_eof() { | |
1246 | return Err(self.error( | |
1247 | self.span(), | |
1248 | ast::ErrorKind::GroupNameUnexpectedEof, | |
1249 | )); | |
1250 | } | |
1251 | let start = self.pos(); | |
1252 | loop { | |
1253 | if self.char() == '>' { | |
1254 | break; | |
1255 | } | |
1256 | if !is_capture_char(self.char(), self.pos() == start) { | |
1257 | return Err(self.error( | |
1258 | self.span_char(), | |
1259 | ast::ErrorKind::GroupNameInvalid, | |
1260 | )); | |
1261 | } | |
1262 | if !self.bump() { | |
1263 | break; | |
1264 | } | |
1265 | } | |
1266 | let end = self.pos(); | |
1267 | if self.is_eof() { | |
1268 | return Err(self.error( | |
1269 | self.span(), | |
1270 | ast::ErrorKind::GroupNameUnexpectedEof, | |
1271 | )); | |
1272 | } | |
1273 | assert_eq!(self.char(), '>'); | |
1274 | self.bump(); | |
1275 | let name = &self.pattern()[start.offset..end.offset]; | |
1276 | if name.is_empty() { | |
1277 | return Err(self.error( | |
1278 | Span::new(start, start), | |
1279 | ast::ErrorKind::GroupNameEmpty, | |
1280 | )); | |
1281 | } | |
1282 | let capname = ast::CaptureName { | |
1283 | span: Span::new(start, end), | |
1284 | name: name.to_string(), | |
1285 | index: capture_index, | |
1286 | }; | |
1287 | try!(self.add_capture_name(&capname)); | |
1288 | Ok(capname) | |
1289 | } | |
1290 | ||
1291 | /// Parse a sequence of flags starting at the current character. | |
1292 | /// | |
1293 | /// This advances the parser to the character immediately following the | |
1294 | /// flags, which is guaranteed to be either `:` or `)`. | |
1295 | /// | |
1296 | /// # Errors | |
1297 | /// | |
1298 | /// If any flags are duplicated, then an error is returned. | |
1299 | /// | |
1300 | /// If the negation operator is used more than once, then an error is | |
1301 | /// returned. | |
1302 | /// | |
1303 | /// If no flags could be found or if the negation operation is not followed | |
1304 | /// by any flags, then an error is returned. | |
1305 | fn parse_flags(&self) -> Result<ast::Flags> { | |
1306 | let mut flags = ast::Flags { | |
1307 | span: self.span(), | |
1308 | items: vec![], | |
1309 | }; | |
1310 | let mut last_was_negation = None; | |
1311 | while self.char() != ':' && self.char() != ')' { | |
1312 | if self.char() == '-' { | |
1313 | last_was_negation = Some(self.span_char()); | |
1314 | let item = ast::FlagsItem { | |
1315 | span: self.span_char(), | |
1316 | kind: ast::FlagsItemKind::Negation, | |
1317 | }; | |
1318 | if let Some(i) = flags.add_item(item) { | |
1319 | return Err(self.error( | |
1320 | self.span_char(), | |
1321 | ast::ErrorKind::FlagRepeatedNegation { | |
1322 | original: flags.items[i].span, | |
1323 | }, | |
1324 | )); | |
1325 | } | |
1326 | } else { | |
1327 | last_was_negation = None; | |
1328 | let item = ast::FlagsItem { | |
1329 | span: self.span_char(), | |
1330 | kind: ast::FlagsItemKind::Flag(try!(self.parse_flag())), | |
1331 | }; | |
1332 | if let Some(i) = flags.add_item(item) { | |
1333 | return Err(self.error( | |
1334 | self.span_char(), | |
1335 | ast::ErrorKind::FlagDuplicate { | |
1336 | original: flags.items[i].span, | |
1337 | }, | |
1338 | )); | |
1339 | } | |
1340 | } | |
1341 | if !self.bump() { | |
1342 | return Err(self.error( | |
1343 | self.span(), | |
1344 | ast::ErrorKind::FlagUnexpectedEof, | |
1345 | )); | |
1346 | } | |
1347 | } | |
1348 | if let Some(span) = last_was_negation { | |
1349 | return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation)); | |
1350 | } | |
1351 | flags.span.end = self.pos(); | |
1352 | Ok(flags) | |
1353 | } | |
1354 | ||
1355 | /// Parse the current character as a flag. Do not advance the parser. | |
1356 | /// | |
1357 | /// # Errors | |
1358 | /// | |
1359 | /// If the flag is not recognized, then an error is returned. | |
1360 | fn parse_flag(&self) -> Result<ast::Flag> { | |
1361 | match self.char() { | |
1362 | 'i' => Ok(ast::Flag::CaseInsensitive), | |
1363 | 'm' => Ok(ast::Flag::MultiLine), | |
1364 | 's' => Ok(ast::Flag::DotMatchesNewLine), | |
1365 | 'U' => Ok(ast::Flag::SwapGreed), | |
1366 | 'u' => Ok(ast::Flag::Unicode), | |
1367 | 'x' => Ok(ast::Flag::IgnoreWhitespace), | |
1368 | _ => Err(self.error( | |
1369 | self.span_char(), | |
1370 | ast::ErrorKind::FlagUnrecognized, | |
1371 | )), | |
1372 | } | |
1373 | } | |
1374 | ||
1375 | /// Parse a primitive AST. e.g., A literal, non-set character class or | |
1376 | /// assertion. | |
1377 | /// | |
1378 | /// This assumes that the parser expects a primitive at the current | |
1379 | /// location. i.e., All other non-primitive cases have been handled. | |
1380 | /// For example, if the parser's position is at `|`, then `|` will be | |
1381 | /// treated as a literal (e.g., inside a character class). | |
1382 | /// | |
1383 | /// This advances the parser to the first character immediately following | |
1384 | /// the primitive. | |
1385 | fn parse_primitive(&self) -> Result<Primitive> { | |
1386 | match self.char() { | |
1387 | '\\' => self.parse_escape(), | |
1388 | '.' => { | |
1389 | let ast = Primitive::Dot(self.span_char()); | |
1390 | self.bump(); | |
1391 | Ok(ast) | |
1392 | } | |
1393 | '^' => { | |
1394 | let ast = Primitive::Assertion(ast::Assertion { | |
1395 | span: self.span_char(), | |
1396 | kind: ast::AssertionKind::StartLine, | |
1397 | }); | |
1398 | self.bump(); | |
1399 | Ok(ast) | |
1400 | } | |
1401 | '$' => { | |
1402 | let ast = Primitive::Assertion(ast::Assertion { | |
1403 | span: self.span_char(), | |
1404 | kind: ast::AssertionKind::EndLine, | |
1405 | }); | |
1406 | self.bump(); | |
1407 | Ok(ast) | |
1408 | } | |
1409 | c => { | |
1410 | let ast = Primitive::Literal(ast::Literal { | |
1411 | span: self.span_char(), | |
1412 | kind: ast::LiteralKind::Verbatim, | |
1413 | c: c, | |
1414 | }); | |
1415 | self.bump(); | |
1416 | Ok(ast) | |
1417 | } | |
1418 | } | |
1419 | } | |
1420 | ||
1421 | /// Parse an escape sequence as a primitive AST. | |
1422 | /// | |
1423 | /// This assumes the parser is positioned at the start of the escape | |
1424 | /// sequence, i.e., `\`. It advances the parser to the first position | |
1425 | /// immediately following the escape sequence. | |
1426 | fn parse_escape(&self) -> Result<Primitive> { | |
1427 | assert_eq!(self.char(), '\\'); | |
1428 | let start = self.pos(); | |
1429 | if !self.bump() { | |
1430 | return Err(self.error( | |
1431 | Span::new(start, self.pos()), | |
1432 | ast::ErrorKind::EscapeUnexpectedEof, | |
1433 | )); | |
1434 | } | |
1435 | let c = self.char(); | |
1436 | // Put some of the more complicated routines into helpers. | |
1437 | match c { | |
1438 | '0'...'7' => { | |
1439 | if !self.parser().octal { | |
1440 | return Err(self.error( | |
1441 | Span::new(start, self.span_char().end), | |
1442 | ast::ErrorKind::UnsupportedBackreference, | |
1443 | )); | |
1444 | } | |
1445 | let mut lit = self.parse_octal(); | |
1446 | lit.span.start = start; | |
1447 | return Ok(Primitive::Literal(lit)); | |
1448 | } | |
1449 | '8'...'9' if !self.parser().octal => { | |
1450 | return Err(self.error( | |
1451 | Span::new(start, self.span_char().end), | |
1452 | ast::ErrorKind::UnsupportedBackreference, | |
1453 | )); | |
1454 | } | |
1455 | 'x' | 'u' | 'U' => { | |
1456 | let mut lit = try!(self.parse_hex()); | |
1457 | lit.span.start = start; | |
1458 | return Ok(Primitive::Literal(lit)); | |
1459 | } | |
1460 | 'p' | 'P' => { | |
1461 | let mut cls = try!(self.parse_unicode_class()); | |
1462 | cls.span.start = start; | |
1463 | return Ok(Primitive::Unicode(cls)); | |
1464 | } | |
1465 | 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { | |
1466 | let mut cls = self.parse_perl_class(); | |
1467 | cls.span.start = start; | |
1468 | return Ok(Primitive::Perl(cls)); | |
1469 | } | |
1470 | _ => {} | |
1471 | } | |
1472 | ||
1473 | // Handle all of the one letter sequences inline. | |
1474 | self.bump(); | |
1475 | let span = Span::new(start, self.pos()); | |
1476 | if is_meta_character(c) { | |
1477 | return Ok(Primitive::Literal(ast::Literal { | |
1478 | span: span, | |
1479 | kind: ast::LiteralKind::Punctuation, | |
1480 | c: c, | |
1481 | })); | |
1482 | } | |
1483 | let special = |kind, c| Ok(Primitive::Literal(ast::Literal { | |
1484 | span: span, | |
1485 | kind: ast::LiteralKind::Special(kind), | |
1486 | c: c, | |
1487 | })); | |
1488 | match c { | |
1489 | 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'), | |
1490 | 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'), | |
1491 | 't' => special(ast::SpecialLiteralKind::Tab, '\t'), | |
1492 | 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'), | |
1493 | 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'), | |
1494 | 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'), | |
1495 | ' ' if self.ignore_whitespace() => { | |
1496 | special(ast::SpecialLiteralKind::Space, ' ') | |
1497 | } | |
1498 | 'A' => Ok(Primitive::Assertion(ast::Assertion { | |
1499 | span: span, | |
1500 | kind: ast::AssertionKind::StartText, | |
1501 | })), | |
1502 | 'z' => Ok(Primitive::Assertion(ast::Assertion { | |
1503 | span: span, | |
1504 | kind: ast::AssertionKind::EndText, | |
1505 | })), | |
1506 | 'b' => Ok(Primitive::Assertion(ast::Assertion { | |
1507 | span: span, | |
1508 | kind: ast::AssertionKind::WordBoundary, | |
1509 | })), | |
1510 | 'B' => Ok(Primitive::Assertion(ast::Assertion { | |
1511 | span: span, | |
1512 | kind: ast::AssertionKind::NotWordBoundary, | |
1513 | })), | |
1514 | _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)), | |
1515 | } | |
1516 | } | |
1517 | ||
1518 | /// Parse an octal representation of a Unicode codepoint up to 3 digits | |
1519 | /// long. This expects the parser to be positioned at the first octal | |
1520 | /// digit and advances the parser to the first character immediately | |
1521 | /// following the octal number. This also assumes that parsing octal | |
1522 | /// escapes is enabled. | |
1523 | /// | |
1524 | /// Assuming the preconditions are met, this routine can never fail. | |
1525 | fn parse_octal(&self) -> ast::Literal { | |
1526 | use std::char; | |
1527 | use std::u32; | |
1528 | ||
1529 | assert!(self.parser().octal); | |
1530 | assert!('0' <= self.char() && self.char() <= '7'); | |
1531 | let start = self.pos(); | |
1532 | // Parse up to two more digits. | |
1533 | while | |
1534 | self.bump() && | |
1535 | '0' <= self.char() && self.char() <= '7' && | |
1536 | self.pos().offset - start.offset <= 2 | |
1537 | {} | |
1538 | let end = self.pos(); | |
1539 | let octal = &self.pattern()[start.offset..end.offset]; | |
1540 | // Parsing the octal should never fail since the above guarantees a | |
1541 | // valid number. | |
1542 | let codepoint = | |
1543 | u32::from_str_radix(octal, 8).expect("valid octal number"); | |
1544 | // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no | |
1545 | // invalid Unicode scalar values. | |
1546 | let c = char::from_u32(codepoint).expect("Unicode scalar value"); | |
1547 | ast::Literal { | |
1548 | span: Span::new(start, end), | |
1549 | kind: ast::LiteralKind::Octal, | |
1550 | c: c, | |
1551 | } | |
1552 | } | |
1553 | ||
1554 | /// Parse a hex representation of a Unicode codepoint. This handles both | |
1555 | /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to | |
1556 | /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to | |
1557 | /// the first character immediately following the hexadecimal literal. | |
1558 | fn parse_hex(&self) -> Result<ast::Literal> { | |
1559 | assert!(self.char() == 'x' | |
1560 | || self.char() == 'u' | |
1561 | || self.char() == 'U'); | |
1562 | ||
1563 | let hex_kind = match self.char() { | |
1564 | 'x' => ast::HexLiteralKind::X, | |
1565 | 'u' => ast::HexLiteralKind::UnicodeShort, | |
1566 | _ => ast::HexLiteralKind::UnicodeLong, | |
1567 | }; | |
1568 | if !self.bump_and_bump_space() { | |
1569 | return Err(self.error( | |
1570 | self.span(), | |
1571 | ast::ErrorKind::EscapeUnexpectedEof, | |
1572 | )); | |
1573 | } | |
1574 | if self.char() == '{' { | |
1575 | self.parse_hex_brace(hex_kind) | |
1576 | } else { | |
1577 | self.parse_hex_digits(hex_kind) | |
1578 | } | |
1579 | } | |
1580 | ||
1581 | /// Parse an N-digit hex representation of a Unicode codepoint. This | |
1582 | /// expects the parser to be positioned at the first digit and will advance | |
1583 | /// the parser to the first character immediately following the escape | |
1584 | /// sequence. | |
1585 | /// | |
1586 | /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) | |
1587 | /// or 8 (for `\UNNNNNNNN`). | |
1588 | fn parse_hex_digits( | |
1589 | &self, | |
1590 | kind: ast::HexLiteralKind, | |
1591 | ) -> Result<ast::Literal> { | |
1592 | use std::char; | |
1593 | use std::u32; | |
1594 | ||
1595 | let mut scratch = self.parser().scratch.borrow_mut(); | |
1596 | scratch.clear(); | |
1597 | ||
1598 | let start = self.pos(); | |
1599 | for i in 0..kind.digits() { | |
1600 | if i > 0 && !self.bump_and_bump_space() { | |
1601 | return Err(self.error( | |
1602 | self.span(), | |
1603 | ast::ErrorKind::EscapeUnexpectedEof, | |
1604 | )); | |
1605 | } | |
1606 | if !is_hex(self.char()) { | |
1607 | return Err(self.error( | |
1608 | self.span_char(), | |
1609 | ast::ErrorKind::EscapeHexInvalidDigit, | |
1610 | )); | |
1611 | } | |
1612 | scratch.push(self.char()); | |
1613 | } | |
1614 | // The final bump just moves the parser past the literal, which may | |
1615 | // be EOF. | |
1616 | self.bump_and_bump_space(); | |
1617 | let end = self.pos(); | |
1618 | let hex = scratch.as_str(); | |
1619 | match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { | |
1620 | None => Err(self.error( | |
1621 | Span::new(start, end), | |
1622 | ast::ErrorKind::EscapeHexInvalid, | |
1623 | )), | |
1624 | Some(c) => Ok(ast::Literal { | |
1625 | span: Span::new(start, end), | |
1626 | kind: ast::LiteralKind::HexFixed(kind), | |
1627 | c: c, | |
1628 | }), | |
1629 | } | |
1630 | } | |
1631 | ||
1632 | /// Parse a hex representation of any Unicode scalar value. This expects | |
1633 | /// the parser to be positioned at the opening brace `{` and will advance | |
1634 | /// the parser to the first character following the closing brace `}`. | |
1635 | fn parse_hex_brace( | |
1636 | &self, | |
1637 | kind: ast::HexLiteralKind, | |
1638 | ) -> Result<ast::Literal> { | |
1639 | use std::char; | |
1640 | use std::u32; | |
1641 | ||
1642 | let mut scratch = self.parser().scratch.borrow_mut(); | |
1643 | scratch.clear(); | |
1644 | ||
1645 | let brace_pos = self.pos(); | |
1646 | let start = self.span_char().end; | |
1647 | while self.bump_and_bump_space() && self.char() != '}' { | |
1648 | if !is_hex(self.char()) { | |
1649 | return Err(self.error( | |
1650 | self.span_char(), | |
1651 | ast::ErrorKind::EscapeHexInvalidDigit, | |
1652 | )); | |
1653 | } | |
1654 | scratch.push(self.char()); | |
1655 | } | |
1656 | if self.is_eof() { | |
1657 | return Err(self.error( | |
1658 | Span::new(brace_pos, self.pos()), | |
1659 | ast::ErrorKind::EscapeUnexpectedEof, | |
1660 | )); | |
1661 | } | |
1662 | let end = self.pos(); | |
1663 | let hex = scratch.as_str(); | |
1664 | assert_eq!(self.char(), '}'); | |
1665 | self.bump_and_bump_space(); | |
1666 | ||
1667 | if hex.is_empty() { | |
1668 | return Err(self.error( | |
1669 | Span::new(brace_pos, self.pos()), | |
1670 | ast::ErrorKind::EscapeHexEmpty, | |
1671 | )); | |
1672 | } | |
1673 | match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { | |
1674 | None => Err(self.error( | |
1675 | Span::new(start, end), | |
1676 | ast::ErrorKind::EscapeHexInvalid, | |
1677 | )), | |
1678 | Some(c) => Ok(ast::Literal { | |
1679 | span: Span::new(start, self.pos()), | |
1680 | kind: ast::LiteralKind::HexBrace(kind), | |
1681 | c: c, | |
1682 | }), | |
1683 | } | |
1684 | } | |
1685 | ||
1686 | /// Parse a decimal number into a u32 while trimming leading and trailing | |
1687 | /// whitespace. | |
1688 | /// | |
1689 | /// This expects the parser to be positioned at the first position where | |
1690 | /// a decimal digit could occur. This will advance the parser to the byte | |
1691 | /// immediately following the last contiguous decimal digit. | |
1692 | /// | |
1693 | /// If no decimal digit could be found or if there was a problem parsing | |
1694 | /// the complete set of digits into a u32, then an error is returned. | |
1695 | fn parse_decimal(&self) -> Result<u32> { | |
1696 | let mut scratch = self.parser().scratch.borrow_mut(); | |
1697 | scratch.clear(); | |
1698 | ||
1699 | while !self.is_eof() && self.char().is_whitespace() { | |
1700 | self.bump(); | |
1701 | } | |
1702 | let start = self.pos(); | |
1703 | while !self.is_eof() && '0' <= self.char() && self.char() <= '9' { | |
1704 | scratch.push(self.char()); | |
1705 | self.bump_and_bump_space(); | |
1706 | } | |
1707 | let span = Span::new(start, self.pos()); | |
1708 | while !self.is_eof() && self.char().is_whitespace() { | |
1709 | self.bump_and_bump_space(); | |
1710 | } | |
1711 | let digits = scratch.as_str(); | |
1712 | if digits.is_empty() { | |
1713 | return Err(self.error(span, ast::ErrorKind::DecimalEmpty)); | |
1714 | } | |
1715 | match u32::from_str_radix(digits, 10).ok() { | |
1716 | Some(n) => Ok(n), | |
1717 | None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)), | |
1718 | } | |
1719 | } | |
1720 | ||
1721 | /// Parse a standard character class consisting primarily of characters or | |
1722 | /// character ranges, but can also contain nested character classes of | |
1723 | /// any type (sans `.`). | |
1724 | /// | |
1725 | /// This assumes the parser is positioned at the opening `[`. If parsing | |
1726 | /// is successful, then the parser is advanced to the position immediately | |
1727 | /// following the closing `]`. | |
1728 | fn parse_set_class(&self) -> Result<ast::Class> { | |
1729 | assert_eq!(self.char(), '['); | |
1730 | ||
1731 | let mut union = ast::ClassSetUnion { | |
1732 | span: self.span(), | |
1733 | items: vec![], | |
1734 | }; | |
1735 | loop { | |
1736 | self.bump_space(); | |
1737 | if self.is_eof() { | |
1738 | return Err(self.unclosed_class_error()); | |
1739 | } | |
1740 | match self.char() { | |
1741 | '[' => { | |
1742 | // If we've already parsed the opening bracket, then | |
1743 | // attempt to treat this as the beginning of an ASCII | |
1744 | // class. If ASCII class parsing fails, then the parser | |
1745 | // backs up to `[`. | |
1746 | if !self.parser().stack_class.borrow().is_empty() { | |
1747 | if let Some(cls) = self.maybe_parse_ascii_class() { | |
1748 | union.push(ast::ClassSetItem::Ascii(cls)); | |
1749 | continue; | |
1750 | } | |
1751 | } | |
1752 | union = try!(self.push_class_open(union)); | |
1753 | } | |
1754 | ']' => { | |
1755 | match try!(self.pop_class(union)) { | |
1756 | Either::Left(nested_union) => { union = nested_union; } | |
1757 | Either::Right(class) => return Ok(class), | |
1758 | } | |
1759 | } | |
1760 | '&' if self.peek() == Some('&') => { | |
1761 | assert!(self.bump_if("&&")); | |
1762 | union = self.push_class_op( | |
1763 | ast::ClassSetBinaryOpKind::Intersection, union); | |
1764 | } | |
1765 | '-' if self.peek() == Some('-') => { | |
1766 | assert!(self.bump_if("--")); | |
1767 | union = self.push_class_op( | |
1768 | ast::ClassSetBinaryOpKind::Difference, union); | |
1769 | } | |
1770 | '~' if self.peek() == Some('~') => { | |
1771 | assert!(self.bump_if("~~")); | |
1772 | union = self.push_class_op( | |
1773 | ast::ClassSetBinaryOpKind::SymmetricDifference, union); | |
1774 | } | |
1775 | _ => { | |
1776 | union.push(try!(self.parse_set_class_range())); | |
1777 | } | |
1778 | } | |
1779 | } | |
1780 | } | |
1781 | ||
1782 | /// Parse a single primitive item in a character class set. The item to | |
1783 | /// be parsed can either be one of a simple literal character, a range | |
1784 | /// between two simple literal characters or a "primitive" character | |
1785 | /// class like \w or \p{Greek}. | |
1786 | /// | |
1787 | /// If an invalid escape is found, or if a character class is found where | |
1788 | /// a simple literal is expected (e.g., in a range), then an error is | |
1789 | /// returned. | |
1790 | fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> { | |
1791 | let prim1 = try!(self.parse_set_class_item()); | |
1792 | self.bump_space(); | |
1793 | if self.is_eof() { | |
1794 | return Err(self.unclosed_class_error()); | |
1795 | } | |
1796 | // If the next char isn't a `-`, then we don't have a range. | |
1797 | // There are two exceptions. If the char after a `-` is a `]`, then | |
1798 | // `-` is interpreted as a literal `-`. Alternatively, if the char | |
1799 | // after a `-` is a `-`, then `--` corresponds to a "difference" | |
1800 | // operation. | |
1801 | if self.char() != '-' | |
1802 | || self.peek_space() == Some(']') | |
1803 | || self.peek_space() == Some('-') | |
1804 | { | |
1805 | return prim1.into_class_set_item(self); | |
1806 | } | |
1807 | // OK, now we're parsing a range, so bump past the `-` and parse the | |
1808 | // second half of the range. | |
1809 | if !self.bump_and_bump_space() { | |
1810 | return Err(self.unclosed_class_error()); | |
1811 | } | |
1812 | let prim2 = try!(self.parse_set_class_item()); | |
1813 | let range = ast::ClassSetRange { | |
1814 | span: Span::new(prim1.span().start, prim2.span().end), | |
1815 | start: try!(prim1.into_class_literal(self)), | |
1816 | end: try!(prim2.into_class_literal(self)), | |
1817 | }; | |
1818 | if !range.is_valid() { | |
1819 | return Err(self.error( | |
1820 | range.span, | |
1821 | ast::ErrorKind::ClassRangeInvalid, | |
1822 | )); | |
1823 | } | |
1824 | Ok(ast::ClassSetItem::Range(range)) | |
1825 | } | |
1826 | ||
1827 | /// Parse a single item in a character class as a primitive, where the | |
1828 | /// primitive either consists of a verbatim literal or a single escape | |
1829 | /// sequence. | |
1830 | /// | |
1831 | /// This assumes the parser is positioned at the beginning of a primitive, | |
1832 | /// and advances the parser to the first position after the primitive if | |
1833 | /// successful. | |
1834 | /// | |
1835 | /// Note that it is the caller's responsibility to report an error if an | |
1836 | /// illegal primitive was parsed. | |
1837 | fn parse_set_class_item(&self) -> Result<Primitive> { | |
1838 | if self.char() == '\\' { | |
1839 | self.parse_escape() | |
1840 | } else { | |
1841 | let x = Primitive::Literal(ast::Literal { | |
1842 | span: self.span_char(), | |
1843 | kind: ast::LiteralKind::Verbatim, | |
1844 | c: self.char(), | |
1845 | }); | |
1846 | self.bump(); | |
1847 | Ok(x) | |
1848 | } | |
1849 | } | |
1850 | ||
1851 | /// Parses the opening of a character class set. This includes the opening | |
1852 | /// bracket along with `^` if present to indicate negation. This also | |
1853 | /// starts parsing the opening set of unioned items if applicable, since | |
1854 | /// there are special rules applied to certain characters in the opening | |
1855 | /// of a character class. For example, `[^]]` is the class of all | |
1856 | /// characters not equal to `]`. (`]` would need to be escaped in any other | |
1857 | /// position.) Similarly for `-`. | |
1858 | /// | |
1859 | /// In all cases, the op inside the returned `ast::ClassBracketed` is an | |
1860 | /// empty union. This empty union should be replaced with the actual item | |
1861 | /// when it is popped from the parser's stack. | |
1862 | /// | |
1863 | /// This assumes the parser is positioned at the opening `[` and advances | |
1864 | /// the parser to the first non-special byte of the character class. | |
1865 | /// | |
1866 | /// An error is returned if EOF is found. | |
1867 | fn parse_set_class_open( | |
1868 | &self, | |
1869 | ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> { | |
1870 | assert_eq!(self.char(), '['); | |
1871 | let start = self.pos(); | |
1872 | if !self.bump_and_bump_space() { | |
1873 | return Err(self.error( | |
1874 | Span::new(start, self.pos()), | |
1875 | ast::ErrorKind::ClassUnclosed, | |
1876 | )); | |
1877 | } | |
1878 | ||
1879 | let negated = | |
1880 | if self.char() != '^' { | |
1881 | false | |
1882 | } else { | |
1883 | if !self.bump_and_bump_space() { | |
1884 | return Err(self.error( | |
1885 | Span::new(start, self.pos()), | |
1886 | ast::ErrorKind::ClassUnclosed, | |
1887 | )); | |
1888 | } | |
1889 | true | |
1890 | }; | |
1891 | // Accept any number of `-` as literal `-`. | |
1892 | let mut union = ast::ClassSetUnion { | |
1893 | span: self.span(), | |
1894 | items: vec![], | |
1895 | }; | |
1896 | while self.char() == '-' { | |
1897 | union.push(ast::ClassSetItem::Literal(ast::Literal { | |
1898 | span: self.span_char(), | |
1899 | kind: ast::LiteralKind::Verbatim, | |
1900 | c: '-', | |
1901 | })); | |
1902 | if !self.bump_and_bump_space() { | |
1903 | return Err(self.error( | |
1904 | Span::new(start, self.pos()), | |
1905 | ast::ErrorKind::ClassUnclosed, | |
1906 | )); | |
1907 | } | |
1908 | } | |
1909 | // If `]` is the *first* char in a set, then interpret it as a literal | |
1910 | // `]`. That is, an empty class is impossible to write. | |
1911 | if union.items.is_empty() && self.char() == ']' { | |
1912 | union.push(ast::ClassSetItem::Literal(ast::Literal { | |
1913 | span: self.span_char(), | |
1914 | kind: ast::LiteralKind::Verbatim, | |
1915 | c: ']', | |
1916 | })); | |
1917 | if !self.bump_and_bump_space() { | |
1918 | return Err(self.error( | |
1919 | Span::new(start, self.pos()), | |
1920 | ast::ErrorKind::ClassUnclosed, | |
1921 | )); | |
1922 | } | |
1923 | } | |
1924 | let set = ast::ClassBracketed { | |
1925 | span: Span::new(start, self.pos()), | |
1926 | negated: negated, | |
1927 | kind: ast::ClassSet::union(ast::ClassSetUnion { | |
1928 | span: Span::new(union.span.start, union.span.start), | |
1929 | items: vec![], | |
1930 | }), | |
1931 | }; | |
1932 | Ok((set, union)) | |
1933 | } | |
1934 | ||
1935 | /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`. | |
1936 | /// | |
1937 | /// This assumes the parser is positioned at the opening `[`. | |
1938 | /// | |
1939 | /// If no valid ASCII character class could be found, then this does not | |
1940 | /// advance the parser and `None` is returned. Otherwise, the parser is | |
1941 | /// advanced to the first byte following the closing `]` and the | |
1942 | /// corresponding ASCII class is returned. | |
1943 | fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> { | |
1944 | // ASCII character classes are interesting from a parsing perspective | |
1945 | // because parsing cannot fail with any interesting error. For example, | |
1946 | // in order to use an ASCII character class, it must be enclosed in | |
1947 | // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think | |
1948 | // of it as "ASCII character characters have the syntax `[:NAME:]` | |
1949 | // which can only appear within character brackets." This means that | |
1950 | // things like `[[:lower:]A]` are legal constructs. | |
1951 | // | |
1952 | // However, if one types an incorrect ASCII character class, e.g., | |
1953 | // `[[:loower:]]`, then we treat that as a normal nested character | |
1954 | // class containing the characters `:elorw`. One might argue that we | |
1955 | // should return an error instead since the repeated colons give away | |
1956 | // the intent to write an ASCII class. But what if the user typed | |
1957 | // `[[:lower]]` instead? How can we tell that was intended to be an | |
1958 | // ASCII class and not just a normal nested class? | |
1959 | // | |
1960 | // Reasonable people can probably disagree over this, but for better | |
1961 | // or worse, we implement semantics that never fails at the expense | |
1962 | // of better failure modes. | |
1963 | assert_eq!(self.char(), '['); | |
1964 | // If parsing fails, then we back up the parser to this starting point. | |
1965 | let start = self.pos(); | |
1966 | let mut negated = false; | |
1967 | if !self.bump() || self.char() != ':' { | |
1968 | self.parser().pos.set(start); | |
1969 | return None; | |
1970 | } | |
1971 | if !self.bump() { | |
1972 | self.parser().pos.set(start); | |
1973 | return None; | |
1974 | } | |
1975 | if self.char() == '^' { | |
1976 | negated = true; | |
1977 | if !self.bump() { | |
1978 | self.parser().pos.set(start); | |
1979 | return None; | |
1980 | } | |
1981 | } | |
1982 | let name_start = self.offset(); | |
1983 | while self.char() != ':' && self.bump() {} | |
1984 | if self.is_eof() { | |
1985 | self.parser().pos.set(start); | |
1986 | return None; | |
1987 | } | |
1988 | let name = &self.pattern()[name_start..self.offset()]; | |
1989 | if !self.bump_if(":]") { | |
1990 | self.parser().pos.set(start); | |
1991 | return None; | |
1992 | } | |
1993 | let kind = match ast::ClassAsciiKind::from_name(name) { | |
1994 | Some(kind) => kind, | |
1995 | None => { | |
1996 | self.parser().pos.set(start); | |
1997 | return None; | |
1998 | } | |
1999 | }; | |
2000 | Some(ast::ClassAscii { | |
2001 | span: Span::new(start, self.pos()), | |
2002 | kind: kind, | |
2003 | negated: negated, | |
2004 | }) | |
2005 | } | |
2006 | ||
2007 | /// Parse a Unicode class in either the single character notation, `\pN` | |
2008 | /// or the multi-character bracketed notation, `\p{Greek}`. This assumes | |
2009 | /// the parser is positioned at the `p` (or `P` for negation) and will | |
2010 | /// advance the parser to the character immediately following the class. | |
2011 | /// | |
2012 | /// Note that this does not check whether the class name is valid or not. | |
2013 | fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> { | |
2014 | assert!(self.char() == 'p' || self.char() == 'P'); | |
2015 | ||
2016 | let mut scratch = self.parser().scratch.borrow_mut(); | |
2017 | scratch.clear(); | |
2018 | ||
2019 | let negated = self.char() == 'P'; | |
2020 | if !self.bump_and_bump_space() { | |
2021 | return Err(self.error( | |
2022 | self.span(), | |
2023 | ast::ErrorKind::EscapeUnexpectedEof, | |
2024 | )); | |
2025 | } | |
2026 | let (start, kind) = | |
2027 | if self.char() == '{' { | |
2028 | let start = self.span_char().end; | |
2029 | while self.bump_and_bump_space() && self.char() != '}' { | |
2030 | scratch.push(self.char()); | |
2031 | } | |
2032 | if self.is_eof() { | |
2033 | return Err(self.error( | |
2034 | self.span(), | |
2035 | ast::ErrorKind::EscapeUnexpectedEof, | |
2036 | )); | |
2037 | } | |
2038 | assert_eq!(self.char(), '}'); | |
2039 | self.bump(); | |
2040 | ||
2041 | let name = scratch.as_str(); | |
2042 | if let Some(i) = name.find("!=") { | |
2043 | (start, ast::ClassUnicodeKind::NamedValue { | |
2044 | op: ast::ClassUnicodeOpKind::NotEqual, | |
2045 | name: name[..i].to_string(), | |
2046 | value: name[i+2..].to_string(), | |
2047 | }) | |
2048 | } else if let Some(i) = name.find(':') { | |
2049 | (start, ast::ClassUnicodeKind::NamedValue { | |
2050 | op: ast::ClassUnicodeOpKind::Colon, | |
2051 | name: name[..i].to_string(), | |
2052 | value: name[i+1..].to_string(), | |
2053 | }) | |
2054 | } else if let Some(i) = name.find('=') { | |
2055 | (start, ast::ClassUnicodeKind::NamedValue { | |
2056 | op: ast::ClassUnicodeOpKind::Equal, | |
2057 | name: name[..i].to_string(), | |
2058 | value: name[i+1..].to_string(), | |
2059 | }) | |
2060 | } else { | |
2061 | (start, ast::ClassUnicodeKind::Named(name.to_string())) | |
2062 | } | |
2063 | } else { | |
2064 | let start = self.pos(); | |
2065 | let c = self.char(); | |
2066 | self.bump_and_bump_space(); | |
2067 | let kind = ast::ClassUnicodeKind::OneLetter(c); | |
2068 | (start, kind) | |
2069 | }; | |
2070 | Ok(ast::ClassUnicode { | |
2071 | span: Span::new(start, self.pos()), | |
2072 | negated: negated, | |
2073 | kind: kind, | |
2074 | }) | |
2075 | } | |
2076 | ||
2077 | /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the | |
2078 | /// parser is currently at a valid character class name and will be | |
2079 | /// advanced to the character immediately following the class. | |
2080 | fn parse_perl_class(&self) -> ast::ClassPerl { | |
2081 | let c = self.char(); | |
2082 | let span = self.span_char(); | |
2083 | self.bump(); | |
2084 | let (negated, kind) = match c { | |
2085 | 'd' => (false, ast::ClassPerlKind::Digit), | |
2086 | 'D' => (true, ast::ClassPerlKind::Digit), | |
2087 | 's' => (false, ast::ClassPerlKind::Space), | |
2088 | 'S' => (true, ast::ClassPerlKind::Space), | |
2089 | 'w' => (false, ast::ClassPerlKind::Word), | |
2090 | 'W' => (true, ast::ClassPerlKind::Word), | |
2091 | c => panic!("expected valid Perl class but got '{}'", c), | |
2092 | }; | |
2093 | ast::ClassPerl { span: span, kind: kind, negated: negated } | |
2094 | } | |
2095 | } | |
2096 | ||
2097 | /// A type that traverses a fully parsed Ast and checks whether its depth | |
2098 | /// exceeds the specified nesting limit. If it does, then an error is returned. | |
2099 | #[derive(Debug)] | |
2100 | struct NestLimiter<'p, 's: 'p, P: 'p + 's> { | |
2101 | /// The parser that is checking the nest limit. | |
2102 | p: &'p ParserI<'s, P>, | |
2103 | /// The current depth while walking an Ast. | |
2104 | depth: u32, | |
2105 | } | |
2106 | ||
2107 | impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> { | |
2108 | fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> { | |
2109 | NestLimiter { p: p, depth: 0 } | |
2110 | } | |
2111 | ||
2112 | fn check(self, ast: &Ast) -> Result<()> { | |
2113 | ast::visit(ast, self) | |
2114 | } | |
2115 | ||
2116 | fn increment_depth(&mut self, span: &Span) -> Result<()> { | |
2117 | let new = try!(self.depth.checked_add(1).ok_or_else(|| self.p.error( | |
2118 | span.clone(), | |
2119 | ast::ErrorKind::NestLimitExceeded(::std::u32::MAX), | |
2120 | ))); | |
2121 | let limit = self.p.parser().nest_limit; | |
2122 | if new > limit { | |
2123 | return Err(self.p.error( | |
2124 | span.clone(), | |
2125 | ast::ErrorKind::NestLimitExceeded(limit), | |
2126 | )); | |
2127 | } | |
2128 | self.depth = new; | |
2129 | Ok(()) | |
2130 | } | |
2131 | ||
2132 | fn decrement_depth(&mut self) { | |
2133 | // Assuming the correctness of the visitor, this should never drop | |
2134 | // below 0. | |
2135 | self.depth = self.depth.checked_sub(1).unwrap(); | |
2136 | } | |
2137 | } | |
2138 | ||
2139 | impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> { | |
2140 | type Output = (); | |
2141 | type Err = ast::Error; | |
2142 | ||
2143 | fn finish(self) -> Result<()> { | |
2144 | Ok(()) | |
2145 | } | |
2146 | ||
2147 | fn visit_pre(&mut self, ast: &Ast) -> Result<()> { | |
2148 | let span = match *ast { | |
2149 | Ast::Empty(_) | |
2150 | | Ast::Flags(_) | |
2151 | | Ast::Literal(_) | |
2152 | | Ast::Dot(_) | |
2153 | | Ast::Assertion(_) | |
2154 | | Ast::Class(ast::Class::Unicode(_)) | |
2155 | | Ast::Class(ast::Class::Perl(_)) => { | |
2156 | // These are all base cases, so we don't increment depth. | |
2157 | return Ok(()); | |
2158 | } | |
2159 | Ast::Class(ast::Class::Bracketed(ref x)) => &x.span, | |
2160 | Ast::Repetition(ref x) => &x.span, | |
2161 | Ast::Group(ref x) => &x.span, | |
2162 | Ast::Alternation(ref x) => &x.span, | |
2163 | Ast::Concat(ref x) => &x.span, | |
2164 | }; | |
2165 | self.increment_depth(span) | |
2166 | } | |
2167 | ||
2168 | fn visit_post(&mut self, ast: &Ast) -> Result<()> { | |
2169 | match *ast { | |
2170 | Ast::Empty(_) | |
2171 | | Ast::Flags(_) | |
2172 | | Ast::Literal(_) | |
2173 | | Ast::Dot(_) | |
2174 | | Ast::Assertion(_) | |
2175 | | Ast::Class(ast::Class::Unicode(_)) | |
2176 | | Ast::Class(ast::Class::Perl(_)) => { | |
2177 | // These are all base cases, so we don't decrement depth. | |
2178 | Ok(()) | |
2179 | } | |
2180 | Ast::Class(ast::Class::Bracketed(_)) | |
2181 | | Ast::Repetition(_) | |
2182 | | Ast::Group(_) | |
2183 | | Ast::Alternation(_) | |
2184 | | Ast::Concat(_) => { | |
2185 | self.decrement_depth(); | |
2186 | Ok(()) | |
2187 | } | |
2188 | } | |
2189 | } | |
2190 | ||
2191 | fn visit_class_set_item_pre( | |
2192 | &mut self, | |
2193 | ast: &ast::ClassSetItem, | |
2194 | ) -> Result<()> { | |
2195 | let span = match *ast { | |
2196 | ast::ClassSetItem::Empty(_) | |
2197 | | ast::ClassSetItem::Literal(_) | |
2198 | | ast::ClassSetItem::Range(_) | |
2199 | | ast::ClassSetItem::Ascii(_) | |
2200 | | ast::ClassSetItem::Unicode(_) | |
2201 | | ast::ClassSetItem::Perl(_) => { | |
2202 | // These are all base cases, so we don't increment depth. | |
2203 | return Ok(()); | |
2204 | } | |
2205 | ast::ClassSetItem::Bracketed(ref x) => &x.span, | |
2206 | ast::ClassSetItem::Union(ref x) => &x.span, | |
2207 | }; | |
2208 | self.increment_depth(span) | |
2209 | } | |
2210 | ||
2211 | fn visit_class_set_item_post( | |
2212 | &mut self, | |
2213 | ast: &ast::ClassSetItem, | |
2214 | ) -> Result<()> { | |
2215 | match *ast { | |
2216 | ast::ClassSetItem::Empty(_) | |
2217 | | ast::ClassSetItem::Literal(_) | |
2218 | | ast::ClassSetItem::Range(_) | |
2219 | | ast::ClassSetItem::Ascii(_) | |
2220 | | ast::ClassSetItem::Unicode(_) | |
2221 | | ast::ClassSetItem::Perl(_) => { | |
2222 | // These are all base cases, so we don't decrement depth. | |
2223 | Ok(()) | |
2224 | } | |
2225 | ast::ClassSetItem::Bracketed(_) | |
2226 | | ast::ClassSetItem::Union(_) => { | |
2227 | self.decrement_depth(); | |
2228 | Ok(()) | |
2229 | } | |
2230 | } | |
2231 | } | |
2232 | ||
2233 | fn visit_class_set_binary_op_pre( | |
2234 | &mut self, | |
2235 | ast: &ast::ClassSetBinaryOp, | |
2236 | ) -> Result<()> { | |
2237 | self.increment_depth(&ast.span) | |
2238 | } | |
2239 | ||
2240 | fn visit_class_set_binary_op_post( | |
2241 | &mut self, | |
2242 | _ast: &ast::ClassSetBinaryOp, | |
2243 | ) -> Result<()> { | |
2244 | self.decrement_depth(); | |
2245 | Ok(()) | |
2246 | } | |
2247 | } | |
2248 | ||
2249 | #[cfg(test)] | |
2250 | mod tests { | |
2251 | use std::ops::Range; | |
2252 | ||
2253 | use ast::{self, Ast, Position, Span}; | |
2254 | use super::{Parser, ParserI, ParserBuilder, Primitive}; | |
2255 | ||
2256 | // Our own assert_eq, which has slightly better formatting (but honestly | |
2257 | // still kind of crappy). | |
2258 | macro_rules! assert_eq { | |
2259 | ($left:expr, $right:expr) => ({ | |
2260 | match (&$left, &$right) { | |
2261 | (left_val, right_val) => { | |
2262 | if !(*left_val == *right_val) { | |
2263 | panic!("assertion failed: `(left == right)`\n\n\ | |
2264 | left: `{:?}`\nright: `{:?}`\n\n", | |
2265 | left_val, right_val) | |
2266 | } | |
2267 | } | |
2268 | } | |
2269 | }); | |
2270 | } | |
2271 | ||
2272 | // We create these errors to compare with real ast::Errors in the tests. | |
2273 | // We define equality between TestError and ast::Error to disregard the | |
2274 | // pattern string in ast::Error, which is annoying to provide in tests. | |
2275 | #[derive(Clone, Debug)] | |
2276 | struct TestError { | |
2277 | span: Span, | |
2278 | kind: ast::ErrorKind, | |
2279 | } | |
2280 | ||
2281 | impl PartialEq<ast::Error> for TestError { | |
2282 | fn eq(&self, other: &ast::Error) -> bool { | |
2283 | self.span == other.span && self.kind == other.kind | |
2284 | } | |
2285 | } | |
2286 | ||
2287 | impl PartialEq<TestError> for ast::Error { | |
2288 | fn eq(&self, other: &TestError) -> bool { | |
2289 | self.span == other.span && self.kind == other.kind | |
2290 | } | |
2291 | } | |
2292 | ||
2293 | fn s(str: &str) -> String { | |
2294 | str.to_string() | |
2295 | } | |
2296 | ||
2297 | fn parser(pattern: &str) -> ParserI<Parser> { | |
2298 | ParserI::new(Parser::new(), pattern) | |
2299 | } | |
2300 | ||
2301 | fn parser_octal(pattern: &str) -> ParserI<Parser> { | |
2302 | let parser = ParserBuilder::new().octal(true).build(); | |
2303 | ParserI::new(parser, pattern) | |
2304 | } | |
2305 | ||
2306 | fn parser_nest_limit(pattern: &str, nest_limit: u32) -> ParserI<Parser> { | |
2307 | let p = ParserBuilder::new().nest_limit(nest_limit).build(); | |
2308 | ParserI::new(p, pattern) | |
2309 | } | |
2310 | ||
2311 | fn parser_ignore_whitespace(pattern: &str) -> ParserI<Parser> { | |
2312 | let p = ParserBuilder::new().ignore_whitespace(true).build(); | |
2313 | ParserI::new(p, pattern) | |
2314 | } | |
2315 | ||
2316 | /// Short alias for creating a new span. | |
2317 | fn nspan(start: Position, end: Position) -> Span { | |
2318 | Span::new(start, end) | |
2319 | } | |
2320 | ||
2321 | /// Short alias for creating a new position. | |
2322 | fn npos(offset: usize, line: usize, column: usize) -> Position { | |
2323 | Position::new(offset, line, column) | |
2324 | } | |
2325 | ||
2326 | /// Create a new span from the given offset range. This assumes a single | |
2327 | /// line and sets the columns based on the offsets. i.e., This only works | |
2328 | /// out of the box for ASCII, which is fine for most tests. | |
2329 | fn span(range: Range<usize>) -> Span { | |
2330 | let start = Position::new(range.start, 1, range.start + 1); | |
2331 | let end = Position::new(range.end, 1, range.end + 1); | |
2332 | Span::new(start, end) | |
2333 | } | |
2334 | ||
2335 | /// Create a new span for the corresponding byte range in the given string. | |
2336 | fn span_range(subject: &str, range: Range<usize>) -> Span { | |
2337 | let start = Position { | |
2338 | offset: range.start, | |
2339 | line: 1 + subject[..range.start].matches('\n').count(), | |
2340 | column: 1 + subject[..range.start] | |
2341 | .chars() | |
2342 | .rev() | |
2343 | .position(|c| c == '\n') | |
2344 | .unwrap_or(subject[..range.start].chars().count()), | |
2345 | }; | |
2346 | let end = Position { | |
2347 | offset: range.end, | |
2348 | line: 1 + subject[..range.end].matches('\n').count(), | |
2349 | column: 1 + subject[..range.end] | |
2350 | .chars() | |
2351 | .rev() | |
2352 | .position(|c| c == '\n') | |
2353 | .unwrap_or(subject[..range.end].chars().count()), | |
2354 | }; | |
2355 | Span::new(start, end) | |
2356 | } | |
2357 | ||
2358 | /// Create a verbatim literal starting at the given position. | |
2359 | fn lit(c: char, start: usize) -> Ast { | |
2360 | lit_with(c, span(start..start + c.len_utf8())) | |
2361 | } | |
2362 | ||
2363 | /// Create a punctuation literal starting at the given position. | |
2364 | fn punct_lit(c: char, span: Span) -> Ast { | |
2365 | Ast::Literal(ast::Literal { | |
2366 | span: span, | |
2367 | kind: ast::LiteralKind::Punctuation, | |
2368 | c: c, | |
2369 | }) | |
2370 | } | |
2371 | ||
2372 | /// Create a verbatim literal with the given span. | |
2373 | fn lit_with(c: char, span: Span) -> Ast { | |
2374 | Ast::Literal(ast::Literal { | |
2375 | span: span, | |
2376 | kind: ast::LiteralKind::Verbatim, | |
2377 | c: c, | |
2378 | }) | |
2379 | } | |
2380 | ||
2381 | /// Create a concatenation with the given range. | |
2382 | fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast { | |
2383 | concat_with(span(range), asts) | |
2384 | } | |
2385 | ||
2386 | /// Create a concatenation with the given span. | |
2387 | fn concat_with(span: Span, asts: Vec<Ast>) -> Ast { | |
2388 | Ast::Concat(ast::Concat { span: span, asts: asts }) | |
2389 | } | |
2390 | ||
2391 | /// Create an alternation with the given span. | |
2392 | fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast { | |
2393 | Ast::Alternation(ast::Alternation { span: span(range), asts: asts }) | |
2394 | } | |
2395 | ||
2396 | /// Create a capturing group with the given span. | |
2397 | fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast { | |
2398 | Ast::Group(ast::Group { | |
2399 | span: span(range), | |
2400 | kind: ast::GroupKind::CaptureIndex(index), | |
2401 | ast: Box::new(ast), | |
2402 | }) | |
2403 | } | |
2404 | ||
2405 | /// Create an ast::SetFlags. | |
2406 | /// | |
2407 | /// The given pattern should be the full pattern string. The range given | |
2408 | /// should correspond to the byte offsets where the flag set occurs. | |
2409 | /// | |
2410 | /// If negated is true, then the set is interpreted as beginning with a | |
2411 | /// negation. | |
2412 | fn flag_set( | |
2413 | pat: &str, | |
2414 | range: Range<usize>, | |
2415 | flag: ast::Flag, | |
2416 | negated: bool, | |
2417 | ) -> Ast { | |
2418 | let mut items = vec![ | |
2419 | ast::FlagsItem { | |
2420 | span: span_range(pat, (range.end - 2)..(range.end - 1)), | |
2421 | kind: ast::FlagsItemKind::Flag(flag), | |
2422 | }, | |
2423 | ]; | |
2424 | if negated { | |
2425 | items.insert(0, ast::FlagsItem { | |
2426 | span: span_range(pat, (range.start + 2)..(range.end - 2)), | |
2427 | kind: ast::FlagsItemKind::Negation, | |
2428 | }); | |
2429 | } | |
2430 | Ast::Flags(ast::SetFlags { | |
2431 | span: span_range(pat, range.clone()), | |
2432 | flags: ast::Flags { | |
2433 | span: span_range(pat, (range.start + 2)..(range.end - 1)), | |
2434 | items: items, | |
2435 | }, | |
2436 | }) | |
2437 | } | |
2438 | ||
2439 | #[test] | |
2440 | fn parse_nest_limit() { | |
2441 | // A nest limit of 0 still allows some types of regexes. | |
2442 | assert_eq!( | |
2443 | parser_nest_limit("", 0).parse(), | |
2444 | Ok(Ast::Empty(span(0..0)))); | |
2445 | assert_eq!( | |
2446 | parser_nest_limit("a", 0).parse(), | |
2447 | Ok(lit('a', 0))); | |
2448 | ||
2449 | // Test repetition operations, which require one level of nesting. | |
2450 | assert_eq!( | |
2451 | parser_nest_limit("a+", 0).parse().unwrap_err(), | |
2452 | TestError { | |
2453 | span: span(0..2), | |
2454 | kind: ast::ErrorKind::NestLimitExceeded(0), | |
2455 | }); | |
2456 | assert_eq!( | |
2457 | parser_nest_limit("a+", 1).parse(), | |
2458 | Ok(Ast::Repetition(ast::Repetition { | |
2459 | span: span(0..2), | |
2460 | op: ast::RepetitionOp { | |
2461 | span: span(1..2), | |
2462 | kind: ast::RepetitionKind::OneOrMore, | |
2463 | }, | |
2464 | greedy: true, | |
2465 | ast: Box::new(lit('a', 0)), | |
2466 | }))); | |
2467 | assert_eq!( | |
2468 | parser_nest_limit("(a)+", 1).parse().unwrap_err(), | |
2469 | TestError { | |
2470 | span: span(0..3), | |
2471 | kind: ast::ErrorKind::NestLimitExceeded(1), | |
2472 | }); | |
2473 | assert_eq!( | |
2474 | parser_nest_limit("a+*", 1).parse().unwrap_err(), | |
2475 | TestError { | |
2476 | span: span(0..2), | |
2477 | kind: ast::ErrorKind::NestLimitExceeded(1), | |
2478 | }); | |
2479 | assert_eq!( | |
2480 | parser_nest_limit("a+*", 2).parse(), | |
2481 | Ok(Ast::Repetition(ast::Repetition { | |
2482 | span: span(0..3), | |
2483 | op: ast::RepetitionOp { | |
2484 | span: span(2..3), | |
2485 | kind: ast::RepetitionKind::ZeroOrMore, | |
2486 | }, | |
2487 | greedy: true, | |
2488 | ast: Box::new(Ast::Repetition(ast::Repetition { | |
2489 | span: span(0..2), | |
2490 | op: ast::RepetitionOp { | |
2491 | span: span(1..2), | |
2492 | kind: ast::RepetitionKind::OneOrMore, | |
2493 | }, | |
2494 | greedy: true, | |
2495 | ast: Box::new(lit('a', 0)), | |
2496 | })), | |
2497 | }))); | |
2498 | ||
2499 | // Test concatenations. A concatenation requires one level of nesting. | |
2500 | assert_eq!( | |
2501 | parser_nest_limit("ab", 0).parse().unwrap_err(), | |
2502 | TestError { | |
2503 | span: span(0..2), | |
2504 | kind: ast::ErrorKind::NestLimitExceeded(0), | |
2505 | }); | |
2506 | assert_eq!( | |
2507 | parser_nest_limit("ab", 1).parse(), | |
2508 | Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)]))); | |
2509 | assert_eq!( | |
2510 | parser_nest_limit("abc", 1).parse(), | |
2511 | Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)]))); | |
2512 | ||
2513 | // Test alternations. An alternation requires one level of nesting. | |
2514 | assert_eq!( | |
2515 | parser_nest_limit("a|b", 0).parse().unwrap_err(), | |
2516 | TestError { | |
2517 | span: span(0..3), | |
2518 | kind: ast::ErrorKind::NestLimitExceeded(0), | |
2519 | }); | |
2520 | assert_eq!( | |
2521 | parser_nest_limit("a|b", 1).parse(), | |
2522 | Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)]))); | |
2523 | assert_eq!( | |
2524 | parser_nest_limit("a|b|c", 1).parse(), | |
2525 | Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)]))); | |
2526 | ||
2527 | // Test character classes. Classes form their own mini-recursive | |
2528 | // syntax! | |
2529 | assert_eq!( | |
2530 | parser_nest_limit("[a]", 0).parse().unwrap_err(), | |
2531 | TestError { | |
2532 | span: span(0..3), | |
2533 | kind: ast::ErrorKind::NestLimitExceeded(0), | |
2534 | }); | |
2535 | assert_eq!( | |
2536 | parser_nest_limit("[a]", 1).parse(), | |
2537 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
2538 | span: span(0..3), | |
2539 | negated: false, | |
2540 | kind: ast::ClassSet::Item( | |
2541 | ast::ClassSetItem::Literal(ast::Literal { | |
2542 | span: span(1..2), | |
2543 | kind: ast::LiteralKind::Verbatim, | |
2544 | c: 'a', | |
2545 | }) | |
2546 | ), | |
2547 | })))); | |
2548 | assert_eq!( | |
2549 | parser_nest_limit("[ab]", 1).parse().unwrap_err(), | |
2550 | TestError { | |
2551 | span: span(1..3), | |
2552 | kind: ast::ErrorKind::NestLimitExceeded(1), | |
2553 | }); | |
2554 | assert_eq!( | |
2555 | parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(), | |
2556 | TestError { | |
2557 | span: span(3..7), | |
2558 | kind: ast::ErrorKind::NestLimitExceeded(2), | |
2559 | }); | |
2560 | assert_eq!( | |
2561 | parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(), | |
2562 | TestError { | |
2563 | span: span(4..6), | |
2564 | kind: ast::ErrorKind::NestLimitExceeded(3), | |
2565 | }); | |
2566 | assert_eq!( | |
2567 | parser_nest_limit("[a--b]", 1).parse().unwrap_err(), | |
2568 | TestError { | |
2569 | span: span(1..5), | |
2570 | kind: ast::ErrorKind::NestLimitExceeded(1), | |
2571 | }); | |
2572 | assert_eq!( | |
2573 | parser_nest_limit("[a--bc]", 2).parse().unwrap_err(), | |
2574 | TestError { | |
2575 | span: span(4..6), | |
2576 | kind: ast::ErrorKind::NestLimitExceeded(2), | |
2577 | }); | |
2578 | } | |
2579 | ||
2580 | #[test] | |
2581 | fn parse_comments() { | |
2582 | let pat = "(?x) | |
2583 | # This is comment 1. | |
2584 | foo # This is comment 2. | |
2585 | # This is comment 3. | |
2586 | bar | |
2587 | # This is comment 4."; | |
2588 | let astc = parser(pat).parse_with_comments().unwrap(); | |
2589 | assert_eq!( | |
2590 | astc.ast, | |
2591 | concat_with(span_range(pat, 0..pat.len()), vec![ | |
2592 | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), | |
2593 | lit_with('f', span_range(pat, 26..27)), | |
2594 | lit_with('o', span_range(pat, 27..28)), | |
2595 | lit_with('o', span_range(pat, 28..29)), | |
2596 | lit_with('b', span_range(pat, 74..75)), | |
2597 | lit_with('a', span_range(pat, 75..76)), | |
2598 | lit_with('r', span_range(pat, 76..77)), | |
2599 | ])); | |
2600 | assert_eq!(astc.comments, vec![ | |
2601 | ast::Comment { | |
2602 | span: span_range(pat, 5..26), | |
2603 | comment: s(" This is comment 1."), | |
2604 | }, | |
2605 | ast::Comment { | |
2606 | span: span_range(pat, 30..51), | |
2607 | comment: s(" This is comment 2."), | |
2608 | }, | |
2609 | ast::Comment { | |
2610 | span: span_range(pat, 53..74), | |
2611 | comment: s(" This is comment 3."), | |
2612 | }, | |
2613 | ast::Comment { | |
2614 | span: span_range(pat, 78..98), | |
2615 | comment: s(" This is comment 4."), | |
2616 | }, | |
2617 | ]); | |
2618 | } | |
2619 | ||
2620 | #[test] | |
2621 | fn parse_holistic() { | |
2622 | assert_eq!( | |
2623 | parser("]").parse(), | |
2624 | Ok(lit(']', 0))); | |
2625 | assert_eq!( | |
2626 | parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(), | |
2627 | Ok(concat(0..36, vec![ | |
2628 | punct_lit('\\', span(0..2)), | |
2629 | punct_lit('.', span(2..4)), | |
2630 | punct_lit('+', span(4..6)), | |
2631 | punct_lit('*', span(6..8)), | |
2632 | punct_lit('?', span(8..10)), | |
2633 | punct_lit('(', span(10..12)), | |
2634 | punct_lit(')', span(12..14)), | |
2635 | punct_lit('|', span(14..16)), | |
2636 | punct_lit('[', span(16..18)), | |
2637 | punct_lit(']', span(18..20)), | |
2638 | punct_lit('{', span(20..22)), | |
2639 | punct_lit('}', span(22..24)), | |
2640 | punct_lit('^', span(24..26)), | |
2641 | punct_lit('$', span(26..28)), | |
2642 | punct_lit('#', span(28..30)), | |
2643 | punct_lit('&', span(30..32)), | |
2644 | punct_lit('-', span(32..34)), | |
2645 | punct_lit('~', span(34..36)), | |
2646 | ]))); | |
2647 | } | |
2648 | ||
2649 | #[test] | |
2650 | fn parse_ignore_whitespace() { | |
2651 | // Test that basic whitespace insensitivity works. | |
2652 | let pat = "(?x)a b"; | |
2653 | assert_eq!( | |
2654 | parser(pat).parse(), | |
2655 | Ok(concat_with(nspan(npos(0, 1, 1), npos(7, 1, 8)), vec![ | |
2656 | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), | |
2657 | lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), | |
2658 | lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), | |
2659 | ]))); | |
2660 | ||
2661 | // Test that we can toggle whitespace insensitivity. | |
2662 | let pat = "(?x)a b(?-x)a b"; | |
2663 | assert_eq!( | |
2664 | parser(pat).parse(), | |
2665 | Ok(concat_with(nspan(npos(0, 1, 1), npos(15, 1, 16)), vec![ | |
2666 | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), | |
2667 | lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), | |
2668 | lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), | |
2669 | flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true), | |
2670 | lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))), | |
2671 | lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))), | |
2672 | lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))), | |
2673 | ]))); | |
2674 | ||
2675 | // Test that nesting whitespace insensitive flags works. | |
2676 | let pat = "a (?x:a )a "; | |
2677 | assert_eq!( | |
2678 | parser(pat).parse(), | |
2679 | Ok(concat_with(span_range(pat, 0..11), vec![ | |
2680 | lit_with('a', span_range(pat, 0..1)), | |
2681 | lit_with(' ', span_range(pat, 1..2)), | |
2682 | Ast::Group(ast::Group { | |
2683 | span: span_range(pat, 2..9), | |
2684 | kind: ast::GroupKind::NonCapturing(ast::Flags { | |
2685 | span: span_range(pat, 4..5), | |
2686 | items: vec![ | |
2687 | ast::FlagsItem { | |
2688 | span: span_range(pat, 4..5), | |
2689 | kind: ast::FlagsItemKind::Flag( | |
2690 | ast::Flag::IgnoreWhitespace), | |
2691 | }, | |
2692 | ], | |
2693 | }), | |
2694 | ast: Box::new(lit_with('a', span_range(pat, 6..7))), | |
2695 | }), | |
2696 | lit_with('a', span_range(pat, 9..10)), | |
2697 | lit_with(' ', span_range(pat, 10..11)), | |
2698 | ]))); | |
2699 | ||
2700 | // Test that whitespace after an opening paren is insignificant. | |
2701 | let pat = "(?x)( ?P<foo> a )"; | |
2702 | assert_eq!( | |
2703 | parser(pat).parse(), | |
2704 | Ok(concat_with(span_range(pat, 0..pat.len()), vec![ | |
2705 | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), | |
2706 | Ast::Group(ast::Group { | |
2707 | span: span_range(pat, 4..pat.len()), | |
2708 | kind: ast::GroupKind::CaptureName(ast::CaptureName { | |
2709 | span: span_range(pat, 9..12), | |
2710 | name: s("foo"), | |
2711 | index: 1, | |
2712 | }), | |
2713 | ast: Box::new(lit_with('a', span_range(pat, 14..15))), | |
2714 | }), | |
2715 | ]))); | |
2716 | let pat = "(?x)( a )"; | |
2717 | assert_eq!( | |
2718 | parser(pat).parse(), | |
2719 | Ok(concat_with(span_range(pat, 0..pat.len()), vec![ | |
2720 | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), | |
2721 | Ast::Group(ast::Group { | |
2722 | span: span_range(pat, 4..pat.len()), | |
2723 | kind: ast::GroupKind::CaptureIndex(1), | |
2724 | ast: Box::new(lit_with('a', span_range(pat, 7..8))), | |
2725 | }), | |
2726 | ]))); | |
2727 | let pat = "(?x)( ?: a )"; | |
2728 | assert_eq!( | |
2729 | parser(pat).parse(), | |
2730 | Ok(concat_with(span_range(pat, 0..pat.len()), vec![ | |
2731 | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), | |
2732 | Ast::Group(ast::Group { | |
2733 | span: span_range(pat, 4..pat.len()), | |
2734 | kind: ast::GroupKind::NonCapturing(ast::Flags { | |
2735 | span: span_range(pat, 8..8), | |
2736 | items: vec![], | |
2737 | }), | |
2738 | ast: Box::new(lit_with('a', span_range(pat, 11..12))), | |
2739 | }), | |
2740 | ]))); | |
2741 | let pat = r"(?x)\x { 53 }"; | |
2742 | assert_eq!( | |
2743 | parser(pat).parse(), | |
2744 | Ok(concat_with(span_range(pat, 0..pat.len()), vec![ | |
2745 | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), | |
2746 | Ast::Literal(ast::Literal { | |
2747 | span: span(4..13), | |
2748 | kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), | |
2749 | c: 'S', | |
2750 | }), | |
2751 | ]))); | |
2752 | ||
2753 | // Test that whitespace after an escape is OK. | |
2754 | let pat = r"(?x)\ "; | |
2755 | assert_eq!( | |
2756 | parser(pat).parse(), | |
2757 | Ok(concat_with(span_range(pat, 0..pat.len()), vec![ | |
2758 | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), | |
2759 | Ast::Literal(ast::Literal { | |
2760 | span: span_range(pat, 4..6), | |
2761 | kind: ast::LiteralKind::Special( | |
2762 | ast::SpecialLiteralKind::Space), | |
2763 | c: ' ', | |
2764 | }), | |
2765 | ]))); | |
2766 | // ... but only when `x` mode is enabled. | |
2767 | let pat = r"\ "; | |
2768 | assert_eq!( | |
2769 | parser(pat).parse().unwrap_err(), | |
2770 | TestError { | |
2771 | span: span_range(pat, 0..2), | |
2772 | kind: ast::ErrorKind::EscapeUnrecognized, | |
2773 | }); | |
2774 | } | |
2775 | ||
2776 | #[test] | |
2777 | fn parse_newlines() { | |
2778 | let pat = ".\n."; | |
2779 | assert_eq!( | |
2780 | parser(pat).parse(), | |
2781 | Ok(concat_with(span_range(pat, 0..3), vec![ | |
2782 | Ast::Dot(span_range(pat, 0..1)), | |
2783 | lit_with('\n', span_range(pat, 1..2)), | |
2784 | Ast::Dot(span_range(pat, 2..3)), | |
2785 | ]))); | |
2786 | ||
2787 | let pat = "foobar\nbaz\nquux\n"; | |
2788 | assert_eq!( | |
2789 | parser(pat).parse(), | |
2790 | Ok(concat_with(span_range(pat, 0..pat.len()), vec![ | |
2791 | lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))), | |
2792 | lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))), | |
2793 | lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))), | |
2794 | lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))), | |
2795 | lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), | |
2796 | lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))), | |
2797 | lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))), | |
2798 | lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))), | |
2799 | lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))), | |
2800 | lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))), | |
2801 | lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))), | |
2802 | lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))), | |
2803 | lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))), | |
2804 | lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))), | |
2805 | lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))), | |
2806 | lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))), | |
2807 | ]))); | |
2808 | } | |
2809 | ||
2810 | #[test] | |
2811 | fn parse_uncounted_repetition() { | |
2812 | assert_eq!( | |
2813 | parser(r"a*").parse(), | |
2814 | Ok(Ast::Repetition(ast::Repetition { | |
2815 | span: span(0..2), | |
2816 | op: ast::RepetitionOp { | |
2817 | span: span(1..2), | |
2818 | kind: ast::RepetitionKind::ZeroOrMore, | |
2819 | }, | |
2820 | greedy: true, | |
2821 | ast: Box::new(lit('a', 0)), | |
2822 | }))); | |
2823 | assert_eq!( | |
2824 | parser(r"a+").parse(), | |
2825 | Ok(Ast::Repetition(ast::Repetition { | |
2826 | span: span(0..2), | |
2827 | op: ast::RepetitionOp { | |
2828 | span: span(1..2), | |
2829 | kind: ast::RepetitionKind::OneOrMore, | |
2830 | }, | |
2831 | greedy: true, | |
2832 | ast: Box::new(lit('a', 0)), | |
2833 | }))); | |
2834 | ||
2835 | assert_eq!( | |
2836 | parser(r"a?").parse(), | |
2837 | Ok(Ast::Repetition(ast::Repetition { | |
2838 | span: span(0..2), | |
2839 | op: ast::RepetitionOp { | |
2840 | span: span(1..2), | |
2841 | kind: ast::RepetitionKind::ZeroOrOne, | |
2842 | }, | |
2843 | greedy: true, | |
2844 | ast: Box::new(lit('a', 0)), | |
2845 | }))); | |
2846 | assert_eq!( | |
2847 | parser(r"a??").parse(), | |
2848 | Ok(Ast::Repetition(ast::Repetition { | |
2849 | span: span(0..3), | |
2850 | op: ast::RepetitionOp { | |
2851 | span: span(1..3), | |
2852 | kind: ast::RepetitionKind::ZeroOrOne, | |
2853 | }, | |
2854 | greedy: false, | |
2855 | ast: Box::new(lit('a', 0)), | |
2856 | }))); | |
2857 | assert_eq!( | |
2858 | parser(r"a?").parse(), | |
2859 | Ok(Ast::Repetition(ast::Repetition { | |
2860 | span: span(0..2), | |
2861 | op: ast::RepetitionOp { | |
2862 | span: span(1..2), | |
2863 | kind: ast::RepetitionKind::ZeroOrOne, | |
2864 | }, | |
2865 | greedy: true, | |
2866 | ast: Box::new(lit('a', 0)), | |
2867 | }))); | |
2868 | assert_eq!( | |
2869 | parser(r"a?b").parse(), | |
2870 | Ok(concat(0..3, vec![ | |
2871 | Ast::Repetition(ast::Repetition { | |
2872 | span: span(0..2), | |
2873 | op: ast::RepetitionOp { | |
2874 | span: span(1..2), | |
2875 | kind: ast::RepetitionKind::ZeroOrOne, | |
2876 | }, | |
2877 | greedy: true, | |
2878 | ast: Box::new(lit('a', 0)), | |
2879 | }), | |
2880 | lit('b', 2), | |
2881 | ]))); | |
2882 | assert_eq!( | |
2883 | parser(r"a??b").parse(), | |
2884 | Ok(concat(0..4, vec![ | |
2885 | Ast::Repetition(ast::Repetition { | |
2886 | span: span(0..3), | |
2887 | op: ast::RepetitionOp { | |
2888 | span: span(1..3), | |
2889 | kind: ast::RepetitionKind::ZeroOrOne, | |
2890 | }, | |
2891 | greedy: false, | |
2892 | ast: Box::new(lit('a', 0)), | |
2893 | }), | |
2894 | lit('b', 3), | |
2895 | ]))); | |
2896 | assert_eq!( | |
2897 | parser(r"ab?").parse(), | |
2898 | Ok(concat(0..3, vec![ | |
2899 | lit('a', 0), | |
2900 | Ast::Repetition(ast::Repetition { | |
2901 | span: span(1..3), | |
2902 | op: ast::RepetitionOp { | |
2903 | span: span(2..3), | |
2904 | kind: ast::RepetitionKind::ZeroOrOne, | |
2905 | }, | |
2906 | greedy: true, | |
2907 | ast: Box::new(lit('b', 1)), | |
2908 | }), | |
2909 | ]))); | |
2910 | assert_eq!( | |
2911 | parser(r"(ab)?").parse(), | |
2912 | Ok(Ast::Repetition(ast::Repetition { | |
2913 | span: span(0..5), | |
2914 | op: ast::RepetitionOp { | |
2915 | span: span(4..5), | |
2916 | kind: ast::RepetitionKind::ZeroOrOne, | |
2917 | }, | |
2918 | greedy: true, | |
2919 | ast: Box::new(group(0..4, 1, concat(1..3, vec![ | |
2920 | lit('a', 1), | |
2921 | lit('b', 2), | |
2922 | ]))), | |
2923 | }))); | |
2924 | assert_eq!( | |
2925 | parser(r"|a?").parse(), | |
2926 | Ok(alt(0..3, vec![ | |
2927 | Ast::Empty(span(0..0)), | |
2928 | Ast::Repetition(ast::Repetition { | |
2929 | span: span(1..3), | |
2930 | op: ast::RepetitionOp { | |
2931 | span: span(2..3), | |
2932 | kind: ast::RepetitionKind::ZeroOrOne, | |
2933 | }, | |
2934 | greedy: true, | |
2935 | ast: Box::new(lit('a', 1)), | |
2936 | }), | |
2937 | ]))); | |
2938 | ||
2939 | assert_eq!( | |
2940 | parser(r"*").parse().unwrap_err(), | |
2941 | TestError { | |
2942 | span: span(0..0), | |
2943 | kind: ast::ErrorKind::RepetitionMissing, | |
2944 | }); | |
2945 | assert_eq!( | |
2946 | parser(r"(*)").parse().unwrap_err(), | |
2947 | TestError { | |
2948 | span: span(1..1), | |
2949 | kind: ast::ErrorKind::RepetitionMissing, | |
2950 | }); | |
2951 | assert_eq!( | |
2952 | parser(r"(?:?)").parse().unwrap_err(), | |
2953 | TestError { | |
2954 | span: span(3..3), | |
2955 | kind: ast::ErrorKind::RepetitionMissing, | |
2956 | }); | |
2957 | assert_eq!( | |
2958 | parser(r"+").parse().unwrap_err(), | |
2959 | TestError { | |
2960 | span: span(0..0), | |
2961 | kind: ast::ErrorKind::RepetitionMissing, | |
2962 | }); | |
2963 | assert_eq!( | |
2964 | parser(r"?").parse().unwrap_err(), | |
2965 | TestError { | |
2966 | span: span(0..0), | |
2967 | kind: ast::ErrorKind::RepetitionMissing, | |
2968 | }); | |
2969 | assert_eq!( | |
2970 | parser(r"(?)").parse().unwrap_err(), | |
2971 | TestError { | |
2972 | span: span(1..1), | |
2973 | kind: ast::ErrorKind::RepetitionMissing, | |
2974 | }); | |
2975 | assert_eq!( | |
2976 | parser(r"|*").parse().unwrap_err(), | |
2977 | TestError { | |
2978 | span: span(1..1), | |
2979 | kind: ast::ErrorKind::RepetitionMissing, | |
2980 | }); | |
2981 | assert_eq!( | |
2982 | parser(r"|+").parse().unwrap_err(), | |
2983 | TestError { | |
2984 | span: span(1..1), | |
2985 | kind: ast::ErrorKind::RepetitionMissing, | |
2986 | }); | |
2987 | assert_eq!( | |
2988 | parser(r"|?").parse().unwrap_err(), | |
2989 | TestError { | |
2990 | span: span(1..1), | |
2991 | kind: ast::ErrorKind::RepetitionMissing, | |
2992 | }); | |
2993 | } | |
2994 | ||
2995 | #[test] | |
2996 | fn parse_counted_repetition() { | |
2997 | assert_eq!( | |
2998 | parser(r"a{5}").parse(), | |
2999 | Ok(Ast::Repetition(ast::Repetition { | |
3000 | span: span(0..4), | |
3001 | op: ast::RepetitionOp { | |
3002 | span: span(1..4), | |
3003 | kind: ast::RepetitionKind::Range( | |
3004 | ast::RepetitionRange::Exactly(5)), | |
3005 | }, | |
3006 | greedy: true, | |
3007 | ast: Box::new(lit('a', 0)), | |
3008 | }))); | |
3009 | assert_eq!( | |
3010 | parser(r"a{5,}").parse(), | |
3011 | Ok(Ast::Repetition(ast::Repetition { | |
3012 | span: span(0..5), | |
3013 | op: ast::RepetitionOp { | |
3014 | span: span(1..5), | |
3015 | kind: ast::RepetitionKind::Range( | |
3016 | ast::RepetitionRange::AtLeast(5)), | |
3017 | }, | |
3018 | greedy: true, | |
3019 | ast: Box::new(lit('a', 0)), | |
3020 | }))); | |
3021 | assert_eq!( | |
3022 | parser(r"a{5,9}").parse(), | |
3023 | Ok(Ast::Repetition(ast::Repetition { | |
3024 | span: span(0..6), | |
3025 | op: ast::RepetitionOp { | |
3026 | span: span(1..6), | |
3027 | kind: ast::RepetitionKind::Range( | |
3028 | ast::RepetitionRange::Bounded(5, 9)), | |
3029 | }, | |
3030 | greedy: true, | |
3031 | ast: Box::new(lit('a', 0)), | |
3032 | }))); | |
3033 | assert_eq!( | |
3034 | parser(r"a{5}?").parse(), | |
3035 | Ok(Ast::Repetition(ast::Repetition { | |
3036 | span: span(0..5), | |
3037 | op: ast::RepetitionOp { | |
3038 | span: span(1..5), | |
3039 | kind: ast::RepetitionKind::Range( | |
3040 | ast::RepetitionRange::Exactly(5)), | |
3041 | }, | |
3042 | greedy: false, | |
3043 | ast: Box::new(lit('a', 0)), | |
3044 | }))); | |
3045 | assert_eq!( | |
3046 | parser(r"ab{5}").parse(), | |
3047 | Ok(concat(0..5, vec![ | |
3048 | lit('a', 0), | |
3049 | Ast::Repetition(ast::Repetition { | |
3050 | span: span(1..5), | |
3051 | op: ast::RepetitionOp { | |
3052 | span: span(2..5), | |
3053 | kind: ast::RepetitionKind::Range( | |
3054 | ast::RepetitionRange::Exactly(5)), | |
3055 | }, | |
3056 | greedy: true, | |
3057 | ast: Box::new(lit('b', 1)), | |
3058 | }), | |
3059 | ]))); | |
3060 | assert_eq!( | |
3061 | parser(r"ab{5}c").parse(), | |
3062 | Ok(concat(0..6, vec![ | |
3063 | lit('a', 0), | |
3064 | Ast::Repetition(ast::Repetition { | |
3065 | span: span(1..5), | |
3066 | op: ast::RepetitionOp { | |
3067 | span: span(2..5), | |
3068 | kind: ast::RepetitionKind::Range( | |
3069 | ast::RepetitionRange::Exactly(5)), | |
3070 | }, | |
3071 | greedy: true, | |
3072 | ast: Box::new(lit('b', 1)), | |
3073 | }), | |
3074 | lit('c', 5), | |
3075 | ]))); | |
3076 | ||
3077 | assert_eq!( | |
3078 | parser(r"a{ 5 }").parse(), | |
3079 | Ok(Ast::Repetition(ast::Repetition { | |
3080 | span: span(0..6), | |
3081 | op: ast::RepetitionOp { | |
3082 | span: span(1..6), | |
3083 | kind: ast::RepetitionKind::Range( | |
3084 | ast::RepetitionRange::Exactly(5)), | |
3085 | }, | |
3086 | greedy: true, | |
3087 | ast: Box::new(lit('a', 0)), | |
3088 | }))); | |
3089 | assert_eq!( | |
3090 | parser(r"a{ 5 , 9 }").parse(), | |
3091 | Ok(Ast::Repetition(ast::Repetition { | |
3092 | span: span(0..10), | |
3093 | op: ast::RepetitionOp { | |
3094 | span: span(1..10), | |
3095 | kind: ast::RepetitionKind::Range( | |
3096 | ast::RepetitionRange::Bounded(5, 9)), | |
3097 | }, | |
3098 | greedy: true, | |
3099 | ast: Box::new(lit('a', 0)), | |
3100 | }))); | |
3101 | assert_eq!( | |
3102 | parser_ignore_whitespace(r"a{5,9} ?").parse(), | |
3103 | Ok(Ast::Repetition(ast::Repetition { | |
3104 | span: span(0..8), | |
3105 | op: ast::RepetitionOp { | |
3106 | span: span(1..8), | |
3107 | kind: ast::RepetitionKind::Range( | |
3108 | ast::RepetitionRange::Bounded(5, 9)), | |
3109 | }, | |
3110 | greedy: false, | |
3111 | ast: Box::new(lit('a', 0)), | |
3112 | }))); | |
3113 | ||
3114 | assert_eq!( | |
3115 | parser(r"a{").parse().unwrap_err(), | |
3116 | TestError { | |
3117 | span: span(1..2), | |
3118 | kind: ast::ErrorKind::RepetitionCountUnclosed, | |
3119 | }); | |
3120 | assert_eq!( | |
3121 | parser(r"a{}").parse().unwrap_err(), | |
3122 | TestError { | |
3123 | span: span(2..2), | |
3124 | kind: ast::ErrorKind::DecimalEmpty, | |
3125 | }); | |
3126 | assert_eq!( | |
3127 | parser(r"a{a").parse().unwrap_err(), | |
3128 | TestError { | |
3129 | span: span(2..2), | |
3130 | kind: ast::ErrorKind::DecimalEmpty, | |
3131 | }); | |
3132 | assert_eq!( | |
3133 | parser(r"a{9999999999}").parse().unwrap_err(), | |
3134 | TestError { | |
3135 | span: span(2..12), | |
3136 | kind: ast::ErrorKind::DecimalInvalid, | |
3137 | }); | |
3138 | assert_eq!( | |
3139 | parser(r"a{9").parse().unwrap_err(), | |
3140 | TestError { | |
3141 | span: span(1..3), | |
3142 | kind: ast::ErrorKind::RepetitionCountUnclosed, | |
3143 | }); | |
3144 | assert_eq!( | |
3145 | parser(r"a{9,a").parse().unwrap_err(), | |
3146 | TestError { | |
3147 | span: span(4..4), | |
3148 | kind: ast::ErrorKind::DecimalEmpty, | |
3149 | }); | |
3150 | assert_eq!( | |
3151 | parser(r"a{9,9999999999}").parse().unwrap_err(), | |
3152 | TestError { | |
3153 | span: span(4..14), | |
3154 | kind: ast::ErrorKind::DecimalInvalid, | |
3155 | }); | |
3156 | assert_eq!( | |
3157 | parser(r"a{9,").parse().unwrap_err(), | |
3158 | TestError { | |
3159 | span: span(1..4), | |
3160 | kind: ast::ErrorKind::RepetitionCountUnclosed, | |
3161 | }); | |
3162 | assert_eq!( | |
3163 | parser(r"a{9,11").parse().unwrap_err(), | |
3164 | TestError { | |
3165 | span: span(1..6), | |
3166 | kind: ast::ErrorKind::RepetitionCountUnclosed, | |
3167 | }); | |
3168 | assert_eq!( | |
3169 | parser(r"a{2,1}").parse().unwrap_err(), | |
3170 | TestError { | |
3171 | span: span(1..6), | |
3172 | kind: ast::ErrorKind::RepetitionCountInvalid, | |
3173 | }); | |
3174 | assert_eq!( | |
3175 | parser(r"{5}").parse().unwrap_err(), | |
3176 | TestError { | |
3177 | span: span(0..0), | |
3178 | kind: ast::ErrorKind::RepetitionMissing, | |
3179 | }); | |
3180 | assert_eq!( | |
3181 | parser(r"|{5}").parse().unwrap_err(), | |
3182 | TestError { | |
3183 | span: span(1..1), | |
3184 | kind: ast::ErrorKind::RepetitionMissing, | |
3185 | }); | |
3186 | } | |
3187 | ||
3188 | #[test] | |
3189 | fn parse_alternate() { | |
3190 | assert_eq!( | |
3191 | parser(r"a|b").parse(), | |
3192 | Ok(Ast::Alternation(ast::Alternation { | |
3193 | span: span(0..3), | |
3194 | asts: vec![lit('a', 0), lit('b', 2)], | |
3195 | }))); | |
3196 | assert_eq!( | |
3197 | parser(r"(a|b)").parse(), | |
3198 | Ok(group(0..5, 1, Ast::Alternation(ast::Alternation { | |
3199 | span: span(1..4), | |
3200 | asts: vec![lit('a', 1), lit('b', 3)], | |
3201 | })))); | |
3202 | ||
3203 | assert_eq!( | |
3204 | parser(r"a|b|c").parse(), | |
3205 | Ok(Ast::Alternation(ast::Alternation { | |
3206 | span: span(0..5), | |
3207 | asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)], | |
3208 | }))); | |
3209 | assert_eq!( | |
3210 | parser(r"ax|by|cz").parse(), | |
3211 | Ok(Ast::Alternation(ast::Alternation { | |
3212 | span: span(0..8), | |
3213 | asts: vec![ | |
3214 | concat(0..2, vec![lit('a', 0), lit('x', 1)]), | |
3215 | concat(3..5, vec![lit('b', 3), lit('y', 4)]), | |
3216 | concat(6..8, vec![lit('c', 6), lit('z', 7)]), | |
3217 | ], | |
3218 | }))); | |
3219 | assert_eq!( | |
3220 | parser(r"(ax|by|cz)").parse(), | |
3221 | Ok(group(0..10, 1, Ast::Alternation(ast::Alternation { | |
3222 | span: span(1..9), | |
3223 | asts: vec![ | |
3224 | concat(1..3, vec![lit('a', 1), lit('x', 2)]), | |
3225 | concat(4..6, vec![lit('b', 4), lit('y', 5)]), | |
3226 | concat(7..9, vec![lit('c', 7), lit('z', 8)]), | |
3227 | ], | |
3228 | })))); | |
3229 | assert_eq!( | |
3230 | parser(r"(ax|(by|(cz)))").parse(), | |
3231 | Ok(group(0..14, 1, alt(1..13, vec![ | |
3232 | concat(1..3, vec![lit('a', 1), lit('x', 2)]), | |
3233 | group(4..13, 2, alt(5..12, vec![ | |
3234 | concat(5..7, vec![lit('b', 5), lit('y', 6)]), | |
3235 | group(8..12, 3, concat(9..11, vec![ | |
3236 | lit('c', 9), | |
3237 | lit('z', 10), | |
3238 | ])), | |
3239 | ])), | |
3240 | ])))); | |
3241 | ||
3242 | assert_eq!( | |
3243 | parser(r"|").parse(), Ok(alt(0..1, vec![ | |
3244 | Ast::Empty(span(0..0)), Ast::Empty(span(1..1)), | |
3245 | ]))); | |
3246 | assert_eq!( | |
3247 | parser(r"||").parse(), Ok(alt(0..2, vec![ | |
3248 | Ast::Empty(span(0..0)), | |
3249 | Ast::Empty(span(1..1)), | |
3250 | Ast::Empty(span(2..2)), | |
3251 | ]))); | |
3252 | assert_eq!( | |
3253 | parser(r"a|").parse(), Ok(alt(0..2, vec![ | |
3254 | lit('a', 0), Ast::Empty(span(2..2)), | |
3255 | ]))); | |
3256 | assert_eq!( | |
3257 | parser(r"|a").parse(), Ok(alt(0..2, vec![ | |
3258 | Ast::Empty(span(0..0)), lit('a', 1), | |
3259 | ]))); | |
3260 | ||
3261 | assert_eq!( | |
3262 | parser(r"(|)").parse(), Ok(group(0..3, 1, alt(1..2, vec![ | |
3263 | Ast::Empty(span(1..1)), Ast::Empty(span(2..2)), | |
3264 | ])))); | |
3265 | assert_eq!( | |
3266 | parser(r"(a|)").parse(), Ok(group(0..4, 1, alt(1..3, vec![ | |
3267 | lit('a', 1), Ast::Empty(span(3..3)), | |
3268 | ])))); | |
3269 | assert_eq!( | |
3270 | parser(r"(|a)").parse(), Ok(group(0..4, 1, alt(1..3, vec![ | |
3271 | Ast::Empty(span(1..1)), lit('a', 2), | |
3272 | ])))); | |
3273 | ||
3274 | assert_eq!( | |
3275 | parser(r"a|b)").parse().unwrap_err(), | |
3276 | TestError { | |
3277 | span: span(3..4), | |
3278 | kind: ast::ErrorKind::GroupUnopened, | |
3279 | }); | |
3280 | assert_eq!( | |
3281 | parser(r"(a|b").parse().unwrap_err(), | |
3282 | TestError { | |
3283 | span: span(0..1), | |
3284 | kind: ast::ErrorKind::GroupUnclosed, | |
3285 | }); | |
3286 | } | |
3287 | ||
3288 | #[test] | |
3289 | fn parse_unsupported_lookaround() { | |
3290 | assert_eq!( | |
3291 | parser(r"(?=a)").parse().unwrap_err(), | |
3292 | TestError { | |
3293 | span: span(0..3), | |
3294 | kind: ast::ErrorKind::UnsupportedLookAround, | |
3295 | }); | |
3296 | assert_eq!( | |
3297 | parser(r"(?!a)").parse().unwrap_err(), | |
3298 | TestError { | |
3299 | span: span(0..3), | |
3300 | kind: ast::ErrorKind::UnsupportedLookAround, | |
3301 | }); | |
3302 | assert_eq!( | |
3303 | parser(r"(?<=a)").parse().unwrap_err(), | |
3304 | TestError { | |
3305 | span: span(0..4), | |
3306 | kind: ast::ErrorKind::UnsupportedLookAround, | |
3307 | }); | |
3308 | assert_eq!( | |
3309 | parser(r"(?<!a)").parse().unwrap_err(), | |
3310 | TestError { | |
3311 | span: span(0..4), | |
3312 | kind: ast::ErrorKind::UnsupportedLookAround, | |
3313 | }); | |
3314 | } | |
3315 | ||
3316 | #[test] | |
3317 | fn parse_group() { | |
3318 | assert_eq!(parser("(?i)").parse(), Ok(Ast::Flags(ast::SetFlags { | |
3319 | span: span(0..4), | |
3320 | flags: ast::Flags { | |
3321 | span: span(2..3), | |
3322 | items: vec![ast::FlagsItem { | |
3323 | span: span(2..3), | |
3324 | kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), | |
3325 | }], | |
3326 | }, | |
3327 | }))); | |
3328 | assert_eq!(parser("(?iU)").parse(), Ok(Ast::Flags(ast::SetFlags { | |
3329 | span: span(0..5), | |
3330 | flags: ast::Flags { | |
3331 | span: span(2..4), | |
3332 | items: vec![ | |
3333 | ast::FlagsItem { | |
3334 | span: span(2..3), | |
3335 | kind: ast::FlagsItemKind::Flag( | |
3336 | ast::Flag::CaseInsensitive), | |
3337 | }, | |
3338 | ast::FlagsItem { | |
3339 | span: span(3..4), | |
3340 | kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), | |
3341 | }, | |
3342 | ], | |
3343 | }, | |
3344 | }))); | |
3345 | assert_eq!(parser("(?i-U)").parse(), Ok(Ast::Flags(ast::SetFlags { | |
3346 | span: span(0..6), | |
3347 | flags: ast::Flags { | |
3348 | span: span(2..5), | |
3349 | items: vec![ | |
3350 | ast::FlagsItem { | |
3351 | span: span(2..3), | |
3352 | kind: ast::FlagsItemKind::Flag( | |
3353 | ast::Flag::CaseInsensitive), | |
3354 | }, | |
3355 | ast::FlagsItem { | |
3356 | span: span(3..4), | |
3357 | kind: ast::FlagsItemKind::Negation, | |
3358 | }, | |
3359 | ast::FlagsItem { | |
3360 | span: span(4..5), | |
3361 | kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), | |
3362 | }, | |
3363 | ], | |
3364 | }, | |
3365 | }))); | |
3366 | ||
3367 | assert_eq!(parser("()").parse(), Ok(Ast::Group(ast::Group { | |
3368 | span: span(0..2), | |
3369 | kind: ast::GroupKind::CaptureIndex(1), | |
3370 | ast: Box::new(Ast::Empty(span(1..1))), | |
3371 | }))); | |
3372 | assert_eq!(parser("(a)").parse(), Ok(Ast::Group(ast::Group { | |
3373 | span: span(0..3), | |
3374 | kind: ast::GroupKind::CaptureIndex(1), | |
3375 | ast: Box::new(lit('a', 1)), | |
3376 | }))); | |
3377 | assert_eq!(parser("(())").parse(), Ok(Ast::Group(ast::Group { | |
3378 | span: span(0..4), | |
3379 | kind: ast::GroupKind::CaptureIndex(1), | |
3380 | ast: Box::new(Ast::Group(ast::Group { | |
3381 | span: span(1..3), | |
3382 | kind: ast::GroupKind::CaptureIndex(2), | |
3383 | ast: Box::new(Ast::Empty(span(2..2))), | |
3384 | })), | |
3385 | }))); | |
3386 | ||
3387 | assert_eq!(parser("(?:a)").parse(), Ok(Ast::Group(ast::Group { | |
3388 | span: span(0..5), | |
3389 | kind: ast::GroupKind::NonCapturing(ast::Flags { | |
3390 | span: span(2..2), | |
3391 | items: vec![], | |
3392 | }), | |
3393 | ast: Box::new(lit('a', 3)), | |
3394 | }))); | |
3395 | ||
3396 | assert_eq!(parser("(?i:a)").parse(), Ok(Ast::Group(ast::Group { | |
3397 | span: span(0..6), | |
3398 | kind: ast::GroupKind::NonCapturing(ast::Flags { | |
3399 | span: span(2..3), | |
3400 | items: vec![ | |
3401 | ast::FlagsItem { | |
3402 | span: span(2..3), | |
3403 | kind: ast::FlagsItemKind::Flag( | |
3404 | ast::Flag::CaseInsensitive), | |
3405 | }, | |
3406 | ], | |
3407 | }), | |
3408 | ast: Box::new(lit('a', 4)), | |
3409 | }))); | |
3410 | assert_eq!(parser("(?i-U:a)").parse(), Ok(Ast::Group(ast::Group { | |
3411 | span: span(0..8), | |
3412 | kind: ast::GroupKind::NonCapturing(ast::Flags { | |
3413 | span: span(2..5), | |
3414 | items: vec![ | |
3415 | ast::FlagsItem { | |
3416 | span: span(2..3), | |
3417 | kind: ast::FlagsItemKind::Flag( | |
3418 | ast::Flag::CaseInsensitive), | |
3419 | }, | |
3420 | ast::FlagsItem { | |
3421 | span: span(3..4), | |
3422 | kind: ast::FlagsItemKind::Negation, | |
3423 | }, | |
3424 | ast::FlagsItem { | |
3425 | span: span(4..5), | |
3426 | kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), | |
3427 | }, | |
3428 | ], | |
3429 | }), | |
3430 | ast: Box::new(lit('a', 6)), | |
3431 | }))); | |
3432 | ||
3433 | assert_eq!( | |
3434 | parser("(").parse().unwrap_err(), | |
3435 | TestError { | |
3436 | span: span(0..1), | |
3437 | kind: ast::ErrorKind::GroupUnclosed, | |
3438 | }); | |
3439 | assert_eq!( | |
3440 | parser("(?").parse().unwrap_err(), | |
3441 | TestError { | |
3442 | span: span(0..1), | |
3443 | kind: ast::ErrorKind::GroupUnclosed, | |
3444 | }); | |
3445 | assert_eq!( | |
3446 | parser("(?P").parse().unwrap_err(), | |
3447 | TestError { | |
3448 | span: span(2..3), | |
3449 | kind: ast::ErrorKind::FlagUnrecognized, | |
3450 | }); | |
3451 | assert_eq!( | |
3452 | parser("(?P<").parse().unwrap_err(), | |
3453 | TestError { | |
3454 | span: span(4..4), | |
3455 | kind: ast::ErrorKind::GroupNameUnexpectedEof, | |
3456 | }); | |
3457 | assert_eq!( | |
3458 | parser("(a").parse().unwrap_err(), | |
3459 | TestError { | |
3460 | span: span(0..1), | |
3461 | kind: ast::ErrorKind::GroupUnclosed, | |
3462 | }); | |
3463 | assert_eq!( | |
3464 | parser("(()").parse().unwrap_err(), | |
3465 | TestError { | |
3466 | span: span(0..1), | |
3467 | kind: ast::ErrorKind::GroupUnclosed, | |
3468 | }); | |
3469 | assert_eq!( | |
3470 | parser(")").parse().unwrap_err(), | |
3471 | TestError { | |
3472 | span: span(0..1), | |
3473 | kind: ast::ErrorKind::GroupUnopened, | |
3474 | }); | |
3475 | assert_eq!( | |
3476 | parser("a)").parse().unwrap_err(), | |
3477 | TestError { | |
3478 | span: span(1..2), | |
3479 | kind: ast::ErrorKind::GroupUnopened, | |
3480 | }); | |
3481 | } | |
3482 | ||
3483 | #[test] | |
3484 | fn parse_capture_name() { | |
3485 | assert_eq!(parser("(?P<a>z)").parse(), Ok(Ast::Group(ast::Group { | |
3486 | span: span(0..8), | |
3487 | kind: ast::GroupKind::CaptureName(ast::CaptureName { | |
3488 | span: span(4..5), | |
3489 | name: s("a"), | |
3490 | index: 1, | |
3491 | }), | |
3492 | ast: Box::new(lit('z', 6)), | |
3493 | }))); | |
3494 | assert_eq!(parser("(?P<abc>z)").parse(), Ok(Ast::Group(ast::Group { | |
3495 | span: span(0..10), | |
3496 | kind: ast::GroupKind::CaptureName(ast::CaptureName { | |
3497 | span: span(4..7), | |
3498 | name: s("abc"), | |
3499 | index: 1, | |
3500 | }), | |
3501 | ast: Box::new(lit('z', 8)), | |
3502 | }))); | |
3503 | ||
3504 | assert_eq!( | |
3505 | parser("(?P<").parse().unwrap_err(), | |
3506 | TestError { | |
3507 | span: span(4..4), | |
3508 | kind: ast::ErrorKind::GroupNameUnexpectedEof, | |
3509 | }); | |
3510 | assert_eq!( | |
3511 | parser("(?P<>z)").parse().unwrap_err(), | |
3512 | TestError { | |
3513 | span: span(4..4), | |
3514 | kind: ast::ErrorKind::GroupNameEmpty, | |
3515 | }); | |
3516 | assert_eq!( | |
3517 | parser("(?P<a").parse().unwrap_err(), | |
3518 | TestError { | |
3519 | span: span(5..5), | |
3520 | kind: ast::ErrorKind::GroupNameUnexpectedEof, | |
3521 | }); | |
3522 | assert_eq!( | |
3523 | parser("(?P<ab").parse().unwrap_err(), | |
3524 | TestError { | |
3525 | span: span(6..6), | |
3526 | kind: ast::ErrorKind::GroupNameUnexpectedEof, | |
3527 | }); | |
3528 | assert_eq!( | |
3529 | parser("(?P<0a").parse().unwrap_err(), | |
3530 | TestError { | |
3531 | span: span(4..5), | |
3532 | kind: ast::ErrorKind::GroupNameInvalid, | |
3533 | }); | |
3534 | assert_eq!( | |
3535 | parser("(?P<~").parse().unwrap_err(), | |
3536 | TestError { | |
3537 | span: span(4..5), | |
3538 | kind: ast::ErrorKind::GroupNameInvalid, | |
3539 | }); | |
3540 | assert_eq!( | |
3541 | parser("(?P<abc~").parse().unwrap_err(), | |
3542 | TestError { | |
3543 | span: span(7..8), | |
3544 | kind: ast::ErrorKind::GroupNameInvalid, | |
3545 | }); | |
3546 | assert_eq!( | |
3547 | parser("(?P<a>y)(?P<a>z)").parse().unwrap_err(), | |
3548 | TestError { | |
3549 | span: span(12..13), | |
3550 | kind: ast::ErrorKind::GroupNameDuplicate { | |
3551 | original: span(4..5), | |
3552 | }, | |
3553 | }); | |
3554 | } | |
3555 | ||
3556 | #[test] | |
3557 | fn parse_flags() { | |
3558 | assert_eq!(parser("i:").parse_flags(), Ok(ast::Flags { | |
3559 | span: span(0..1), | |
3560 | items: vec![ast::FlagsItem { | |
3561 | span: span(0..1), | |
3562 | kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), | |
3563 | }], | |
3564 | })); | |
3565 | assert_eq!(parser("i)").parse_flags(), Ok(ast::Flags { | |
3566 | span: span(0..1), | |
3567 | items: vec![ast::FlagsItem { | |
3568 | span: span(0..1), | |
3569 | kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), | |
3570 | }], | |
3571 | })); | |
3572 | ||
3573 | assert_eq!(parser("isU:").parse_flags(), Ok(ast::Flags { | |
3574 | span: span(0..3), | |
3575 | items: vec![ | |
3576 | ast::FlagsItem { | |
3577 | span: span(0..1), | |
3578 | kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), | |
3579 | }, | |
3580 | ast::FlagsItem { | |
3581 | span: span(1..2), | |
3582 | kind: ast::FlagsItemKind::Flag( | |
3583 | ast::Flag::DotMatchesNewLine), | |
3584 | }, | |
3585 | ast::FlagsItem { | |
3586 | span: span(2..3), | |
3587 | kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), | |
3588 | }, | |
3589 | ], | |
3590 | })); | |
3591 | ||
3592 | assert_eq!(parser("-isU:").parse_flags(), Ok(ast::Flags { | |
3593 | span: span(0..4), | |
3594 | items: vec![ | |
3595 | ast::FlagsItem { | |
3596 | span: span(0..1), | |
3597 | kind: ast::FlagsItemKind::Negation, | |
3598 | }, | |
3599 | ast::FlagsItem { | |
3600 | span: span(1..2), | |
3601 | kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), | |
3602 | }, | |
3603 | ast::FlagsItem { | |
3604 | span: span(2..3), | |
3605 | kind: ast::FlagsItemKind::Flag( | |
3606 | ast::Flag::DotMatchesNewLine), | |
3607 | }, | |
3608 | ast::FlagsItem { | |
3609 | span: span(3..4), | |
3610 | kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), | |
3611 | }, | |
3612 | ], | |
3613 | })); | |
3614 | assert_eq!(parser("i-sU:").parse_flags(), Ok(ast::Flags { | |
3615 | span: span(0..4), | |
3616 | items: vec![ | |
3617 | ast::FlagsItem { | |
3618 | span: span(0..1), | |
3619 | kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), | |
3620 | }, | |
3621 | ast::FlagsItem { | |
3622 | span: span(1..2), | |
3623 | kind: ast::FlagsItemKind::Negation, | |
3624 | }, | |
3625 | ast::FlagsItem { | |
3626 | span: span(2..3), | |
3627 | kind: ast::FlagsItemKind::Flag( | |
3628 | ast::Flag::DotMatchesNewLine), | |
3629 | }, | |
3630 | ast::FlagsItem { | |
3631 | span: span(3..4), | |
3632 | kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), | |
3633 | }, | |
3634 | ], | |
3635 | })); | |
3636 | ||
3637 | assert_eq!( | |
3638 | parser("isU").parse_flags().unwrap_err(), | |
3639 | TestError { | |
3640 | span: span(3..3), | |
3641 | kind: ast::ErrorKind::FlagUnexpectedEof, | |
3642 | }); | |
3643 | assert_eq!( | |
3644 | parser("isUa:").parse_flags().unwrap_err(), | |
3645 | TestError { | |
3646 | span: span(3..4), | |
3647 | kind: ast::ErrorKind::FlagUnrecognized, | |
3648 | }); | |
3649 | assert_eq!( | |
3650 | parser("isUi:").parse_flags().unwrap_err(), | |
3651 | TestError { | |
3652 | span: span(3..4), | |
3653 | kind: ast::ErrorKind::FlagDuplicate { | |
3654 | original: span(0..1), | |
3655 | }, | |
3656 | }); | |
3657 | assert_eq!( | |
3658 | parser("i-sU-i:").parse_flags().unwrap_err(), | |
3659 | TestError { | |
3660 | span: span(4..5), | |
3661 | kind: ast::ErrorKind::FlagRepeatedNegation { | |
3662 | original: span(1..2), | |
3663 | }, | |
3664 | }); | |
3665 | assert_eq!( | |
3666 | parser("-)").parse_flags().unwrap_err(), | |
3667 | TestError { | |
3668 | span: span(0..1), | |
3669 | kind: ast::ErrorKind::FlagDanglingNegation, | |
3670 | }); | |
3671 | assert_eq!( | |
3672 | parser("i-)").parse_flags().unwrap_err(), | |
3673 | TestError { | |
3674 | span: span(1..2), | |
3675 | kind: ast::ErrorKind::FlagDanglingNegation, | |
3676 | }); | |
3677 | assert_eq!( | |
3678 | parser("iU-)").parse_flags().unwrap_err(), | |
3679 | TestError { | |
3680 | span: span(2..3), | |
3681 | kind: ast::ErrorKind::FlagDanglingNegation, | |
3682 | }); | |
3683 | } | |
3684 | ||
3685 | #[test] | |
3686 | fn parse_flag() { | |
3687 | assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive)); | |
3688 | assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine)); | |
3689 | assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine)); | |
3690 | assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed)); | |
3691 | assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode)); | |
3692 | assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace)); | |
3693 | ||
3694 | assert_eq!( | |
3695 | parser("a").parse_flag().unwrap_err(), | |
3696 | TestError { | |
3697 | span: span(0..1), | |
3698 | kind: ast::ErrorKind::FlagUnrecognized, | |
3699 | }); | |
3700 | assert_eq!( | |
3701 | parser("☃").parse_flag().unwrap_err(), | |
3702 | TestError { | |
3703 | span: span_range("☃", 0..3), | |
3704 | kind: ast::ErrorKind::FlagUnrecognized, | |
3705 | }); | |
3706 | } | |
3707 | ||
3708 | #[test] | |
3709 | fn parse_primitive_non_escape() { | |
3710 | assert_eq!( | |
3711 | parser(r".").parse_primitive(), | |
3712 | Ok(Primitive::Dot(span(0..1)))); | |
3713 | assert_eq!( | |
3714 | parser(r"^").parse_primitive(), | |
3715 | Ok(Primitive::Assertion(ast::Assertion { | |
3716 | span: span(0..1), | |
3717 | kind: ast::AssertionKind::StartLine, | |
3718 | }))); | |
3719 | assert_eq!( | |
3720 | parser(r"$").parse_primitive(), | |
3721 | Ok(Primitive::Assertion(ast::Assertion { | |
3722 | span: span(0..1), | |
3723 | kind: ast::AssertionKind::EndLine, | |
3724 | }))); | |
3725 | ||
3726 | assert_eq!( | |
3727 | parser(r"a").parse_primitive(), | |
3728 | Ok(Primitive::Literal(ast::Literal { | |
3729 | span: span(0..1), | |
3730 | kind: ast::LiteralKind::Verbatim, | |
3731 | c: 'a', | |
3732 | }))); | |
3733 | assert_eq!( | |
3734 | parser(r"|").parse_primitive(), | |
3735 | Ok(Primitive::Literal(ast::Literal { | |
3736 | span: span(0..1), | |
3737 | kind: ast::LiteralKind::Verbatim, | |
3738 | c: '|', | |
3739 | }))); | |
3740 | assert_eq!( | |
3741 | parser(r"☃").parse_primitive(), | |
3742 | Ok(Primitive::Literal(ast::Literal { | |
3743 | span: span_range("☃", 0..3), | |
3744 | kind: ast::LiteralKind::Verbatim, | |
3745 | c: '☃', | |
3746 | }))); | |
3747 | } | |
3748 | ||
3749 | #[test] | |
3750 | fn parse_escape() { | |
3751 | assert_eq!( | |
3752 | parser(r"\|").parse_primitive(), | |
3753 | Ok(Primitive::Literal(ast::Literal { | |
3754 | span: span(0..2), | |
3755 | kind: ast::LiteralKind::Punctuation, | |
3756 | c: '|', | |
3757 | }))); | |
3758 | let specials = &[ | |
3759 | (r"\a", '\x07', ast::SpecialLiteralKind::Bell), | |
3760 | (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed), | |
3761 | (r"\t", '\t', ast::SpecialLiteralKind::Tab), | |
3762 | (r"\n", '\n', ast::SpecialLiteralKind::LineFeed), | |
3763 | (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn), | |
3764 | (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab), | |
3765 | ]; | |
3766 | for &(pat, c, ref kind) in specials { | |
3767 | assert_eq!( | |
3768 | parser(pat).parse_primitive(), | |
3769 | Ok(Primitive::Literal(ast::Literal { | |
3770 | span: span(0..2), | |
3771 | kind: ast::LiteralKind::Special(kind.clone()), | |
3772 | c: c, | |
3773 | }))); | |
3774 | } | |
3775 | assert_eq!( | |
3776 | parser(r"\A").parse_primitive(), | |
3777 | Ok(Primitive::Assertion(ast::Assertion { | |
3778 | span: span(0..2), | |
3779 | kind: ast::AssertionKind::StartText, | |
3780 | }))); | |
3781 | assert_eq!( | |
3782 | parser(r"\z").parse_primitive(), | |
3783 | Ok(Primitive::Assertion(ast::Assertion { | |
3784 | span: span(0..2), | |
3785 | kind: ast::AssertionKind::EndText, | |
3786 | }))); | |
3787 | assert_eq!( | |
3788 | parser(r"\b").parse_primitive(), | |
3789 | Ok(Primitive::Assertion(ast::Assertion { | |
3790 | span: span(0..2), | |
3791 | kind: ast::AssertionKind::WordBoundary, | |
3792 | }))); | |
3793 | assert_eq!( | |
3794 | parser(r"\B").parse_primitive(), | |
3795 | Ok(Primitive::Assertion(ast::Assertion { | |
3796 | span: span(0..2), | |
3797 | kind: ast::AssertionKind::NotWordBoundary, | |
3798 | }))); | |
3799 | ||
3800 | assert_eq!( | |
3801 | parser(r"\").parse_escape().unwrap_err(), | |
3802 | TestError { | |
3803 | span: span(0..1), | |
3804 | kind: ast::ErrorKind::EscapeUnexpectedEof, | |
3805 | }); | |
3806 | assert_eq!( | |
3807 | parser(r"\y").parse_escape().unwrap_err(), | |
3808 | TestError { | |
3809 | span: span(0..2), | |
3810 | kind: ast::ErrorKind::EscapeUnrecognized, | |
3811 | }); | |
3812 | } | |
3813 | ||
3814 | #[test] | |
3815 | fn parse_unsupported_backreference() { | |
3816 | assert_eq!( | |
3817 | parser(r"\0").parse_escape().unwrap_err(), | |
3818 | TestError { | |
3819 | span: span(0..2), | |
3820 | kind: ast::ErrorKind::UnsupportedBackreference, | |
3821 | }); | |
3822 | assert_eq!( | |
3823 | parser(r"\9").parse_escape().unwrap_err(), | |
3824 | TestError { | |
3825 | span: span(0..2), | |
3826 | kind: ast::ErrorKind::UnsupportedBackreference, | |
3827 | }); | |
3828 | } | |
3829 | ||
3830 | #[test] | |
3831 | fn parse_octal() { | |
3832 | for i in 0..511 { | |
3833 | let pat = format!(r"\{:o}", i); | |
3834 | assert_eq!( | |
3835 | parser_octal(&pat).parse_escape(), | |
3836 | Ok(Primitive::Literal(ast::Literal { | |
3837 | span: span(0..pat.len()), | |
3838 | kind: ast::LiteralKind::Octal, | |
3839 | c: ::std::char::from_u32(i).unwrap(), | |
3840 | }))); | |
3841 | } | |
3842 | assert_eq!( | |
3843 | parser_octal(r"\778").parse_escape(), | |
3844 | Ok(Primitive::Literal(ast::Literal { | |
3845 | span: span(0..3), | |
3846 | kind: ast::LiteralKind::Octal, | |
3847 | c: '?', | |
3848 | }))); | |
3849 | assert_eq!( | |
3850 | parser_octal(r"\7777").parse_escape(), | |
3851 | Ok(Primitive::Literal(ast::Literal { | |
3852 | span: span(0..4), | |
3853 | kind: ast::LiteralKind::Octal, | |
3854 | c: '\u{01FF}', | |
3855 | }))); | |
3856 | assert_eq!( | |
3857 | parser_octal(r"\778").parse(), | |
3858 | Ok(Ast::Concat(ast::Concat { | |
3859 | span: span(0..4), | |
3860 | asts: vec![ | |
3861 | Ast::Literal(ast::Literal { | |
3862 | span: span(0..3), | |
3863 | kind: ast::LiteralKind::Octal, | |
3864 | c: '?', | |
3865 | }), | |
3866 | Ast::Literal(ast::Literal { | |
3867 | span: span(3..4), | |
3868 | kind: ast::LiteralKind::Verbatim, | |
3869 | c: '8', | |
3870 | }), | |
3871 | ], | |
3872 | }))); | |
3873 | assert_eq!( | |
3874 | parser_octal(r"\7777").parse(), | |
3875 | Ok(Ast::Concat(ast::Concat { | |
3876 | span: span(0..5), | |
3877 | asts: vec![ | |
3878 | Ast::Literal(ast::Literal { | |
3879 | span: span(0..4), | |
3880 | kind: ast::LiteralKind::Octal, | |
3881 | c: '\u{01FF}', | |
3882 | }), | |
3883 | Ast::Literal(ast::Literal { | |
3884 | span: span(4..5), | |
3885 | kind: ast::LiteralKind::Verbatim, | |
3886 | c: '7', | |
3887 | }), | |
3888 | ], | |
3889 | }))); | |
3890 | ||
3891 | assert_eq!( | |
3892 | parser_octal(r"\8").parse_escape().unwrap_err(), | |
3893 | TestError { | |
3894 | span: span(0..2), | |
3895 | kind: ast::ErrorKind::EscapeUnrecognized, | |
3896 | }); | |
3897 | } | |
3898 | ||
3899 | #[test] | |
3900 | fn parse_hex_two() { | |
3901 | for i in 0..256 { | |
3902 | let pat = format!(r"\x{:02x}", i); | |
3903 | assert_eq!( | |
3904 | parser(&pat).parse_escape(), | |
3905 | Ok(Primitive::Literal(ast::Literal { | |
3906 | span: span(0..pat.len()), | |
3907 | kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X), | |
3908 | c: ::std::char::from_u32(i).unwrap(), | |
3909 | }))); | |
3910 | } | |
3911 | ||
3912 | assert_eq!( | |
3913 | parser(r"\xF").parse_escape().unwrap_err(), | |
3914 | TestError { | |
3915 | span: span(3..3), | |
3916 | kind: ast::ErrorKind::EscapeUnexpectedEof, | |
3917 | }); | |
3918 | assert_eq!( | |
3919 | parser(r"\xG").parse_escape().unwrap_err(), | |
3920 | TestError { | |
3921 | span: span(2..3), | |
3922 | kind: ast::ErrorKind::EscapeHexInvalidDigit, | |
3923 | }); | |
3924 | assert_eq!( | |
3925 | parser(r"\xFG").parse_escape().unwrap_err(), | |
3926 | TestError { | |
3927 | span: span(3..4), | |
3928 | kind: ast::ErrorKind::EscapeHexInvalidDigit, | |
3929 | }); | |
3930 | } | |
3931 | ||
3932 | #[test] | |
3933 | fn parse_hex_four() { | |
3934 | for i in 0..65536 { | |
3935 | let c = match ::std::char::from_u32(i) { | |
3936 | None => continue, | |
3937 | Some(c) => c, | |
3938 | }; | |
3939 | let pat = format!(r"\u{:04x}", i); | |
3940 | assert_eq!( | |
3941 | parser(&pat).parse_escape(), | |
3942 | Ok(Primitive::Literal(ast::Literal { | |
3943 | span: span(0..pat.len()), | |
3944 | kind: ast::LiteralKind::HexFixed( | |
3945 | ast::HexLiteralKind::UnicodeShort), | |
3946 | c: c, | |
3947 | }))); | |
3948 | } | |
3949 | ||
3950 | assert_eq!( | |
3951 | parser(r"\uF").parse_escape().unwrap_err(), | |
3952 | TestError { | |
3953 | span: span(3..3), | |
3954 | kind: ast::ErrorKind::EscapeUnexpectedEof, | |
3955 | }); | |
3956 | assert_eq!( | |
3957 | parser(r"\uG").parse_escape().unwrap_err(), | |
3958 | TestError { | |
3959 | span: span(2..3), | |
3960 | kind: ast::ErrorKind::EscapeHexInvalidDigit, | |
3961 | }); | |
3962 | assert_eq!( | |
3963 | parser(r"\uFG").parse_escape().unwrap_err(), | |
3964 | TestError { | |
3965 | span: span(3..4), | |
3966 | kind: ast::ErrorKind::EscapeHexInvalidDigit, | |
3967 | }); | |
3968 | assert_eq!( | |
3969 | parser(r"\uFFG").parse_escape().unwrap_err(), | |
3970 | TestError { | |
3971 | span: span(4..5), | |
3972 | kind: ast::ErrorKind::EscapeHexInvalidDigit, | |
3973 | }); | |
3974 | assert_eq!( | |
3975 | parser(r"\uFFFG").parse_escape().unwrap_err(), | |
3976 | TestError { | |
3977 | span: span(5..6), | |
3978 | kind: ast::ErrorKind::EscapeHexInvalidDigit, | |
3979 | }); | |
3980 | assert_eq!( | |
3981 | parser(r"\uD800").parse_escape().unwrap_err(), | |
3982 | TestError { | |
3983 | span: span(2..6), | |
3984 | kind: ast::ErrorKind::EscapeHexInvalid, | |
3985 | }); | |
3986 | } | |
3987 | ||
3988 | #[test] | |
3989 | fn parse_hex_eight() { | |
3990 | for i in 0..65536 { | |
3991 | let c = match ::std::char::from_u32(i) { | |
3992 | None => continue, | |
3993 | Some(c) => c, | |
3994 | }; | |
3995 | let pat = format!(r"\U{:08x}", i); | |
3996 | assert_eq!( | |
3997 | parser(&pat).parse_escape(), | |
3998 | Ok(Primitive::Literal(ast::Literal { | |
3999 | span: span(0..pat.len()), | |
4000 | kind: ast::LiteralKind::HexFixed( | |
4001 | ast::HexLiteralKind::UnicodeLong), | |
4002 | c: c, | |
4003 | }))); | |
4004 | } | |
4005 | ||
4006 | assert_eq!( | |
4007 | parser(r"\UF").parse_escape().unwrap_err(), | |
4008 | TestError { | |
4009 | span: span(3..3), | |
4010 | kind: ast::ErrorKind::EscapeUnexpectedEof, | |
4011 | }); | |
4012 | assert_eq!( | |
4013 | parser(r"\UG").parse_escape().unwrap_err(), | |
4014 | TestError { | |
4015 | span: span(2..3), | |
4016 | kind: ast::ErrorKind::EscapeHexInvalidDigit, | |
4017 | }); | |
4018 | assert_eq!( | |
4019 | parser(r"\UFG").parse_escape().unwrap_err(), | |
4020 | TestError { | |
4021 | span: span(3..4), | |
4022 | kind: ast::ErrorKind::EscapeHexInvalidDigit, | |
4023 | }); | |
4024 | assert_eq!( | |
4025 | parser(r"\UFFG").parse_escape().unwrap_err(), | |
4026 | TestError { | |
4027 | span: span(4..5), | |
4028 | kind: ast::ErrorKind::EscapeHexInvalidDigit, | |
4029 | }); | |
4030 | assert_eq!( | |
4031 | parser(r"\UFFFG").parse_escape().unwrap_err(), | |
4032 | TestError { | |
4033 | span: span(5..6), | |
4034 | kind: ast::ErrorKind::EscapeHexInvalidDigit, | |
4035 | }); | |
4036 | assert_eq!( | |
4037 | parser(r"\UFFFFG").parse_escape().unwrap_err(), | |
4038 | TestError { | |
4039 | span: span(6..7), | |
4040 | kind: ast::ErrorKind::EscapeHexInvalidDigit, | |
4041 | }); | |
4042 | assert_eq!( | |
4043 | parser(r"\UFFFFFG").parse_escape().unwrap_err(), | |
4044 | TestError { | |
4045 | span: span(7..8), | |
4046 | kind: ast::ErrorKind::EscapeHexInvalidDigit, | |
4047 | }); | |
4048 | assert_eq!( | |
4049 | parser(r"\UFFFFFFG").parse_escape().unwrap_err(), | |
4050 | TestError { | |
4051 | span: span(8..9), | |
4052 | kind: ast::ErrorKind::EscapeHexInvalidDigit, | |
4053 | }); | |
4054 | assert_eq!( | |
4055 | parser(r"\UFFFFFFFG").parse_escape().unwrap_err(), | |
4056 | TestError { | |
4057 | span: span(9..10), | |
4058 | kind: ast::ErrorKind::EscapeHexInvalidDigit, | |
4059 | }); | |
4060 | } | |
4061 | ||
4062 | #[test] | |
4063 | fn parse_hex_brace() { | |
4064 | assert_eq!( | |
4065 | parser(r"\u{26c4}").parse_escape(), | |
4066 | Ok(Primitive::Literal(ast::Literal { | |
4067 | span: span(0..8), | |
4068 | kind: ast::LiteralKind::HexBrace( | |
4069 | ast::HexLiteralKind::UnicodeShort), | |
4070 | c: '⛄', | |
4071 | }))); | |
4072 | assert_eq!( | |
4073 | parser(r"\U{26c4}").parse_escape(), | |
4074 | Ok(Primitive::Literal(ast::Literal { | |
4075 | span: span(0..8), | |
4076 | kind: ast::LiteralKind::HexBrace( | |
4077 | ast::HexLiteralKind::UnicodeLong), | |
4078 | c: '⛄', | |
4079 | }))); | |
4080 | assert_eq!( | |
4081 | parser(r"\x{26c4}").parse_escape(), | |
4082 | Ok(Primitive::Literal(ast::Literal { | |
4083 | span: span(0..8), | |
4084 | kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), | |
4085 | c: '⛄', | |
4086 | }))); | |
4087 | assert_eq!( | |
4088 | parser(r"\x{26C4}").parse_escape(), | |
4089 | Ok(Primitive::Literal(ast::Literal { | |
4090 | span: span(0..8), | |
4091 | kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), | |
4092 | c: '⛄', | |
4093 | }))); | |
4094 | assert_eq!( | |
4095 | parser(r"\x{10fFfF}").parse_escape(), | |
4096 | Ok(Primitive::Literal(ast::Literal { | |
4097 | span: span(0..10), | |
4098 | kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), | |
4099 | c: '\u{10FFFF}', | |
4100 | }))); | |
4101 | ||
4102 | assert_eq!( | |
4103 | parser(r"\x").parse_escape().unwrap_err(), | |
4104 | TestError { | |
4105 | span: span(2..2), | |
4106 | kind: ast::ErrorKind::EscapeUnexpectedEof, | |
4107 | }); | |
4108 | assert_eq!( | |
4109 | parser(r"\x{").parse_escape().unwrap_err(), | |
4110 | TestError { | |
4111 | span: span(2..3), | |
4112 | kind: ast::ErrorKind::EscapeUnexpectedEof, | |
4113 | }); | |
4114 | assert_eq!( | |
4115 | parser(r"\x{FF").parse_escape().unwrap_err(), | |
4116 | TestError { | |
4117 | span: span(2..5), | |
4118 | kind: ast::ErrorKind::EscapeUnexpectedEof, | |
4119 | }); | |
4120 | assert_eq!( | |
4121 | parser(r"\x{}").parse_escape().unwrap_err(), | |
4122 | TestError { | |
4123 | span: span(2..4), | |
4124 | kind: ast::ErrorKind::EscapeHexEmpty, | |
4125 | }); | |
4126 | assert_eq!( | |
4127 | parser(r"\x{FGF}").parse_escape().unwrap_err(), | |
4128 | TestError { | |
4129 | span: span(4..5), | |
4130 | kind: ast::ErrorKind::EscapeHexInvalidDigit, | |
4131 | }); | |
4132 | assert_eq!( | |
4133 | parser(r"\x{FFFFFF}").parse_escape().unwrap_err(), | |
4134 | TestError { | |
4135 | span: span(3..9), | |
4136 | kind: ast::ErrorKind::EscapeHexInvalid, | |
4137 | }); | |
4138 | assert_eq!( | |
4139 | parser(r"\x{D800}").parse_escape().unwrap_err(), | |
4140 | TestError { | |
4141 | span: span(3..7), | |
4142 | kind: ast::ErrorKind::EscapeHexInvalid, | |
4143 | }); | |
4144 | assert_eq!( | |
4145 | parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(), | |
4146 | TestError { | |
4147 | span: span(3..12), | |
4148 | kind: ast::ErrorKind::EscapeHexInvalid, | |
4149 | }); | |
4150 | } | |
4151 | ||
4152 | #[test] | |
4153 | fn parse_decimal() { | |
4154 | assert_eq!(parser("123").parse_decimal(), Ok(123)); | |
4155 | assert_eq!(parser("0").parse_decimal(), Ok(0)); | |
4156 | assert_eq!(parser("01").parse_decimal(), Ok(1)); | |
4157 | ||
4158 | assert_eq!( | |
4159 | parser("-1").parse_decimal().unwrap_err(), | |
4160 | TestError { | |
4161 | span: span(0..0), | |
4162 | kind: ast::ErrorKind::DecimalEmpty, | |
4163 | }); | |
4164 | assert_eq!( | |
4165 | parser("").parse_decimal().unwrap_err(), | |
4166 | TestError { | |
4167 | span: span(0..0), | |
4168 | kind: ast::ErrorKind::DecimalEmpty, | |
4169 | }); | |
4170 | assert_eq!( | |
4171 | parser("9999999999").parse_decimal().unwrap_err(), | |
4172 | TestError { | |
4173 | span: span(0..10), | |
4174 | kind: ast::ErrorKind::DecimalInvalid, | |
4175 | }); | |
4176 | } | |
4177 | ||
4178 | #[test] | |
4179 | fn parse_set_class() { | |
4180 | fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet { | |
4181 | ast::ClassSet::union(ast::ClassSetUnion { | |
4182 | span: span, | |
4183 | items: items, | |
4184 | }) | |
4185 | } | |
4186 | ||
4187 | fn intersection( | |
4188 | span: Span, | |
4189 | lhs: ast::ClassSet, | |
4190 | rhs: ast::ClassSet, | |
4191 | ) -> ast::ClassSet { | |
4192 | ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { | |
4193 | span: span, | |
4194 | kind: ast::ClassSetBinaryOpKind::Intersection, | |
4195 | lhs: Box::new(lhs), | |
4196 | rhs: Box::new(rhs), | |
4197 | }) | |
4198 | } | |
4199 | ||
4200 | fn difference( | |
4201 | span: Span, | |
4202 | lhs: ast::ClassSet, | |
4203 | rhs: ast::ClassSet, | |
4204 | ) -> ast::ClassSet { | |
4205 | ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { | |
4206 | span: span, | |
4207 | kind: ast::ClassSetBinaryOpKind::Difference, | |
4208 | lhs: Box::new(lhs), | |
4209 | rhs: Box::new(rhs), | |
4210 | }) | |
4211 | } | |
4212 | ||
4213 | fn symdifference( | |
4214 | span: Span, | |
4215 | lhs: ast::ClassSet, | |
4216 | rhs: ast::ClassSet, | |
4217 | ) -> ast::ClassSet { | |
4218 | ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { | |
4219 | span: span, | |
4220 | kind: ast::ClassSetBinaryOpKind::SymmetricDifference, | |
4221 | lhs: Box::new(lhs), | |
4222 | rhs: Box::new(rhs), | |
4223 | }) | |
4224 | } | |
4225 | ||
4226 | fn itemset(item: ast::ClassSetItem) -> ast::ClassSet { | |
4227 | ast::ClassSet::Item(item) | |
4228 | } | |
4229 | ||
4230 | fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem { | |
4231 | ast::ClassSetItem::Ascii(cls) | |
4232 | } | |
4233 | ||
4234 | fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem { | |
4235 | ast::ClassSetItem::Unicode(cls) | |
4236 | } | |
4237 | ||
4238 | fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem { | |
4239 | ast::ClassSetItem::Perl(cls) | |
4240 | } | |
4241 | ||
4242 | fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem { | |
4243 | ast::ClassSetItem::Bracketed(Box::new(cls)) | |
4244 | } | |
4245 | ||
4246 | fn lit(span: Span, c: char) -> ast::ClassSetItem { | |
4247 | ast::ClassSetItem::Literal(ast::Literal { | |
4248 | span: span, | |
4249 | kind: ast::LiteralKind::Verbatim, | |
4250 | c: c, | |
4251 | }) | |
4252 | } | |
4253 | ||
4254 | fn empty(span: Span) -> ast::ClassSetItem { | |
4255 | ast::ClassSetItem::Empty(span) | |
4256 | } | |
4257 | ||
4258 | fn range(span: Span, start: char, end: char) -> ast::ClassSetItem { | |
4259 | let pos1 = Position { | |
4260 | offset: span.start.offset + start.len_utf8(), | |
4261 | column: span.start.column + 1, | |
4262 | ..span.start | |
4263 | }; | |
4264 | let pos2 = Position { | |
4265 | offset: span.end.offset - end.len_utf8(), | |
4266 | column: span.end.column - 1, | |
4267 | ..span.end | |
4268 | }; | |
4269 | ast::ClassSetItem::Range(ast::ClassSetRange { | |
4270 | span: span, | |
4271 | start: ast::Literal { | |
4272 | span: Span { end: pos1, ..span }, | |
4273 | kind: ast::LiteralKind::Verbatim, | |
4274 | c: start, | |
4275 | }, | |
4276 | end: ast::Literal { | |
4277 | span: Span { start: pos2, ..span }, | |
4278 | kind: ast::LiteralKind::Verbatim, | |
4279 | c: end, | |
4280 | }, | |
4281 | }) | |
4282 | } | |
4283 | ||
4284 | fn alnum(span: Span, negated: bool) -> ast::ClassAscii { | |
4285 | ast::ClassAscii { | |
4286 | span: span, | |
4287 | kind: ast::ClassAsciiKind::Alnum, | |
4288 | negated: negated, | |
4289 | } | |
4290 | } | |
4291 | ||
4292 | fn lower(span: Span, negated: bool) -> ast::ClassAscii { | |
4293 | ast::ClassAscii { | |
4294 | span: span, | |
4295 | kind: ast::ClassAsciiKind::Lower, | |
4296 | negated: negated, | |
4297 | } | |
4298 | } | |
4299 | ||
4300 | assert_eq!( | |
4301 | parser("[[:alnum:]]").parse(), | |
4302 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4303 | span: span(0..11), | |
4304 | negated: false, | |
4305 | kind: itemset(item_ascii(alnum(span(1..10), false))), | |
4306 | })))); | |
4307 | assert_eq!( | |
4308 | parser("[[[:alnum:]]]").parse(), | |
4309 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4310 | span: span(0..13), | |
4311 | negated: false, | |
4312 | kind: itemset(item_bracket(ast::ClassBracketed { | |
4313 | span: span(1..12), | |
4314 | negated: false, | |
4315 | kind: itemset(item_ascii(alnum(span(2..11), false))), | |
4316 | })), | |
4317 | })))); | |
4318 | assert_eq!( | |
4319 | parser("[[:alnum:]&&[:lower:]]").parse(), | |
4320 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4321 | span: span(0..22), | |
4322 | negated: false, | |
4323 | kind: intersection( | |
4324 | span(1..21), | |
4325 | itemset(item_ascii(alnum(span(1..10), false))), | |
4326 | itemset(item_ascii(lower(span(12..21), false))), | |
4327 | ), | |
4328 | })))); | |
4329 | assert_eq!( | |
4330 | parser("[[:alnum:]--[:lower:]]").parse(), | |
4331 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4332 | span: span(0..22), | |
4333 | negated: false, | |
4334 | kind: difference( | |
4335 | span(1..21), | |
4336 | itemset(item_ascii(alnum(span(1..10), false))), | |
4337 | itemset(item_ascii(lower(span(12..21), false))), | |
4338 | ), | |
4339 | })))); | |
4340 | assert_eq!( | |
4341 | parser("[[:alnum:]~~[:lower:]]").parse(), | |
4342 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4343 | span: span(0..22), | |
4344 | negated: false, | |
4345 | kind: symdifference( | |
4346 | span(1..21), | |
4347 | itemset(item_ascii(alnum(span(1..10), false))), | |
4348 | itemset(item_ascii(lower(span(12..21), false))), | |
4349 | ), | |
4350 | })))); | |
4351 | ||
4352 | assert_eq!( | |
4353 | parser("[a]").parse(), | |
4354 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4355 | span: span(0..3), | |
4356 | negated: false, | |
4357 | kind: itemset(lit(span(1..2), 'a')), | |
4358 | })))); | |
4359 | assert_eq!( | |
4360 | parser(r"[a\]]").parse(), | |
4361 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4362 | span: span(0..5), | |
4363 | negated: false, | |
4364 | kind: union(span(1..4), vec![ | |
4365 | lit(span(1..2), 'a'), | |
4366 | ast::ClassSetItem::Literal(ast::Literal { | |
4367 | span: span(2..4), | |
4368 | kind: ast::LiteralKind::Punctuation, | |
4369 | c: ']', | |
4370 | }), | |
4371 | ]), | |
4372 | })))); | |
4373 | assert_eq!( | |
4374 | parser(r"[a\-z]").parse(), | |
4375 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4376 | span: span(0..6), | |
4377 | negated: false, | |
4378 | kind: union(span(1..5), vec![ | |
4379 | lit(span(1..2), 'a'), | |
4380 | ast::ClassSetItem::Literal(ast::Literal { | |
4381 | span: span(2..4), | |
4382 | kind: ast::LiteralKind::Punctuation, | |
4383 | c: '-', | |
4384 | }), | |
4385 | lit(span(4..5), 'z'), | |
4386 | ]), | |
4387 | })))); | |
4388 | assert_eq!( | |
4389 | parser("[ab]").parse(), | |
4390 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4391 | span: span(0..4), | |
4392 | negated: false, | |
4393 | kind: union(span(1..3), vec![ | |
4394 | lit(span(1..2), 'a'), | |
4395 | lit(span(2..3), 'b'), | |
4396 | ]), | |
4397 | })))); | |
4398 | assert_eq!( | |
4399 | parser("[a-]").parse(), | |
4400 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4401 | span: span(0..4), | |
4402 | negated: false, | |
4403 | kind: union(span(1..3), vec![ | |
4404 | lit(span(1..2), 'a'), | |
4405 | lit(span(2..3), '-'), | |
4406 | ]), | |
4407 | })))); | |
4408 | assert_eq!( | |
4409 | parser("[-a]").parse(), | |
4410 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4411 | span: span(0..4), | |
4412 | negated: false, | |
4413 | kind: union(span(1..3), vec![ | |
4414 | lit(span(1..2), '-'), | |
4415 | lit(span(2..3), 'a'), | |
4416 | ]), | |
4417 | })))); | |
4418 | assert_eq!( | |
4419 | parser(r"[\pL]").parse(), | |
4420 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4421 | span: span(0..5), | |
4422 | negated: false, | |
4423 | kind: itemset(item_unicode(ast::ClassUnicode { | |
4424 | span: span(1..4), | |
4425 | negated: false, | |
4426 | kind: ast::ClassUnicodeKind::OneLetter('L'), | |
4427 | })), | |
4428 | })))); | |
4429 | assert_eq!( | |
4430 | parser(r"[\w]").parse(), | |
4431 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4432 | span: span(0..4), | |
4433 | negated: false, | |
4434 | kind: itemset(item_perl(ast::ClassPerl { | |
4435 | span: span(1..3), | |
4436 | kind: ast::ClassPerlKind::Word, | |
4437 | negated: false, | |
4438 | })), | |
4439 | })))); | |
4440 | assert_eq!( | |
4441 | parser(r"[a\wz]").parse(), | |
4442 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4443 | span: span(0..6), | |
4444 | negated: false, | |
4445 | kind: union(span(1..5), vec![ | |
4446 | lit(span(1..2), 'a'), | |
4447 | item_perl(ast::ClassPerl { | |
4448 | span: span(2..4), | |
4449 | kind: ast::ClassPerlKind::Word, | |
4450 | negated: false, | |
4451 | }), | |
4452 | lit(span(4..5), 'z'), | |
4453 | ]), | |
4454 | })))); | |
4455 | ||
4456 | assert_eq!( | |
4457 | parser("[a-z]").parse(), | |
4458 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4459 | span: span(0..5), | |
4460 | negated: false, | |
4461 | kind: itemset(range(span(1..4), 'a', 'z')), | |
4462 | })))); | |
4463 | assert_eq!( | |
4464 | parser("[a-cx-z]").parse(), | |
4465 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4466 | span: span(0..8), | |
4467 | negated: false, | |
4468 | kind: union(span(1..7), vec![ | |
4469 | range(span(1..4), 'a', 'c'), | |
4470 | range(span(4..7), 'x', 'z'), | |
4471 | ]), | |
4472 | })))); | |
4473 | assert_eq!( | |
4474 | parser(r"[\w&&a-cx-z]").parse(), | |
4475 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4476 | span: span(0..12), | |
4477 | negated: false, | |
4478 | kind: intersection( | |
4479 | span(1..11), | |
4480 | itemset(item_perl(ast::ClassPerl { | |
4481 | span: span(1..3), | |
4482 | kind: ast::ClassPerlKind::Word, | |
4483 | negated: false, | |
4484 | })), | |
4485 | union(span(5..11), vec![ | |
4486 | range(span(5..8), 'a', 'c'), | |
4487 | range(span(8..11), 'x', 'z'), | |
4488 | ]), | |
4489 | ), | |
4490 | })))); | |
4491 | assert_eq!( | |
4492 | parser(r"[a-cx-z&&\w]").parse(), | |
4493 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4494 | span: span(0..12), | |
4495 | negated: false, | |
4496 | kind: intersection( | |
4497 | span(1..11), | |
4498 | union(span(1..7), vec![ | |
4499 | range(span(1..4), 'a', 'c'), | |
4500 | range(span(4..7), 'x', 'z'), | |
4501 | ]), | |
4502 | itemset(item_perl(ast::ClassPerl { | |
4503 | span: span(9..11), | |
4504 | kind: ast::ClassPerlKind::Word, | |
4505 | negated: false, | |
4506 | })), | |
4507 | ), | |
4508 | })))); | |
4509 | assert_eq!( | |
4510 | parser(r"[a--b--c]").parse(), | |
4511 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4512 | span: span(0..9), | |
4513 | negated: false, | |
4514 | kind: difference( | |
4515 | span(1..8), | |
4516 | difference( | |
4517 | span(1..5), | |
4518 | itemset(lit(span(1..2), 'a')), | |
4519 | itemset(lit(span(4..5), 'b')), | |
4520 | ), | |
4521 | itemset(lit(span(7..8), 'c')), | |
4522 | ), | |
4523 | })))); | |
4524 | assert_eq!( | |
4525 | parser(r"[a~~b~~c]").parse(), | |
4526 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4527 | span: span(0..9), | |
4528 | negated: false, | |
4529 | kind: symdifference( | |
4530 | span(1..8), | |
4531 | symdifference( | |
4532 | span(1..5), | |
4533 | itemset(lit(span(1..2), 'a')), | |
4534 | itemset(lit(span(4..5), 'b')), | |
4535 | ), | |
4536 | itemset(lit(span(7..8), 'c')), | |
4537 | ), | |
4538 | })))); | |
4539 | assert_eq!( | |
4540 | parser(r"[\^&&^]").parse(), | |
4541 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4542 | span: span(0..7), | |
4543 | negated: false, | |
4544 | kind: intersection( | |
4545 | span(1..6), | |
4546 | itemset(ast::ClassSetItem::Literal(ast::Literal { | |
4547 | span: span(1..3), | |
4548 | kind: ast::LiteralKind::Punctuation, | |
4549 | c: '^', | |
4550 | })), | |
4551 | itemset(lit(span(5..6), '^')), | |
4552 | ), | |
4553 | })))); | |
4554 | assert_eq!( | |
4555 | parser(r"[\&&&&]").parse(), | |
4556 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4557 | span: span(0..7), | |
4558 | negated: false, | |
4559 | kind: intersection( | |
4560 | span(1..6), | |
4561 | itemset(ast::ClassSetItem::Literal(ast::Literal { | |
4562 | span: span(1..3), | |
4563 | kind: ast::LiteralKind::Punctuation, | |
4564 | c: '&', | |
4565 | })), | |
4566 | itemset(lit(span(5..6), '&')), | |
4567 | ), | |
4568 | })))); | |
4569 | assert_eq!( | |
4570 | parser(r"[&&&&]").parse(), | |
4571 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4572 | span: span(0..6), | |
4573 | negated: false, | |
4574 | kind: intersection( | |
4575 | span(1..5), | |
4576 | intersection( | |
4577 | span(1..3), | |
4578 | itemset(empty(span(1..1))), | |
4579 | itemset(empty(span(3..3))), | |
4580 | ), | |
4581 | itemset(empty(span(5..5))), | |
4582 | ), | |
4583 | })))); | |
4584 | ||
4585 | let pat = "[☃-⛄]"; | |
4586 | assert_eq!( | |
4587 | parser(pat).parse(), | |
4588 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4589 | span: span_range(pat, 0..9), | |
4590 | negated: false, | |
4591 | kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { | |
4592 | span: span_range(pat, 1..8), | |
4593 | start: ast::Literal { | |
4594 | span: span_range(pat, 1..4), | |
4595 | kind: ast::LiteralKind::Verbatim, | |
4596 | c: '☃', | |
4597 | }, | |
4598 | end: ast::Literal { | |
4599 | span: span_range(pat, 5..8), | |
4600 | kind: ast::LiteralKind::Verbatim, | |
4601 | c: '⛄', | |
4602 | }, | |
4603 | })), | |
4604 | })))); | |
4605 | ||
4606 | assert_eq!( | |
4607 | parser(r"[]]").parse(), | |
4608 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4609 | span: span(0..3), | |
4610 | negated: false, | |
4611 | kind: itemset(lit(span(1..2), ']')), | |
4612 | })))); | |
4613 | assert_eq!( | |
4614 | parser(r"[]\[]").parse(), | |
4615 | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4616 | span: span(0..5), | |
4617 | negated: false, | |
4618 | kind: union(span(1..4), vec![ | |
4619 | lit(span(1..2), ']'), | |
4620 | ast::ClassSetItem::Literal(ast::Literal { | |
4621 | span: span(2..4), | |
4622 | kind: ast::LiteralKind::Punctuation, | |
4623 | c: '[', | |
4624 | }), | |
4625 | ]), | |
4626 | })))); | |
4627 | assert_eq!( | |
4628 | parser(r"[\[]]").parse(), | |
4629 | Ok(concat(0..5, vec![ | |
4630 | Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { | |
4631 | span: span(0..4), | |
4632 | negated: false, | |
4633 | kind: itemset(ast::ClassSetItem::Literal(ast::Literal { | |
4634 | span: span(1..3), | |
4635 | kind: ast::LiteralKind::Punctuation, | |
4636 | c: '[', | |
4637 | })), | |
4638 | })), | |
4639 | Ast::Literal(ast::Literal { | |
4640 | span: span(4..5), | |
4641 | kind: ast::LiteralKind::Verbatim, | |
4642 | c: ']', | |
4643 | }), | |
4644 | ]))); | |
4645 | ||
4646 | assert_eq!( | |
4647 | parser("[").parse().unwrap_err(), | |
4648 | TestError { | |
4649 | span: span(0..1), | |
4650 | kind: ast::ErrorKind::ClassUnclosed, | |
4651 | }); | |
4652 | assert_eq!( | |
4653 | parser("[[").parse().unwrap_err(), | |
4654 | TestError { | |
4655 | span: span(1..2), | |
4656 | kind: ast::ErrorKind::ClassUnclosed, | |
4657 | }); | |
4658 | assert_eq!( | |
4659 | parser("[[-]").parse().unwrap_err(), | |
4660 | TestError { | |
4661 | span: span(0..1), | |
4662 | kind: ast::ErrorKind::ClassUnclosed, | |
4663 | }); | |
4664 | assert_eq!( | |
4665 | parser("[[[:alnum:]").parse().unwrap_err(), | |
4666 | TestError { | |
4667 | span: span(1..2), | |
4668 | kind: ast::ErrorKind::ClassUnclosed, | |
4669 | }); | |
4670 | assert_eq!( | |
4671 | parser(r"[\b]").parse().unwrap_err(), | |
4672 | TestError { | |
4673 | span: span(1..3), | |
4674 | kind: ast::ErrorKind::ClassEscapeInvalid, | |
4675 | }); | |
4676 | assert_eq!( | |
4677 | parser(r"[\w-a]").parse().unwrap_err(), | |
4678 | TestError { | |
4679 | span: span(1..3), | |
4680 | kind: ast::ErrorKind::ClassEscapeInvalid, | |
4681 | }); | |
4682 | assert_eq!( | |
4683 | parser(r"[a-\w]").parse().unwrap_err(), | |
4684 | TestError { | |
4685 | span: span(3..5), | |
4686 | kind: ast::ErrorKind::ClassEscapeInvalid, | |
4687 | }); | |
4688 | assert_eq!( | |
4689 | parser(r"[z-a]").parse().unwrap_err(), | |
4690 | TestError { | |
4691 | span: span(1..4), | |
4692 | kind: ast::ErrorKind::ClassRangeInvalid, | |
4693 | }); | |
4694 | ||
4695 | assert_eq!( | |
4696 | parser_ignore_whitespace("[a ").parse().unwrap_err(), | |
4697 | TestError { | |
4698 | span: span(0..1), | |
4699 | kind: ast::ErrorKind::ClassUnclosed, | |
4700 | }); | |
4701 | assert_eq!( | |
4702 | parser_ignore_whitespace("[a- ").parse().unwrap_err(), | |
4703 | TestError { | |
4704 | span: span(0..1), | |
4705 | kind: ast::ErrorKind::ClassUnclosed, | |
4706 | }); | |
4707 | } | |
4708 | ||
4709 | #[test] | |
4710 | fn parse_set_class_open() { | |
4711 | assert_eq!( | |
4712 | parser("[a]").parse_set_class_open(), { | |
4713 | let set = ast::ClassBracketed { | |
4714 | span: span(0..1), | |
4715 | negated: false, | |
4716 | kind: ast::ClassSet::union(ast::ClassSetUnion { | |
4717 | span: span(1..1), | |
4718 | items: vec![], | |
4719 | }), | |
4720 | }; | |
4721 | let union = ast::ClassSetUnion { | |
4722 | span: span(1..1), | |
4723 | items: vec![], | |
4724 | }; | |
4725 | Ok((set, union)) | |
4726 | }); | |
4727 | assert_eq!( | |
4728 | parser_ignore_whitespace("[ a]").parse_set_class_open(), { | |
4729 | let set = ast::ClassBracketed { | |
4730 | span: span(0..4), | |
4731 | negated: false, | |
4732 | kind: ast::ClassSet::union(ast::ClassSetUnion { | |
4733 | span: span(4..4), | |
4734 | items: vec![], | |
4735 | }), | |
4736 | }; | |
4737 | let union = ast::ClassSetUnion { | |
4738 | span: span(4..4), | |
4739 | items: vec![], | |
4740 | }; | |
4741 | Ok((set, union)) | |
4742 | }); | |
4743 | assert_eq!( | |
4744 | parser("[^a]").parse_set_class_open(), { | |
4745 | let set = ast::ClassBracketed { | |
4746 | span: span(0..2), | |
4747 | negated: true, | |
4748 | kind: ast::ClassSet::union(ast::ClassSetUnion { | |
4749 | span: span(2..2), | |
4750 | items: vec![], | |
4751 | }), | |
4752 | }; | |
4753 | let union = ast::ClassSetUnion { | |
4754 | span: span(2..2), | |
4755 | items: vec![], | |
4756 | }; | |
4757 | Ok((set, union)) | |
4758 | }); | |
4759 | assert_eq!( | |
4760 | parser_ignore_whitespace("[ ^ a]").parse_set_class_open(), { | |
4761 | let set = ast::ClassBracketed { | |
4762 | span: span(0..4), | |
4763 | negated: true, | |
4764 | kind: ast::ClassSet::union(ast::ClassSetUnion { | |
4765 | span: span(4..4), | |
4766 | items: vec![], | |
4767 | }), | |
4768 | }; | |
4769 | let union = ast::ClassSetUnion { | |
4770 | span: span(4..4), | |
4771 | items: vec![], | |
4772 | }; | |
4773 | Ok((set, union)) | |
4774 | }); | |
4775 | assert_eq!( | |
4776 | parser("[-a]").parse_set_class_open(), { | |
4777 | let set = ast::ClassBracketed { | |
4778 | span: span(0..2), | |
4779 | negated: false, | |
4780 | kind: ast::ClassSet::union(ast::ClassSetUnion { | |
4781 | span: span(1..1), | |
4782 | items: vec![], | |
4783 | }), | |
4784 | }; | |
4785 | let union = ast::ClassSetUnion { | |
4786 | span: span(1..2), | |
4787 | items: vec![ | |
4788 | ast::ClassSetItem::Literal(ast::Literal { | |
4789 | span: span(1..2), | |
4790 | kind: ast::LiteralKind::Verbatim, | |
4791 | c: '-', | |
4792 | }), | |
4793 | ], | |
4794 | }; | |
4795 | Ok((set, union)) | |
4796 | }); | |
4797 | assert_eq!( | |
4798 | parser_ignore_whitespace("[ - a]").parse_set_class_open(), { | |
4799 | let set = ast::ClassBracketed { | |
4800 | span: span(0..4), | |
4801 | negated: false, | |
4802 | kind: ast::ClassSet::union(ast::ClassSetUnion { | |
4803 | span: span(2..2), | |
4804 | items: vec![], | |
4805 | }), | |
4806 | }; | |
4807 | let union = ast::ClassSetUnion { | |
4808 | span: span(2..3), | |
4809 | items: vec![ | |
4810 | ast::ClassSetItem::Literal(ast::Literal { | |
4811 | span: span(2..3), | |
4812 | kind: ast::LiteralKind::Verbatim, | |
4813 | c: '-', | |
4814 | }), | |
4815 | ], | |
4816 | }; | |
4817 | Ok((set, union)) | |
4818 | }); | |
4819 | assert_eq!( | |
4820 | parser("[^-a]").parse_set_class_open(), { | |
4821 | let set = ast::ClassBracketed { | |
4822 | span: span(0..3), | |
4823 | negated: true, | |
4824 | kind: ast::ClassSet::union(ast::ClassSetUnion { | |
4825 | span: span(2..2), | |
4826 | items: vec![], | |
4827 | }), | |
4828 | }; | |
4829 | let union = ast::ClassSetUnion { | |
4830 | span: span(2..3), | |
4831 | items: vec![ | |
4832 | ast::ClassSetItem::Literal(ast::Literal { | |
4833 | span: span(2..3), | |
4834 | kind: ast::LiteralKind::Verbatim, | |
4835 | c: '-', | |
4836 | }), | |
4837 | ], | |
4838 | }; | |
4839 | Ok((set, union)) | |
4840 | }); | |
4841 | assert_eq!( | |
4842 | parser("[--a]").parse_set_class_open(), { | |
4843 | let set = ast::ClassBracketed { | |
4844 | span: span(0..3), | |
4845 | negated: false, | |
4846 | kind: ast::ClassSet::union(ast::ClassSetUnion { | |
4847 | span: span(1..1), | |
4848 | items: vec![], | |
4849 | }), | |
4850 | }; | |
4851 | let union = ast::ClassSetUnion { | |
4852 | span: span(1..3), | |
4853 | items: vec![ | |
4854 | ast::ClassSetItem::Literal(ast::Literal { | |
4855 | span: span(1..2), | |
4856 | kind: ast::LiteralKind::Verbatim, | |
4857 | c: '-', | |
4858 | }), | |
4859 | ast::ClassSetItem::Literal(ast::Literal { | |
4860 | span: span(2..3), | |
4861 | kind: ast::LiteralKind::Verbatim, | |
4862 | c: '-', | |
4863 | }), | |
4864 | ], | |
4865 | }; | |
4866 | Ok((set, union)) | |
4867 | }); | |
4868 | assert_eq!( | |
4869 | parser("[]a]").parse_set_class_open(), { | |
4870 | let set = ast::ClassBracketed { | |
4871 | span: span(0..2), | |
4872 | negated: false, | |
4873 | kind: ast::ClassSet::union(ast::ClassSetUnion { | |
4874 | span: span(1..1), | |
4875 | items: vec![], | |
4876 | }), | |
4877 | }; | |
4878 | let union = ast::ClassSetUnion { | |
4879 | span: span(1..2), | |
4880 | items: vec![ | |
4881 | ast::ClassSetItem::Literal(ast::Literal { | |
4882 | span: span(1..2), | |
4883 | kind: ast::LiteralKind::Verbatim, | |
4884 | c: ']', | |
4885 | }), | |
4886 | ], | |
4887 | }; | |
4888 | Ok((set, union)) | |
4889 | }); | |
4890 | assert_eq!( | |
4891 | parser_ignore_whitespace("[ ] a]").parse_set_class_open(), { | |
4892 | let set = ast::ClassBracketed { | |
4893 | span: span(0..4), | |
4894 | negated: false, | |
4895 | kind: ast::ClassSet::union(ast::ClassSetUnion { | |
4896 | span: span(2..2), | |
4897 | items: vec![], | |
4898 | }), | |
4899 | }; | |
4900 | let union = ast::ClassSetUnion { | |
4901 | span: span(2..3), | |
4902 | items: vec![ | |
4903 | ast::ClassSetItem::Literal(ast::Literal { | |
4904 | span: span(2..3), | |
4905 | kind: ast::LiteralKind::Verbatim, | |
4906 | c: ']', | |
4907 | }), | |
4908 | ], | |
4909 | }; | |
4910 | Ok((set, union)) | |
4911 | }); | |
4912 | assert_eq!( | |
4913 | parser("[^]a]").parse_set_class_open(), { | |
4914 | let set = ast::ClassBracketed { | |
4915 | span: span(0..3), | |
4916 | negated: true, | |
4917 | kind: ast::ClassSet::union(ast::ClassSetUnion { | |
4918 | span: span(2..2), | |
4919 | items: vec![], | |
4920 | }), | |
4921 | }; | |
4922 | let union = ast::ClassSetUnion { | |
4923 | span: span(2..3), | |
4924 | items: vec![ | |
4925 | ast::ClassSetItem::Literal(ast::Literal { | |
4926 | span: span(2..3), | |
4927 | kind: ast::LiteralKind::Verbatim, | |
4928 | c: ']', | |
4929 | }), | |
4930 | ], | |
4931 | }; | |
4932 | Ok((set, union)) | |
4933 | }); | |
4934 | assert_eq!( | |
4935 | parser("[-]a]").parse_set_class_open(), { | |
4936 | let set = ast::ClassBracketed { | |
4937 | span: span(0..2), | |
4938 | negated: false, | |
4939 | kind: ast::ClassSet::union(ast::ClassSetUnion { | |
4940 | span: span(1..1), | |
4941 | items: vec![], | |
4942 | }), | |
4943 | }; | |
4944 | let union = ast::ClassSetUnion { | |
4945 | span: span(1..2), | |
4946 | items: vec![ | |
4947 | ast::ClassSetItem::Literal(ast::Literal { | |
4948 | span: span(1..2), | |
4949 | kind: ast::LiteralKind::Verbatim, | |
4950 | c: '-', | |
4951 | }), | |
4952 | ], | |
4953 | }; | |
4954 | Ok((set, union)) | |
4955 | }); | |
4956 | ||
4957 | assert_eq!( | |
4958 | parser("[").parse_set_class_open().unwrap_err(), | |
4959 | TestError { | |
4960 | span: span(0..1), | |
4961 | kind: ast::ErrorKind::ClassUnclosed, | |
4962 | }); | |
4963 | assert_eq!( | |
4964 | parser_ignore_whitespace("[ ") | |
4965 | .parse_set_class_open() | |
4966 | .unwrap_err(), | |
4967 | TestError { | |
4968 | span: span(0..5), | |
4969 | kind: ast::ErrorKind::ClassUnclosed, | |
4970 | }); | |
4971 | assert_eq!( | |
4972 | parser("[^").parse_set_class_open().unwrap_err(), | |
4973 | TestError { | |
4974 | span: span(0..2), | |
4975 | kind: ast::ErrorKind::ClassUnclosed, | |
4976 | }); | |
4977 | assert_eq!( | |
4978 | parser("[]").parse_set_class_open().unwrap_err(), | |
4979 | TestError { | |
4980 | span: span(0..2), | |
4981 | kind: ast::ErrorKind::ClassUnclosed, | |
4982 | }); | |
4983 | assert_eq!( | |
4984 | parser("[-").parse_set_class_open().unwrap_err(), | |
4985 | TestError { | |
4986 | span: span(0..2), | |
4987 | kind: ast::ErrorKind::ClassUnclosed, | |
4988 | }); | |
4989 | assert_eq!( | |
4990 | parser("[--").parse_set_class_open().unwrap_err(), | |
4991 | TestError { | |
4992 | span: span(0..3), | |
4993 | kind: ast::ErrorKind::ClassUnclosed, | |
4994 | }); | |
4995 | } | |
4996 | ||
4997 | #[test] | |
4998 | fn maybe_parse_ascii_class() { | |
4999 | assert_eq!( | |
5000 | parser(r"[:alnum:]").maybe_parse_ascii_class(), | |
5001 | Some(ast::ClassAscii { | |
5002 | span: span(0..9), | |
5003 | kind: ast::ClassAsciiKind::Alnum, | |
5004 | negated: false, | |
5005 | })); | |
5006 | assert_eq!( | |
5007 | parser(r"[:alnum:]A").maybe_parse_ascii_class(), | |
5008 | Some(ast::ClassAscii { | |
5009 | span: span(0..9), | |
5010 | kind: ast::ClassAsciiKind::Alnum, | |
5011 | negated: false, | |
5012 | })); | |
5013 | assert_eq!( | |
5014 | parser(r"[:^alnum:]").maybe_parse_ascii_class(), | |
5015 | Some(ast::ClassAscii { | |
5016 | span: span(0..10), | |
5017 | kind: ast::ClassAsciiKind::Alnum, | |
5018 | negated: true, | |
5019 | })); | |
5020 | ||
5021 | let p = parser(r"[:"); | |
5022 | assert_eq!(p.maybe_parse_ascii_class(), None); | |
5023 | assert_eq!(p.offset(), 0); | |
5024 | ||
5025 | let p = parser(r"[:^"); | |
5026 | assert_eq!(p.maybe_parse_ascii_class(), None); | |
5027 | assert_eq!(p.offset(), 0); | |
5028 | ||
5029 | let p = parser(r"[^:alnum:]"); | |
5030 | assert_eq!(p.maybe_parse_ascii_class(), None); | |
5031 | assert_eq!(p.offset(), 0); | |
5032 | ||
5033 | let p = parser(r"[:alnnum:]"); | |
5034 | assert_eq!(p.maybe_parse_ascii_class(), None); | |
5035 | assert_eq!(p.offset(), 0); | |
5036 | ||
5037 | let p = parser(r"[:alnum]"); | |
5038 | assert_eq!(p.maybe_parse_ascii_class(), None); | |
5039 | assert_eq!(p.offset(), 0); | |
5040 | ||
5041 | let p = parser(r"[:alnum:"); | |
5042 | assert_eq!(p.maybe_parse_ascii_class(), None); | |
5043 | assert_eq!(p.offset(), 0); | |
5044 | } | |
5045 | ||
5046 | #[test] | |
5047 | fn parse_unicode_class() { | |
5048 | assert_eq!( | |
5049 | parser(r"\pN").parse_escape(), | |
5050 | Ok(Primitive::Unicode(ast::ClassUnicode { | |
5051 | span: span(0..3), | |
5052 | negated: false, | |
5053 | kind: ast::ClassUnicodeKind::OneLetter('N'), | |
5054 | }))); | |
5055 | assert_eq!( | |
5056 | parser(r"\PN").parse_escape(), | |
5057 | Ok(Primitive::Unicode(ast::ClassUnicode { | |
5058 | span: span(0..3), | |
5059 | negated: true, | |
5060 | kind: ast::ClassUnicodeKind::OneLetter('N'), | |
5061 | }))); | |
5062 | assert_eq!( | |
5063 | parser(r"\p{N}").parse_escape(), | |
5064 | Ok(Primitive::Unicode(ast::ClassUnicode { | |
5065 | span: span(0..5), | |
5066 | negated: false, | |
5067 | kind: ast::ClassUnicodeKind::Named(s("N")), | |
5068 | }))); | |
5069 | assert_eq!( | |
5070 | parser(r"\P{N}").parse_escape(), | |
5071 | Ok(Primitive::Unicode(ast::ClassUnicode { | |
5072 | span: span(0..5), | |
5073 | negated: true, | |
5074 | kind: ast::ClassUnicodeKind::Named(s("N")), | |
5075 | }))); | |
5076 | assert_eq!( | |
5077 | parser(r"\p{Greek}").parse_escape(), | |
5078 | Ok(Primitive::Unicode(ast::ClassUnicode { | |
5079 | span: span(0..9), | |
5080 | negated: false, | |
5081 | kind: ast::ClassUnicodeKind::Named(s("Greek")), | |
5082 | }))); | |
5083 | ||
5084 | assert_eq!( | |
5085 | parser(r"\p{scx:Katakana}").parse_escape(), | |
5086 | Ok(Primitive::Unicode(ast::ClassUnicode { | |
5087 | span: span(0..16), | |
5088 | negated: false, | |
5089 | kind: ast::ClassUnicodeKind::NamedValue { | |
5090 | op: ast::ClassUnicodeOpKind::Colon, | |
5091 | name: s("scx"), | |
5092 | value: s("Katakana"), | |
5093 | }, | |
5094 | }))); | |
5095 | assert_eq!( | |
5096 | parser(r"\p{scx=Katakana}").parse_escape(), | |
5097 | Ok(Primitive::Unicode(ast::ClassUnicode { | |
5098 | span: span(0..16), | |
5099 | negated: false, | |
5100 | kind: ast::ClassUnicodeKind::NamedValue { | |
5101 | op: ast::ClassUnicodeOpKind::Equal, | |
5102 | name: s("scx"), | |
5103 | value: s("Katakana"), | |
5104 | }, | |
5105 | }))); | |
5106 | assert_eq!( | |
5107 | parser(r"\p{scx!=Katakana}").parse_escape(), | |
5108 | Ok(Primitive::Unicode(ast::ClassUnicode { | |
5109 | span: span(0..17), | |
5110 | negated: false, | |
5111 | kind: ast::ClassUnicodeKind::NamedValue { | |
5112 | op: ast::ClassUnicodeOpKind::NotEqual, | |
5113 | name: s("scx"), | |
5114 | value: s("Katakana"), | |
5115 | }, | |
5116 | }))); | |
5117 | ||
5118 | assert_eq!( | |
5119 | parser(r"\p{:}").parse_escape(), | |
5120 | Ok(Primitive::Unicode(ast::ClassUnicode { | |
5121 | span: span(0..5), | |
5122 | negated: false, | |
5123 | kind: ast::ClassUnicodeKind::NamedValue { | |
5124 | op: ast::ClassUnicodeOpKind::Colon, | |
5125 | name: s(""), | |
5126 | value: s(""), | |
5127 | }, | |
5128 | }))); | |
5129 | assert_eq!( | |
5130 | parser(r"\p{=}").parse_escape(), | |
5131 | Ok(Primitive::Unicode(ast::ClassUnicode { | |
5132 | span: span(0..5), | |
5133 | negated: false, | |
5134 | kind: ast::ClassUnicodeKind::NamedValue { | |
5135 | op: ast::ClassUnicodeOpKind::Equal, | |
5136 | name: s(""), | |
5137 | value: s(""), | |
5138 | }, | |
5139 | }))); | |
5140 | assert_eq!( | |
5141 | parser(r"\p{!=}").parse_escape(), | |
5142 | Ok(Primitive::Unicode(ast::ClassUnicode { | |
5143 | span: span(0..6), | |
5144 | negated: false, | |
5145 | kind: ast::ClassUnicodeKind::NamedValue { | |
5146 | op: ast::ClassUnicodeOpKind::NotEqual, | |
5147 | name: s(""), | |
5148 | value: s(""), | |
5149 | }, | |
5150 | }))); | |
5151 | ||
5152 | assert_eq!( | |
5153 | parser(r"\p").parse_escape().unwrap_err(), | |
5154 | TestError { | |
5155 | span: span(2..2), | |
5156 | kind: ast::ErrorKind::EscapeUnexpectedEof, | |
5157 | }); | |
5158 | assert_eq!( | |
5159 | parser(r"\p{").parse_escape().unwrap_err(), | |
5160 | TestError { | |
5161 | span: span(3..3), | |
5162 | kind: ast::ErrorKind::EscapeUnexpectedEof, | |
5163 | }); | |
5164 | assert_eq!( | |
5165 | parser(r"\p{N").parse_escape().unwrap_err(), | |
5166 | TestError { | |
5167 | span: span(4..4), | |
5168 | kind: ast::ErrorKind::EscapeUnexpectedEof, | |
5169 | }); | |
5170 | assert_eq!( | |
5171 | parser(r"\p{Greek").parse_escape().unwrap_err(), | |
5172 | TestError { | |
5173 | span: span(8..8), | |
5174 | kind: ast::ErrorKind::EscapeUnexpectedEof, | |
5175 | }); | |
5176 | ||
5177 | assert_eq!( | |
5178 | parser(r"\pNz").parse(), | |
5179 | Ok(Ast::Concat(ast::Concat { | |
5180 | span: span(0..4), | |
5181 | asts: vec![ | |
5182 | Ast::Class(ast::Class::Unicode(ast::ClassUnicode { | |
5183 | span: span(0..3), | |
5184 | negated: false, | |
5185 | kind: ast::ClassUnicodeKind::OneLetter('N'), | |
5186 | })), | |
5187 | Ast::Literal(ast::Literal { | |
5188 | span: span(3..4), | |
5189 | kind: ast::LiteralKind::Verbatim, | |
5190 | c: 'z', | |
5191 | }), | |
5192 | ], | |
5193 | }))); | |
5194 | assert_eq!( | |
5195 | parser(r"\p{Greek}z").parse(), | |
5196 | Ok(Ast::Concat(ast::Concat { | |
5197 | span: span(0..10), | |
5198 | asts: vec![ | |
5199 | Ast::Class(ast::Class::Unicode(ast::ClassUnicode { | |
5200 | span: span(0..9), | |
5201 | negated: false, | |
5202 | kind: ast::ClassUnicodeKind::Named(s("Greek")), | |
5203 | })), | |
5204 | Ast::Literal(ast::Literal { | |
5205 | span: span(9..10), | |
5206 | kind: ast::LiteralKind::Verbatim, | |
5207 | c: 'z', | |
5208 | }), | |
5209 | ], | |
5210 | }))); | |
5211 | } | |
5212 | ||
5213 | #[test] | |
5214 | fn parse_perl_class() { | |
5215 | assert_eq!( | |
5216 | parser(r"\d").parse_escape(), | |
5217 | Ok(Primitive::Perl(ast::ClassPerl { | |
5218 | span: span(0..2), | |
5219 | kind: ast::ClassPerlKind::Digit, | |
5220 | negated: false, | |
5221 | }))); | |
5222 | assert_eq!( | |
5223 | parser(r"\D").parse_escape(), | |
5224 | Ok(Primitive::Perl(ast::ClassPerl { | |
5225 | span: span(0..2), | |
5226 | kind: ast::ClassPerlKind::Digit, | |
5227 | negated: true, | |
5228 | }))); | |
5229 | assert_eq!( | |
5230 | parser(r"\s").parse_escape(), | |
5231 | Ok(Primitive::Perl(ast::ClassPerl { | |
5232 | span: span(0..2), | |
5233 | kind: ast::ClassPerlKind::Space, | |
5234 | negated: false, | |
5235 | }))); | |
5236 | assert_eq!( | |
5237 | parser(r"\S").parse_escape(), | |
5238 | Ok(Primitive::Perl(ast::ClassPerl { | |
5239 | span: span(0..2), | |
5240 | kind: ast::ClassPerlKind::Space, | |
5241 | negated: true, | |
5242 | }))); | |
5243 | assert_eq!( | |
5244 | parser(r"\w").parse_escape(), | |
5245 | Ok(Primitive::Perl(ast::ClassPerl { | |
5246 | span: span(0..2), | |
5247 | kind: ast::ClassPerlKind::Word, | |
5248 | negated: false, | |
5249 | }))); | |
5250 | assert_eq!( | |
5251 | parser(r"\W").parse_escape(), | |
5252 | Ok(Primitive::Perl(ast::ClassPerl { | |
5253 | span: span(0..2), | |
5254 | kind: ast::ClassPerlKind::Word, | |
5255 | negated: true, | |
5256 | }))); | |
5257 | ||
5258 | assert_eq!( | |
5259 | parser(r"\d").parse(), | |
5260 | Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl { | |
5261 | span: span(0..2), | |
5262 | kind: ast::ClassPerlKind::Digit, | |
5263 | negated: false, | |
5264 | })))); | |
5265 | assert_eq!( | |
5266 | parser(r"\dz").parse(), | |
5267 | Ok(Ast::Concat(ast::Concat { | |
5268 | span: span(0..3), | |
5269 | asts: vec![ | |
5270 | Ast::Class(ast::Class::Perl(ast::ClassPerl { | |
5271 | span: span(0..2), | |
5272 | kind: ast::ClassPerlKind::Digit, | |
5273 | negated: false, | |
5274 | })), | |
5275 | Ast::Literal(ast::Literal { | |
5276 | span: span(2..3), | |
5277 | kind: ast::LiteralKind::Verbatim, | |
5278 | c: 'z', | |
5279 | }), | |
5280 | ], | |
5281 | }))); | |
5282 | } | |
5283 | ||
5284 | // This tests a bug fix where the nest limit checker wasn't decrementing | |
5285 | // its depth during post-traversal, which causes long regexes to trip | |
5286 | // the default limit too aggressively. | |
5287 | #[test] | |
5288 | fn regression_454_nest_too_big() { | |
5289 | let pattern = r#" | |
5290 | 2(?: | |
5291 | [45]\d{3}| | |
5292 | 7(?: | |
5293 | 1[0-267]| | |
5294 | 2[0-289]| | |
5295 | 3[0-29]| | |
5296 | 4[01]| | |
5297 | 5[1-3]| | |
5298 | 6[013]| | |
5299 | 7[0178]| | |
5300 | 91 | |
5301 | )| | |
5302 | 8(?: | |
5303 | 0[125]| | |
5304 | [139][1-6]| | |
5305 | 2[0157-9]| | |
5306 | 41| | |
5307 | 6[1-35]| | |
5308 | 7[1-5]| | |
5309 | 8[1-8]| | |
5310 | 90 | |
5311 | )| | |
5312 | 9(?: | |
5313 | 0[0-2]| | |
5314 | 1[0-4]| | |
5315 | 2[568]| | |
5316 | 3[3-6]| | |
5317 | 5[5-7]| | |
5318 | 6[0167]| | |
5319 | 7[15]| | |
5320 | 8[0146-9] | |
5321 | ) | |
5322 | )\d{4} | |
5323 | "#; | |
5324 | assert!(parser_nest_limit(pattern, 50).parse().is_ok()); | |
5325 | } | |
5326 | ||
5327 | // This tests that we treat a trailing `-` in a character class as a | |
5328 | // literal `-` even when whitespace mode is enabled and there is whitespace | |
5329 | // after the trailing `-`. | |
5330 | #[test] | |
5331 | fn regression_455_trailing_dash_ignore_whitespace() { | |
5332 | assert!(parser("(?x)[ / - ]").parse().is_ok()); | |
5333 | assert!(parser("(?x)[ a - ]").parse().is_ok()); | |
5334 | assert!(parser("(?x)[ | |
5335 | a | |
5336 | - ] | |
5337 | ").parse().is_ok()); | |
5338 | assert!(parser("(?x)[ | |
5339 | a # wat | |
5340 | - ] | |
5341 | ").parse().is_ok()); | |
5342 | ||
5343 | assert!(parser("(?x)[ / -").parse().is_err()); | |
5344 | assert!(parser("(?x)[ / - ").parse().is_err()); | |
5345 | assert!(parser("(?x)[ | |
5346 | / - | |
5347 | ").parse().is_err()); | |
5348 | assert!(parser("(?x)[ | |
5349 | / - # wat | |
5350 | ").parse().is_err()); | |
5351 | } | |
5352 | } |