src/vendor/regex-syntax-0.3.9/src/lib.rs

   1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 /*!
  12 This crate provides a regular expression parser and an abstract syntax for
  13 regular expressions. The abstract syntax is defined by the `Expr` type. The
  14 concrete syntax is enumerated in the
  15 [`regex`](../regex/index.html#syntax)
  16 crate documentation.
  17
  18 Note that since this crate is first and foremost an implementation detail for
  19 the `regex` crate, it may experience more frequent breaking changes. It is
  20 exposed as a separate crate so that others may use it to do analysis on regular
  21 expressions or even build their own matching engine.
  22
  23 # Example: parsing an expression
  24
  25 Parsing a regular expression can be done with the `Expr::parse` function.
  26
  27 ```rust
  28 use regex_syntax::Expr;
  29
  30 assert_eq!(Expr::parse(r"ab|yz").unwrap(), Expr::Alternate(vec![
  31     Expr::Literal { chars: vec!['a', 'b'], casei: false },
  32     Expr::Literal { chars: vec!['y', 'z'], casei: false },
  33 ]));
  34 ```
  35
  36 # Example: inspecting an error
  37
  38 The parser in this crate provides very detailed error values. For example,
  39 if an invalid character class range is given:
  40
  41 ```rust
  42 use regex_syntax::{Expr, ErrorKind};
  43
  44 let err = Expr::parse(r"[z-a]").unwrap_err();
  45 assert_eq!(err.position(), 4);
  46 assert_eq!(err.kind(), &ErrorKind::InvalidClassRange {
  47     start: 'z',
  48     end: 'a',
  49 });
  50 ```
  51
  52 Or unbalanced parentheses:
  53
  54 ```rust
  55 use regex_syntax::{Expr, ErrorKind};
  56
  57 let err = Expr::parse(r"ab(cd").unwrap_err();
  58 assert_eq!(err.position(), 2);
  59 assert_eq!(err.kind(), &ErrorKind::UnclosedParen);
  60 ```
  61 */
  62
  63 #![deny(missing_docs)]
  64 #![cfg_attr(test, deny(warnings))]
  65
  66 #[cfg(test)] extern crate quickcheck;
  67 #[cfg(test)] extern crate rand;
  68
  69 mod literals;
  70 mod parser;
  71 mod unicode;
  72
  73 use std::ascii;
  74 use std::char;
  75 use std::cmp::{Ordering, max, min};
  76 use std::fmt;
  77 use std::iter::IntoIterator;
  78 use std::ops::Deref;
  79 use std::result;
  80 use std::slice;
  81 use std::u8;
  82 use std::vec;
  83
  84 use unicode::case_folding;
  85
  86 use self::Expr::*;
  87 use self::Repeater::*;
  88
  89 use parser::{Flags, Parser};
  90
  91 pub use literals::{Literals, Lit};
  92
  93 /// A regular expression abstract syntax tree.
  94 ///
  95 /// An `Expr` represents the abstract syntax of a regular expression.
  96 #[derive(Clone, Debug, PartialEq, Eq)]
  97 pub enum Expr {
  98     /// An empty regex (which never matches any text).
  99     Empty,
 100     /// A sequence of one or more literal characters to be matched.
 101     Literal {
 102         /// The characters.
 103         chars: Vec<char>,
 104         /// Whether to match case insensitively.
 105         casei: bool,
 106     },
 107     /// A sequence of one or more literal bytes to be matched.
 108     LiteralBytes {
 109         /// The bytes.
 110         bytes: Vec<u8>,
 111         /// Whether to match case insensitively.
 112         ///
 113         /// The interpretation of "case insensitive" in this context is
 114         /// ambiguous since `bytes` can be arbitrary. However, a good heuristic
 115         /// is to assume that the bytes are ASCII-compatible and do simple
 116         /// ASCII case folding.
 117         casei: bool,
 118     },
 119     /// Match any character.
 120     AnyChar,
 121     /// Match any character, excluding new line (`0xA`).
 122     AnyCharNoNL,
 123     /// Match any byte.
 124     AnyByte,
 125     /// Match any byte, excluding new line (`0xA`).
 126     AnyByteNoNL,
 127     /// A character class.
 128     Class(CharClass),
 129     /// A character class with byte ranges only.
 130     ClassBytes(ByteClass),
 131     /// Match the start of a line or beginning of input.
 132     StartLine,
 133     /// Match the end of a line or end of input.
 134     EndLine,
 135     /// Match the beginning of input.
 136     StartText,
 137     /// Match the end of input.
 138     EndText,
 139     /// Match a word boundary (word character on one side and a non-word
 140     /// character on the other).
 141     WordBoundary,
 142     /// Match a position that is not a word boundary (word or non-word
 143     /// characters on both sides).
 144     NotWordBoundary,
 145     /// Match an ASCII word boundary.
 146     WordBoundaryAscii,
 147     /// Match a position that is not an ASCII word boundary.
 148     NotWordBoundaryAscii,
 149     /// A group, possibly non-capturing.
 150     Group {
 151         /// The expression inside the group.
 152         e: Box<Expr>,
 153         /// The capture index (starting at `1`) only for capturing groups.
 154         i: Option<usize>,
 155         /// The capture name, only for capturing named groups.
 156         name: Option<String>,
 157     },
 158     /// A repeat operator (`?`, `*`, `+` or `{m,n}`).
 159     Repeat {
 160         /// The expression to be repeated. Limited to literals, `.`, classes
 161         /// or grouped expressions.
 162         e: Box<Expr>,
 163         /// The type of repeat operator used.
 164         r: Repeater,
 165         /// Whether the repeat is greedy (match the most) or not (match the
 166         /// least).
 167         greedy: bool,
 168     },
 169     /// A concatenation of expressions. Must be matched one after the other.
 170     ///
 171     /// N.B. A concat expression can only appear at the top-level or
 172     /// immediately inside a group expression.
 173     Concat(Vec<Expr>),
 174     /// An alternation of expressions. Only one must match.
 175     ///
 176     /// N.B. An alternate expression can only appear at the top-level or
 177     /// immediately inside a group expression.
 178     Alternate(Vec<Expr>),
 179 }
 180
 181 type CaptureIndex = Option<usize>;
 182
 183 type CaptureName = Option<String>;
 184
 185 /// The type of a repeat operator expression.
 186 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 187 pub enum Repeater {
 188     /// Match zero or one (`?`).
 189     ZeroOrOne,
 190     /// Match zero or more (`*`).
 191     ZeroOrMore,
 192     /// Match one or more (`+`).
 193     OneOrMore,
 194     /// Match for at least `min` and at most `max` (`{m,n}`).
 195     ///
 196     /// When `max` is `None`, there is no upper bound on the number of matches.
 197     Range {
 198         /// Lower bound on the number of matches.
 199         min: u32,
 200         /// Optional upper bound on the number of matches.
 201         max: Option<u32>,
 202     },
 203 }
 204
 205 impl Repeater {
 206     /// Returns true if and only if this repetition can match the empty string.
 207     fn matches_empty(&self) -> bool {
 208         use self::Repeater::*;
 209         match *self {
 210             ZeroOrOne => true,
 211             ZeroOrMore => true,
 212             OneOrMore => false,
 213             Range { min, .. } => min == 0,
 214         }
 215     }
 216 }
 217
 218 /// A character class.
 219 ///
 220 /// A character class has a canonical format that the parser guarantees. Its
 221 /// canonical format is defined by the following invariants:
 222 ///
 223 /// 1. Given any Unicode scalar value, it is matched by *at most* one character
 224 ///    range in a canonical character class.
 225 /// 2. Every adjacent character range is separated by at least one Unicode
 226 ///    scalar value.
 227 /// 3. Given any pair of character ranges `r1` and `r2`, if
 228 ///    `r1.end < r2.start`, then `r1` comes before `r2` in a canonical
 229 ///    character class.
 230 ///
 231 /// In sum, any `CharClass` produced by this crate's parser is a sorted
 232 /// sequence of non-overlapping ranges. This makes it possible to test whether
 233 /// a character is matched by a class with a binary search.
 234 ///
 235 /// If the case insensitive flag was set when parsing a character class, then
 236 /// simple case folding is done automatically. For example, `(?i)[a-c]` is
 237 /// automatically translated to `[a-cA-C]`.
 238 #[derive(Clone, Debug, PartialEq, Eq)]
 239 pub struct CharClass {
 240     ranges: Vec<ClassRange>,
 241 }
 242
 243 /// A single inclusive range in a character class.
 244 ///
 245 /// Since range boundaries are defined by Unicode scalar values, the boundaries
 246 /// can never be in the open interval `(0xD7FF, 0xE000)`. However, a range may
 247 /// *cover* codepoints that are not scalar values.
 248 ///
 249 /// Note that this has a few convenient impls on `PartialEq` and `PartialOrd`
 250 /// for testing whether a character is contained inside a given range.
 251 #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)]
 252 pub struct ClassRange {
 253     /// The start character of the range.
 254     ///
 255     /// This must be less than or equal to `end`.
 256     pub start: char,
 257
 258     /// The end character of the range.
 259     ///
 260     /// This must be greater than or equal to `start`.
 261     pub end: char,
 262 }
 263
 264 /// A byte class for byte ranges only.
 265 ///
 266 /// A byte class has a canonical format that the parser guarantees. Its
 267 /// canonical format is defined by the following invariants:
 268 ///
 269 /// 1. Given any byte, it is matched by *at most* one byte range in a canonical
 270 ///    character class.
 271 /// 2. Every adjacent byte range is separated by at least one byte.
 272 /// 3. Given any pair of byte ranges `r1` and `r2`, if
 273 ///    `r1.end < r2.start`, then `r1` comes before `r2` in a canonical
 274 ///    character class.
 275 ///
 276 /// In sum, any `ByteClass` produced by this crate's parser is a sorted
 277 /// sequence of non-overlapping ranges. This makes it possible to test whether
 278 /// a byte is matched by a class with a binary search.
 279 ///
 280 /// If the case insensitive flag was set when parsing a character class,
 281 /// then simple ASCII-only case folding is done automatically. For example,
 282 /// `(?i)[a-c]` is automatically translated to `[a-cA-C]`.
 283 #[derive(Clone, Debug, PartialEq, Eq)]
 284 pub struct ByteClass {
 285     ranges: Vec<ByteRange>,
 286 }
 287
 288 /// A single inclusive range in a byte class.
 289 ///
 290 /// Note that this has a few convenient impls on `PartialEq` and `PartialOrd`
 291 /// for testing whether a byte is contained inside a given range.
 292 #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)]
 293 pub struct ByteRange {
 294     /// The start byte of the range.
 295     ///
 296     /// This must be less than or equal to `end`.
 297     pub start: u8,
 298
 299     /// The end byte of the range.
 300     ///
 301     /// This must be greater than or equal to `end`.
 302     pub end: u8,
 303 }
 304
 305 /// A builder for configuring regular expression parsing.
 306 ///
 307 /// This allows setting the default values of flags and other options, such
 308 /// as the maximum nesting depth.
 309 #[derive(Clone, Debug)]
 310 pub struct ExprBuilder {
 311     flags: Flags,
 312     nest_limit: usize,
 313 }
 314
 315 impl ExprBuilder {
 316     /// Create a new builder for configuring expression parsing.
 317     ///
 318     /// Note that all flags are disabled by default.
 319     pub fn new() -> ExprBuilder {
 320         ExprBuilder {
 321             flags: Flags::default(),
 322             nest_limit: 200,
 323         }
 324     }
 325
 326     /// Set the default value for the case insensitive (`i`) flag.
 327     pub fn case_insensitive(mut self, yes: bool) -> ExprBuilder {
 328         self.flags.casei = yes;
 329         self
 330     }
 331
 332     /// Set the default value for the multi-line matching (`m`) flag.
 333     pub fn multi_line(mut self, yes: bool) -> ExprBuilder {
 334         self.flags.multi = yes;
 335         self
 336     }
 337
 338     /// Set the default value for the any character (`s`) flag.
 339     pub fn dot_matches_new_line(mut self, yes: bool) -> ExprBuilder {
 340         self.flags.dotnl = yes;
 341         self
 342     }
 343
 344     /// Set the default value for the greedy swap (`U`) flag.
 345     pub fn swap_greed(mut self, yes: bool) -> ExprBuilder {
 346         self.flags.swap_greed = yes;
 347         self
 348     }
 349
 350     /// Set the default value for the ignore whitespace (`x`) flag.
 351     pub fn ignore_whitespace(mut self, yes: bool) -> ExprBuilder {
 352         self.flags.ignore_space = yes;
 353         self
 354     }
 355
 356     /// Set the default value for the Unicode (`u`) flag.
 357     ///
 358     /// If `yes` is false, then `allow_bytes` is set to true.
 359     pub fn unicode(mut self, yes: bool) -> ExprBuilder {
 360         self.flags.unicode = yes;
 361         if !yes {
 362             self.allow_bytes(true)
 363         } else {
 364             self
 365         }
 366     }
 367
 368     /// Whether the parser allows matching arbitrary bytes or not.
 369     ///
 370     /// When the `u` flag is disabled (either with this builder or in the
 371     /// expression itself), the parser switches to interpreting the expression
 372     /// as matching arbitrary bytes instead of Unicode codepoints. For example,
 373     /// the expression `(?u:\xFF)` matches the *codepoint* `\xFF`, which
 374     /// corresponds to the UTF-8 byte sequence `\xCE\xBF`. Conversely,
 375     /// `(?-u:\xFF)` matches the *byte* `\xFF`, which is not valid UTF-8.
 376     ///
 377     /// When `allow_bytes` is disabled (the default), an expression like
 378     /// `(?-u:\xFF)` will cause the parser to return an error, since it would
 379     /// otherwise match invalid UTF-8. When enabled, it will be allowed.
 380     pub fn allow_bytes(mut self, yes: bool) -> ExprBuilder {
 381         self.flags.allow_bytes = yes;
 382         self
 383     }
 384
 385     /// Set the nesting limit for regular expression parsing.
 386     ///
 387     /// Regular expressions that nest more than this limit will result in a
 388     /// `StackExhausted` error.
 389     pub fn nest_limit(mut self, limit: usize) -> ExprBuilder {
 390         self.nest_limit = limit;
 391         self
 392     }
 393
 394     /// Parse a string as a regular expression using the current configuraiton.
 395     pub fn parse(self, s: &str) -> Result<Expr> {
 396         Parser::parse(s, self.flags).and_then(|e| e.simplify(self.nest_limit))
 397     }
 398 }
 399
 400 impl Expr {
 401     /// Parses a string in a regular expression syntax tree.
 402     ///
 403     /// This is a convenience method for parsing an expression using the
 404     /// default configuration. To tweak parsing options (such as which flags
 405     /// are enabled by default), use the `ExprBuilder` type.
 406     pub fn parse(s: &str) -> Result<Expr> {
 407         ExprBuilder::new().parse(s)
 408     }
 409
 410     /// Returns true iff the expression can be repeated by a quantifier.
 411     fn can_repeat(&self) -> bool {
 412         match *self {
 413             Literal{..} | LiteralBytes{..}
 414             | AnyChar | AnyCharNoNL | AnyByte | AnyByteNoNL
 415             | Class(_) | ClassBytes(_)
 416             | StartLine | EndLine | StartText | EndText
 417             | WordBoundary | NotWordBoundary
 418             | WordBoundaryAscii | NotWordBoundaryAscii
 419             | Group{..}
 420             => true,
 421             _ => false,
 422         }
 423     }
 424
 425     fn simplify(self, nest_limit: usize) -> Result<Expr> {
 426         fn combine_literals(es: &mut Vec<Expr>, e: Expr) {
 427             match (es.pop(), e) {
 428                 (None, e) => es.push(e),
 429                 (Some(Literal { chars: mut chars1, casei: casei1 }),
 430                       Literal { chars: chars2, casei: casei2 }) => {
 431                     if casei1 == casei2 {
 432                         chars1.extend(chars2);
 433                         es.push(Literal { chars: chars1, casei: casei1 });
 434                     } else {
 435                         es.push(Literal { chars: chars1, casei: casei1 });
 436                         es.push(Literal { chars: chars2, casei: casei2 });
 437                     }
 438                 }
 439                 (Some(LiteralBytes { bytes: mut bytes1, casei: casei1 }),
 440                       LiteralBytes { bytes: bytes2, casei: casei2 }) => {
 441                     if casei1 == casei2 {
 442                         bytes1.extend(bytes2);
 443                         es.push(LiteralBytes { bytes: bytes1, casei: casei1 });
 444                     } else {
 445                         es.push(LiteralBytes { bytes: bytes1, casei: casei1 });
 446                         es.push(LiteralBytes { bytes: bytes2, casei: casei2 });
 447                     }
 448                 }
 449                 (Some(e1), e2) => {
 450                     es.push(e1);
 451                     es.push(e2);
 452                 }
 453             }
 454         }
 455         fn simp(expr: Expr, recurse: usize, limit: usize) -> Result<Expr> {
 456             if recurse > limit {
 457                 return Err(Error {
 458                     pos: 0,
 459                     surround: "".to_owned(),
 460                     kind: ErrorKind::StackExhausted,
 461                 });
 462             }
 463             let simplify = |e| simp(e, recurse + 1, limit);
 464             Ok(match expr {
 465                 Repeat { e, r, greedy } => Repeat {
 466                     e: Box::new(try!(simplify(*e))),
 467                     r: r,
 468                     greedy: greedy,
 469                 },
 470                 Group { e, i, name } => {
 471                     let e = try!(simplify(*e));
 472                     if i.is_none() && name.is_none() && e.can_repeat() {
 473                         e
 474                     } else {
 475                         Group { e: Box::new(e), i: i, name: name }
 476                     }
 477                 }
 478                 Concat(es) => {
 479                     let mut new_es = Vec::with_capacity(es.len());
 480                     for e in es {
 481                         combine_literals(&mut new_es, try!(simplify(e)));
 482                     }
 483                     if new_es.len() == 1 {
 484                         new_es.pop().unwrap()
 485                     } else {
 486                         Concat(new_es)
 487                     }
 488                 }
 489                 Alternate(es) => {
 490                     let mut new_es = Vec::with_capacity(es.len());
 491                     for e in es {
 492                         new_es.push(try!(simplify(e)));
 493                     }
 494                     Alternate(new_es)
 495                 }
 496                 e => e,
 497             })
 498         }
 499         simp(self, 0, nest_limit)
 500     }
 501
 502     /// Returns a set of literal prefixes extracted from this expression.
 503     pub fn prefixes(&self) -> Literals {
 504         let mut lits = Literals::empty();
 505         lits.union_prefixes(self);
 506         lits
 507     }
 508
 509     /// Returns a set of literal suffixes extracted from this expression.
 510     pub fn suffixes(&self) -> Literals {
 511         let mut lits = Literals::empty();
 512         lits.union_suffixes(self);
 513         lits
 514     }
 515
 516     /// Returns true if and only if the expression is required to match from
 517     /// the beginning of text.
 518     pub fn is_anchored_start(&self) -> bool {
 519         match *self {
 520             Repeat { ref e, r, .. } => {
 521                 !r.matches_empty() && e.is_anchored_start()
 522             }
 523             Group { ref e, .. } => e.is_anchored_start(),
 524             Concat(ref es) => es[0].is_anchored_start(),
 525             Alternate(ref es) => es.iter().all(|e| e.is_anchored_start()),
 526             StartText => true,
 527             _ => false,
 528         }
 529     }
 530
 531     /// Returns true if and only if the expression has at least one matchable
 532     /// sub-expression that must match the beginning of text.
 533     pub fn has_anchored_start(&self) -> bool {
 534         match *self {
 535             Repeat { ref e, r, .. } => {
 536                 !r.matches_empty() && e.has_anchored_start()
 537             }
 538             Group { ref e, .. } => e.has_anchored_start(),
 539             Concat(ref es) => es[0].has_anchored_start(),
 540             Alternate(ref es) => es.iter().any(|e| e.has_anchored_start()),
 541             StartText => true,
 542             _ => false,
 543         }
 544     }
 545
 546     /// Returns true if and only if the expression is required to match at the
 547     /// end of the text.
 548     pub fn is_anchored_end(&self) -> bool {
 549         match *self {
 550             Repeat { ref e, r, .. } => {
 551                 !r.matches_empty() && e.is_anchored_end()
 552             }
 553             Group { ref e, .. } => e.is_anchored_end(),
 554             Concat(ref es) => es[es.len() - 1].is_anchored_end(),
 555             Alternate(ref es) => es.iter().all(|e| e.is_anchored_end()),
 556             EndText => true,
 557             _ => false,
 558         }
 559     }
 560
 561     /// Returns true if and only if the expression has at least one matchable
 562     /// sub-expression that must match the beginning of text.
 563     pub fn has_anchored_end(&self) -> bool {
 564         match *self {
 565             Repeat { ref e, r, .. } => {
 566                 !r.matches_empty() && e.has_anchored_end()
 567             }
 568             Group { ref e, .. } => e.has_anchored_end(),
 569             Concat(ref es) => es[es.len() - 1].has_anchored_end(),
 570             Alternate(ref es) => es.iter().any(|e| e.has_anchored_end()),
 571             EndText => true,
 572             _ => false,
 573         }
 574     }
 575
 576     /// Returns true if and only if the expression contains sub-expressions
 577     /// that can match arbitrary bytes.
 578     pub fn has_bytes(&self) -> bool {
 579         match *self {
 580             Repeat { ref e, .. } => e.has_bytes(),
 581             Group { ref e, .. } => e.has_bytes(),
 582             Concat(ref es) => es.iter().any(|e| e.has_bytes()),
 583             Alternate(ref es) => es.iter().any(|e| e.has_bytes()),
 584             LiteralBytes{..} => true,
 585             AnyByte | AnyByteNoNL => true,
 586             ClassBytes(_) => true,
 587             WordBoundaryAscii | NotWordBoundaryAscii => true,
 588             _ => false,
 589         }
 590     }
 591 }
 592
 593 impl Deref for CharClass {
 594     type Target = Vec<ClassRange>;
 595     fn deref(&self) -> &Vec<ClassRange> { &self.ranges }
 596 }
 597
 598 impl IntoIterator for CharClass {
 599     type Item = ClassRange;
 600     type IntoIter = vec::IntoIter<ClassRange>;
 601     fn into_iter(self) -> vec::IntoIter<ClassRange> { self.ranges.into_iter() }
 602 }
 603
 604 impl<'a> IntoIterator for &'a CharClass {
 605     type Item = &'a ClassRange;
 606     type IntoIter = slice::Iter<'a, ClassRange>;
 607     fn into_iter(self) -> slice::Iter<'a, ClassRange> { self.iter() }
 608 }
 609
 610 impl CharClass {
 611     /// Create a new class from an existing set of ranges.
 612     pub fn new(ranges: Vec<ClassRange>) -> CharClass {
 613         CharClass { ranges: ranges }
 614     }
 615
 616     /// Create an empty class.
 617     fn empty() -> CharClass {
 618         CharClass::new(Vec::new())
 619     }
 620
 621     /// Returns true if `c` is matched by this character class.
 622     pub fn matches(&self, c: char) -> bool {
 623         self.binary_search_by(|range| c.partial_cmp(range).unwrap()).is_ok()
 624     }
 625
 626     /// Removes the given character from the class if it exists.
 627     ///
 628     /// Note that this takes `O(n)` time in the number of ranges.
 629     pub fn remove(&mut self, c: char) {
 630         let mut i = match self.binary_search_by(|r| c.partial_cmp(r).unwrap()) {
 631             Ok(i) => i,
 632             Err(_) => return,
 633         };
 634         let mut r = self.ranges.remove(i);
 635         if r.start == c {
 636             r.start = inc_char(c);
 637             if r.start > r.end || c == char::MAX {
 638                 return;
 639             }
 640             self.ranges.insert(i, r);
 641         } else if r.end == c {
 642             r.end = dec_char(c);
 643             if r.end < r.start || c == '\x00' {
 644                 return;
 645             }
 646             self.ranges.insert(0, r);
 647         } else {
 648             let (mut r1, mut r2) = (r.clone(), r.clone());
 649             r1.end = dec_char(c);
 650             if r1.start <= r1.end {
 651                 self.ranges.insert(i, r1);
 652                 i += 1;
 653             }
 654             r2.start = inc_char(c);
 655             if r2.start <= r2.end {
 656                 self.ranges.insert(i, r2);
 657             }
 658         }
 659     }
 660
 661     /// Create a new empty class from this one.
 662     fn to_empty(&self) -> CharClass {
 663         CharClass { ranges: Vec::with_capacity(self.len()) }
 664     }
 665
 666     /// Create a byte class from this character class.
 667     ///
 668     /// Codepoints above 0xFF are removed.
 669     fn to_byte_class(self) -> ByteClass {
 670         ByteClass::new(
 671             self.ranges.into_iter()
 672                        .filter_map(|r| r.to_byte_range())
 673                        .collect()).canonicalize()
 674     }
 675
 676     /// Merge two classes and canonicalize them.
 677     #[cfg(test)]
 678     fn merge(mut self, other: CharClass) -> CharClass {
 679         self.ranges.extend(other);
 680         self.canonicalize()
 681     }
 682
 683     /// Canonicalze any sequence of ranges.
 684     ///
 685     /// This is responsible for enforcing the canonical format invariants
 686     /// as described on the docs for the `CharClass` type.
 687     fn canonicalize(mut self) -> CharClass {
 688         // TODO: Save some cycles here by checking if already canonicalized.
 689         self.ranges.sort();
 690         let mut ordered = self.to_empty(); // TODO: Do this in place?
 691         for candidate in self {
 692             // If the candidate overlaps with an existing range, then it must
 693             // be the most recent range added because we process the candidates
 694             // in order.
 695             if let Some(or) = ordered.ranges.last_mut() {
 696                 if or.overlapping(candidate) {
 697                     *or = or.merge(candidate);
 698                     continue;
 699                 }
 700             }
 701             ordered.ranges.push(candidate);
 702         }
 703         ordered
 704     }
 705
 706     /// Negates the character class.
 707     ///
 708     /// For all `c` where `c` is a Unicode scalar value, `c` matches `self`
 709     /// if and only if `c` does not match `self.negate()`.
 710     pub fn negate(mut self) -> CharClass {
 711         fn range(s: char, e: char) -> ClassRange { ClassRange::new(s, e) }
 712
 713         if self.is_empty() {
 714             // Inverting an empty range yields all of Unicode.
 715             return CharClass {
 716                 ranges: vec![ClassRange { start: '\x00', end: '\u{10ffff}' }],
 717             };
 718         }
 719         self = self.canonicalize();
 720         let mut inv = self.to_empty();
 721         if self[0].start > '\x00' {
 722             inv.ranges.push(range('\x00', dec_char(self[0].start)));
 723         }
 724         for win in self.windows(2) {
 725             inv.ranges.push(range(inc_char(win[0].end),
 726                                   dec_char(win[1].start)));
 727         }
 728         if self[self.len() - 1].end < char::MAX {
 729             inv.ranges.push(range(inc_char(self[self.len() - 1].end),
 730                                   char::MAX));
 731         }
 732         inv
 733     }
 734
 735     /// Apply case folding to this character class.
 736     ///
 737     /// N.B. Applying case folding to a negated character class probably
 738     /// won't produce the expected result. e.g., `(?i)[^x]` really should
 739     /// match any character sans `x` and `X`, but if `[^x]` is negated
 740     /// before being case folded, you'll end up matching any character.
 741     pub fn case_fold(self) -> CharClass {
 742         let mut folded = self.to_empty();
 743         for r in self {
 744             // Applying case folding to a range is expensive because *every*
 745             // character needs to be examined. Thus, we avoid that drudgery
 746             // if no character in the current range is in our case folding
 747             // table.
 748             if r.needs_case_folding() {
 749                 folded.ranges.extend(r.case_fold());
 750             }
 751             folded.ranges.push(r);
 752         }
 753         folded.canonicalize()
 754     }
 755
 756     /// Returns the number of characters that match this class.
 757     fn num_chars(&self) -> usize {
 758         self.ranges.iter()
 759             .map(|&r| 1 + (r.end as u32) - (r.start as u32))
 760             .fold(0, |acc, len| acc + len)
 761             as usize
 762     }
 763 }
 764
 765 impl ClassRange {
 766     /// Create a new class range.
 767     ///
 768     /// If `end < start`, then the two values are swapped so that
 769     /// the invariant `start <= end` is preserved.
 770     fn new(start: char, end: char) -> ClassRange {
 771         if start <= end {
 772             ClassRange { start: start, end: end }
 773         } else {
 774             ClassRange { start: end, end: start }
 775         }
 776     }
 777
 778     /// Translate this to a byte class.
 779     ///
 780     /// If the start codepoint exceeds 0xFF, then this returns `None`.
 781     ///
 782     /// If the end codepoint exceeds 0xFF, then it is set to 0xFF.
 783     fn to_byte_range(self) -> Option<ByteRange> {
 784         if self.start > '\u{FF}' {
 785             None
 786         } else {
 787             let s = self.start as u8;
 788             let e = min('\u{FF}', self.end) as u8;
 789             Some(ByteRange::new(s, e))
 790         }
 791     }
 792
 793     /// Create a range of one character.
 794     fn one(c: char) -> ClassRange {
 795         ClassRange { start: c, end: c }
 796     }
 797
 798     /// Returns true if and only if the two ranges are overlapping. Note that
 799     /// since ranges are inclusive, `a-c` and `d-f` are overlapping!
 800     fn overlapping(self, other: ClassRange) -> bool {
 801         max(self.start, other.start) <= inc_char(min(self.end, other.end))
 802     }
 803
 804     /// Creates a new range representing the union of `self` and `other.
 805     fn merge(self, other: ClassRange) -> ClassRange {
 806         ClassRange {
 807             start: min(self.start, other.start),
 808             end: max(self.end, other.end),
 809         }
 810     }
 811
 812     /// Returns true if and only if this range contains a character that is
 813     /// in the case folding table.
 814     fn needs_case_folding(self) -> bool {
 815         case_folding::C_plus_S_both_table
 816         .binary_search_by(|&(c, _)| self.partial_cmp(&c).unwrap()).is_ok()
 817     }
 818
 819     /// Apply case folding to this range.
 820     ///
 821     /// Since case folding might add characters such that the range is no
 822     /// longer contiguous, this returns multiple class ranges. They are in
 823     /// canonical order.
 824     fn case_fold(self) -> Vec<ClassRange> {
 825         let table = &case_folding::C_plus_S_both_table;
 826         let (s, e) = (self.start as u32, self.end as u32 + 1);
 827         let mut start = self.start;
 828         let mut end = start;
 829         let mut next_case_fold = '\x00';
 830         let mut ranges = Vec::with_capacity(10);
 831         for mut c in (s..e).filter_map(char::from_u32) {
 832             if c >= next_case_fold {
 833                 c = match simple_case_fold_both_result(c) {
 834                     Ok(i) => {
 835                         for &(c1, c2) in &table[i..] {
 836                             if c1 != c {
 837                                 break;
 838                             }
 839                             if c2 != inc_char(end) {
 840                                 ranges.push(ClassRange::new(start, end));
 841                                 start = c2;
 842                             }
 843                             end = c2;
 844                         }
 845                         continue;
 846                     }
 847                     Err(i) => {
 848                         if i < table.len() {
 849                             next_case_fold = table[i].0;
 850                         } else {
 851                             next_case_fold = '\u{10FFFF}';
 852                         }
 853                         c
 854                     }
 855                 };
 856             }
 857             // The fast path. We know this character doesn't have an entry
 858             // in the case folding table.
 859             if c != inc_char(end) {
 860                 ranges.push(ClassRange::new(start, end));
 861                 start = c;
 862             }
 863             end = c;
 864         }
 865         ranges.push(ClassRange::new(start, end));
 866         ranges
 867     }
 868 }
 869
 870 impl PartialEq<char> for ClassRange {
 871     #[inline]
 872     fn eq(&self, other: &char) -> bool {
 873         self.start <= *other && *other <= self.end
 874     }
 875 }
 876
 877 impl PartialEq<ClassRange> for char {
 878     #[inline]
 879     fn eq(&self, other: &ClassRange) -> bool {
 880         other.eq(self)
 881     }
 882 }
 883
 884 impl PartialOrd<char> for ClassRange {
 885     #[inline]
 886     fn partial_cmp(&self, other: &char) -> Option<Ordering> {
 887         Some(if self == other {
 888             Ordering::Equal
 889         } else if *other > self.end {
 890             Ordering::Greater
 891         } else {
 892             Ordering::Less
 893         })
 894     }
 895 }
 896
 897 impl PartialOrd<ClassRange> for char {
 898     #[inline]
 899     fn partial_cmp(&self, other: &ClassRange) -> Option<Ordering> {
 900         other.partial_cmp(self).map(|o| o.reverse())
 901     }
 902 }
 903
 904 impl ByteClass {
 905     /// Create a new class from an existing set of ranges.
 906     pub fn new(ranges: Vec<ByteRange>) -> ByteClass {
 907         ByteClass { ranges: ranges }
 908     }
 909
 910     /// Returns true if `b` is matched by this byte class.
 911     pub fn matches(&self, b: u8) -> bool {
 912         self.binary_search_by(|range| b.partial_cmp(range).unwrap()).is_ok()
 913     }
 914
 915     /// Removes the given byte from the class if it exists.
 916     ///
 917     /// Note that this takes `O(n)` time in the number of ranges.
 918     pub fn remove(&mut self, b: u8) {
 919         let mut i = match self.binary_search_by(|r| b.partial_cmp(r).unwrap()) {
 920             Ok(i) => i,
 921             Err(_) => return,
 922         };
 923         let mut r = self.ranges.remove(i);
 924         if r.start == b {
 925             r.start = b.saturating_add(1);
 926             if r.start > r.end || b == u8::MAX {
 927                 return;
 928             }
 929             self.ranges.insert(i, r);
 930         } else if r.end == b {
 931             r.end = b.saturating_sub(1);
 932             if r.end < r.start || b == b'\x00' {
 933                 return;
 934             }
 935             self.ranges.insert(0, r);
 936         } else {
 937             let (mut r1, mut r2) = (r.clone(), r.clone());
 938             r1.end = b.saturating_sub(1);
 939             if r1.start <= r1.end {
 940                 self.ranges.insert(i, r1);
 941                 i += 1;
 942             }
 943             r2.start = b.saturating_add(1);
 944             if r2.start <= r2.end {
 945                 self.ranges.insert(i, r2);
 946             }
 947         }
 948     }
 949
 950     /// Create a new empty class from this one.
 951     fn to_empty(&self) -> ByteClass {
 952         ByteClass { ranges: Vec::with_capacity(self.len()) }
 953     }
 954
 955     /// Canonicalze any sequence of ranges.
 956     ///
 957     /// This is responsible for enforcing the canonical format invariants
 958     /// as described on the docs for the `ByteClass` type.
 959     fn canonicalize(mut self) -> ByteClass {
 960         // TODO: Save some cycles here by checking if already canonicalized.
 961         self.ranges.sort();
 962         let mut ordered = self.to_empty(); // TODO: Do this in place?
 963         for candidate in self {
 964             // If the candidate overlaps with an existing range, then it must
 965             // be the most recent range added because we process the candidates
 966             // in order.
 967             if let Some(or) = ordered.ranges.last_mut() {
 968                 if or.overlapping(candidate) {
 969                     *or = or.merge(candidate);
 970                     continue;
 971                 }
 972             }
 973             ordered.ranges.push(candidate);
 974         }
 975         ordered
 976     }
 977
 978     /// Negates the byte class.
 979     ///
 980     /// For all `b` where `b` is a byte, `b` matches `self` if and only if `b`
 981     /// does not match `self.negate()`.
 982     pub fn negate(mut self) -> ByteClass {
 983         fn range(s: u8, e: u8) -> ByteRange { ByteRange::new(s, e) }
 984
 985         if self.is_empty() {
 986             // Inverting an empty range yields all bytes.
 987             return ByteClass {
 988                 ranges: vec![ByteRange { start: b'\x00', end: b'\xff' }],
 989             };
 990         }
 991         self = self.canonicalize();
 992         let mut inv = self.to_empty();
 993         if self[0].start > b'\x00' {
 994             inv.ranges.push(range(b'\x00', self[0].start.saturating_sub(1)));
 995         }
 996         for win in self.windows(2) {
 997             inv.ranges.push(range(win[0].end.saturating_add(1),
 998                                   win[1].start.saturating_sub(1)));
 999         }
1000         if self[self.len() - 1].end < u8::MAX {
1001             inv.ranges.push(range(self[self.len() - 1].end.saturating_add(1),
1002                                   u8::MAX));
1003         }
1004         inv
1005     }
1006
1007     /// Apply case folding to this byte class.
1008     ///
1009     /// This assumes that the bytes in the ranges are ASCII compatible.
1010     ///
1011     /// N.B. Applying case folding to a negated character class probably
1012     /// won't produce the expected result. e.g., `(?i)[^x]` really should
1013     /// match any character sans `x` and `X`, but if `[^x]` is negated
1014     /// before being case folded, you'll end up matching any character.
1015     pub fn case_fold(self) -> ByteClass {
1016         let mut folded = self.to_empty();
1017         for r in self {
1018             folded.ranges.extend(r.case_fold());
1019         }
1020         folded.canonicalize()
1021     }
1022
1023     /// Returns the number of bytes that match this class.
1024     fn num_bytes(&self) -> usize {
1025         self.ranges.iter()
1026             .map(|&r| 1 + (r.end as u32) - (r.start as u32))
1027             .fold(0, |acc, len| acc + len)
1028             as usize
1029     }
1030 }
1031
1032 impl ByteRange {
1033     /// Create a new class range.
1034     ///
1035     /// If `end < start`, then the two values are swapped so that
1036     /// the invariant `start <= end` is preserved.
1037     fn new(start: u8, end: u8) -> ByteRange {
1038         if start <= end {
1039             ByteRange { start: start, end: end }
1040         } else {
1041             ByteRange { start: end, end: start }
1042         }
1043     }
1044
1045     /// Returns true if and only if the two ranges are overlapping. Note that
1046     /// since ranges are inclusive, `a-c` and `d-f` are overlapping!
1047     fn overlapping(self, other: ByteRange) -> bool {
1048         max(self.start, other.start)
1049         <= min(self.end, other.end).saturating_add(1)
1050     }
1051
1052     /// Returns true if and only if the intersection of self and other is non
1053     /// empty.
1054     fn is_intersect_empty(self, other: ByteRange) -> bool {
1055         max(self.start, other.start) > min(self.end, other.end)
1056     }
1057
1058     /// Creates a new range representing the union of `self` and `other.
1059     fn merge(self, other: ByteRange) -> ByteRange {
1060         ByteRange {
1061             start: min(self.start, other.start),
1062             end: max(self.end, other.end),
1063         }
1064     }
1065
1066     /// Apply case folding to this range.
1067     ///
1068     /// Since case folding might add bytes such that the range is no
1069     /// longer contiguous, this returns multiple byte ranges.
1070     ///
1071     /// This assumes that the bytes in this range are ASCII compatible.
1072     fn case_fold(self) -> Vec<ByteRange> {
1073         // So much easier than Unicode case folding!
1074         let mut ranges = vec![self];
1075         if !ByteRange::new(b'a', b'z').is_intersect_empty(self) {
1076             let lower = max(self.start, b'a');
1077             let upper = min(self.end, b'z');
1078             ranges.push(ByteRange::new(lower - 32, upper - 32));
1079         }
1080         if !ByteRange::new(b'A', b'Z').is_intersect_empty(self) {
1081             let lower = max(self.start, b'A');
1082             let upper = min(self.end, b'Z');
1083             ranges.push(ByteRange::new(lower + 32, upper + 32));
1084         }
1085         ranges
1086     }
1087 }
1088
1089 impl Deref for ByteClass {
1090     type Target = Vec<ByteRange>;
1091     fn deref(&self) -> &Vec<ByteRange> { &self.ranges }
1092 }
1093
1094 impl IntoIterator for ByteClass {
1095     type Item = ByteRange;
1096     type IntoIter = vec::IntoIter<ByteRange>;
1097     fn into_iter(self) -> vec::IntoIter<ByteRange> { self.ranges.into_iter() }
1098 }
1099
1100 impl<'a> IntoIterator for &'a ByteClass {
1101     type Item = &'a ByteRange;
1102     type IntoIter = slice::Iter<'a, ByteRange>;
1103     fn into_iter(self) -> slice::Iter<'a, ByteRange> { self.iter() }
1104 }
1105
1106 impl PartialEq<u8> for ByteRange {
1107     #[inline]
1108     fn eq(&self, other: &u8) -> bool {
1109         self.start <= *other && *other <= self.end
1110     }
1111 }
1112
1113 impl PartialEq<ByteRange> for u8 {
1114     #[inline]
1115     fn eq(&self, other: &ByteRange) -> bool {
1116         other.eq(self)
1117     }
1118 }
1119
1120 impl PartialOrd<u8> for ByteRange {
1121     #[inline]
1122     fn partial_cmp(&self, other: &u8) -> Option<Ordering> {
1123         Some(if self == other {
1124             Ordering::Equal
1125         } else if *other > self.end {
1126             Ordering::Greater
1127         } else {
1128             Ordering::Less
1129         })
1130     }
1131 }
1132
1133 impl PartialOrd<ByteRange> for u8 {
1134     #[inline]
1135     fn partial_cmp(&self, other: &ByteRange) -> Option<Ordering> {
1136         other.partial_cmp(self).map(|o| o.reverse())
1137     }
1138 }
1139
1140 /// This implementation of `Display` will write a regular expression from the
1141 /// syntax tree. It does not write the original string parsed.
1142 impl fmt::Display for Expr {
1143     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1144         match *self {
1145             Empty => write!(f, ""),
1146             Literal { ref chars, casei } => {
1147                 if casei {
1148                     try!(write!(f, "(?iu:"));
1149                 } else {
1150                     try!(write!(f, "(?u:"));
1151                 }
1152                 for &c in chars {
1153                     try!(write!(f, "{}", quote_char(c)));
1154                 }
1155                 try!(write!(f, ")"));
1156                 Ok(())
1157             }
1158             LiteralBytes { ref bytes, casei } => {
1159                 if casei {
1160                     try!(write!(f, "(?i-u:"));
1161                 } else {
1162                     try!(write!(f, "(?-u:"));
1163                 }
1164                 for &b in bytes {
1165                     try!(write!(f, "{}", quote_byte(b)));
1166                 }
1167                 try!(write!(f, ")"));
1168                 Ok(())
1169             }
1170             AnyChar => write!(f, "(?su:.)"),
1171             AnyCharNoNL => write!(f, "(?u:.)"),
1172             AnyByte => write!(f, "(?s-u:.)"),
1173             AnyByteNoNL => write!(f, "(?-u:.)"),
1174             Class(ref cls) => write!(f, "{}", cls),
1175             ClassBytes(ref cls) => write!(f, "{}", cls),
1176             StartLine => write!(f, "(?m:^)"),
1177             EndLine => write!(f, "(?m:$)"),
1178             StartText => write!(f, r"^"),
1179             EndText => write!(f, r"$"),
1180             WordBoundary => write!(f, r"(?u:\b)"),
1181             NotWordBoundary => write!(f, r"(?u:\B)"),
1182             WordBoundaryAscii => write!(f, r"(?-u:\b)"),
1183             NotWordBoundaryAscii => write!(f, r"(?-u:\B)"),
1184             Group { ref e, i: None, name: None } => write!(f, "(?:{})", e),
1185             Group { ref e, name: None, .. } => write!(f, "({})", e),
1186             Group { ref e, name: Some(ref n), .. } => {
1187                 write!(f, "(?P<{}>{})", n, e)
1188             }
1189             Repeat { ref e, r, greedy } => {
1190                 match &**e {
1191                     &Literal { ref chars, .. } if chars.len() > 1 => {
1192                         try!(write!(f, "(?:{}){}", e, r))
1193                     }
1194                     _ => try!(write!(f, "{}{}", e, r)),
1195                 }
1196                 if !greedy { try!(write!(f, "?")); }
1197                 Ok(())
1198             }
1199             Concat(ref es) => {
1200                 for e in es {
1201                     try!(write!(f, "{}", e));
1202                 }
1203                 Ok(())
1204             }
1205             Alternate(ref es) => {
1206                 for (i, e) in es.iter().enumerate() {
1207                     if i > 0 { try!(write!(f, "|")); }
1208                     try!(write!(f, "{}", e));
1209                 }
1210                 Ok(())
1211             }
1212         }
1213     }
1214 }
1215
1216 impl fmt::Display for Repeater {
1217     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1218         match *self {
1219             ZeroOrOne => write!(f, "?"),
1220             ZeroOrMore => write!(f, "*"),
1221             OneOrMore => write!(f, "+"),
1222             Range { min: s, max: None } => write!(f, "{{{},}}", s),
1223             Range { min: s, max: Some(e) } if s == e => write!(f, "{{{}}}", s),
1224             Range { min: s, max: Some(e) } => write!(f, "{{{}, {}}}", s, e),
1225         }
1226     }
1227 }
1228
1229 impl fmt::Display for CharClass {
1230     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1231         try!(write!(f, "(?u:["));
1232         for range in self.iter() {
1233             if range.start == '-' || range.end == '-' {
1234                 try!(write!(f, "-"));
1235                 break;
1236             }
1237         }
1238         for range in self.iter() {
1239             let mut range = *range;
1240             if range.start == '-' {
1241                 range.start = ((range.start as u8) + 1) as char;
1242             }
1243             if range.end == '-' {
1244                 range.end = ((range.end as u8) - 1) as char;
1245             }
1246             if range.start > range.end {
1247                 continue;
1248             }
1249             try!(write!(f, "{}", range));
1250         }
1251         try!(write!(f, "])"));
1252         Ok(())
1253     }
1254 }
1255
1256 impl fmt::Display for ClassRange {
1257     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1258         write!(f, "{}-{}", quote_char(self.start), quote_char(self.end))
1259     }
1260 }
1261
1262 impl fmt::Display for ByteClass {
1263     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1264         try!(write!(f, "(?-u:["));
1265         for range in self.iter() {
1266             if range.start == b'-' || range.end == b'-' {
1267                 try!(write!(f, "-"));
1268                 break;
1269             }
1270         }
1271         for range in self.iter() {
1272             let mut range = *range;
1273             if range.start == b'-' {
1274                 range.start += 1;
1275             }
1276             if range.end == b'-' {
1277                 range.start -= 1;
1278             }
1279             if range.start > range.end {
1280                 continue;
1281             }
1282             try!(write!(f, "{}", range));
1283         }
1284         try!(write!(f, "])"));
1285         Ok(())
1286     }
1287 }
1288
1289 impl fmt::Display for ByteRange {
1290     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1291         write!(f, "{}-{}", quote_byte(self.start), quote_byte(self.end))
1292     }
1293 }
1294
1295 /// An alias for computations that can return a `Error`.
1296 pub type Result<T> = ::std::result::Result<T, Error>;
1297
1298 /// A parse error.
1299 ///
1300 /// This includes details about the specific type of error and a rough
1301 /// approximation of where it occurred.
1302 #[derive(Clone, Debug, PartialEq)]
1303 pub struct Error {
1304     pos: usize,
1305     surround: String,
1306     kind: ErrorKind,
1307 }
1308
1309 /// The specific type of parse error that can occur.
1310 #[derive(Clone, Debug, PartialEq)]
1311 pub enum ErrorKind {
1312     /// A negation symbol is used twice in flag settings.
1313     /// e.g., `(?-i-s)`.
1314     DoubleFlagNegation,
1315     /// The same capture name was used more than once.
1316     /// e.g., `(?P<a>.)(?P<a>.)`.
1317     DuplicateCaptureName(String),
1318     /// An alternate is empty. e.g., `(|a)`.
1319     EmptyAlternate,
1320     /// A capture group name is empty. e.g., `(?P<>a)`.
1321     EmptyCaptureName,
1322     /// A negation symbol was not proceded by any flags. e.g., `(?i-)`.
1323     EmptyFlagNegation,
1324     /// A group is empty. e.g., `()`.
1325     EmptyGroup,
1326     /// An invalid number was used in a counted repetition. e.g., `a{b}`.
1327     InvalidBase10(String),
1328     /// An invalid hexadecimal number was used in an escape sequence.
1329     /// e.g., `\xAG`.
1330     InvalidBase16(String),
1331     /// An invalid capture name was used. e.g., `(?P<0a>b)`.
1332     InvalidCaptureName(String),
1333     /// An invalid class range was givien. Specifically, when the start of the
1334     /// range is greater than the end. e.g., `[z-a]`.
1335     InvalidClassRange {
1336         /// The first character specified in the range.
1337         start: char,
1338         /// The second character specified in the range.
1339         end: char,
1340     },
1341     /// An escape sequence was used in a character class where it is not
1342     /// allowed. e.g., `[a-\pN]` or `[\A]`.
1343     InvalidClassEscape(Expr),
1344     /// An invalid counted repetition min/max was given. e.g., `a{2,1}`.
1345     InvalidRepeatRange {
1346         /// The first number specified in the repetition.
1347         min: u32,
1348         /// The second number specified in the repetition.
1349         max: u32,
1350     },
1351     /// An invalid Unicode scalar value was used in a long hexadecimal
1352     /// sequence. e.g., `\x{D800}`.
1353     InvalidScalarValue(u32),
1354     /// An empty counted repetition operator. e.g., `a{}`.
1355     MissingBase10,
1356     /// A repetition operator was not applied to an expression. e.g., `*`.
1357     RepeaterExpectsExpr,
1358     /// A repetition operator was applied to an expression that cannot be
1359     /// repeated. e.g., `a+*` or `a|*`.
1360     RepeaterUnexpectedExpr(Expr),
1361     /// A capture group name that is never closed. e.g., `(?P<a`.
1362     UnclosedCaptureName(String),
1363     /// An unclosed hexadecimal literal. e.g., `\x{a`.
1364     UnclosedHex,
1365     /// An unclosed parenthesis. e.g., `(a`.
1366     UnclosedParen,
1367     /// An unclosed counted repetition operator. e.g., `a{2`.
1368     UnclosedRepeat,
1369     /// An unclosed named Unicode class. e.g., `\p{Yi`.
1370     UnclosedUnicodeName,
1371     /// Saw end of regex before class was closed. e.g., `[a`.
1372     UnexpectedClassEof,
1373     /// Saw end of regex before escape sequence was closed. e.g., `\`.
1374     UnexpectedEscapeEof,
1375     /// Saw end of regex before flags were closed. e.g., `(?i`.
1376     UnexpectedFlagEof,
1377     /// Saw end of regex before two hexadecimal digits were seen. e.g., `\xA`.
1378     UnexpectedTwoDigitHexEof,
1379     /// Unopened parenthesis. e.g., `)`.
1380     UnopenedParen,
1381     /// Unrecognized escape sequence. e.g., `\q`.
1382     UnrecognizedEscape(char),
1383     /// Unrecognized flag. e.g., `(?a)`.
1384     UnrecognizedFlag(char),
1385     /// Unrecognized named Unicode class. e.g., `\p{Foo}`.
1386     UnrecognizedUnicodeClass(String),
1387     /// Indicates that the regex uses too much nesting.
1388     ///
1389     /// (N.B. This error exists because traversing the Expr is recursive and
1390     /// an explicit heap allocated stack is not (yet?) used. Regardless, some
1391     /// sort of limit must be applied to avoid unbounded memory growth.
1392     StackExhausted,
1393     /// A disallowed flag was found (e.g., `u`).
1394     FlagNotAllowed(char),
1395     /// A Unicode class was used when the Unicode (`u`) flag was disabled.
1396     UnicodeNotAllowed,
1397     /// InvalidUtf8 indicates that the expression may match non-UTF-8 bytes.
1398     /// This never returned if the parser is permitted to allow expressions
1399     /// that match arbitrary bytes.
1400     InvalidUtf8,
1401     /// A character class was constructed such that it is empty.
1402     /// e.g., `[^\d\D]`.
1403     EmptyClass,
1404     /// Hints that destructuring should not be exhaustive.
1405     ///
1406     /// This enum may grow additional variants, so this makes sure clients
1407     /// don't count on exhaustive matching. (Otherwise, adding a new variant
1408     /// could break existing code.)
1409     #[doc(hidden)]
1410     __Nonexhaustive,
1411 }
1412
1413 impl Error {
1414     /// Returns an approximate *character* offset at which the error occurred.
1415     ///
1416     /// The character offset may be equal to the number of characters in the
1417     /// string, in which case it should be interpreted as pointing to the end
1418     /// of the regex.
1419     pub fn position(&self) -> usize {
1420         self.pos
1421     }
1422
1423     /// Returns the type of the regex parse error.
1424     pub fn kind(&self) -> &ErrorKind {
1425         &self.kind
1426     }
1427 }
1428
1429 impl ErrorKind {
1430     fn description(&self) -> &str {
1431         use ErrorKind::*;
1432         match *self {
1433             DoubleFlagNegation => "double flag negation",
1434             DuplicateCaptureName(_) => "duplicate capture name",
1435             EmptyAlternate => "empty alternate",
1436             EmptyCaptureName => "empty capture name",
1437             EmptyFlagNegation => "flag negation without any flags",
1438             EmptyGroup => "empty group (e.g., '()')",
1439             InvalidBase10(_) => "invalid base 10 number",
1440             InvalidBase16(_) => "invalid base 16 number",
1441             InvalidCaptureName(_) => "invalid capture name",
1442             InvalidClassRange{..} => "invalid character class range",
1443             InvalidClassEscape(_) => "invalid escape sequence in class",
1444             InvalidRepeatRange{..} => "invalid counted repetition range",
1445             InvalidScalarValue(_) => "invalid Unicode scalar value",
1446             MissingBase10 => "missing count in repetition operator",
1447             RepeaterExpectsExpr => "repetition operator missing expression",
1448             RepeaterUnexpectedExpr(_) => "expression cannot be repeated",
1449             UnclosedCaptureName(_) => "unclosed capture group name",
1450             UnclosedHex => "unclosed hexadecimal literal",
1451             UnclosedParen => "unclosed parenthesis",
1452             UnclosedRepeat => "unclosed counted repetition operator",
1453             UnclosedUnicodeName => "unclosed Unicode class literal",
1454             UnexpectedClassEof => "unexpected EOF in character class",
1455             UnexpectedEscapeEof => "unexpected EOF in escape sequence",
1456             UnexpectedFlagEof => "unexpected EOF in flags",
1457             UnexpectedTwoDigitHexEof => "unexpected EOF in hex literal",
1458             UnopenedParen => "unopened parenthesis",
1459             UnrecognizedEscape(_) => "unrecognized escape sequence",
1460             UnrecognizedFlag(_) => "unrecognized flag",
1461             UnrecognizedUnicodeClass(_) => "unrecognized Unicode class name",
1462             StackExhausted => "stack exhausted, too much nesting",
1463             FlagNotAllowed(_) => "flag not allowed",
1464             UnicodeNotAllowed => "Unicode features not allowed",
1465             InvalidUtf8 => "matching arbitrary bytes is not allowed",
1466             EmptyClass => "empty character class",
1467             __Nonexhaustive => unreachable!(),
1468         }
1469     }
1470 }
1471
1472 impl ::std::error::Error for Error {
1473     fn description(&self) -> &str {
1474         self.kind.description()
1475     }
1476 }
1477
1478 impl fmt::Display for Error {
1479     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1480         if let ErrorKind::StackExhausted = self.kind {
1481             write!(f, "Error parsing regex: {}", self.kind)
1482         } else {
1483             write!(
1484                 f, "Error parsing regex near '{}' at character offset {}: {}",
1485                 self.surround, self.pos, self.kind)
1486         }
1487     }
1488 }
1489
1490 impl fmt::Display for ErrorKind {
1491     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1492         use ErrorKind::*;
1493         match *self {
1494             DoubleFlagNegation =>
1495                 write!(f, "Only one negation symbol is allowed in flags."),
1496             DuplicateCaptureName(ref s) =>
1497                 write!(f, "Capture name '{}' is used more than once.", s),
1498             EmptyAlternate =>
1499                 write!(f, "Alternations cannot be empty."),
1500             EmptyCaptureName =>
1501                 write!(f, "Capture names cannot be empty."),
1502             EmptyFlagNegation =>
1503                 write!(f, "Flag negation requires setting at least one flag."),
1504             EmptyGroup =>
1505                 write!(f, "Empty regex groups (e.g., '()') are not allowed."),
1506             InvalidBase10(ref s) =>
1507                 write!(f, "Not a valid base 10 number: '{}'", s),
1508             InvalidBase16(ref s) =>
1509                 write!(f, "Not a valid base 16 number: '{}'", s),
1510             InvalidCaptureName(ref s) =>
1511                 write!(f, "Invalid capture name: '{}'. Capture names must \
1512                            consist of [_a-zA-Z0-9] and are not allowed to \
1513                            start with with a number.", s),
1514             InvalidClassRange { start, end } =>
1515                 write!(f, "Invalid character class range '{}-{}'. \
1516                            Character class ranges must start with the smaller \
1517                            character, but {} > {}", start, end, start, end),
1518             InvalidClassEscape(ref e) =>
1519                 write!(f, "Invalid escape sequence in character \
1520                            class: '{}'.", e),
1521             InvalidRepeatRange { min, max } =>
1522                 write!(f, "Invalid counted repetition range: {{{}, {}}}. \
1523                            Counted repetition ranges must start with the \
1524                            minimum, but {} > {}", min, max, min, max),
1525             InvalidScalarValue(c) =>
1526                 write!(f, "Number does not correspond to a Unicode scalar \
1527                            value: '{}'.", c),
1528             MissingBase10 =>
1529                 write!(f, "Missing maximum in counted reptition operator."),
1530             RepeaterExpectsExpr =>
1531                 write!(f, "Missing expression for reptition operator."),
1532             RepeaterUnexpectedExpr(ref e) =>
1533                 write!(f, "Invalid application of reptition operator to: \
1534                           '{}'.", e),
1535             UnclosedCaptureName(ref s) =>
1536                 write!(f, "Capture name group for '{}' is not closed. \
1537                            (Missing a '>'.)", s),
1538             UnclosedHex =>
1539                 write!(f, "Unclosed hexadecimal literal (missing a '}}')."),
1540             UnclosedParen =>
1541                 write!(f, "Unclosed parenthesis."),
1542             UnclosedRepeat =>
1543                 write!(f, "Unclosed counted repetition (missing a '}}')."),
1544             UnclosedUnicodeName =>
1545                 write!(f, "Unclosed Unicode literal (missing a '}}')."),
1546             UnexpectedClassEof =>
1547                 write!(f, "Character class was not closed before the end of \
1548                            the regex (missing a ']')."),
1549             UnexpectedEscapeEof =>
1550                 write!(f, "Started an escape sequence that didn't finish \
1551                            before the end of the regex."),
1552             UnexpectedFlagEof =>
1553                 write!(f, "Inline flag settings was not closed before the end \
1554                            of the regex (missing a ')' or ':')."),
1555             UnexpectedTwoDigitHexEof =>
1556                 write!(f, "Unexpected end of two digit hexadecimal literal."),
1557             UnopenedParen =>
1558                 write!(f, "Unopened parenthesis."),
1559             UnrecognizedEscape(c) =>
1560                 write!(f, "Unrecognized escape sequence: '\\{}'.", c),
1561             UnrecognizedFlag(c) =>
1562                 write!(f, "Unrecognized flag: '{}'. \
1563                            (Allowed flags: i, m, s, U, u, x.)", c),
1564             UnrecognizedUnicodeClass(ref s) =>
1565                 write!(f, "Unrecognized Unicode class name: '{}'.", s),
1566             StackExhausted =>
1567                 write!(f, "Exhausted space required to parse regex with too \
1568                            much nesting."),
1569             FlagNotAllowed(flag) =>
1570                 write!(f, "Use of the flag '{}' is not allowed.", flag),
1571             UnicodeNotAllowed =>
1572                 write!(f, "Unicode features are not allowed when the Unicode \
1573                            (u) flag is not set."),
1574             InvalidUtf8 =>
1575                 write!(f, "Matching arbitrary bytes is not allowed."),
1576             EmptyClass =>
1577                 write!(f, "Empty character classes are not allowed."),
1578             __Nonexhaustive => unreachable!(),
1579         }
1580     }
1581 }
1582
1583 /// The result of binary search on the simple case folding table.
1584 ///
1585 /// Note that this binary search is done on the "both" table, such that
1586 /// the index returned corresponds to the *first* location of `c1` in the
1587 /// table. The table can then be scanned linearly starting from the position
1588 /// returned to find other case mappings for `c1`.
1589 fn simple_case_fold_both_result(c1: char) -> result::Result<usize, usize> {
1590     let table = &case_folding::C_plus_S_both_table;
1591     let i = binary_search(table, |&(c2, _)| c1 <= c2);
1592     if i >= table.len() || table[i].0 != c1 {
1593         Err(i)
1594     } else {
1595         Ok(i)
1596     }
1597 }
1598
1599 /// Binary search to find first element such that `pred(T) == true`.
1600 ///
1601 /// Assumes that if `pred(xs[i]) == true` then `pred(xs[i+1]) == true`.
1602 ///
1603 /// If all elements yield `pred(T) == false`, then `xs.len()` is returned.
1604 fn binary_search<T, F>(xs: &[T], mut pred: F) -> usize
1605         where F: FnMut(&T) -> bool {
1606     let (mut left, mut right) = (0, xs.len());
1607     while left < right {
1608         let mid = (left + right) / 2;
1609         if pred(&xs[mid]) {
1610             right = mid;
1611         } else {
1612             left = mid + 1;
1613         }
1614     }
1615     left
1616 }
1617
1618 /// Escapes all regular expression meta characters in `text`.
1619 ///
1620 /// The string returned may be safely used as a literal in a regular
1621 /// expression.
1622 pub fn quote(text: &str) -> String {
1623     let mut quoted = String::with_capacity(text.len());
1624     for c in text.chars() {
1625         if parser::is_punct(c) {
1626             quoted.push('\\');
1627         }
1628         quoted.push(c);
1629     }
1630     quoted
1631 }
1632
1633 fn quote_char(c: char) -> String {
1634     let mut s = String::new();
1635     if parser::is_punct(c) {
1636         s.push('\\');
1637     }
1638     s.push(c);
1639     s
1640 }
1641
1642 fn quote_byte(b: u8) -> String {
1643     if parser::is_punct(b as char) || b == b'\'' || b == b'"' {
1644         quote_char(b as char)
1645     } else {
1646         let escaped: Vec<u8> = ascii::escape_default(b).collect();
1647         String::from_utf8(escaped).unwrap()
1648     }
1649 }
1650
1651 fn inc_char(c: char) -> char {
1652     match c {
1653         char::MAX => char::MAX,
1654         '\u{D7FF}' => '\u{E000}',
1655         c => char::from_u32(c as u32 + 1).unwrap(),
1656     }
1657 }
1658
1659 fn dec_char(c: char) -> char {
1660     match c {
1661         '\x00' => '\x00',
1662         '\u{E000}' => '\u{D7FF}',
1663         c => char::from_u32(c as u32 - 1).unwrap(),
1664     }
1665 }
1666
1667 /// Returns true if and only if `c` is a word character.
1668 #[doc(hidden)]
1669 pub fn is_word_char(c: char) -> bool {
1670     match c {
1671         '_' | '0' ... '9' | 'a' ... 'z' | 'A' ... 'Z'  => true,
1672         _ => ::unicode::regex::PERLW.binary_search_by(|&(start, end)| {
1673             if c >= start && c <= end {
1674                 Ordering::Equal
1675             } else if start > c {
1676                 Ordering::Greater
1677             } else {
1678                 Ordering::Less
1679             }
1680         }).is_ok(),
1681     }
1682 }
1683
1684 /// Returns true if and only if `c` is an ASCII word byte.
1685 #[doc(hidden)]
1686 pub fn is_word_byte(b: u8) -> bool {
1687     match b {
1688         b'_' | b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z'  => true,
1689         _ => false,
1690     }
1691 }
1692
1693 #[cfg(test)]
1694 mod properties;
1695
1696 #[cfg(test)]
1697 mod tests {
1698     use {CharClass, ClassRange, ByteClass, ByteRange, Expr};
1699
1700     fn class(ranges: &[(char, char)]) -> CharClass {
1701         let ranges = ranges.iter().cloned()
1702                            .map(|(c1, c2)| ClassRange::new(c1, c2)).collect();
1703         CharClass::new(ranges)
1704     }
1705
1706     fn bclass(ranges: &[(u8, u8)]) -> ByteClass {
1707         let ranges = ranges.iter().cloned()
1708                            .map(|(c1, c2)| ByteRange::new(c1, c2)).collect();
1709         ByteClass::new(ranges)
1710     }
1711
1712     fn e(re: &str) -> Expr { Expr::parse(re).unwrap() }
1713
1714     #[test]
1715     fn stack_exhaustion() {
1716         use std::iter::repeat;
1717
1718         let open: String = repeat('(').take(200).collect();
1719         let close: String = repeat(')').take(200).collect();
1720         assert!(Expr::parse(&format!("{}a{}", open, close)).is_ok());
1721
1722         let open: String = repeat('(').take(200 + 1).collect();
1723         let close: String = repeat(')').take(200 + 1).collect();
1724         assert!(Expr::parse(&format!("{}a{}", open, close)).is_err());
1725     }
1726
1727     #[test]
1728     fn anchored_start() {
1729         assert!(e("^a").is_anchored_start());
1730         assert!(e("(^a)").is_anchored_start());
1731         assert!(e("^a|^b").is_anchored_start());
1732         assert!(e("(^a)|(^b)").is_anchored_start());
1733         assert!(e("(^(a|b))").is_anchored_start());
1734
1735         assert!(!e("^a|b").is_anchored_start());
1736         assert!(!e("a|^b").is_anchored_start());
1737     }
1738
1739     #[test]
1740     fn anchored_end() {
1741         assert!(e("a$").is_anchored_end());
1742         assert!(e("(a$)").is_anchored_end());
1743         assert!(e("a$|b$").is_anchored_end());
1744         assert!(e("(a$)|(b$)").is_anchored_end());
1745         assert!(e("((a|b)$)").is_anchored_end());
1746
1747         assert!(!e("a$|b").is_anchored_end());
1748         assert!(!e("a|b$").is_anchored_end());
1749     }
1750
1751     #[test]
1752     fn class_canon_no_change() {
1753         let cls = class(&[('a', 'c'), ('x', 'z')]);
1754         assert_eq!(cls.clone().canonicalize(), cls);
1755     }
1756
1757     #[test]
1758     fn class_canon_unordered() {
1759         let cls = class(&[('x', 'z'), ('a', 'c')]);
1760         assert_eq!(cls.canonicalize(), class(&[
1761             ('a', 'c'), ('x', 'z'),
1762         ]));
1763     }
1764
1765     #[test]
1766     fn class_canon_overlap() {
1767         let cls = class(&[('x', 'z'), ('w', 'y')]);
1768         assert_eq!(cls.canonicalize(), class(&[
1769             ('w', 'z'),
1770         ]));
1771     }
1772
1773     #[test]
1774     fn class_canon_overlap_many() {
1775         let cls = class(&[
1776             ('c', 'f'), ('a', 'g'), ('d', 'j'), ('a', 'c'),
1777             ('m', 'p'), ('l', 's'),
1778         ]);
1779         assert_eq!(cls.clone().canonicalize(), class(&[
1780             ('a', 'j'), ('l', 's'),
1781         ]));
1782     }
1783
1784     #[test]
1785     fn class_canon_overlap_boundary() {
1786         let cls = class(&[('x', 'z'), ('u', 'w')]);
1787         assert_eq!(cls.canonicalize(), class(&[
1788             ('u', 'z'),
1789         ]));
1790     }
1791
1792     #[test]
1793     fn class_canon_extreme_edge_case() {
1794         let cls = class(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
1795         assert_eq!(cls.canonicalize(), class(&[
1796             ('\x00', '\u{10FFFF}'),
1797         ]));
1798     }
1799
1800     #[test]
1801     fn class_canon_singles() {
1802         let cls = class(&[('a', 'a'), ('b', 'b')]);
1803         assert_eq!(cls.canonicalize(), class(&[('a', 'b')]));
1804     }
1805
1806     #[test]
1807     fn class_negate_single() {
1808         let cls = class(&[('a', 'a')]);
1809         assert_eq!(cls.negate(), class(&[
1810             ('\x00', '\x60'), ('\x62', '\u{10FFFF}'),
1811         ]));
1812     }
1813
1814     #[test]
1815     fn class_negate_singles() {
1816         let cls = class(&[('a', 'a'), ('b', 'b')]);
1817         assert_eq!(cls.negate(), class(&[
1818             ('\x00', '\x60'), ('\x63', '\u{10FFFF}'),
1819         ]));
1820     }
1821
1822     #[test]
1823     fn class_negate_multiples() {
1824         let cls = class(&[('a', 'c'), ('x', 'z')]);
1825         assert_eq!(cls.negate(), class(&[
1826             ('\x00', '\x60'), ('\x64', '\x77'), ('\x7b', '\u{10FFFF}'),
1827         ]));
1828     }
1829
1830     #[test]
1831     fn class_negate_min_scalar() {
1832         let cls = class(&[('\x00', 'a')]);
1833         assert_eq!(cls.negate(), class(&[
1834             ('\x62', '\u{10FFFF}'),
1835         ]));
1836     }
1837
1838     #[test]
1839     fn class_negate_max_scalar() {
1840         let cls = class(&[('a', '\u{10FFFF}')]);
1841         assert_eq!(cls.negate(), class(&[
1842             ('\x00', '\x60'),
1843         ]));
1844     }
1845
1846     #[test]
1847     fn class_negate_everything() {
1848         let cls = class(&[('\x00', '\u{10FFFF}')]);
1849         assert_eq!(cls.negate(), class(&[]));
1850     }
1851
1852     #[test]
1853     fn class_negate_everything_sans_one() {
1854         let cls = class(&[
1855             ('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')
1856         ]);
1857         assert_eq!(cls.negate(), class(&[
1858             ('\u{10FFFE}', '\u{10FFFE}'),
1859         ]));
1860     }
1861
1862     #[test]
1863     fn class_negate_surrogates_min() {
1864         let cls = class(&[('\x00', '\u{D7FF}')]);
1865         assert_eq!(cls.negate(), class(&[
1866             ('\u{E000}', '\u{10FFFF}'),
1867         ]));
1868     }
1869
1870     #[test]
1871     fn class_negate_surrogates_min_edge() {
1872         let cls = class(&[('\x00', '\u{D7FE}')]);
1873         assert_eq!(cls.negate(), class(&[
1874             ('\u{D7FF}', '\u{10FFFF}'),
1875         ]));
1876     }
1877
1878     #[test]
1879     fn class_negate_surrogates_max() {
1880         let cls = class(&[('\u{E000}', '\u{10FFFF}')]);
1881         assert_eq!(cls.negate(), class(&[
1882             ('\x00', '\u{D7FF}'),
1883         ]));
1884     }
1885
1886     #[test]
1887     fn class_negate_surrogates_max_edge() {
1888         let cls = class(&[('\u{E001}', '\u{10FFFF}')]);
1889         assert_eq!(cls.negate(), class(&[
1890             ('\x00', '\u{E000}'),
1891         ]));
1892     }
1893
1894     #[test]
1895     fn class_canon_overlap_many_case_fold() {
1896         let cls = class(&[
1897             ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'),
1898             ('M', 'P'), ('L', 'S'), ('c', 'f'),
1899         ]);
1900         assert_eq!(cls.case_fold(), class(&[
1901             ('A', 'J'), ('L', 'S'),
1902             ('a', 'j'), ('l', 's'),
1903             ('\u{17F}', '\u{17F}'),
1904         ]));
1905
1906         let cls = bclass(&[
1907             (b'C', b'F'), (b'A', b'G'), (b'D', b'J'), (b'A', b'C'),
1908             (b'M', b'P'), (b'L', b'S'), (b'c', b'f'),
1909         ]);
1910         assert_eq!(cls.case_fold(), bclass(&[
1911             (b'A', b'J'), (b'L', b'S'),
1912             (b'a', b'j'), (b'l', b's'),
1913         ]));
1914     }
1915
1916     #[test]
1917     fn class_fold_az() {
1918         let cls = class(&[('A', 'Z')]);
1919         assert_eq!(cls.case_fold(), class(&[
1920             ('A', 'Z'), ('a', 'z'),
1921             ('\u{17F}', '\u{17F}'),
1922             ('\u{212A}', '\u{212A}'),
1923         ]));
1924         let cls = class(&[('a', 'z')]);
1925         assert_eq!(cls.case_fold(), class(&[
1926             ('A', 'Z'), ('a', 'z'),
1927             ('\u{17F}', '\u{17F}'),
1928             ('\u{212A}', '\u{212A}'),
1929         ]));
1930
1931         let cls = bclass(&[(b'A', b'Z')]);
1932         assert_eq!(cls.case_fold(), bclass(&[
1933             (b'A', b'Z'), (b'a', b'z'),
1934         ]));
1935         let cls = bclass(&[(b'a', b'z')]);
1936         assert_eq!(cls.case_fold(), bclass(&[
1937             (b'A', b'Z'), (b'a', b'z'),
1938         ]));
1939     }
1940
1941     #[test]
1942     fn class_fold_a_underscore() {
1943         let cls = class(&[('A', 'A'), ('_', '_')]);
1944         assert_eq!(cls.clone().canonicalize(), class(&[
1945             ('A', 'A'), ('_', '_'),
1946         ]));
1947         assert_eq!(cls.case_fold(), class(&[
1948             ('A', 'A'), ('_', '_'), ('a', 'a'),
1949         ]));
1950
1951         let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
1952         assert_eq!(cls.clone().canonicalize(), bclass(&[
1953             (b'A', b'A'), (b'_', b'_'),
1954         ]));
1955         assert_eq!(cls.case_fold(), bclass(&[
1956             (b'A', b'A'), (b'_', b'_'), (b'a', b'a'),
1957         ]));
1958     }
1959
1960     #[test]
1961     fn class_fold_a_equals() {
1962         let cls = class(&[('A', 'A'), ('=', '=')]);
1963         assert_eq!(cls.clone().canonicalize(), class(&[
1964             ('=', '='), ('A', 'A'),
1965         ]));
1966         assert_eq!(cls.case_fold(), class(&[
1967             ('=', '='), ('A', 'A'), ('a', 'a'),
1968         ]));
1969
1970         let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
1971         assert_eq!(cls.clone().canonicalize(), bclass(&[
1972             (b'=', b'='), (b'A', b'A'),
1973         ]));
1974         assert_eq!(cls.case_fold(), bclass(&[
1975             (b'=', b'='), (b'A', b'A'), (b'a', b'a'),
1976         ]));
1977     }
1978
1979     #[test]
1980     fn class_fold_no_folding_needed() {
1981         let cls = class(&[('\x00', '\x10')]);
1982         assert_eq!(cls.case_fold(), class(&[
1983             ('\x00', '\x10'),
1984         ]));
1985
1986         let cls = bclass(&[(b'\x00', b'\x10')]);
1987         assert_eq!(cls.case_fold(), bclass(&[
1988             (b'\x00', b'\x10'),
1989         ]));
1990     }
1991
1992     #[test]
1993     fn class_fold_negated() {
1994         let cls = class(&[('x', 'x')]);
1995         assert_eq!(cls.clone().case_fold(), class(&[
1996             ('X', 'X'), ('x', 'x'),
1997         ]));
1998         assert_eq!(cls.case_fold().negate(), class(&[
1999             ('\x00', 'W'), ('Y', 'w'), ('y', '\u{10FFFF}'),
2000         ]));
2001
2002         let cls = bclass(&[(b'x', b'x')]);
2003         assert_eq!(cls.clone().case_fold(), bclass(&[
2004             (b'X', b'X'), (b'x', b'x'),
2005         ]));
2006         assert_eq!(cls.case_fold().negate(), bclass(&[
2007             (b'\x00', b'W'), (b'Y', b'w'), (b'y', b'\xff'),
2008         ]));
2009     }
2010
2011     #[test]
2012     fn class_fold_single_to_multiple() {
2013         let cls = class(&[('k', 'k')]);
2014         assert_eq!(cls.case_fold(), class(&[
2015             ('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),
2016         ]));
2017
2018         let cls = bclass(&[(b'k', b'k')]);
2019         assert_eq!(cls.case_fold(), bclass(&[
2020             (b'K', b'K'), (b'k', b'k'),
2021         ]));
2022     }
2023
2024     #[test]
2025     fn class_fold_at() {
2026         let cls = class(&[('@', '@')]);
2027         assert_eq!(cls.clone().canonicalize(), class(&[('@', '@')]));
2028         assert_eq!(cls.case_fold(), class(&[('@', '@')]));
2029
2030         let cls = bclass(&[(b'@', b'@')]);
2031         assert_eq!(cls.clone().canonicalize(), bclass(&[(b'@', b'@')]));
2032         assert_eq!(cls.case_fold(), bclass(&[(b'@', b'@')]));
2033     }
2034
2035     #[test]
2036     fn roundtrip_class_hypen() {
2037         let expr = e("[-./]");
2038         assert_eq!("(?u:[-\\.-/])", expr.to_string());
2039
2040         let expr = e("(?-u)[-./]");
2041         assert_eq!("(?-u:[-\\.-/])", expr.to_string());
2042     }
2043 }