src/vendor/regex-syntax-0.5.5/src/hir/mod.rs

   1 // Copyright 2018 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 /*!
  12 Defines a high-level intermediate representation for regular expressions.
  13 */
  14 use std::char;
  15 use std::cmp;
  16 use std::error;
  17 use std::fmt;
  18 use std::u8;
  19
  20 use ast::Span;
  21 use hir::interval::{Interval, IntervalSet, IntervalSetIter};
  22 use unicode;
  23
  24 pub use hir::visitor::{Visitor, visit};
  25
  26 mod interval;
  27 pub mod literal;
  28 pub mod print;
  29 pub mod translate;
  30 mod visitor;
  31
  32 /// An error that can occur while translating an `Ast` to a `Hir`.
  33 #[derive(Clone, Debug, Eq, PartialEq)]
  34 pub struct Error {
  35     /// The kind of error.
  36     kind: ErrorKind,
  37     /// The original pattern that the translator's Ast was parsed from. Every
  38     /// span in an error is a valid range into this string.
  39     pattern: String,
  40     /// The span of this error, derived from the Ast given to the translator.
  41     span: Span,
  42 }
  43
  44 impl Error {
  45     /// Return the type of this error.
  46     pub fn kind(&self) -> &ErrorKind {
  47         &self.kind
  48     }
  49
  50     /// The original pattern string in which this error occurred.
  51     ///
  52     /// Every span reported by this error is reported in terms of this string.
  53     pub fn pattern(&self) -> &str {
  54         &self.pattern
  55     }
  56
  57     /// Return the span at which this error occurred.
  58     pub fn span(&self) -> &Span {
  59         &self.span
  60     }
  61 }
  62
  63 /// The type of an error that occurred while building an `Hir`.
  64 #[derive(Clone, Debug, Eq, PartialEq)]
  65 pub enum ErrorKind {
  66     /// This error occurs when a Unicode feature is used when Unicode
  67     /// support is disabled. For example `(?-u:\pL)` would trigger this error.
  68     UnicodeNotAllowed,
  69     /// This error occurs when translating a pattern that could match a byte
  70     /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled.
  71     InvalidUtf8,
  72     /// This occurs when an unrecognized Unicode property name could not
  73     /// be found.
  74     UnicodePropertyNotFound,
  75     /// This occurs when an unrecognized Unicode property value could not
  76     /// be found.
  77     UnicodePropertyValueNotFound,
  78     /// This occurs when the translator attempts to construct a character class
  79     /// that is empty.
  80     ///
  81     /// Note that this restriction in the translator may be removed in the
  82     /// future.
  83     EmptyClassNotAllowed,
  84     /// Hints that destructuring should not be exhaustive.
  85     ///
  86     /// This enum may grow additional variants, so this makes sure clients
  87     /// don't count on exhaustive matching. (Otherwise, adding a new variant
  88     /// could break existing code.)
  89     #[doc(hidden)]
  90     __Nonexhaustive,
  91 }
  92
  93 impl ErrorKind {
  94     fn description(&self) -> &str {
  95         use self::ErrorKind::*;
  96         match *self {
  97             UnicodeNotAllowed => "Unicode not allowed here",
  98             InvalidUtf8 => "pattern can match invalid UTF-8",
  99             UnicodePropertyNotFound => "Unicode property not found",
 100             UnicodePropertyValueNotFound => "Unicode property value not found",
 101             EmptyClassNotAllowed => "empty character classes are not allowed",
 102             _ => unreachable!(),
 103         }
 104     }
 105 }
 106
 107 impl error::Error for Error {
 108     fn description(&self) -> &str {
 109         self.kind.description()
 110     }
 111 }
 112
 113 impl fmt::Display for Error {
 114     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 115         ::error::Formatter::from(self).fmt(f)
 116     }
 117 }
 118
 119 impl fmt::Display for ErrorKind {
 120     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 121         f.write_str(self.description())
 122     }
 123 }
 124
 125 /// A high-level intermediate representation (HIR) for a regular expression.
 126 ///
 127 /// The HIR of a regular expression represents an intermediate step between its
 128 /// abstract syntax (a structured description of the concrete syntax) and
 129 /// compiled byte codes. The purpose of HIR is to make regular expressions
 130 /// easier to analyze. In particular, the AST is much more complex than the
 131 /// HIR. For example, while an AST supports arbitrarily nested character
 132 /// classes, the HIR will flatten all nested classes into a single set. The HIR
 133 /// will also "compile away" every flag present in the concrete syntax. For
 134 /// example, users of HIR expressions never need to worry about case folding;
 135 /// it is handled automatically by the translator (e.g., by translating `(?i)A`
 136 /// to `[aA]`).
 137 ///
 138 /// If the HIR was produced by a translator that disallows invalid UTF-8, then
 139 /// the HIR is guaranteed to match UTF-8 exclusively.
 140 ///
 141 /// This type defines its own destructor that uses constant stack space and
 142 /// heap space proportional to the size of the HIR.
 143 ///
 144 /// The specific type of an HIR expression can be accessed via its `kind`
 145 /// or `into_kind` methods. This extra level of indirection exists for two
 146 /// reasons:
 147 ///
 148 /// 1. Construction of an HIR expression *must* use the constructor methods
 149 ///    on this `Hir` type instead of building the `HirKind` values directly.
 150 ///    This permits construction to enforce invariants like "concatenations
 151 ///    always consist of two or more sub-expressions."
 152 /// 2. Every HIR expression contains attributes that are defined inductively,
 153 ///    and can be computed cheaply during the construction process. For
 154 ///    example, one such attribute is whether the expression must match at the
 155 ///    beginning of the text.
 156 ///
 157 /// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
 158 /// expression pattern string, and uses constant stack space and heap space
 159 /// proportional to the size of the `Hir`.
 160 #[derive(Clone, Debug, Eq, PartialEq)]
 161 pub struct Hir {
 162     /// The underlying HIR kind.
 163     kind: HirKind,
 164     /// Analysis info about this HIR, computed during construction.
 165     info: HirInfo,
 166 }
 167
 168 /// The kind of an arbitrary `Hir` expression.
 169 #[derive(Clone, Debug, Eq, PartialEq)]
 170 pub enum HirKind {
 171     /// The empty regular expression, which matches everything, including the
 172     /// empty string.
 173     Empty,
 174     /// A single literal character that matches exactly this character.
 175     Literal(Literal),
 176     /// A single character class that matches any of the characters in the
 177     /// class. A class can either consist of Unicode scalar values as
 178     /// characters, or it can use bytes.
 179     Class(Class),
 180     /// An anchor assertion. An anchor assertion match always has zero length.
 181     Anchor(Anchor),
 182     /// A word boundary assertion, which may or may not be Unicode aware. A
 183     /// word boundary assertion match always has zero length.
 184     WordBoundary(WordBoundary),
 185     /// A repetition operation applied to a child expression.
 186     Repetition(Repetition),
 187     /// A possibly capturing group, which contains a child expression.
 188     Group(Group),
 189     /// A concatenation of expressions. A concatenation always has at least two
 190     /// child expressions.
 191     ///
 192     /// A concatenation matches only if each of its child expression matches
 193     /// one after the other.
 194     Concat(Vec<Hir>),
 195     /// An alternation of expressions. An alternation always has at least two
 196     /// child expressions.
 197     ///
 198     /// An alternation matches only if at least one of its child expression
 199     /// matches. If multiple expressions match, then the leftmost is preferred.
 200     Alternation(Vec<Hir>),
 201 }
 202
 203 impl Hir {
 204     /// Returns a reference to the underlying HIR kind.
 205     pub fn kind(&self) -> &HirKind {
 206         &self.kind
 207     }
 208
 209     /// Consumes ownership of this HIR expression and returns its underlying
 210     /// `HirKind`.
 211     pub fn into_kind(mut self) -> HirKind {
 212         use std::mem;
 213         mem::replace(&mut self.kind, HirKind::Empty)
 214     }
 215
 216     /// Returns an empty HIR expression.
 217     ///
 218     /// An empty HIR expression always matches, including the empty string.
 219     pub fn empty() -> Hir {
 220         let mut info = HirInfo::new();
 221         info.set_always_utf8(true);
 222         info.set_all_assertions(true);
 223         info.set_anchored_start(false);
 224         info.set_anchored_end(false);
 225         info.set_any_anchored_start(false);
 226         info.set_any_anchored_end(false);
 227         info.set_match_empty(true);
 228         Hir {
 229             kind: HirKind::Empty,
 230             info: info,
 231         }
 232     }
 233
 234     /// Creates a literal HIR expression.
 235     ///
 236     /// If the given literal has a `Byte` variant with an ASCII byte, then this
 237     /// method panics. This enforces the invariant that `Byte` variants are
 238     /// only used to express matching of invalid UTF-8.
 239     pub fn literal(lit: Literal) -> Hir {
 240         if let Literal::Byte(b) = lit {
 241             assert!(b > 0x7F);
 242         }
 243
 244         let mut info = HirInfo::new();
 245         info.set_always_utf8(lit.is_unicode());
 246         info.set_all_assertions(false);
 247         info.set_anchored_start(false);
 248         info.set_anchored_end(false);
 249         info.set_any_anchored_start(false);
 250         info.set_any_anchored_end(false);
 251         info.set_match_empty(false);
 252         Hir {
 253             kind: HirKind::Literal(lit),
 254             info: info,
 255         }
 256     }
 257
 258     /// Creates a class HIR expression.
 259     pub fn class(class: Class) -> Hir {
 260         let mut info = HirInfo::new();
 261         info.set_always_utf8(class.is_always_utf8());
 262         info.set_all_assertions(false);
 263         info.set_anchored_start(false);
 264         info.set_anchored_end(false);
 265         info.set_any_anchored_start(false);
 266         info.set_any_anchored_end(false);
 267         info.set_match_empty(false);
 268         Hir {
 269             kind: HirKind::Class(class),
 270             info: info,
 271         }
 272     }
 273
 274     /// Creates an anchor assertion HIR expression.
 275     pub fn anchor(anchor: Anchor) -> Hir {
 276         let mut info = HirInfo::new();
 277         info.set_always_utf8(true);
 278         info.set_all_assertions(true);
 279         info.set_anchored_start(false);
 280         info.set_anchored_end(false);
 281         info.set_any_anchored_start(false);
 282         info.set_any_anchored_end(false);
 283         info.set_match_empty(true);
 284         if let Anchor::StartText = anchor {
 285             info.set_anchored_start(true);
 286             info.set_any_anchored_start(true);
 287         }
 288         if let Anchor::EndText = anchor {
 289             info.set_anchored_end(true);
 290             info.set_any_anchored_end(true);
 291         }
 292         Hir {
 293             kind: HirKind::Anchor(anchor),
 294             info: info,
 295         }
 296     }
 297
 298     /// Creates a word boundary assertion HIR expression.
 299     pub fn word_boundary(word_boundary: WordBoundary) -> Hir {
 300         let mut info = HirInfo::new();
 301         info.set_always_utf8(true);
 302         info.set_all_assertions(true);
 303         info.set_anchored_start(false);
 304         info.set_anchored_end(false);
 305         info.set_any_anchored_start(false);
 306         info.set_any_anchored_end(false);
 307         // A negated word boundary matches the empty string, but a normal
 308         // word boundary does not!
 309         info.set_match_empty(word_boundary.is_negated());
 310         // Negated ASCII word boundaries can match invalid UTF-8.
 311         if let WordBoundary::AsciiNegate = word_boundary {
 312             info.set_always_utf8(false);
 313         }
 314         Hir {
 315             kind: HirKind::WordBoundary(word_boundary),
 316             info: info,
 317         }
 318     }
 319
 320     /// Creates a repetition HIR expression.
 321     pub fn repetition(rep: Repetition) -> Hir {
 322         let mut info = HirInfo::new();
 323         info.set_always_utf8(rep.hir.is_always_utf8());
 324         info.set_all_assertions(rep.hir.is_all_assertions());
 325         // If this operator can match the empty string, then it can never
 326         // be anchored.
 327         info.set_anchored_start(
 328             !rep.is_match_empty() && rep.hir.is_anchored_start()
 329         );
 330         info.set_anchored_end(
 331             !rep.is_match_empty() && rep.hir.is_anchored_end()
 332         );
 333         info.set_any_anchored_start(rep.hir.is_any_anchored_start());
 334         info.set_any_anchored_end(rep.hir.is_any_anchored_end());
 335         info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty());
 336         Hir {
 337             kind: HirKind::Repetition(rep),
 338             info: info,
 339         }
 340     }
 341
 342     /// Creates a group HIR expression.
 343     pub fn group(group: Group) -> Hir {
 344         let mut info = HirInfo::new();
 345         info.set_always_utf8(group.hir.is_always_utf8());
 346         info.set_all_assertions(group.hir.is_all_assertions());
 347         info.set_anchored_start(group.hir.is_anchored_start());
 348         info.set_anchored_end(group.hir.is_anchored_end());
 349         info.set_any_anchored_start(group.hir.is_any_anchored_start());
 350         info.set_any_anchored_end(group.hir.is_any_anchored_end());
 351         info.set_match_empty(group.hir.is_match_empty());
 352         Hir {
 353             kind: HirKind::Group(group),
 354             info: info,
 355         }
 356     }
 357
 358     /// Returns the concatenation of the given expressions.
 359     ///
 360     /// This flattens the concatenation as appropriate.
 361     pub fn concat(mut exprs: Vec<Hir>) -> Hir {
 362         match exprs.len() {
 363             0 => Hir::empty(),
 364             1 => exprs.pop().unwrap(),
 365             _ => {
 366                 let mut info = HirInfo::new();
 367                 info.set_always_utf8(true);
 368                 info.set_all_assertions(true);
 369                 info.set_any_anchored_start(false);
 370                 info.set_any_anchored_end(false);
 371                 info.set_match_empty(true);
 372
 373                 // Some attributes require analyzing all sub-expressions.
 374                 for e in &exprs {
 375                     let x = info.is_always_utf8() && e.is_always_utf8();
 376                     info.set_always_utf8(x);
 377
 378                     let x = info.is_all_assertions() && e.is_all_assertions();
 379                     info.set_all_assertions(x);
 380
 381                     let x =
 382                         info.is_any_anchored_start()
 383                         || e.is_any_anchored_start();
 384                     info.set_any_anchored_start(x);
 385
 386                     let x =
 387                         info.is_any_anchored_end()
 388                         || e.is_any_anchored_end();
 389                     info.set_any_anchored_end(x);
 390
 391                     let x = info.is_match_empty() && e.is_match_empty();
 392                     info.set_match_empty(x);
 393                 }
 394                 // Anchored attributes require something slightly more
 395                 // sophisticated. Normally, WLOG, to determine whether an
 396                 // expression is anchored to the start, we'd only need to check
 397                 // the first expression of a concatenation. However,
 398                 // expressions like `$\b^` are still anchored to the start,
 399                 // but the first expression in the concatenation *isn't*
 400                 // anchored to the start. So the "first" expression to look at
 401                 // is actually one that is either not an assertion or is
 402                 // specifically the StartText assertion.
 403                 info.set_anchored_start(
 404                     exprs.iter()
 405                         .take_while(|e| {
 406                             e.is_anchored_start() || e.is_all_assertions()
 407                         })
 408                         .any(|e| {
 409                             e.is_anchored_start()
 410                         }));
 411                 // Similarly for the end anchor, but in reverse.
 412                 info.set_anchored_end(
 413                     exprs.iter()
 414                         .rev()
 415                         .take_while(|e| {
 416                             e.is_anchored_end() || e.is_all_assertions()
 417                         })
 418                         .any(|e| {
 419                             e.is_anchored_end()
 420                         }));
 421                 Hir {
 422                     kind: HirKind::Concat(exprs),
 423                     info: info,
 424                 }
 425             }
 426         }
 427     }
 428
 429     /// Returns the alternation of the given expressions.
 430     ///
 431     /// This flattens the alternation as appropriate.
 432     pub fn alternation(mut exprs: Vec<Hir>) -> Hir {
 433         match exprs.len() {
 434             0 => Hir::empty(),
 435             1 => exprs.pop().unwrap(),
 436             _ => {
 437                 let mut info = HirInfo::new();
 438                 info.set_always_utf8(true);
 439                 info.set_all_assertions(true);
 440                 info.set_anchored_start(true);
 441                 info.set_anchored_end(true);
 442                 info.set_any_anchored_start(false);
 443                 info.set_any_anchored_end(false);
 444                 info.set_match_empty(false);
 445
 446                 // Some attributes require analyzing all sub-expressions.
 447                 for e in &exprs {
 448                     let x = info.is_always_utf8() && e.is_always_utf8();
 449                     info.set_always_utf8(x);
 450
 451                     let x = info.is_all_assertions() && e.is_all_assertions();
 452                     info.set_all_assertions(x);
 453
 454                     let x = info.is_anchored_start() && e.is_anchored_start();
 455                     info.set_anchored_start(x);
 456
 457                     let x = info.is_anchored_end() && e.is_anchored_end();
 458                     info.set_anchored_end(x);
 459
 460                     let x =
 461                         info.is_any_anchored_start()
 462                         || e.is_any_anchored_start();
 463                     info.set_any_anchored_start(x);
 464
 465                     let x =
 466                         info.is_any_anchored_end()
 467                         || e.is_any_anchored_end();
 468                     info.set_any_anchored_end(x);
 469
 470                     let x = info.is_match_empty() || e.is_match_empty();
 471                     info.set_match_empty(x);
 472                 }
 473                 Hir {
 474                     kind: HirKind::Alternation(exprs),
 475                     info: info,
 476                 }
 477             }
 478         }
 479     }
 480
 481     /// Build an HIR expression for `.`.
 482     ///
 483     /// A `.` expression matches any character except for `\n`. To build an
 484     /// expression that matches any character, including `\n`, use the `any`
 485     /// method.
 486     ///
 487     /// If `bytes` is `true`, then this assumes characters are limited to a
 488     /// single byte.
 489     pub fn dot(bytes: bool) -> Hir {
 490         if bytes {
 491             let mut cls = ClassBytes::empty();
 492             cls.push(ClassBytesRange::new(b'\0', b'\x09'));
 493             cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
 494             Hir::class(Class::Bytes(cls))
 495         } else {
 496             let mut cls = ClassUnicode::empty();
 497             cls.push(ClassUnicodeRange::new('\0', '\x09'));
 498             cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
 499             Hir::class(Class::Unicode(cls))
 500         }
 501     }
 502
 503     /// Build an HIR expression for `(?s).`.
 504     ///
 505     /// A `(?s).` expression matches any character, including `\n`. To build an
 506     /// expression that matches any character except for `\n`, then use the
 507     /// `dot` method.
 508     ///
 509     /// If `bytes` is `true`, then this assumes characters are limited to a
 510     /// single byte.
 511     pub fn any(bytes: bool) -> Hir {
 512         if bytes {
 513             let mut cls = ClassBytes::empty();
 514             cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
 515             Hir::class(Class::Bytes(cls))
 516         } else {
 517             let mut cls = ClassUnicode::empty();
 518             cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
 519             Hir::class(Class::Unicode(cls))
 520         }
 521     }
 522
 523     /// Return true if and only if this HIR will always match valid UTF-8.
 524     ///
 525     /// When this returns false, then it is possible for this HIR expression
 526     /// to match invalid UTF-8.
 527     pub fn is_always_utf8(&self) -> bool {
 528         self.info.is_always_utf8()
 529     }
 530
 531     /// Returns true if and only if this entire HIR expression is made up of
 532     /// zero-width assertions.
 533     ///
 534     /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but
 535     /// not `^a`.
 536     pub fn is_all_assertions(&self) -> bool {
 537         self.info.is_all_assertions()
 538     }
 539
 540     /// Return true if and only if this HIR is required to match from the
 541     /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`,
 542     /// `^foo|^bar` but not `^foo|bar`.
 543     pub fn is_anchored_start(&self) -> bool {
 544         self.info.is_anchored_start()
 545     }
 546
 547     /// Return true if and only if this HIR is required to match at the end
 548     /// of text. This includes expressions like `foo$`, `(foo|bar)$`,
 549     /// `foo$|bar$` but not `foo$|bar`.
 550     pub fn is_anchored_end(&self) -> bool {
 551         self.info.is_anchored_end()
 552     }
 553
 554     /// Return true if and only if this HIR contains any sub-expression that
 555     /// is required to match at the beginning of text. Specifically, this
 556     /// returns true if the `^` symbol (when multiline mode is disabled) or the
 557     /// `\A` escape appear anywhere in the regex.
 558     pub fn is_any_anchored_start(&self) -> bool {
 559         self.info.is_any_anchored_start()
 560     }
 561
 562     /// Return true if and only if this HIR contains any sub-expression that is
 563     /// required to match at the end of text. Specifically, this returns true
 564     /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape
 565     /// appear anywhere in the regex.
 566     pub fn is_any_anchored_end(&self) -> bool {
 567         self.info.is_any_anchored_end()
 568     }
 569
 570     /// Return true if and only if the empty string is part of the language
 571     /// matched by this regular expression.
 572     ///
 573     /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\B`,
 574     /// but not `a`, `a+` or `\b`.
 575     pub fn is_match_empty(&self) -> bool {
 576         self.info.is_match_empty()
 577     }
 578 }
 579
 580 impl HirKind {
 581     /// Return true if and only if this HIR is the empty regular expression.
 582     ///
 583     /// Note that this is not defined inductively. That is, it only tests if
 584     /// this kind is the `Empty` variant. To get the inductive definition,
 585     /// use the `is_match_empty` method on [`Hir`](struct.Hir.html).
 586     pub fn is_empty(&self) -> bool {
 587         match *self {
 588             HirKind::Empty => true,
 589             _ => false,
 590         }
 591     }
 592
 593     /// Returns true if and only if this kind has any (including possibly
 594     /// empty) subexpressions.
 595     pub fn has_subexprs(&self) -> bool {
 596         match *self {
 597             HirKind::Empty
 598             | HirKind::Literal(_)
 599             | HirKind::Class(_)
 600             | HirKind::Anchor(_)
 601             | HirKind::WordBoundary(_) => false,
 602             HirKind::Group(_)
 603             | HirKind::Repetition(_)
 604             | HirKind::Concat(_)
 605             | HirKind::Alternation(_) => true,
 606         }
 607     }
 608 }
 609
 610 /// Print a display representation of this Hir.
 611 ///
 612 /// The result of this is a valid regular expression pattern string.
 613 ///
 614 /// This implementation uses constant stack space and heap space proportional
 615 /// to the size of the `Hir`.
 616 impl fmt::Display for Hir {
 617     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 618         use hir::print::Printer;
 619         Printer::new().print(self, f)
 620     }
 621 }
 622
 623 /// The high-level intermediate representation of a literal.
 624 ///
 625 /// A literal corresponds to a single character, where a character is either
 626 /// defined by a Unicode scalar value or an arbitrary byte. Unicode characters
 627 /// are preferred whenever possible. In particular, a `Byte` variant is only
 628 /// ever produced when it could match invalid UTF-8.
 629 #[derive(Clone, Debug, Eq, PartialEq)]
 630 pub enum Literal {
 631     /// A single character represented by a Unicode scalar value.
 632     Unicode(char),
 633     /// A single character represented by an arbitrary byte.
 634     Byte(u8),
 635 }
 636
 637 impl Literal {
 638     /// Returns true if and only if this literal corresponds to a Unicode
 639     /// scalar value.
 640     pub fn is_unicode(&self) -> bool {
 641         match *self {
 642             Literal::Unicode(_) => true,
 643             Literal::Byte(b) if b <= 0x7F => true,
 644             Literal::Byte(_) => false,
 645         }
 646     }
 647 }
 648
 649 /// The high-level intermediate representation of a character class.
 650 ///
 651 /// A character class corresponds to a set of characters. A character is either
 652 /// defined by a Unicode scalar value or a byte. Unicode characters are used
 653 /// by default, while bytes are used when Unicode mode (via the `u` flag) is
 654 /// disabled.
 655 ///
 656 /// A character class, regardless of its character type, is represented by a
 657 /// sequence of non-overlapping non-adjacent ranges of characters.
 658 ///
 659 /// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may
 660 /// be produced even when it exclusively matches valid UTF-8. This is because
 661 /// a `Bytes` variant represents an intention by the author of the regular
 662 /// expression to disable Unicode mode, which in turn impacts the semantics of
 663 /// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not
 664 /// match the same set of strings.
 665 #[derive(Clone, Debug, Eq, PartialEq)]
 666 pub enum Class {
 667     /// A set of characters represented by Unicode scalar values.
 668     Unicode(ClassUnicode),
 669     /// A set of characters represented by arbitrary bytes (one byte per
 670     /// character).
 671     Bytes(ClassBytes),
 672 }
 673
 674 impl Class {
 675     /// Apply Unicode simple case folding to this character class, in place.
 676     /// The character class will be expanded to include all simple case folded
 677     /// character variants.
 678     ///
 679     /// If this is a byte oriented character class, then this will be limited
 680     /// to the ASCII ranges `A-Z` and `a-z`.
 681     pub fn case_fold_simple(&mut self) {
 682         match *self {
 683             Class::Unicode(ref mut x) => x.case_fold_simple(),
 684             Class::Bytes(ref mut x) => x.case_fold_simple(),
 685         }
 686     }
 687
 688     /// Negate this character class in place.
 689     ///
 690     /// After completion, this character class will contain precisely the
 691     /// characters that weren't previously in the class.
 692     pub fn negate(&mut self) {
 693         match *self {
 694             Class::Unicode(ref mut x) => x.negate(),
 695             Class::Bytes(ref mut x) => x.negate(),
 696         }
 697     }
 698
 699     /// Returns true if and only if this character class will only ever match
 700     /// valid UTF-8.
 701     ///
 702     /// A character class can match invalid UTF-8 only when the following
 703     /// conditions are met:
 704     ///
 705     /// 1. The translator was configured to permit generating an expression
 706     ///    that can match invalid UTF-8. (By default, this is disabled.)
 707     /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
 708     ///    syntax or in the parser builder. By default, Unicode mode is
 709     ///    enabled.
 710     pub fn is_always_utf8(&self) -> bool {
 711         match *self {
 712             Class::Unicode(_) => true,
 713             Class::Bytes(ref x) => x.is_all_ascii(),
 714         }
 715     }
 716 }
 717
 718 /// A set of characters represented by Unicode scalar values.
 719 #[derive(Clone, Debug, Eq, PartialEq)]
 720 pub struct ClassUnicode {
 721     set: IntervalSet<ClassUnicodeRange>,
 722 }
 723
 724 impl ClassUnicode {
 725     /// Create a new class from a sequence of ranges.
 726     ///
 727     /// The given ranges do not need to be in any specific order, and ranges
 728     /// may overlap.
 729     pub fn new<I>(ranges: I) -> ClassUnicode
 730     where I: IntoIterator<Item=ClassUnicodeRange>
 731     {
 732         ClassUnicode { set: IntervalSet::new(ranges) }
 733     }
 734
 735     /// Create a new class with no ranges.
 736     pub fn empty() -> ClassUnicode {
 737         ClassUnicode::new(vec![])
 738     }
 739
 740     /// Add a new range to this set.
 741     pub fn push(&mut self, range: ClassUnicodeRange) {
 742         self.set.push(range);
 743     }
 744
 745     /// Return an iterator over all ranges in this class.
 746     ///
 747     /// The iterator yields ranges in ascending order.
 748     pub fn iter(&self) -> ClassUnicodeIter {
 749         ClassUnicodeIter(self.set.iter())
 750     }
 751
 752     /// Return the underlying ranges as a slice.
 753     pub fn ranges(&self) -> &[ClassUnicodeRange] {
 754         self.set.intervals()
 755     }
 756
 757     /// Expand this character class such that it contains all case folded
 758     /// characters, according to Unicode's "simple" mapping. For example, if
 759     /// this class consists of the range `a-z`, then applying case folding will
 760     /// result in the class containing both the ranges `a-z` and `A-Z`.
 761     pub fn case_fold_simple(&mut self) {
 762         self.set.case_fold_simple();
 763     }
 764
 765     /// Negate this character class.
 766     ///
 767     /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
 768     /// set, then it will not be in this set after negation.
 769     pub fn negate(&mut self) {
 770         self.set.negate();
 771     }
 772
 773     /// Union this character class with the given character class, in place.
 774     pub fn union(&mut self, other: &ClassUnicode) {
 775         self.set.union(&other.set);
 776     }
 777
 778     /// Intersect this character class with the given character class, in
 779     /// place.
 780     pub fn intersect(&mut self, other: &ClassUnicode) {
 781         self.set.intersect(&other.set);
 782     }
 783
 784     /// Subtract the given character class from this character class, in place.
 785     pub fn difference(&mut self, other: &ClassUnicode) {
 786         self.set.difference(&other.set);
 787     }
 788
 789     /// Compute the symmetric difference of the given character classes, in
 790     /// place.
 791     ///
 792     /// This computes the symmetric difference of two character classes. This
 793     /// removes all elements in this class that are also in the given class,
 794     /// but all adds all elements from the given class that aren't in this
 795     /// class. That is, the class will contain all elements in either class,
 796     /// but will not contain any elements that are in both classes.
 797     pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
 798         self.set.symmetric_difference(&other.set);
 799     }
 800 }
 801
 802 /// An iterator over all ranges in a Unicode character class.
 803 ///
 804 /// The lifetime `'a` refers to the lifetime of the underlying class.
 805 #[derive(Debug)]
 806 pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
 807
 808 impl<'a> Iterator for ClassUnicodeIter<'a> {
 809     type Item = &'a ClassUnicodeRange;
 810
 811     fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
 812         self.0.next()
 813     }
 814 }
 815
 816 /// A single range of characters represented by Unicode scalar values.
 817 ///
 818 /// The range is closed. That is, the start and end of the range are included
 819 /// in the range.
 820 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
 821 pub struct ClassUnicodeRange {
 822     start: char,
 823     end: char,
 824 }
 825
 826 impl fmt::Debug for ClassUnicodeRange {
 827     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 828         let start =
 829             if !self.start.is_whitespace() && !self.start.is_control() {
 830                 self.start.to_string()
 831             } else {
 832                 format!("0x{:X}", self.start as u32)
 833             };
 834         let end =
 835             if !self.end.is_whitespace() && !self.end.is_control() {
 836                 self.end.to_string()
 837             } else {
 838                 format!("0x{:X}", self.end as u32)
 839             };
 840         f.debug_struct("ClassUnicodeRange")
 841          .field("start", &start)
 842          .field("end", &end)
 843          .finish()
 844     }
 845 }
 846
 847 impl Interval for ClassUnicodeRange {
 848     type Bound = char;
 849
 850     #[inline] fn lower(&self) -> char { self.start }
 851     #[inline] fn upper(&self) -> char { self.end }
 852     #[inline] fn set_lower(&mut self, bound: char) { self.start = bound; }
 853     #[inline] fn set_upper(&mut self, bound: char) { self.end = bound; }
 854
 855     /// Apply simple case folding to this Unicode scalar value range.
 856     ///
 857     /// Additional ranges are appended to the given vector. Canonical ordering
 858     /// is *not* maintained in the given vector.
 859     fn case_fold_simple(&self, ranges: &mut Vec<ClassUnicodeRange>) {
 860         if !unicode::contains_simple_case_mapping(self.start, self.end) {
 861             return;
 862         }
 863         let start = self.start as u32;
 864         let end = (self.end as u32).saturating_add(1);
 865         let mut next_simple_cp = None;
 866         for cp in (start..end).filter_map(char::from_u32) {
 867             if next_simple_cp.map_or(false, |next| cp < next) {
 868                 continue;
 869             }
 870             let it = match unicode::simple_fold(cp) {
 871                 Ok(it) => it,
 872                 Err(next) => {
 873                     next_simple_cp = next;
 874                     continue;
 875                 }
 876             };
 877             for cp_folded in it {
 878                 ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
 879             }
 880         }
 881     }
 882 }
 883
 884 impl ClassUnicodeRange {
 885     /// Create a new Unicode scalar value range for a character class.
 886     ///
 887     /// The returned range is always in a canonical form. That is, the range
 888     /// returned always satisfies the invariant that `start <= end`.
 889     pub fn new(start: char, end: char) -> ClassUnicodeRange {
 890         ClassUnicodeRange::create(start, end)
 891     }
 892
 893     /// Return the start of this range.
 894     ///
 895     /// The start of a range is always less than or equal to the end of the
 896     /// range.
 897     pub fn start(&self) -> char {
 898         self.start
 899     }
 900
 901     /// Return the end of this range.
 902     ///
 903     /// The end of a range is always greater than or equal to the start of the
 904     /// range.
 905     pub fn end(&self) -> char {
 906         self.end
 907     }
 908 }
 909
 910 /// A set of characters represented by arbitrary bytes (where one byte
 911 /// corresponds to one character).
 912 #[derive(Clone, Debug, Eq, PartialEq)]
 913 pub struct ClassBytes {
 914     set: IntervalSet<ClassBytesRange>,
 915 }
 916
 917 impl ClassBytes {
 918     /// Create a new class from a sequence of ranges.
 919     ///
 920     /// The given ranges do not need to be in any specific order, and ranges
 921     /// may overlap.
 922     pub fn new<I>(ranges: I) -> ClassBytes
 923     where I: IntoIterator<Item=ClassBytesRange>
 924     {
 925         ClassBytes { set: IntervalSet::new(ranges) }
 926     }
 927
 928     /// Create a new class with no ranges.
 929     pub fn empty() -> ClassBytes {
 930         ClassBytes::new(vec![])
 931     }
 932
 933     /// Add a new range to this set.
 934     pub fn push(&mut self, range: ClassBytesRange) {
 935         self.set.push(range);
 936     }
 937
 938     /// Return an iterator over all ranges in this class.
 939     ///
 940     /// The iterator yields ranges in ascending order.
 941     pub fn iter(&self) -> ClassBytesIter {
 942         ClassBytesIter(self.set.iter())
 943     }
 944
 945     /// Return the underlying ranges as a slice.
 946     pub fn ranges(&self) -> &[ClassBytesRange] {
 947         self.set.intervals()
 948     }
 949
 950     /// Expand this character class such that it contains all case folded
 951     /// characters. For example, if this class consists of the range `a-z`,
 952     /// then applying case folding will result in the class containing both the
 953     /// ranges `a-z` and `A-Z`.
 954     ///
 955     /// Note that this only applies ASCII case folding, which is limited to the
 956     /// characters `a-z` and `A-Z`.
 957     pub fn case_fold_simple(&mut self) {
 958         self.set.case_fold_simple();
 959     }
 960
 961     /// Negate this byte class.
 962     ///
 963     /// For all `b` where `b` is a any byte, if `b` was in this set, then it
 964     /// will not be in this set after negation.
 965     pub fn negate(&mut self) {
 966         self.set.negate();
 967     }
 968
 969     /// Union this byte class with the given byte class, in place.
 970     pub fn union(&mut self, other: &ClassBytes) {
 971         self.set.union(&other.set);
 972     }
 973
 974     /// Intersect this byte class with the given byte class, in place.
 975     pub fn intersect(&mut self, other: &ClassBytes) {
 976         self.set.intersect(&other.set);
 977     }
 978
 979     /// Subtract the given byte class from this byte class, in place.
 980     pub fn difference(&mut self, other: &ClassBytes) {
 981         self.set.difference(&other.set);
 982     }
 983
 984     /// Compute the symmetric difference of the given byte classes, in place.
 985     ///
 986     /// This computes the symmetric difference of two byte classes. This
 987     /// removes all elements in this class that are also in the given class,
 988     /// but all adds all elements from the given class that aren't in this
 989     /// class. That is, the class will contain all elements in either class,
 990     /// but will not contain any elements that are in both classes.
 991     pub fn symmetric_difference(&mut self, other: &ClassBytes) {
 992         self.set.symmetric_difference(&other.set);
 993     }
 994
 995     /// Returns true if and only if this character class will either match
 996     /// nothing or only ASCII bytes. Stated differently, this returns false
 997     /// if and only if this class contains a non-ASCII byte.
 998     pub fn is_all_ascii(&self) -> bool {
 999         self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1000     }
1001 }
1002
1003 /// An iterator over all ranges in a byte character class.
1004 ///
1005 /// The lifetime `'a` refers to the lifetime of the underlying class.
1006 #[derive(Debug)]
1007 pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1008
1009 impl<'a> Iterator for ClassBytesIter<'a> {
1010     type Item = &'a ClassBytesRange;
1011
1012     fn next(&mut self) -> Option<&'a ClassBytesRange> {
1013         self.0.next()
1014     }
1015 }
1016
1017 /// A single range of characters represented by arbitrary bytes.
1018 ///
1019 /// The range is closed. That is, the start and end of the range are included
1020 /// in the range.
1021 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1022 pub struct ClassBytesRange {
1023     start: u8,
1024     end: u8,
1025 }
1026
1027 impl Interval for ClassBytesRange {
1028     type Bound = u8;
1029
1030     #[inline] fn lower(&self) -> u8 { self.start }
1031     #[inline] fn upper(&self) -> u8 { self.end }
1032     #[inline] fn set_lower(&mut self, bound: u8) { self.start = bound; }
1033     #[inline] fn set_upper(&mut self, bound: u8) { self.end = bound; }
1034
1035     /// Apply simple case folding to this byte range. Only ASCII case mappings
1036     /// (for a-z) are applied.
1037     ///
1038     /// Additional ranges are appended to the given vector. Canonical ordering
1039     /// is *not* maintained in the given vector.
1040     fn case_fold_simple(&self, ranges: &mut Vec<ClassBytesRange>) {
1041         if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1042             let lower = cmp::max(self.start, b'a');
1043             let upper = cmp::min(self.end, b'z');
1044             ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1045         }
1046         if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1047             let lower = cmp::max(self.start, b'A');
1048             let upper = cmp::min(self.end, b'Z');
1049             ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1050         }
1051     }
1052 }
1053
1054 impl ClassBytesRange {
1055     /// Create a new byte range for a character class.
1056     ///
1057     /// The returned range is always in a canonical form. That is, the range
1058     /// returned always satisfies the invariant that `start <= end`.
1059     pub fn new(start: u8, end: u8) -> ClassBytesRange {
1060         ClassBytesRange::create(start, end)
1061     }
1062
1063     /// Return the start of this range.
1064     ///
1065     /// The start of a range is always less than or equal to the end of the
1066     /// range.
1067     pub fn start(&self) -> u8 {
1068         self.start
1069     }
1070
1071     /// Return the end of this range.
1072     ///
1073     /// The end of a range is always greater than or equal to the start of the
1074     /// range.
1075     pub fn end(&self) -> u8 {
1076         self.end
1077     }
1078 }
1079
1080 impl fmt::Debug for ClassBytesRange {
1081     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1082         let mut debug = f.debug_struct("ClassBytesRange");
1083         if self.start <= 0x7F {
1084             debug.field("start", &(self.start as char));
1085         } else {
1086             debug.field("start", &self.start);
1087         }
1088         if self.end <= 0x7F {
1089             debug.field("end", &(self.end as char));
1090         } else {
1091             debug.field("end", &self.end);
1092         }
1093         debug.finish()
1094     }
1095 }
1096
1097 /// The high-level intermediate representation for an anchor assertion.
1098 ///
1099 /// A matching anchor assertion is always zero-length.
1100 #[derive(Clone, Debug, Eq, PartialEq)]
1101 pub enum Anchor {
1102     /// Match the beginning of a line or the beginning of text. Specifically,
1103     /// this matches at the starting position of the input, or at the position
1104     /// immediately following a `\n` character.
1105     StartLine,
1106     /// Match the end of a line or the end of text. Specifically,
1107     /// this matches at the end position of the input, or at the position
1108     /// immediately preceding a `\n` character.
1109     EndLine,
1110     /// Match the beginning of text. Specifically, this matches at the starting
1111     /// position of the input.
1112     StartText,
1113     /// Match the end of text. Specifically, this matches at the ending
1114     /// position of the input.
1115     EndText,
1116 }
1117
1118 /// The high-level intermediate representation for a word-boundary assertion.
1119 ///
1120 /// A matching word boundary assertion is always zero-length.
1121 #[derive(Clone, Debug, Eq, PartialEq)]
1122 pub enum WordBoundary {
1123     /// Match a Unicode-aware word boundary. That is, this matches a position
1124     /// where the left adjacent character and right adjacent character
1125     /// correspond to a word and non-word or a non-word and word character.
1126     Unicode,
1127     /// Match a Unicode-aware negation of a word boundary.
1128     UnicodeNegate,
1129     /// Match an ASCII-only word boundary. That is, this matches a position
1130     /// where the left adjacent character and right adjacent character
1131     /// correspond to a word and non-word or a non-word and word character.
1132     Ascii,
1133     /// Match an ASCII-only negation of a word boundary.
1134     AsciiNegate,
1135 }
1136
1137 impl WordBoundary {
1138     /// Returns true if and only if this word boundary assertion is negated.
1139     pub fn is_negated(&self) -> bool {
1140         match *self {
1141             WordBoundary::Unicode |  WordBoundary::Ascii => false,
1142             WordBoundary::UnicodeNegate |  WordBoundary::AsciiNegate => true,
1143         }
1144     }
1145 }
1146
1147 /// The high-level intermediate representation for a group.
1148 ///
1149 /// This represents one of three possible group types:
1150 ///
1151 /// 1. A non-capturing group (e.g., `(?:expr)`).
1152 /// 2. A capturing group (e.g., `(expr)`).
1153 /// 3. A named capturing group (e.g., `(?P<name>expr)`).
1154 #[derive(Clone, Debug, Eq, PartialEq)]
1155 pub struct Group {
1156     /// The kind of this group. If it is a capturing group, then the kind
1157     /// contains the capture group index (and the name, if it is a named
1158     /// group).
1159     pub kind: GroupKind,
1160     /// The expression inside the capturing group, which may be empty.
1161     pub hir: Box<Hir>,
1162 }
1163
1164 /// The kind of group.
1165 #[derive(Clone, Debug, Eq, PartialEq)]
1166 pub enum GroupKind {
1167     /// A normal unnamed capturing group.
1168     ///
1169     /// The value is the capture index of the group.
1170     CaptureIndex(u32),
1171     /// A named capturing group.
1172     CaptureName {
1173         /// The name of the group.
1174         name: String,
1175         /// The capture index of the group.
1176         index: u32,
1177     },
1178     /// A non-capturing group.
1179     NonCapturing,
1180 }
1181
1182 /// The high-level intermediate representation of a repetition operator.
1183 ///
1184 /// A repetition operator permits the repetition of an arbitrary
1185 /// sub-expression.
1186 #[derive(Clone, Debug, Eq, PartialEq)]
1187 pub struct Repetition {
1188     /// The kind of this repetition operator.
1189     pub kind: RepetitionKind,
1190     /// Whether this repetition operator is greedy or not. A greedy operator
1191     /// will match as much as it can. A non-greedy operator will match as
1192     /// little as it can.
1193     ///
1194     /// Typically, operators are greedy by default and are only non-greedy when
1195     /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1196     /// not. However, this can be inverted via the `U` "ungreedy" flag.
1197     pub greedy: bool,
1198     /// The expression being repeated.
1199     pub hir: Box<Hir>,
1200 }
1201
1202 impl Repetition {
1203     /// Returns true if and only if this repetition operator makes it possible
1204     /// to match the empty string.
1205     ///
1206     /// Note that this is not defined inductively. For example, while `a*`
1207     /// will report `true`, `()+` will not, even though `()` matches the empty
1208     /// string and one or more occurrences of something that matches the empty
1209     /// string will always match the empty string. In order to get the
1210     /// inductive definition, see the corresponding method on
1211     /// [`Hir`](struct.Hir.html).
1212     pub fn is_match_empty(&self) -> bool {
1213         match self.kind {
1214             RepetitionKind::ZeroOrOne => true,
1215             RepetitionKind::ZeroOrMore => true,
1216             RepetitionKind::OneOrMore => false,
1217             RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0,
1218             RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0,
1219             RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0,
1220         }
1221     }
1222 }
1223
1224 /// The kind of a repetition operator.
1225 #[derive(Clone, Debug, Eq, PartialEq)]
1226 pub enum RepetitionKind {
1227     /// Matches a sub-expression zero or one times.
1228     ZeroOrOne,
1229     /// Matches a sub-expression zero or more times.
1230     ZeroOrMore,
1231     /// Matches a sub-expression one or more times.
1232     OneOrMore,
1233     /// Matches a sub-expression within a bounded range of times.
1234     Range(RepetitionRange),
1235 }
1236
1237 /// The kind of a counted repetition operator.
1238 #[derive(Clone, Debug, Eq, PartialEq)]
1239 pub enum RepetitionRange {
1240     /// Matches a sub-expression exactly this many times.
1241     Exactly(u32),
1242     /// Matches a sub-expression at least this many times.
1243     AtLeast(u32),
1244     /// Matches a sub-expression at least `m` times and at most `n` times.
1245     Bounded(u32, u32),
1246 }
1247
1248 /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1249 /// space but heap space proportional to the depth of the total `Hir`.
1250 impl Drop for Hir {
1251     fn drop(&mut self) {
1252         use std::mem;
1253
1254         match *self.kind() {
1255             HirKind::Empty
1256             | HirKind::Literal(_)
1257             | HirKind::Class(_)
1258             | HirKind::Anchor(_)
1259             | HirKind::WordBoundary(_) => return,
1260             HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return,
1261             HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return,
1262             HirKind::Concat(ref x) if x.is_empty() => return,
1263             HirKind::Alternation(ref x) if x.is_empty() => return,
1264             _ => {}
1265         }
1266
1267         let mut stack = vec![mem::replace(self, Hir::empty())];
1268         while let Some(mut expr) = stack.pop() {
1269             match expr.kind {
1270                 HirKind::Empty
1271                 | HirKind::Literal(_)
1272                 | HirKind::Class(_)
1273                 | HirKind::Anchor(_)
1274                 | HirKind::WordBoundary(_) => {}
1275                 HirKind::Group(ref mut x) => {
1276                     stack.push(mem::replace(&mut x.hir, Hir::empty()));
1277                 }
1278                 HirKind::Repetition(ref mut x) => {
1279                     stack.push(mem::replace(&mut x.hir, Hir::empty()));
1280                 }
1281                 HirKind::Concat(ref mut x) => {
1282                     stack.extend(x.drain(..));
1283                 }
1284                 HirKind::Alternation(ref mut x) => {
1285                     stack.extend(x.drain(..));
1286                 }
1287             }
1288         }
1289     }
1290 }
1291
1292 /// A type that documents various attributes of an HIR expression.
1293 ///
1294 /// These attributes are typically defined inductively on the HIR.
1295 #[derive(Clone, Debug, Eq, PartialEq)]
1296 struct HirInfo {
1297     /// Represent yes/no questions by a bitfield to conserve space, since
1298     /// this is included in every HIR expression.
1299     ///
1300     /// If more attributes need to be added, it is OK to increase the size of
1301     /// this as appropriate.
1302     bools: u8,
1303 }
1304
1305 // A simple macro for defining bitfield accessors/mutators.
1306 macro_rules! define_bool {
1307     ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => {
1308         fn $is_fn_name(&self) -> bool {
1309             self.bools & (0b1 << $bit) > 0
1310         }
1311
1312         fn $set_fn_name(&mut self, yes: bool) {
1313             if yes {
1314                 self.bools |= 1 << $bit;
1315             } else {
1316                 self.bools &= !(1 << $bit);
1317             }
1318         }
1319     }
1320 }
1321
1322 impl HirInfo {
1323     fn new() -> HirInfo {
1324         HirInfo {
1325             bools: 0,
1326         }
1327     }
1328
1329     define_bool!(0, is_always_utf8, set_always_utf8);
1330     define_bool!(1, is_all_assertions, set_all_assertions);
1331     define_bool!(2, is_anchored_start, set_anchored_start);
1332     define_bool!(3, is_anchored_end, set_anchored_end);
1333     define_bool!(4, is_any_anchored_start, set_any_anchored_start);
1334     define_bool!(5, is_any_anchored_end, set_any_anchored_end);
1335     define_bool!(6, is_match_empty, set_match_empty);
1336 }
1337
1338 #[cfg(test)]
1339 mod tests {
1340     use super::*;
1341
1342     fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
1343         let ranges: Vec<ClassUnicodeRange> = ranges
1344             .iter()
1345             .map(|&(s, e)| ClassUnicodeRange::new(s, e))
1346             .collect();
1347         ClassUnicode::new(ranges)
1348     }
1349
1350     fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
1351         let ranges: Vec<ClassBytesRange> = ranges
1352             .iter()
1353             .map(|&(s, e)| ClassBytesRange::new(s, e))
1354             .collect();
1355         ClassBytes::new(ranges)
1356     }
1357
1358     fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
1359         cls.iter().map(|x| (x.start(), x.end())).collect()
1360     }
1361
1362     fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
1363         let mut cls_ = cls.clone();
1364         cls_.case_fold_simple();
1365         cls_
1366     }
1367
1368     fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1369         let mut cls_ = cls1.clone();
1370         cls_.union(cls2);
1371         cls_
1372     }
1373
1374     fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1375         let mut cls_ = cls1.clone();
1376         cls_.intersect(cls2);
1377         cls_
1378     }
1379
1380     fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1381         let mut cls_ = cls1.clone();
1382         cls_.difference(cls2);
1383         cls_
1384     }
1385
1386     fn usymdifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1387         let mut cls_ = cls1.clone();
1388         cls_.symmetric_difference(cls2);
1389         cls_
1390     }
1391
1392     fn unegate(cls: &ClassUnicode) -> ClassUnicode {
1393         let mut cls_ = cls.clone();
1394         cls_.negate();
1395         cls_
1396     }
1397
1398     fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
1399         cls.iter().map(|x| (x.start(), x.end())).collect()
1400     }
1401
1402     fn bcasefold(cls: &ClassBytes) -> ClassBytes {
1403         let mut cls_ = cls.clone();
1404         cls_.case_fold_simple();
1405         cls_
1406     }
1407
1408     fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1409         let mut cls_ = cls1.clone();
1410         cls_.union(cls2);
1411         cls_
1412     }
1413
1414     fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1415         let mut cls_ = cls1.clone();
1416         cls_.intersect(cls2);
1417         cls_
1418     }
1419
1420     fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1421         let mut cls_ = cls1.clone();
1422         cls_.difference(cls2);
1423         cls_
1424     }
1425
1426     fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1427         let mut cls_ = cls1.clone();
1428         cls_.symmetric_difference(cls2);
1429         cls_
1430     }
1431
1432     fn bnegate(cls: &ClassBytes) -> ClassBytes {
1433         let mut cls_ = cls.clone();
1434         cls_.negate();
1435         cls_
1436     }
1437
1438     #[test]
1439     fn class_range_canonical_unicode() {
1440         let range = ClassUnicodeRange::new('\u{00FF}', '\0');
1441         assert_eq!('\0', range.start());
1442         assert_eq!('\u{00FF}', range.end());
1443     }
1444
1445     #[test]
1446     fn class_range_canonical_bytes() {
1447         let range = ClassBytesRange::new(b'\xFF', b'\0');
1448         assert_eq!(b'\0', range.start());
1449         assert_eq!(b'\xFF', range.end());
1450     }
1451
1452     #[test]
1453     fn class_canonicalize_unicode() {
1454         let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1455         let expected = vec![('a', 'c'), ('x', 'z')];
1456         assert_eq!(expected, uranges(&cls));
1457
1458         let cls = uclass(&[('x', 'z'), ('a', 'c')]);
1459         let expected = vec![('a', 'c'), ('x', 'z')];
1460         assert_eq!(expected, uranges(&cls));
1461
1462         let cls = uclass(&[('x', 'z'), ('w', 'y')]);
1463         let expected = vec![('w', 'z')];
1464         assert_eq!(expected, uranges(&cls));
1465
1466         let cls = uclass(&[
1467             ('c', 'f'), ('a', 'g'), ('d', 'j'), ('a', 'c'),
1468             ('m', 'p'), ('l', 's'),
1469         ]);
1470         let expected = vec![('a', 'j'), ('l', 's')];
1471         assert_eq!(expected, uranges(&cls));
1472
1473         let cls = uclass(&[('x', 'z'), ('u', 'w')]);
1474         let expected = vec![('u', 'z')];
1475         assert_eq!(expected, uranges(&cls));
1476
1477         let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
1478         let expected = vec![('\x00', '\u{10FFFF}')];
1479         assert_eq!(expected, uranges(&cls));
1480
1481
1482         let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1483         let expected = vec![('a', 'b')];
1484         assert_eq!(expected, uranges(&cls));
1485     }
1486
1487     #[test]
1488     fn class_canonicalize_bytes() {
1489         let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1490         let expected = vec![(b'a', b'c'), (b'x', b'z')];
1491         assert_eq!(expected, branges(&cls));
1492
1493         let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
1494         let expected = vec![(b'a', b'c'), (b'x', b'z')];
1495         assert_eq!(expected, branges(&cls));
1496
1497         let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
1498         let expected = vec![(b'w', b'z')];
1499         assert_eq!(expected, branges(&cls));
1500
1501         let cls = bclass(&[
1502             (b'c', b'f'), (b'a', b'g'), (b'd', b'j'), (b'a', b'c'),
1503             (b'm', b'p'), (b'l', b's'),
1504         ]);
1505         let expected = vec![(b'a', b'j'), (b'l', b's')];
1506         assert_eq!(expected, branges(&cls));
1507
1508         let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
1509         let expected = vec![(b'u', b'z')];
1510         assert_eq!(expected, branges(&cls));
1511
1512         let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
1513         let expected = vec![(b'\x00', b'\xFF')];
1514         assert_eq!(expected, branges(&cls));
1515
1516         let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1517         let expected = vec![(b'a', b'b')];
1518         assert_eq!(expected, branges(&cls));
1519     }
1520
1521     #[test]
1522     fn class_case_fold_unicode() {
1523         let cls = uclass(&[
1524             ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'),
1525             ('M', 'P'), ('L', 'S'), ('c', 'f'),
1526         ]);
1527         let expected = uclass(&[
1528             ('A', 'J'), ('L', 'S'),
1529             ('a', 'j'), ('l', 's'),
1530             ('\u{17F}', '\u{17F}'),
1531         ]);
1532         assert_eq!(expected, ucasefold(&cls));
1533
1534         let cls = uclass(&[('A', 'Z')]);
1535         let expected = uclass(&[
1536             ('A', 'Z'), ('a', 'z'),
1537             ('\u{17F}', '\u{17F}'),
1538             ('\u{212A}', '\u{212A}'),
1539         ]);
1540         assert_eq!(expected, ucasefold(&cls));
1541
1542         let cls = uclass(&[('a', 'z')]);
1543         let expected = uclass(&[
1544             ('A', 'Z'), ('a', 'z'),
1545             ('\u{17F}', '\u{17F}'),
1546             ('\u{212A}', '\u{212A}'),
1547         ]);
1548         assert_eq!(expected, ucasefold(&cls));
1549
1550         let cls = uclass(&[('A', 'A'), ('_', '_')]);
1551         let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
1552         assert_eq!(expected, ucasefold(&cls));
1553
1554         let cls = uclass(&[('A', 'A'), ('=', '=')]);
1555         let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
1556         assert_eq!(expected, ucasefold(&cls));
1557
1558         let cls = uclass(&[('\x00', '\x10')]);
1559         assert_eq!(cls, ucasefold(&cls));
1560
1561         let cls = uclass(&[('k', 'k')]);
1562         let expected = uclass(&[
1563             ('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),
1564         ]);
1565         assert_eq!(expected, ucasefold(&cls));
1566
1567         let cls = uclass(&[('@', '@')]);
1568         assert_eq!(cls, ucasefold(&cls));
1569     }
1570
1571     #[test]
1572     fn class_case_fold_bytes() {
1573         let cls = bclass(&[
1574             (b'C', b'F'), (b'A', b'G'), (b'D', b'J'), (b'A', b'C'),
1575             (b'M', b'P'), (b'L', b'S'), (b'c', b'f'),
1576         ]);
1577         let expected = bclass(&[
1578             (b'A', b'J'), (b'L', b'S'),
1579             (b'a', b'j'), (b'l', b's'),
1580         ]);
1581         assert_eq!(expected, bcasefold(&cls));
1582
1583         let cls = bclass(&[(b'A', b'Z')]);
1584         let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1585         assert_eq!(expected, bcasefold(&cls));
1586
1587         let cls = bclass(&[(b'a', b'z')]);
1588         let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1589         assert_eq!(expected, bcasefold(&cls));
1590
1591         let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
1592         let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
1593         assert_eq!(expected, bcasefold(&cls));
1594
1595         let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
1596         let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
1597         assert_eq!(expected, bcasefold(&cls));
1598
1599         let cls = bclass(&[(b'\x00', b'\x10')]);
1600         assert_eq!(cls, bcasefold(&cls));
1601
1602         let cls = bclass(&[(b'k', b'k')]);
1603         let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
1604         assert_eq!(expected, bcasefold(&cls));
1605
1606         let cls = bclass(&[(b'@', b'@')]);
1607         assert_eq!(cls, bcasefold(&cls));
1608     }
1609
1610     #[test]
1611     fn class_negate_unicode() {
1612         let cls = uclass(&[('a', 'a')]);
1613         let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
1614         assert_eq!(expected, unegate(&cls));
1615
1616         let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1617         let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
1618         assert_eq!(expected, unegate(&cls));
1619
1620         let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1621         let expected = uclass(&[
1622             ('\x00', '\x60'), ('\x64', '\x77'), ('\x7B', '\u{10FFFF}'),
1623         ]);
1624         assert_eq!(expected, unegate(&cls));
1625
1626         let cls = uclass(&[('\x00', 'a')]);
1627         let expected = uclass(&[('\x62', '\u{10FFFF}')]);
1628         assert_eq!(expected, unegate(&cls));
1629
1630         let cls = uclass(&[('a', '\u{10FFFF}')]);
1631         let expected = uclass(&[('\x00', '\x60')]);
1632         assert_eq!(expected, unegate(&cls));
1633
1634         let cls = uclass(&[('\x00', '\u{10FFFF}')]);
1635         let expected = uclass(&[]);
1636         assert_eq!(expected, unegate(&cls));
1637
1638         let cls = uclass(&[]);
1639         let expected = uclass(&[('\x00', '\u{10FFFF}')]);
1640         assert_eq!(expected, unegate(&cls));
1641
1642         let cls = uclass(&[
1643             ('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}'),
1644         ]);
1645         let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
1646         assert_eq!(expected, unegate(&cls));
1647
1648         let cls = uclass(&[('\x00', '\u{D7FF}')]);
1649         let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1650         assert_eq!(expected, unegate(&cls));
1651
1652         let cls = uclass(&[('\x00', '\u{D7FE}')]);
1653         let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
1654         assert_eq!(expected, unegate(&cls));
1655
1656         let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1657         let expected = uclass(&[('\x00', '\u{D7FF}')]);
1658         assert_eq!(expected, unegate(&cls));
1659
1660         let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
1661         let expected = uclass(&[('\x00', '\u{E000}')]);
1662         assert_eq!(expected, unegate(&cls));
1663     }
1664
1665     #[test]
1666     fn class_negate_bytes() {
1667         let cls = bclass(&[(b'a', b'a')]);
1668         let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
1669         assert_eq!(expected, bnegate(&cls));
1670
1671         let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1672         let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
1673         assert_eq!(expected, bnegate(&cls));
1674
1675         let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1676         let expected = bclass(&[
1677             (b'\x00', b'\x60'), (b'\x64', b'\x77'), (b'\x7B', b'\xFF'),
1678         ]);
1679         assert_eq!(expected, bnegate(&cls));
1680
1681         let cls = bclass(&[(b'\x00', b'a')]);
1682         let expected = bclass(&[(b'\x62', b'\xFF')]);
1683         assert_eq!(expected, bnegate(&cls));
1684
1685         let cls = bclass(&[(b'a', b'\xFF')]);
1686         let expected = bclass(&[(b'\x00', b'\x60')]);
1687         assert_eq!(expected, bnegate(&cls));
1688
1689         let cls = bclass(&[(b'\x00', b'\xFF')]);
1690         let expected = bclass(&[]);
1691         assert_eq!(expected, bnegate(&cls));
1692
1693         let cls = bclass(&[]);
1694         let expected = bclass(&[(b'\x00', b'\xFF')]);
1695         assert_eq!(expected, bnegate(&cls));
1696
1697         let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
1698         let expected = bclass(&[(b'\xFE', b'\xFE')]);
1699         assert_eq!(expected, bnegate(&cls));
1700     }
1701
1702     #[test]
1703     fn class_union_unicode() {
1704         let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
1705         let cls2 = uclass(&[('a', 'z')]);
1706         let expected = uclass(&[('a', 'z'), ('A', 'C')]);
1707         assert_eq!(expected, uunion(&cls1, &cls2));
1708     }
1709
1710     #[test]
1711     fn class_union_bytes() {
1712         let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
1713         let cls2 = bclass(&[(b'a', b'z')]);
1714         let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
1715         assert_eq!(expected, bunion(&cls1, &cls2));
1716     }
1717
1718     #[test]
1719     fn class_intersect_unicode() {
1720         let cls1 = uclass(&[]);
1721         let cls2 = uclass(&[('a', 'a')]);
1722         let expected = uclass(&[]);
1723         assert_eq!(expected, uintersect(&cls1, &cls2));
1724
1725         let cls1 = uclass(&[('a', 'a')]);
1726         let cls2 = uclass(&[('a', 'a')]);
1727         let expected = uclass(&[('a', 'a')]);
1728         assert_eq!(expected, uintersect(&cls1, &cls2));
1729
1730         let cls1 = uclass(&[('a', 'a')]);
1731         let cls2 = uclass(&[('b', 'b')]);
1732         let expected = uclass(&[]);
1733         assert_eq!(expected, uintersect(&cls1, &cls2));
1734
1735         let cls1 = uclass(&[('a', 'a')]);
1736         let cls2 = uclass(&[('a', 'c')]);
1737         let expected = uclass(&[('a', 'a')]);
1738         assert_eq!(expected, uintersect(&cls1, &cls2));
1739
1740         let cls1 = uclass(&[('a', 'b')]);
1741         let cls2 = uclass(&[('a', 'c')]);
1742         let expected = uclass(&[('a', 'b')]);
1743         assert_eq!(expected, uintersect(&cls1, &cls2));
1744
1745         let cls1 = uclass(&[('a', 'b')]);
1746         let cls2 = uclass(&[('b', 'c')]);
1747         let expected = uclass(&[('b', 'b')]);
1748         assert_eq!(expected, uintersect(&cls1, &cls2));
1749
1750         let cls1 = uclass(&[('a', 'b')]);
1751         let cls2 = uclass(&[('c', 'd')]);
1752         let expected = uclass(&[]);
1753         assert_eq!(expected, uintersect(&cls1, &cls2));
1754
1755         let cls1 = uclass(&[('b', 'c')]);
1756         let cls2 = uclass(&[('a', 'd')]);
1757         let expected = uclass(&[('b', 'c')]);
1758         assert_eq!(expected, uintersect(&cls1, &cls2));
1759
1760         let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1761         let cls2 = uclass(&[('a', 'h')]);
1762         let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1763         assert_eq!(expected, uintersect(&cls1, &cls2));
1764
1765         let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1766         let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1767         let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1768         assert_eq!(expected, uintersect(&cls1, &cls2));
1769
1770         let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
1771         let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
1772         let expected = uclass(&[]);
1773         assert_eq!(expected, uintersect(&cls1, &cls2));
1774
1775         let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1776         let cls2 = uclass(&[('h', 'h')]);
1777         let expected = uclass(&[('h', 'h')]);
1778         assert_eq!(expected, uintersect(&cls1, &cls2));
1779
1780         let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
1781         let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
1782         let expected = uclass(&[]);
1783         assert_eq!(expected, uintersect(&cls1, &cls2));
1784
1785         let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
1786         let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
1787         let expected = uclass(&[('b', 'f')]);
1788         assert_eq!(expected, uintersect(&cls1, &cls2));
1789     }
1790
1791     #[test]
1792     fn class_intersect_bytes() {
1793         let cls1 = bclass(&[]);
1794         let cls2 = bclass(&[(b'a', b'a')]);
1795         let expected = bclass(&[]);
1796         assert_eq!(expected, bintersect(&cls1, &cls2));
1797
1798         let cls1 = bclass(&[(b'a', b'a')]);
1799         let cls2 = bclass(&[(b'a', b'a')]);
1800         let expected = bclass(&[(b'a', b'a')]);
1801         assert_eq!(expected, bintersect(&cls1, &cls2));
1802
1803         let cls1 = bclass(&[(b'a', b'a')]);
1804         let cls2 = bclass(&[(b'b', b'b')]);
1805         let expected = bclass(&[]);
1806         assert_eq!(expected, bintersect(&cls1, &cls2));
1807
1808         let cls1 = bclass(&[(b'a', b'a')]);
1809         let cls2 = bclass(&[(b'a', b'c')]);
1810         let expected = bclass(&[(b'a', b'a')]);
1811         assert_eq!(expected, bintersect(&cls1, &cls2));
1812
1813         let cls1 = bclass(&[(b'a', b'b')]);
1814         let cls2 = bclass(&[(b'a', b'c')]);
1815         let expected = bclass(&[(b'a', b'b')]);
1816         assert_eq!(expected, bintersect(&cls1, &cls2));
1817
1818         let cls1 = bclass(&[(b'a', b'b')]);
1819         let cls2 = bclass(&[(b'b', b'c')]);
1820         let expected = bclass(&[(b'b', b'b')]);
1821         assert_eq!(expected, bintersect(&cls1, &cls2));
1822
1823         let cls1 = bclass(&[(b'a', b'b')]);
1824         let cls2 = bclass(&[(b'c', b'd')]);
1825         let expected = bclass(&[]);
1826         assert_eq!(expected, bintersect(&cls1, &cls2));
1827
1828         let cls1 = bclass(&[(b'b', b'c')]);
1829         let cls2 = bclass(&[(b'a', b'd')]);
1830         let expected = bclass(&[(b'b', b'c')]);
1831         assert_eq!(expected, bintersect(&cls1, &cls2));
1832
1833         let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
1834         let cls2 = bclass(&[(b'a', b'h')]);
1835         let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
1836         assert_eq!(expected, bintersect(&cls1, &cls2));
1837
1838         let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
1839         let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
1840         let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
1841         assert_eq!(expected, bintersect(&cls1, &cls2));
1842
1843         let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
1844         let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
1845         let expected = bclass(&[]);
1846         assert_eq!(expected, bintersect(&cls1, &cls2));
1847
1848         let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
1849         let cls2 = bclass(&[(b'h', b'h')]);
1850         let expected = bclass(&[(b'h', b'h')]);
1851         assert_eq!(expected, bintersect(&cls1, &cls2));
1852
1853         let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
1854         let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
1855         let expected = bclass(&[]);
1856         assert_eq!(expected, bintersect(&cls1, &cls2));
1857
1858         let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
1859         let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
1860         let expected = bclass(&[(b'b', b'f')]);
1861         assert_eq!(expected, bintersect(&cls1, &cls2));
1862     }
1863
1864     #[test]
1865     fn class_difference_unicode() {
1866         let cls1 = uclass(&[('a', 'a')]);
1867         let cls2 = uclass(&[('a', 'a')]);
1868         let expected = uclass(&[]);
1869         assert_eq!(expected, udifference(&cls1, &cls2));
1870
1871         let cls1 = uclass(&[('a', 'a')]);
1872         let cls2 = uclass(&[]);
1873         let expected = uclass(&[('a', 'a')]);
1874         assert_eq!(expected, udifference(&cls1, &cls2));
1875
1876         let cls1 = uclass(&[]);
1877         let cls2 = uclass(&[('a', 'a')]);
1878         let expected = uclass(&[]);
1879         assert_eq!(expected, udifference(&cls1, &cls2));
1880
1881         let cls1 = uclass(&[('a', 'z')]);
1882         let cls2 = uclass(&[('a', 'a')]);
1883         let expected = uclass(&[('b', 'z')]);
1884         assert_eq!(expected, udifference(&cls1, &cls2));
1885
1886         let cls1 = uclass(&[('a', 'z')]);
1887         let cls2 = uclass(&[('z', 'z')]);
1888         let expected = uclass(&[('a', 'y')]);
1889         assert_eq!(expected, udifference(&cls1, &cls2));
1890
1891         let cls1 = uclass(&[('a', 'z')]);
1892         let cls2 = uclass(&[('m', 'm')]);
1893         let expected = uclass(&[('a', 'l'), ('n', 'z')]);
1894         assert_eq!(expected, udifference(&cls1, &cls2));
1895
1896         let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
1897         let cls2 = uclass(&[('a', 'z')]);
1898         let expected = uclass(&[]);
1899         assert_eq!(expected, udifference(&cls1, &cls2));
1900
1901         let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
1902         let cls2 = uclass(&[('d', 'v')]);
1903         let expected = uclass(&[('a', 'c')]);
1904         assert_eq!(expected, udifference(&cls1, &cls2));
1905
1906         let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
1907         let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
1908         let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
1909         assert_eq!(expected, udifference(&cls1, &cls2));
1910
1911         let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
1912         let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
1913         let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
1914         assert_eq!(expected, udifference(&cls1, &cls2));
1915
1916         let cls1 = uclass(&[('x', 'z')]);
1917         let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
1918         let expected = uclass(&[('x', 'z')]);
1919         assert_eq!(expected, udifference(&cls1, &cls2));
1920
1921         let cls1 = uclass(&[('a', 'z')]);
1922         let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
1923         let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
1924         assert_eq!(expected, udifference(&cls1, &cls2));
1925     }
1926
1927     #[test]
1928     fn class_difference_bytes() {
1929         let cls1 = bclass(&[(b'a', b'a')]);
1930         let cls2 = bclass(&[(b'a', b'a')]);
1931         let expected = bclass(&[]);
1932         assert_eq!(expected, bdifference(&cls1, &cls2));
1933
1934         let cls1 = bclass(&[(b'a', b'a')]);
1935         let cls2 = bclass(&[]);
1936         let expected = bclass(&[(b'a', b'a')]);
1937         assert_eq!(expected, bdifference(&cls1, &cls2));
1938
1939         let cls1 = bclass(&[]);
1940         let cls2 = bclass(&[(b'a', b'a')]);
1941         let expected = bclass(&[]);
1942         assert_eq!(expected, bdifference(&cls1, &cls2));
1943
1944         let cls1 = bclass(&[(b'a', b'z')]);
1945         let cls2 = bclass(&[(b'a', b'a')]);
1946         let expected = bclass(&[(b'b', b'z')]);
1947         assert_eq!(expected, bdifference(&cls1, &cls2));
1948
1949         let cls1 = bclass(&[(b'a', b'z')]);
1950         let cls2 = bclass(&[(b'z', b'z')]);
1951         let expected = bclass(&[(b'a', b'y')]);
1952         assert_eq!(expected, bdifference(&cls1, &cls2));
1953
1954         let cls1 = bclass(&[(b'a', b'z')]);
1955         let cls2 = bclass(&[(b'm', b'm')]);
1956         let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
1957         assert_eq!(expected, bdifference(&cls1, &cls2));
1958
1959         let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
1960         let cls2 = bclass(&[(b'a', b'z')]);
1961         let expected = bclass(&[]);
1962         assert_eq!(expected, bdifference(&cls1, &cls2));
1963
1964         let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
1965         let cls2 = bclass(&[(b'd', b'v')]);
1966         let expected = bclass(&[(b'a', b'c')]);
1967         assert_eq!(expected, bdifference(&cls1, &cls2));
1968
1969         let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
1970         let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
1971         let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
1972         assert_eq!(expected, bdifference(&cls1, &cls2));
1973
1974         let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
1975         let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
1976         let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
1977         assert_eq!(expected, bdifference(&cls1, &cls2));
1978
1979         let cls1 = bclass(&[(b'x', b'z')]);
1980         let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
1981         let expected = bclass(&[(b'x', b'z')]);
1982         assert_eq!(expected, bdifference(&cls1, &cls2));
1983
1984         let cls1 = bclass(&[(b'a', b'z')]);
1985         let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
1986         let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
1987         assert_eq!(expected, bdifference(&cls1, &cls2));
1988     }
1989
1990     #[test]
1991     fn class_symmetric_difference_unicode() {
1992         let cls1 = uclass(&[('a', 'm')]);
1993         let cls2 = uclass(&[('g', 't')]);
1994         let expected = uclass(&[('a', 'f'), ('n', 't')]);
1995         assert_eq!(expected, usymdifference(&cls1, &cls2));
1996     }
1997
1998     #[test]
1999     fn class_symmetric_difference_bytes() {
2000         let cls1 = bclass(&[(b'a', b'm')]);
2001         let cls2 = bclass(&[(b'g', b't')]);
2002         let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
2003         assert_eq!(expected, bsymdifference(&cls1, &cls2));
2004     }
2005
2006     #[test]
2007     #[should_panic]
2008     fn hir_byte_literal_non_ascii() {
2009         Hir::literal(Literal::Byte(b'a'));
2010     }
2011
2012     // We use a thread with an explicit stack size to test that our destructor
2013     // for Hir can handle arbitrarily sized expressions in constant stack
2014     // space. In case we run on a platform without threads (WASM?), we limit
2015     // this test to Windows/Unix.
2016     #[test]
2017     #[cfg(any(unix, windows))]
2018     fn no_stack_overflow_on_drop() {
2019         use std::thread;
2020
2021         let run = || {
2022             let mut expr = Hir::empty();
2023             for _ in 0..100 {
2024                 expr = Hir::group(Group {
2025                     kind: GroupKind::NonCapturing,
2026                     hir: Box::new(expr),
2027                 });
2028                 expr = Hir::repetition(Repetition {
2029                     kind: RepetitionKind::ZeroOrOne,
2030                     greedy: true,
2031                     hir: Box::new(expr),
2032                 });
2033
2034                 expr = Hir {
2035                     kind: HirKind::Concat(vec![expr]),
2036                     info: HirInfo::new(),
2037                 };
2038                 expr = Hir {
2039                     kind: HirKind::Alternation(vec![expr]),
2040                     info: HirInfo::new(),
2041                 };
2042             }
2043             assert!(!expr.kind.is_empty());
2044         };
2045
2046         // We run our test on a thread with a small stack size so we can
2047         // force the issue more easily.
2048         thread::Builder::new()
2049             .stack_size(1<<10)
2050             .spawn(run)
2051             .unwrap()
2052             .join()
2053             .unwrap();
2054     }
2055 }