vendor/regex/src/compile.rs

   1 use std::collections::HashMap;
   2 use std::fmt;
   3 use std::iter;
   4 use std::result;
   5 use std::sync::Arc;
   6
   7 use regex_syntax::hir::{self, Hir};
   8 use regex_syntax::is_word_byte;
   9 use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
  10
  11 use crate::prog::{
  12     EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
  13     InstSave, InstSplit, Program,
  14 };
  15
  16 use crate::Error;
  17
  18 type Result = result::Result<Patch, Error>;
  19 type ResultOrEmpty = result::Result<Option<Patch>, Error>;
  20
  21 #[derive(Debug)]
  22 struct Patch {
  23     hole: Hole,
  24     entry: InstPtr,
  25 }
  26
  27 /// A compiler translates a regular expression AST to a sequence of
  28 /// instructions. The sequence of instructions represents an NFA.
  29 // `Compiler` is only public via the `internal` module, so avoid deriving
  30 // `Debug`.
  31 #[allow(missing_debug_implementations)]
  32 pub struct Compiler {
  33     insts: Vec<MaybeInst>,
  34     compiled: Program,
  35     capture_name_idx: HashMap<String, usize>,
  36     num_exprs: usize,
  37     size_limit: usize,
  38     suffix_cache: SuffixCache,
  39     utf8_seqs: Option<Utf8Sequences>,
  40     byte_classes: ByteClassSet,
  41     extra_inst_bytes: usize,
  42 }
  43
  44 impl Compiler {
  45     /// Create a new regular expression compiler.
  46     ///
  47     /// Various options can be set before calling `compile` on an expression.
  48     pub fn new() -> Self {
  49         Compiler {
  50             insts: vec![],
  51             compiled: Program::new(),
  52             capture_name_idx: HashMap::new(),
  53             num_exprs: 0,
  54             size_limit: 10 * (1 << 20),
  55             suffix_cache: SuffixCache::new(1000),
  56             utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
  57             byte_classes: ByteClassSet::new(),
  58             extra_inst_bytes: 0,
  59         }
  60     }
  61
  62     /// The size of the resulting program is limited by size_limit. If
  63     /// the program approximately exceeds the given size (in bytes), then
  64     /// compilation will stop and return an error.
  65     pub fn size_limit(mut self, size_limit: usize) -> Self {
  66         self.size_limit = size_limit;
  67         self
  68     }
  69
  70     /// If bytes is true, then the program is compiled as a byte based
  71     /// automaton, which incorporates UTF-8 decoding into the machine. If it's
  72     /// false, then the automaton is Unicode scalar value based, e.g., an
  73     /// engine utilizing such an automaton is responsible for UTF-8 decoding.
  74     ///
  75     /// The specific invariant is that when returning a byte based machine,
  76     /// the neither the `Char` nor `Ranges` instructions are produced.
  77     /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
  78     /// instruction is never produced.
  79     ///
  80     /// Note that `dfa(true)` implies `bytes(true)`.
  81     pub fn bytes(mut self, yes: bool) -> Self {
  82         self.compiled.is_bytes = yes;
  83         self
  84     }
  85
  86     /// When disabled, the program compiled may match arbitrary bytes.
  87     ///
  88     /// When enabled (the default), all compiled programs exclusively match
  89     /// valid UTF-8 bytes.
  90     pub fn only_utf8(mut self, yes: bool) -> Self {
  91         self.compiled.only_utf8 = yes;
  92         self
  93     }
  94
  95     /// When set, the machine returned is suitable for use in the DFA matching
  96     /// engine.
  97     ///
  98     /// In particular, this ensures that if the regex is not anchored in the
  99     /// beginning, then a preceding `.*?` is included in the program. (The NFA
 100     /// based engines handle the preceding `.*?` explicitly, which is difficult
 101     /// or impossible in the DFA engine.)
 102     pub fn dfa(mut self, yes: bool) -> Self {
 103         self.compiled.is_dfa = yes;
 104         self
 105     }
 106
 107     /// When set, the machine returned is suitable for matching text in
 108     /// reverse. In particular, all concatenations are flipped.
 109     pub fn reverse(mut self, yes: bool) -> Self {
 110         self.compiled.is_reverse = yes;
 111         self
 112     }
 113
 114     /// Compile a regular expression given its AST.
 115     ///
 116     /// The compiler is guaranteed to succeed unless the program exceeds the
 117     /// specified size limit. If the size limit is exceeded, then compilation
 118     /// stops and returns an error.
 119     pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
 120         debug_assert!(!exprs.is_empty());
 121         self.num_exprs = exprs.len();
 122         if exprs.len() == 1 {
 123             self.compile_one(&exprs[0])
 124         } else {
 125             self.compile_many(exprs)
 126         }
 127     }
 128
 129     fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
 130         // If we're compiling a forward DFA and we aren't anchored, then
 131         // add a `.*?` before the first capture group.
 132         // Other matching engines handle this by baking the logic into the
 133         // matching engine itself.
 134         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
 135         self.compiled.is_anchored_start = expr.is_anchored_start();
 136         self.compiled.is_anchored_end = expr.is_anchored_end();
 137         if self.compiled.needs_dotstar() {
 138             dotstar_patch = self.c_dotstar()?;
 139             self.compiled.start = dotstar_patch.entry;
 140         }
 141         self.compiled.captures = vec![None];
 142         let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst());
 143         if self.compiled.needs_dotstar() {
 144             self.fill(dotstar_patch.hole, patch.entry);
 145         } else {
 146             self.compiled.start = patch.entry;
 147         }
 148         self.fill_to_next(patch.hole);
 149         self.compiled.matches = vec![self.insts.len()];
 150         self.push_compiled(Inst::Match(0));
 151         self.compile_finish()
 152     }
 153
 154     fn compile_many(
 155         mut self,
 156         exprs: &[Hir],
 157     ) -> result::Result<Program, Error> {
 158         debug_assert!(exprs.len() > 1);
 159
 160         self.compiled.is_anchored_start =
 161             exprs.iter().all(|e| e.is_anchored_start());
 162         self.compiled.is_anchored_end =
 163             exprs.iter().all(|e| e.is_anchored_end());
 164         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
 165         if self.compiled.needs_dotstar() {
 166             dotstar_patch = self.c_dotstar()?;
 167             self.compiled.start = dotstar_patch.entry;
 168         } else {
 169             self.compiled.start = 0; // first instruction is always split
 170         }
 171         self.fill_to_next(dotstar_patch.hole);
 172
 173         let mut prev_hole = Hole::None;
 174         for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
 175             self.fill_to_next(prev_hole);
 176             let split = self.push_split_hole();
 177             let Patch { hole, entry } =
 178                 self.c_capture(0, expr)?.unwrap_or(self.next_inst());
 179             self.fill_to_next(hole);
 180             self.compiled.matches.push(self.insts.len());
 181             self.push_compiled(Inst::Match(i));
 182             prev_hole = self.fill_split(split, Some(entry), None);
 183         }
 184         let i = exprs.len() - 1;
 185         let Patch { hole, entry } =
 186             self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst());
 187         self.fill(prev_hole, entry);
 188         self.fill_to_next(hole);
 189         self.compiled.matches.push(self.insts.len());
 190         self.push_compiled(Inst::Match(i));
 191         self.compile_finish()
 192     }
 193
 194     fn compile_finish(mut self) -> result::Result<Program, Error> {
 195         self.compiled.insts =
 196             self.insts.into_iter().map(|inst| inst.unwrap()).collect();
 197         self.compiled.byte_classes = self.byte_classes.byte_classes();
 198         self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
 199         Ok(self.compiled)
 200     }
 201
 202     /// Compile expr into self.insts, returning a patch on success,
 203     /// or an error if we run out of memory.
 204     ///
 205     /// All of the c_* methods of the compiler share the contract outlined
 206     /// here.
 207     ///
 208     /// The main thing that a c_* method does is mutate `self.insts`
 209     /// to add a list of mostly compiled instructions required to execute
 210     /// the given expression. `self.insts` contains MaybeInsts rather than
 211     /// Insts because there is some backpatching required.
 212     ///
 213     /// The `Patch` value returned by each c_* method provides metadata
 214     /// about the compiled instructions emitted to `self.insts`. The
 215     /// `entry` member of the patch refers to the first instruction
 216     /// (the entry point), while the `hole` member contains zero or
 217     /// more offsets to partial instructions that need to be backpatched.
 218     /// The c_* routine can't know where its list of instructions are going to
 219     /// jump to after execution, so it is up to the caller to patch
 220     /// these jumps to point to the right place. So compiling some
 221     /// expression, e, we would end up with a situation that looked like:
 222     ///
 223     /// ```text
 224     /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
 225     ///                     ^              ^             ^
 226     ///                     |                \         /
 227     ///                   entry                \     /
 228     ///                                         hole
 229     /// ```
 230     ///
 231     /// To compile two expressions, e1 and e2, concatenated together we
 232     /// would do:
 233     ///
 234     /// ```ignore
 235     /// let patch1 = self.c(e1);
 236     /// let patch2 = self.c(e2);
 237     /// ```
 238     ///
 239     /// while leaves us with a situation that looks like
 240     ///
 241     /// ```text
 242     /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
 243     ///                     ^        ^            ^        ^
 244     ///                     |        |            |        |
 245     ///                entry1        hole1   entry2        hole2
 246     /// ```
 247     ///
 248     /// Then to merge the two patches together into one we would backpatch
 249     /// hole1 with entry2 and return a new patch that enters at entry1
 250     /// and has hole2 for a hole. In fact, if you look at the c_concat
 251     /// method you will see that it does exactly this, though it handles
 252     /// a list of expressions rather than just the two that we use for
 253     /// an example.
 254     ///
 255     /// Ok(None) is returned when an expression is compiled to no
 256     /// instruction, and so no patch.entry value makes sense.
 257     fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
 258         use crate::prog;
 259         use regex_syntax::hir::HirKind::*;
 260
 261         self.check_size()?;
 262         match *expr.kind() {
 263             Empty => Ok(None),
 264             Literal(hir::Literal::Unicode(c)) => self.c_char(c),
 265             Literal(hir::Literal::Byte(b)) => {
 266                 assert!(self.compiled.uses_bytes());
 267                 self.c_byte(b)
 268             }
 269             Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
 270             Class(hir::Class::Bytes(ref cls)) => {
 271                 if self.compiled.uses_bytes() {
 272                     self.c_class_bytes(cls.ranges())
 273                 } else {
 274                     assert!(cls.is_all_ascii());
 275                     let mut char_ranges = vec![];
 276                     for r in cls.iter() {
 277                         let (s, e) = (r.start() as char, r.end() as char);
 278                         char_ranges.push(hir::ClassUnicodeRange::new(s, e));
 279                     }
 280                     self.c_class(&char_ranges)
 281                 }
 282             }
 283             Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
 284                 self.byte_classes.set_range(b'\n', b'\n');
 285                 self.c_empty_look(prog::EmptyLook::EndLine)
 286             }
 287             Anchor(hir::Anchor::StartLine) => {
 288                 self.byte_classes.set_range(b'\n', b'\n');
 289                 self.c_empty_look(prog::EmptyLook::StartLine)
 290             }
 291             Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
 292                 self.byte_classes.set_range(b'\n', b'\n');
 293                 self.c_empty_look(prog::EmptyLook::StartLine)
 294             }
 295             Anchor(hir::Anchor::EndLine) => {
 296                 self.byte_classes.set_range(b'\n', b'\n');
 297                 self.c_empty_look(prog::EmptyLook::EndLine)
 298             }
 299             Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
 300                 self.c_empty_look(prog::EmptyLook::EndText)
 301             }
 302             Anchor(hir::Anchor::StartText) => {
 303                 self.c_empty_look(prog::EmptyLook::StartText)
 304             }
 305             Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
 306                 self.c_empty_look(prog::EmptyLook::StartText)
 307             }
 308             Anchor(hir::Anchor::EndText) => {
 309                 self.c_empty_look(prog::EmptyLook::EndText)
 310             }
 311             WordBoundary(hir::WordBoundary::Unicode) => {
 312                 if !cfg!(feature = "unicode-perl") {
 313                     return Err(Error::Syntax(
 314                         "Unicode word boundaries are unavailable when \
 315                          the unicode-perl feature is disabled"
 316                             .to_string(),
 317                     ));
 318                 }
 319                 self.compiled.has_unicode_word_boundary = true;
 320                 self.byte_classes.set_word_boundary();
 321                 // We also make sure that all ASCII bytes are in a different
 322                 // class from non-ASCII bytes. Otherwise, it's possible for
 323                 // ASCII bytes to get lumped into the same class as non-ASCII
 324                 // bytes. This in turn may cause the lazy DFA to falsely start
 325                 // when it sees an ASCII byte that maps to a byte class with
 326                 // non-ASCII bytes. This ensures that never happens.
 327                 self.byte_classes.set_range(0, 0x7F);
 328                 self.c_empty_look(prog::EmptyLook::WordBoundary)
 329             }
 330             WordBoundary(hir::WordBoundary::UnicodeNegate) => {
 331                 if !cfg!(feature = "unicode-perl") {
 332                     return Err(Error::Syntax(
 333                         "Unicode word boundaries are unavailable when \
 334                          the unicode-perl feature is disabled"
 335                             .to_string(),
 336                     ));
 337                 }
 338                 self.compiled.has_unicode_word_boundary = true;
 339                 self.byte_classes.set_word_boundary();
 340                 // See comments above for why we set the ASCII range here.
 341                 self.byte_classes.set_range(0, 0x7F);
 342                 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
 343             }
 344             WordBoundary(hir::WordBoundary::Ascii) => {
 345                 self.byte_classes.set_word_boundary();
 346                 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
 347             }
 348             WordBoundary(hir::WordBoundary::AsciiNegate) => {
 349                 self.byte_classes.set_word_boundary();
 350                 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
 351             }
 352             Group(ref g) => match g.kind {
 353                 hir::GroupKind::NonCapturing => self.c(&g.hir),
 354                 hir::GroupKind::CaptureIndex(index) => {
 355                     if index as usize >= self.compiled.captures.len() {
 356                         self.compiled.captures.push(None);
 357                     }
 358                     self.c_capture(2 * index as usize, &g.hir)
 359                 }
 360                 hir::GroupKind::CaptureName { index, ref name } => {
 361                     if index as usize >= self.compiled.captures.len() {
 362                         let n = name.to_string();
 363                         self.compiled.captures.push(Some(n.clone()));
 364                         self.capture_name_idx.insert(n, index as usize);
 365                     }
 366                     self.c_capture(2 * index as usize, &g.hir)
 367                 }
 368             },
 369             Concat(ref es) => {
 370                 if self.compiled.is_reverse {
 371                     self.c_concat(es.iter().rev())
 372                 } else {
 373                     self.c_concat(es)
 374                 }
 375             }
 376             Alternation(ref es) => self.c_alternate(&**es),
 377             Repetition(ref rep) => self.c_repeat(rep),
 378         }
 379     }
 380
 381     fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
 382         if self.num_exprs > 1 || self.compiled.is_dfa {
 383             // Don't ever compile Save instructions for regex sets because
 384             // they are never used. They are also never used in DFA programs
 385             // because DFAs can't handle captures.
 386             self.c(expr)
 387         } else {
 388             let entry = self.insts.len();
 389             let hole = self.push_hole(InstHole::Save { slot: first_slot });
 390             let patch = self.c(expr)?.unwrap_or(self.next_inst());
 391             self.fill(hole, patch.entry);
 392             self.fill_to_next(patch.hole);
 393             let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
 394             Ok(Some(Patch { hole: hole, entry: entry }))
 395         }
 396     }
 397
 398     fn c_dotstar(&mut self) -> Result {
 399         Ok(if !self.compiled.only_utf8() {
 400             self.c(&Hir::repetition(hir::Repetition {
 401                 kind: hir::RepetitionKind::ZeroOrMore,
 402                 greedy: false,
 403                 hir: Box::new(Hir::any(true)),
 404             }))?
 405             .unwrap()
 406         } else {
 407             self.c(&Hir::repetition(hir::Repetition {
 408                 kind: hir::RepetitionKind::ZeroOrMore,
 409                 greedy: false,
 410                 hir: Box::new(Hir::any(false)),
 411             }))?
 412             .unwrap()
 413         })
 414     }
 415
 416     fn c_char(&mut self, c: char) -> ResultOrEmpty {
 417         if self.compiled.uses_bytes() {
 418             if c.is_ascii() {
 419                 let b = c as u8;
 420                 let hole =
 421                     self.push_hole(InstHole::Bytes { start: b, end: b });
 422                 self.byte_classes.set_range(b, b);
 423                 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
 424             } else {
 425                 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
 426             }
 427         } else {
 428             let hole = self.push_hole(InstHole::Char { c: c });
 429             Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
 430         }
 431     }
 432
 433     fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
 434         use std::mem::size_of;
 435
 436         assert!(!ranges.is_empty());
 437         if self.compiled.uses_bytes() {
 438             Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?))
 439         } else {
 440             let ranges: Vec<(char, char)> =
 441                 ranges.iter().map(|r| (r.start(), r.end())).collect();
 442             let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
 443                 self.push_hole(InstHole::Char { c: ranges[0].0 })
 444             } else {
 445                 self.extra_inst_bytes +=
 446                     ranges.len() * (size_of::<char>() * 2);
 447                 self.push_hole(InstHole::Ranges { ranges: ranges })
 448             };
 449             Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
 450         }
 451     }
 452
 453     fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
 454         self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
 455     }
 456
 457     fn c_class_bytes(
 458         &mut self,
 459         ranges: &[hir::ClassBytesRange],
 460     ) -> ResultOrEmpty {
 461         debug_assert!(!ranges.is_empty());
 462
 463         let first_split_entry = self.insts.len();
 464         let mut holes = vec![];
 465         let mut prev_hole = Hole::None;
 466         for r in &ranges[0..ranges.len() - 1] {
 467             self.fill_to_next(prev_hole);
 468             let split = self.push_split_hole();
 469             let next = self.insts.len();
 470             self.byte_classes.set_range(r.start(), r.end());
 471             holes.push(self.push_hole(InstHole::Bytes {
 472                 start: r.start(),
 473                 end: r.end(),
 474             }));
 475             prev_hole = self.fill_split(split, Some(next), None);
 476         }
 477         let next = self.insts.len();
 478         let r = &ranges[ranges.len() - 1];
 479         self.byte_classes.set_range(r.start(), r.end());
 480         holes.push(
 481             self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
 482         );
 483         self.fill(prev_hole, next);
 484         Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
 485     }
 486
 487     fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
 488         let hole = self.push_hole(InstHole::EmptyLook { look: look });
 489         Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
 490     }
 491
 492     fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
 493     where
 494         I: IntoIterator<Item = &'a Hir>,
 495     {
 496         let mut exprs = exprs.into_iter();
 497         let Patch { mut hole, entry } = loop {
 498             match exprs.next() {
 499                 None => return Ok(None),
 500                 Some(e) => {
 501                     if let Some(p) = self.c(e)? {
 502                         break p;
 503                     }
 504                 }
 505             }
 506         };
 507         for e in exprs {
 508             if let Some(p) = self.c(e)? {
 509                 self.fill(hole, p.entry);
 510                 hole = p.hole;
 511             }
 512         }
 513         Ok(Some(Patch { hole: hole, entry: entry }))
 514     }
 515
 516     fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
 517         debug_assert!(
 518             exprs.len() >= 2,
 519             "alternates must have at least 2 exprs"
 520         );
 521
 522         // Initial entry point is always the first split.
 523         let first_split_entry = self.insts.len();
 524
 525         // Save up all of the holes from each alternate. They will all get
 526         // patched to point to the same location.
 527         let mut holes = vec![];
 528
 529         // true indicates that the hole is a split where we want to fill
 530         // the second branch.
 531         let mut prev_hole = (Hole::None, false);
 532         for e in &exprs[0..exprs.len() - 1] {
 533             if prev_hole.1 {
 534                 let next = self.insts.len();
 535                 self.fill_split(prev_hole.0, None, Some(next));
 536             } else {
 537                 self.fill_to_next(prev_hole.0);
 538             }
 539             let split = self.push_split_hole();
 540             if let Some(Patch { hole, entry }) = self.c(e)? {
 541                 holes.push(hole);
 542                 prev_hole = (self.fill_split(split, Some(entry), None), false);
 543             } else {
 544                 let (split1, split2) = split.dup_one();
 545                 holes.push(split1);
 546                 prev_hole = (split2, true);
 547             }
 548         }
 549         if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
 550             holes.push(hole);
 551             if prev_hole.1 {
 552                 self.fill_split(prev_hole.0, None, Some(entry));
 553             } else {
 554                 self.fill(prev_hole.0, entry);
 555             }
 556         } else {
 557             // We ignore prev_hole.1. When it's true, it means we have two
 558             // empty branches both pushing prev_hole.0 into holes, so both
 559             // branches will go to the same place anyway.
 560             holes.push(prev_hole.0);
 561         }
 562         Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
 563     }
 564
 565     fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
 566         use regex_syntax::hir::RepetitionKind::*;
 567         match rep.kind {
 568             ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
 569             ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
 570             OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
 571             Range(hir::RepetitionRange::Exactly(min_max)) => {
 572                 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
 573             }
 574             Range(hir::RepetitionRange::AtLeast(min)) => {
 575                 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
 576             }
 577             Range(hir::RepetitionRange::Bounded(min, max)) => {
 578                 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
 579             }
 580         }
 581     }
 582
 583     fn c_repeat_zero_or_one(
 584         &mut self,
 585         expr: &Hir,
 586         greedy: bool,
 587     ) -> ResultOrEmpty {
 588         let split_entry = self.insts.len();
 589         let split = self.push_split_hole();
 590         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
 591             Some(p) => p,
 592             None => return self.pop_split_hole(),
 593         };
 594         let split_hole = if greedy {
 595             self.fill_split(split, Some(entry_rep), None)
 596         } else {
 597             self.fill_split(split, None, Some(entry_rep))
 598         };
 599         let holes = vec![hole_rep, split_hole];
 600         Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
 601     }
 602
 603     fn c_repeat_zero_or_more(
 604         &mut self,
 605         expr: &Hir,
 606         greedy: bool,
 607     ) -> ResultOrEmpty {
 608         let split_entry = self.insts.len();
 609         let split = self.push_split_hole();
 610         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
 611             Some(p) => p,
 612             None => return self.pop_split_hole(),
 613         };
 614
 615         self.fill(hole_rep, split_entry);
 616         let split_hole = if greedy {
 617             self.fill_split(split, Some(entry_rep), None)
 618         } else {
 619             self.fill_split(split, None, Some(entry_rep))
 620         };
 621         Ok(Some(Patch { hole: split_hole, entry: split_entry }))
 622     }
 623
 624     fn c_repeat_one_or_more(
 625         &mut self,
 626         expr: &Hir,
 627         greedy: bool,
 628     ) -> ResultOrEmpty {
 629         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
 630             Some(p) => p,
 631             None => return Ok(None),
 632         };
 633         self.fill_to_next(hole_rep);
 634         let split = self.push_split_hole();
 635
 636         let split_hole = if greedy {
 637             self.fill_split(split, Some(entry_rep), None)
 638         } else {
 639             self.fill_split(split, None, Some(entry_rep))
 640         };
 641         Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
 642     }
 643
 644     fn c_repeat_range_min_or_more(
 645         &mut self,
 646         expr: &Hir,
 647         greedy: bool,
 648         min: u32,
 649     ) -> ResultOrEmpty {
 650         let min = u32_to_usize(min);
 651         // Using next_inst() is ok, because we can't return it (concat would
 652         // have to return Some(_) while c_repeat_range_min_or_more returns
 653         // None).
 654         let patch_concat = self
 655             .c_concat(iter::repeat(expr).take(min))?
 656             .unwrap_or(self.next_inst());
 657         if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
 658             self.fill(patch_concat.hole, patch_rep.entry);
 659             Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
 660         } else {
 661             Ok(None)
 662         }
 663     }
 664
 665     fn c_repeat_range(
 666         &mut self,
 667         expr: &Hir,
 668         greedy: bool,
 669         min: u32,
 670         max: u32,
 671     ) -> ResultOrEmpty {
 672         let (min, max) = (u32_to_usize(min), u32_to_usize(max));
 673         debug_assert!(min <= max);
 674         let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
 675         if min == max {
 676             return Ok(patch_concat);
 677         }
 678         // Same reasoning as in c_repeat_range_min_or_more (we know that min <
 679         // max at this point).
 680         let patch_concat = patch_concat.unwrap_or(self.next_inst());
 681         let initial_entry = patch_concat.entry;
 682         // It is much simpler to compile, e.g., `a{2,5}` as:
 683         //
 684         //     aaa?a?a?
 685         //
 686         // But you end up with a sequence of instructions like this:
 687         //
 688         //     0: 'a'
 689         //     1: 'a',
 690         //     2: split(3, 4)
 691         //     3: 'a'
 692         //     4: split(5, 6)
 693         //     5: 'a'
 694         //     6: split(7, 8)
 695         //     7: 'a'
 696         //     8: MATCH
 697         //
 698         // This is *incredibly* inefficient because the splits end
 699         // up forming a chain, which has to be resolved everything a
 700         // transition is followed.
 701         let mut holes = vec![];
 702         let mut prev_hole = patch_concat.hole;
 703         for _ in min..max {
 704             self.fill_to_next(prev_hole);
 705             let split = self.push_split_hole();
 706             let Patch { hole, entry } = match self.c(expr)? {
 707                 Some(p) => p,
 708                 None => return self.pop_split_hole(),
 709             };
 710             prev_hole = hole;
 711             if greedy {
 712                 holes.push(self.fill_split(split, Some(entry), None));
 713             } else {
 714                 holes.push(self.fill_split(split, None, Some(entry)));
 715             }
 716         }
 717         holes.push(prev_hole);
 718         Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
 719     }
 720
 721     /// Can be used as a default value for the c_* functions when the call to
 722     /// c_function is followed by inserting at least one instruction that is
 723     /// always executed after the ones written by the c* function.
 724     fn next_inst(&self) -> Patch {
 725         Patch { hole: Hole::None, entry: self.insts.len() }
 726     }
 727
 728     fn fill(&mut self, hole: Hole, goto: InstPtr) {
 729         match hole {
 730             Hole::None => {}
 731             Hole::One(pc) => {
 732                 self.insts[pc].fill(goto);
 733             }
 734             Hole::Many(holes) => {
 735                 for hole in holes {
 736                     self.fill(hole, goto);
 737                 }
 738             }
 739         }
 740     }
 741
 742     fn fill_to_next(&mut self, hole: Hole) {
 743         let next = self.insts.len();
 744         self.fill(hole, next);
 745     }
 746
 747     fn fill_split(
 748         &mut self,
 749         hole: Hole,
 750         goto1: Option<InstPtr>,
 751         goto2: Option<InstPtr>,
 752     ) -> Hole {
 753         match hole {
 754             Hole::None => Hole::None,
 755             Hole::One(pc) => match (goto1, goto2) {
 756                 (Some(goto1), Some(goto2)) => {
 757                     self.insts[pc].fill_split(goto1, goto2);
 758                     Hole::None
 759                 }
 760                 (Some(goto1), None) => {
 761                     self.insts[pc].half_fill_split_goto1(goto1);
 762                     Hole::One(pc)
 763                 }
 764                 (None, Some(goto2)) => {
 765                     self.insts[pc].half_fill_split_goto2(goto2);
 766                     Hole::One(pc)
 767                 }
 768                 (None, None) => unreachable!(
 769                     "at least one of the split \
 770                      holes must be filled"
 771                 ),
 772             },
 773             Hole::Many(holes) => {
 774                 let mut new_holes = vec![];
 775                 for hole in holes {
 776                     new_holes.push(self.fill_split(hole, goto1, goto2));
 777                 }
 778                 if new_holes.is_empty() {
 779                     Hole::None
 780                 } else if new_holes.len() == 1 {
 781                     new_holes.pop().unwrap()
 782                 } else {
 783                     Hole::Many(new_holes)
 784                 }
 785             }
 786         }
 787     }
 788
 789     fn push_compiled(&mut self, inst: Inst) {
 790         self.insts.push(MaybeInst::Compiled(inst));
 791     }
 792
 793     fn push_hole(&mut self, inst: InstHole) -> Hole {
 794         let hole = self.insts.len();
 795         self.insts.push(MaybeInst::Uncompiled(inst));
 796         Hole::One(hole)
 797     }
 798
 799     fn push_split_hole(&mut self) -> Hole {
 800         let hole = self.insts.len();
 801         self.insts.push(MaybeInst::Split);
 802         Hole::One(hole)
 803     }
 804
 805     fn pop_split_hole(&mut self) -> ResultOrEmpty {
 806         self.insts.pop();
 807         Ok(None)
 808     }
 809
 810     fn check_size(&self) -> result::Result<(), Error> {
 811         use std::mem::size_of;
 812
 813         let size =
 814             self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
 815         if size > self.size_limit {
 816             Err(Error::CompiledTooBig(self.size_limit))
 817         } else {
 818             Ok(())
 819         }
 820     }
 821 }
 822
 823 #[derive(Debug)]
 824 enum Hole {
 825     None,
 826     One(InstPtr),
 827     Many(Vec<Hole>),
 828 }
 829
 830 impl Hole {
 831     fn dup_one(self) -> (Self, Self) {
 832         match self {
 833             Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
 834             Hole::None | Hole::Many(_) => {
 835                 unreachable!("must be called on single hole")
 836             }
 837         }
 838     }
 839 }
 840
 841 #[derive(Clone, Debug)]
 842 enum MaybeInst {
 843     Compiled(Inst),
 844     Uncompiled(InstHole),
 845     Split,
 846     Split1(InstPtr),
 847     Split2(InstPtr),
 848 }
 849
 850 impl MaybeInst {
 851     fn fill(&mut self, goto: InstPtr) {
 852         let maybeinst = match *self {
 853             MaybeInst::Split => MaybeInst::Split1(goto),
 854             MaybeInst::Uncompiled(ref inst) => {
 855                 MaybeInst::Compiled(inst.fill(goto))
 856             }
 857             MaybeInst::Split1(goto1) => {
 858                 MaybeInst::Compiled(Inst::Split(InstSplit {
 859                     goto1: goto1,
 860                     goto2: goto,
 861                 }))
 862             }
 863             MaybeInst::Split2(goto2) => {
 864                 MaybeInst::Compiled(Inst::Split(InstSplit {
 865                     goto1: goto,
 866                     goto2: goto2,
 867                 }))
 868             }
 869             _ => unreachable!(
 870                 "not all instructions were compiled! \
 871                  found uncompiled instruction: {:?}",
 872                 self
 873             ),
 874         };
 875         *self = maybeinst;
 876     }
 877
 878     fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
 879         let filled = match *self {
 880             MaybeInst::Split => {
 881                 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
 882             }
 883             _ => unreachable!(
 884                 "must be called on Split instruction, \
 885                  instead it was called on: {:?}",
 886                 self
 887             ),
 888         };
 889         *self = MaybeInst::Compiled(filled);
 890     }
 891
 892     fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
 893         let half_filled = match *self {
 894             MaybeInst::Split => goto1,
 895             _ => unreachable!(
 896                 "must be called on Split instruction, \
 897                  instead it was called on: {:?}",
 898                 self
 899             ),
 900         };
 901         *self = MaybeInst::Split1(half_filled);
 902     }
 903
 904     fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
 905         let half_filled = match *self {
 906             MaybeInst::Split => goto2,
 907             _ => unreachable!(
 908                 "must be called on Split instruction, \
 909                  instead it was called on: {:?}",
 910                 self
 911             ),
 912         };
 913         *self = MaybeInst::Split2(half_filled);
 914     }
 915
 916     fn unwrap(self) -> Inst {
 917         match self {
 918             MaybeInst::Compiled(inst) => inst,
 919             _ => unreachable!(
 920                 "must be called on a compiled instruction, \
 921                  instead it was called on: {:?}",
 922                 self
 923             ),
 924         }
 925     }
 926 }
 927
 928 #[derive(Clone, Debug)]
 929 enum InstHole {
 930     Save { slot: usize },
 931     EmptyLook { look: EmptyLook },
 932     Char { c: char },
 933     Ranges { ranges: Vec<(char, char)> },
 934     Bytes { start: u8, end: u8 },
 935 }
 936
 937 impl InstHole {
 938     fn fill(&self, goto: InstPtr) -> Inst {
 939         match *self {
 940             InstHole::Save { slot } => {
 941                 Inst::Save(InstSave { goto: goto, slot: slot })
 942             }
 943             InstHole::EmptyLook { look } => {
 944                 Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
 945             }
 946             InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
 947             InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
 948                 goto: goto,
 949                 ranges: ranges.clone().into_boxed_slice(),
 950             }),
 951             InstHole::Bytes { start, end } => {
 952                 Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
 953             }
 954         }
 955     }
 956 }
 957
 958 struct CompileClass<'a, 'b> {
 959     c: &'a mut Compiler,
 960     ranges: &'b [hir::ClassUnicodeRange],
 961 }
 962
 963 impl<'a, 'b> CompileClass<'a, 'b> {
 964     fn compile(mut self) -> Result {
 965         let mut holes = vec![];
 966         let mut initial_entry = None;
 967         let mut last_split = Hole::None;
 968         let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
 969         self.c.suffix_cache.clear();
 970
 971         for (i, range) in self.ranges.iter().enumerate() {
 972             let is_last_range = i + 1 == self.ranges.len();
 973             utf8_seqs.reset(range.start(), range.end());
 974             let mut it = (&mut utf8_seqs).peekable();
 975             loop {
 976                 let utf8_seq = match it.next() {
 977                     None => break,
 978                     Some(utf8_seq) => utf8_seq,
 979                 };
 980                 if is_last_range && it.peek().is_none() {
 981                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
 982                     holes.push(hole);
 983                     self.c.fill(last_split, entry);
 984                     last_split = Hole::None;
 985                     if initial_entry.is_none() {
 986                         initial_entry = Some(entry);
 987                     }
 988                 } else {
 989                     if initial_entry.is_none() {
 990                         initial_entry = Some(self.c.insts.len());
 991                     }
 992                     self.c.fill_to_next(last_split);
 993                     last_split = self.c.push_split_hole();
 994                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
 995                     holes.push(hole);
 996                     last_split =
 997                         self.c.fill_split(last_split, Some(entry), None);
 998                 }
 999             }
1000         }
1001         self.c.utf8_seqs = Some(utf8_seqs);
1002         Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
1003     }
1004
1005     fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
1006         if self.c.compiled.is_reverse {
1007             self.c_utf8_seq_(seq)
1008         } else {
1009             self.c_utf8_seq_(seq.into_iter().rev())
1010         }
1011     }
1012
1013     fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
1014     where
1015         I: IntoIterator<Item = &'r Utf8Range>,
1016     {
1017         // The initial instruction for each UTF-8 sequence should be the same.
1018         let mut from_inst = ::std::usize::MAX;
1019         let mut last_hole = Hole::None;
1020         for byte_range in seq {
1021             let key = SuffixCacheKey {
1022                 from_inst: from_inst,
1023                 start: byte_range.start,
1024                 end: byte_range.end,
1025             };
1026             {
1027                 let pc = self.c.insts.len();
1028                 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
1029                     from_inst = cached_pc;
1030                     continue;
1031                 }
1032             }
1033             self.c.byte_classes.set_range(byte_range.start, byte_range.end);
1034             if from_inst == ::std::usize::MAX {
1035                 last_hole = self.c.push_hole(InstHole::Bytes {
1036                     start: byte_range.start,
1037                     end: byte_range.end,
1038                 });
1039             } else {
1040                 self.c.push_compiled(Inst::Bytes(InstBytes {
1041                     goto: from_inst,
1042                     start: byte_range.start,
1043                     end: byte_range.end,
1044                 }));
1045             }
1046             from_inst = self.c.insts.len().checked_sub(1).unwrap();
1047             debug_assert!(from_inst < ::std::usize::MAX);
1048         }
1049         debug_assert!(from_inst < ::std::usize::MAX);
1050         Ok(Patch { hole: last_hole, entry: from_inst })
1051     }
1052 }
1053
1054 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
1055 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
1056 /// The set of byte ranges looks like this:
1057 ///
1058 /// [0-7F]
1059 /// [C2-DF][80-BF]
1060 /// [E0][A0-BF][80-BF]
1061 /// [E1-EC][80-BF][80-BF]
1062 /// [ED][80-9F][80-BF]
1063 /// [EE-EF][80-BF][80-BF]
1064 ///
1065 /// Each line above translates to one alternate in the compiled regex program.
1066 /// However, all but one of the alternates end in the same suffix, which is
1067 /// a waste of an instruction. The suffix cache facilitates reusing them across
1068 /// alternates.
1069 ///
1070 /// Note that a HashMap could be trivially used for this, but we don't need its
1071 /// overhead. Some small bounded space (LRU style) is more than enough.
1072 ///
1073 /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
1074 /// except it uses hashes as original indices and then compares full keys for
1075 /// validation against `dense` array.
1076 #[derive(Debug)]
1077 struct SuffixCache {
1078     sparse: Box<[usize]>,
1079     dense: Vec<SuffixCacheEntry>,
1080 }
1081
1082 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1083 struct SuffixCacheEntry {
1084     key: SuffixCacheKey,
1085     pc: InstPtr,
1086 }
1087
1088 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1089 struct SuffixCacheKey {
1090     from_inst: InstPtr,
1091     start: u8,
1092     end: u8,
1093 }
1094
1095 impl SuffixCache {
1096     fn new(size: usize) -> Self {
1097         SuffixCache {
1098             sparse: vec![0usize; size].into(),
1099             dense: Vec::with_capacity(size),
1100         }
1101     }
1102
1103     fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1104         let hash = self.hash(&key);
1105         let pos = &mut self.sparse[hash];
1106         if let Some(entry) = self.dense.get(*pos) {
1107             if entry.key == key {
1108                 return Some(entry.pc);
1109             }
1110         }
1111         *pos = self.dense.len();
1112         self.dense.push(SuffixCacheEntry { key: key, pc: pc });
1113         None
1114     }
1115
1116     fn clear(&mut self) {
1117         self.dense.clear();
1118     }
1119
1120     fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1121         // Basic FNV-1a hash as described:
1122         // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1123         const FNV_PRIME: u64 = 1099511628211;
1124         let mut h = 14695981039346656037;
1125         h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1126         h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1127         h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1128         (h as usize) % self.sparse.len()
1129     }
1130 }
1131
1132 struct ByteClassSet([bool; 256]);
1133
1134 impl ByteClassSet {
1135     fn new() -> Self {
1136         ByteClassSet([false; 256])
1137     }
1138
1139     fn set_range(&mut self, start: u8, end: u8) {
1140         debug_assert!(start <= end);
1141         if start > 0 {
1142             self.0[start as usize - 1] = true;
1143         }
1144         self.0[end as usize] = true;
1145     }
1146
1147     fn set_word_boundary(&mut self) {
1148         // We need to mark all ranges of bytes whose pairs result in
1149         // evaluating \b differently.
1150         let iswb = is_word_byte;
1151         let mut b1: u16 = 0;
1152         let mut b2: u16;
1153         while b1 <= 255 {
1154             b2 = b1 + 1;
1155             while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1156                 b2 += 1;
1157             }
1158             self.set_range(b1 as u8, (b2 - 1) as u8);
1159             b1 = b2;
1160         }
1161     }
1162
1163     fn byte_classes(&self) -> Vec<u8> {
1164         // N.B. If you're debugging the DFA, it's useful to simply return
1165         // `(0..256).collect()`, which effectively removes the byte classes
1166         // and makes the transitions easier to read.
1167         // (0usize..256).map(|x| x as u8).collect()
1168         let mut byte_classes = vec![0; 256];
1169         let mut class = 0u8;
1170         let mut i = 0;
1171         loop {
1172             byte_classes[i] = class as u8;
1173             if i >= 255 {
1174                 break;
1175             }
1176             if self.0[i] {
1177                 class = class.checked_add(1).unwrap();
1178             }
1179             i += 1;
1180         }
1181         byte_classes
1182     }
1183 }
1184
1185 impl fmt::Debug for ByteClassSet {
1186     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1187         f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
1188     }
1189 }
1190
1191 fn u32_to_usize(n: u32) -> usize {
1192     // In case usize is less than 32 bits, we need to guard against overflow.
1193     // On most platforms this compiles to nothing.
1194     // TODO Use `std::convert::TryFrom` once it's stable.
1195     if (n as u64) > (::std::usize::MAX as u64) {
1196         panic!("BUG: {} is too big to be pointer sized", n)
1197     }
1198     n as usize
1199 }
1200
1201 #[cfg(test)]
1202 mod tests {
1203     use super::ByteClassSet;
1204
1205     #[test]
1206     fn byte_classes() {
1207         let mut set = ByteClassSet::new();
1208         set.set_range(b'a', b'z');
1209         let classes = set.byte_classes();
1210         assert_eq!(classes[0], 0);
1211         assert_eq!(classes[1], 0);
1212         assert_eq!(classes[2], 0);
1213         assert_eq!(classes[b'a' as usize - 1], 0);
1214         assert_eq!(classes[b'a' as usize], 1);
1215         assert_eq!(classes[b'm' as usize], 1);
1216         assert_eq!(classes[b'z' as usize], 1);
1217         assert_eq!(classes[b'z' as usize + 1], 2);
1218         assert_eq!(classes[254], 2);
1219         assert_eq!(classes[255], 2);
1220
1221         let mut set = ByteClassSet::new();
1222         set.set_range(0, 2);
1223         set.set_range(4, 6);
1224         let classes = set.byte_classes();
1225         assert_eq!(classes[0], 0);
1226         assert_eq!(classes[1], 0);
1227         assert_eq!(classes[2], 0);
1228         assert_eq!(classes[3], 1);
1229         assert_eq!(classes[4], 2);
1230         assert_eq!(classes[5], 2);
1231         assert_eq!(classes[6], 2);
1232         assert_eq!(classes[7], 3);
1233         assert_eq!(classes[255], 3);
1234     }
1235
1236     #[test]
1237     fn full_byte_classes() {
1238         let mut set = ByteClassSet::new();
1239         for i in 0..256u16 {
1240             set.set_range(i as u8, i as u8);
1241         }
1242         assert_eq!(set.byte_classes().len(), 256);
1243     }
1244 }