vendor/regex-0.2.5/src/compile.rs

   1 // Copyright 2014-2016 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 use std::collections::HashMap;
  12 use std::iter;
  13 use std::result;
  14 use std::sync::Arc;
  15
  16 use syntax::{
  17     Expr, Repeater, CharClass, ClassRange, ByteClass, ByteRange,
  18     is_word_byte,
  19 };
  20 use utf8_ranges::{Utf8Range, Utf8Sequence, Utf8Sequences};
  21
  22 use prog::{
  23     Program, Inst, InstPtr, EmptyLook,
  24     InstSave, InstSplit, InstEmptyLook, InstChar, InstRanges, InstBytes,
  25 };
  26
  27 use Error;
  28
  29 type Result = result::Result<Patch, Error>;
  30
  31 #[derive(Debug)]
  32 struct Patch {
  33     hole: Hole,
  34     entry: InstPtr,
  35 }
  36
  37 /// A compiler translates a regular expression AST to a sequence of
  38 /// instructions. The sequence of instructions represents an NFA.
  39 pub struct Compiler {
  40     insts: Vec<MaybeInst>,
  41     compiled: Program,
  42     capture_name_idx: HashMap<String, usize>,
  43     num_exprs: usize,
  44     size_limit: usize,
  45     suffix_cache: SuffixCache,
  46     utf8_seqs: Option<Utf8Sequences>,
  47     byte_classes: ByteClassSet,
  48 }
  49
  50 impl Compiler {
  51     /// Create a new regular expression compiler.
  52     ///
  53     /// Various options can be set before calling `compile` on an expression.
  54     pub fn new() -> Self {
  55         Compiler {
  56             insts: vec![],
  57             compiled: Program::new(),
  58             capture_name_idx: HashMap::new(),
  59             num_exprs: 0,
  60             size_limit: 10 * (1 << 20),
  61             suffix_cache: SuffixCache::new(1000),
  62             utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
  63             byte_classes: ByteClassSet::new(),
  64         }
  65     }
  66
  67     /// The size of the resulting program is limited by size_limit. If
  68     /// the program approximately exceeds the given size (in bytes), then
  69     /// compilation will stop and return an error.
  70     pub fn size_limit(mut self, size_limit: usize) -> Self {
  71         self.size_limit = size_limit;
  72         self
  73     }
  74
  75     /// If bytes is true, then the program is compiled as a byte based
  76     /// automaton, which incorporates UTF-8 decoding into the machine. If it's
  77     /// false, then the automaton is Unicode scalar value based, e.g., an
  78     /// engine utilizing such an automaton is resposible for UTF-8 decoding.
  79     ///
  80     /// The specific invariant is that when returning a byte based machine,
  81     /// the neither the `Char` nor `Ranges` instructions are produced.
  82     /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
  83     /// instruction is never produced.
  84     ///
  85     /// Note that `dfa(true)` implies `bytes(true)`.
  86     pub fn bytes(mut self, yes: bool) -> Self {
  87         self.compiled.is_bytes = yes;
  88         self
  89     }
  90
  91     /// When disabled, the program compiled may match arbitrary bytes.
  92     ///
  93     /// When enabled (the default), all compiled programs exclusively match
  94     /// valid UTF-8 bytes.
  95     pub fn only_utf8(mut self, yes: bool) -> Self {
  96         self.compiled.only_utf8 = yes;
  97         self
  98     }
  99
 100     /// When set, the machine returned is suitable for use in the DFA matching
 101     /// engine.
 102     ///
 103     /// In particular, this ensures that if the regex is not anchored in the
 104     /// beginning, then a preceding `.*?` is included in the program. (The NFA
 105     /// based engines handle the preceding `.*?` explicitly, which is difficult
 106     /// or impossible in the DFA engine.)
 107     pub fn dfa(mut self, yes: bool) -> Self {
 108         self.compiled.is_dfa = yes;
 109         self
 110     }
 111
 112     /// When set, the machine returned is suitable for matching text in
 113     /// reverse. In particular, all concatenations are flipped.
 114     pub fn reverse(mut self, yes: bool) -> Self {
 115         self.compiled.is_reverse = yes;
 116         self
 117     }
 118
 119     /// Compile a regular expression given its AST.
 120     ///
 121     /// The compiler is guaranteed to succeed unless the program exceeds the
 122     /// specified size limit. If the size limit is exceeded, then compilation
 123     /// stops and returns an error.
 124     pub fn compile(
 125         mut self,
 126         exprs: &[Expr],
 127     ) -> result::Result<Program, Error> {
 128         debug_assert!(exprs.len() >= 1);
 129         self.num_exprs = exprs.len();
 130         if exprs.len() == 1 {
 131             self.compile_one(&exprs[0])
 132         } else {
 133             self.compile_many(exprs)
 134         }
 135     }
 136
 137     fn compile_one(mut self, expr: &Expr) -> result::Result<Program, Error> {
 138         // If we're compiling a forward DFA and we aren't anchored, then
 139         // add a `.*?` before the first capture group.
 140         // Other matching engines handle this by baking the logic into the
 141         // matching engine itself.
 142         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
 143         self.compiled.is_anchored_start = expr.is_anchored_start();
 144         self.compiled.is_anchored_end = expr.is_anchored_end();
 145         if self.compiled.needs_dotstar() {
 146             dotstar_patch = try!(self.c_dotstar());
 147             self.compiled.start = dotstar_patch.entry;
 148         }
 149         self.compiled.captures = vec![None];
 150         let patch = try!(self.c_capture(0, expr));
 151         if self.compiled.needs_dotstar() {
 152             self.fill(dotstar_patch.hole, patch.entry);
 153         } else {
 154             self.compiled.start = patch.entry;
 155         }
 156         self.fill_to_next(patch.hole);
 157         self.compiled.matches = vec![self.insts.len()];
 158         self.push_compiled(Inst::Match(0));
 159         self.compile_finish()
 160     }
 161
 162     fn compile_many(
 163         mut self,
 164         exprs: &[Expr],
 165     ) -> result::Result<Program, Error> {
 166         debug_assert!(exprs.len() > 1);
 167
 168         self.compiled.is_anchored_start =
 169             exprs.iter().all(|e| e.is_anchored_start());
 170         self.compiled.is_anchored_end =
 171             exprs.iter().all(|e| e.is_anchored_end());
 172         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
 173         if self.compiled.needs_dotstar() {
 174             dotstar_patch = try!(self.c_dotstar());
 175             self.compiled.start = dotstar_patch.entry;
 176         } else {
 177             self.compiled.start = 0; // first instruction is always split
 178         }
 179         self.fill_to_next(dotstar_patch.hole);
 180
 181         let mut prev_hole = Hole::None;
 182         for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
 183             self.fill_to_next(prev_hole);
 184             let split = self.push_split_hole();
 185             let Patch { hole, entry } = try!(self.c_capture(0, expr));
 186             self.fill_to_next(hole);
 187             self.compiled.matches.push(self.insts.len());
 188             self.push_compiled(Inst::Match(i));
 189             prev_hole = self.fill_split(split, Some(entry), None);
 190         }
 191         let i = exprs.len() - 1;
 192         let Patch { hole, entry } = try!(self.c_capture(0, &exprs[i]));
 193         self.fill(prev_hole, entry);
 194         self.fill_to_next(hole);
 195         self.compiled.matches.push(self.insts.len());
 196         self.push_compiled(Inst::Match(i));
 197         self.compile_finish()
 198     }
 199
 200     fn compile_finish(mut self) -> result::Result<Program, Error> {
 201         self.compiled.insts =
 202             self.insts.into_iter().map(|inst| inst.unwrap()).collect();
 203         self.compiled.byte_classes = self.byte_classes.byte_classes();
 204         self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
 205         Ok(self.compiled)
 206     }
 207
 208     /// Compile expr into self.insts, returning a patch on success,
 209     /// or an error if we run out of memory.
 210     ///
 211     /// All of the c_* methods of the compiler share the contract outlined
 212     /// here.
 213     ///
 214     /// The main thing that a c_* method does is mutate `self.insts`
 215     /// to add a list of mostly compiled instructions required to execute
 216     /// the given expression. `self.insts` contains MaybeInsts rather than
 217     /// Insts because there is some backpatching required.
 218     ///
 219     /// The `Patch` value returned by each c_* method provides metadata
 220     /// about the compiled instructions emitted to `self.insts`. The
 221     /// `entry` member of the patch refers to the first instruction
 222     /// (the entry point), while the `hole` member contains zero or
 223     /// more offsets to partial instructions that need to be backpatched.
 224     /// The c_* routine can't know where its list of instructions are going to
 225     /// jump to after execution, so it is up to the caller to patch
 226     /// these jumps to point to the right place. So compiling some
 227     /// expression, e, we would end up with a situation that looked like:
 228     ///
 229     /// ```text
 230     /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
 231     ///                     ^              ^             ^
 232     ///                     |                \         /
 233     ///                   entry                \     /
 234     ///                                         hole
 235     /// ```
 236     ///
 237     /// To compile two expressions, e1 and e2, concatinated together we
 238     /// would do:
 239     ///
 240     /// ```ignore
 241     /// let patch1 = self.c(e1);
 242     /// let patch2 = self.c(e2);
 243     /// ```
 244     ///
 245     /// while leaves us with a situation that looks like
 246     ///
 247     /// ```text
 248     /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
 249     ///                     ^        ^            ^        ^
 250     ///                     |        |            |        |
 251     ///                entry1        hole1   entry2        hole2
 252     /// ```
 253     ///
 254     /// Then to merge the two patches together into one we would backpatch
 255     /// hole1 with entry2 and return a new patch that enters at entry1
 256     /// and has hole2 for a hole. In fact, if you look at the c_concat
 257     /// method you will see that it does exactly this, though it handles
 258     /// a list of expressions rather than just the two that we use for
 259     /// an example.
 260     fn c(&mut self, expr: &Expr) -> Result {
 261         use prog;
 262         use syntax::Expr::*;
 263
 264         try!(self.check_size());
 265         match *expr {
 266             Empty => Ok(Patch { hole: Hole::None, entry: self.insts.len() }),
 267             Literal { ref chars, casei } => self.c_literal(chars, casei),
 268             LiteralBytes { ref bytes, casei } => self.c_bytes(bytes, casei),
 269             AnyChar => self.c_class(&[ClassRange {
 270                 start: '\x00',
 271                 end: '\u{10ffff}',
 272             }]),
 273             AnyCharNoNL => {
 274                 self.c_class(&[
 275                     ClassRange { start: '\x00', end: '\x09' },
 276                     ClassRange { start: '\x0b', end: '\u{10ffff}' },
 277                 ])
 278             }
 279             AnyByte => {
 280                 self.c_class_bytes(&[ByteRange { start: 0, end: 0xFF }])
 281             }
 282             AnyByteNoNL => {
 283                 self.c_class_bytes(&[
 284                     ByteRange { start: 0, end: 0x9 },
 285                     ByteRange { start: 0xB, end: 0xFF },
 286                 ])
 287             }
 288             Class(ref cls) => {
 289                 self.c_class(cls)
 290             }
 291             ClassBytes(ref cls) => {
 292                 self.c_class_bytes(cls)
 293             }
 294             StartLine if self.compiled.is_reverse => {
 295                 self.byte_classes.set_range(b'\n', b'\n');
 296                 self.c_empty_look(prog::EmptyLook::EndLine)
 297             }
 298             StartLine => {
 299                 self.byte_classes.set_range(b'\n', b'\n');
 300                 self.c_empty_look(prog::EmptyLook::StartLine)
 301             }
 302             EndLine if self.compiled.is_reverse => {
 303                 self.byte_classes.set_range(b'\n', b'\n');
 304                 self.c_empty_look(prog::EmptyLook::StartLine)
 305             }
 306             EndLine => {
 307                 self.byte_classes.set_range(b'\n', b'\n');
 308                 self.c_empty_look(prog::EmptyLook::EndLine)
 309             }
 310             StartText if self.compiled.is_reverse => {
 311                 self.c_empty_look(prog::EmptyLook::EndText)
 312             }
 313             StartText => {
 314                 self.c_empty_look(prog::EmptyLook::StartText)
 315             }
 316             EndText if self.compiled.is_reverse => {
 317                 self.c_empty_look(prog::EmptyLook::StartText)
 318             }
 319             EndText => {
 320                 self.c_empty_look(prog::EmptyLook::EndText)
 321             }
 322             WordBoundary => {
 323                 self.compiled.has_unicode_word_boundary = true;
 324                 self.byte_classes.set_word_boundary();
 325                 self.c_empty_look(prog::EmptyLook::WordBoundary)
 326             }
 327             NotWordBoundary => {
 328                 self.compiled.has_unicode_word_boundary = true;
 329                 self.byte_classes.set_word_boundary();
 330                 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
 331             }
 332             WordBoundaryAscii => {
 333                 self.byte_classes.set_word_boundary();
 334                 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
 335             }
 336             NotWordBoundaryAscii => {
 337                 self.byte_classes.set_word_boundary();
 338                 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
 339             }
 340             Group { ref e, i: None, name: None } => self.c(e),
 341             Group { ref e, i, ref name } => {
 342                 // it's impossible to have a named capture without an index
 343                 let i = i.expect("capture index");
 344                 if i >= self.compiled.captures.len() {
 345                     self.compiled.captures.push(name.clone());
 346                     if let Some(ref name) = *name {
 347                         self.capture_name_idx.insert(name.to_owned(), i);
 348                     }
 349                 }
 350                 self.c_capture(2 * i, e)
 351             }
 352             Concat(ref es) => {
 353                 if self.compiled.is_reverse {
 354                     self.c_concat(es.iter().rev())
 355                 } else {
 356                     self.c_concat(es)
 357                 }
 358             }
 359             Alternate(ref es) => self.c_alternate(&**es),
 360             Repeat { ref e, r, greedy } => self.c_repeat(e, r, greedy),
 361         }
 362     }
 363
 364     fn c_capture(&mut self, first_slot: usize, expr: &Expr) -> Result {
 365         if self.num_exprs > 1 || self.compiled.is_dfa {
 366             // Don't ever compile Save instructions for regex sets because
 367             // they are never used. They are also never used in DFA programs
 368             // because DFAs can't handle captures.
 369             self.c(expr)
 370         } else {
 371             let entry = self.insts.len();
 372             let hole = self.push_hole(InstHole::Save { slot: first_slot });
 373             let patch = try!(self.c(expr));
 374             self.fill(hole, patch.entry);
 375             self.fill_to_next(patch.hole);
 376             let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
 377             Ok(Patch { hole: hole, entry: entry })
 378         }
 379     }
 380
 381     fn c_dotstar(&mut self) -> Result {
 382         Ok(if !self.compiled.only_utf8() {
 383             try!(self.c(&Expr::Repeat {
 384                 e: Box::new(Expr::AnyByte),
 385                 r: Repeater::ZeroOrMore,
 386                 greedy: false,
 387             }))
 388         } else {
 389             try!(self.c(&Expr::Repeat {
 390                 e: Box::new(Expr::AnyChar),
 391                 r: Repeater::ZeroOrMore,
 392                 greedy: false,
 393             }))
 394         })
 395     }
 396
 397     fn c_literal(&mut self, chars: &[char], casei: bool) -> Result {
 398         debug_assert!(!chars.is_empty());
 399         let mut chars: Box<Iterator<Item=&char>> =
 400             if self.compiled.is_reverse {
 401                 Box::new(chars.iter().rev())
 402             } else {
 403                 Box::new(chars.iter())
 404             };
 405         let first = *chars.next().expect("non-empty literal");
 406         let Patch { mut hole, entry } = try!(self.c_char(first, casei));
 407         for &c in chars {
 408             let p = try!(self.c_char(c, casei));
 409             self.fill(hole, p.entry);
 410             hole = p.hole;
 411         }
 412         Ok(Patch { hole: hole, entry: entry })
 413     }
 414
 415     fn c_char(&mut self, c: char, casei: bool) -> Result {
 416         if casei {
 417             self.c_class(&CharClass::new(vec![
 418                 ClassRange { start: c, end: c },
 419             ]).case_fold())
 420         } else {
 421             self.c_class(&[ClassRange { start: c, end: c }])
 422         }
 423     }
 424
 425     fn c_class(&mut self, ranges: &[ClassRange]) -> Result {
 426         assert!(!ranges.is_empty());
 427         if self.compiled.uses_bytes() {
 428             CompileClass {
 429                 c: self,
 430                 ranges: ranges,
 431             }.compile()
 432         } else {
 433             let ranges: Vec<(char, char)> =
 434                 ranges.iter().map(|r| (r.start, r.end)).collect();
 435             let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
 436                 self.push_hole(InstHole::Char { c: ranges[0].0 })
 437             } else {
 438                 self.push_hole(InstHole::Ranges { ranges: ranges })
 439             };
 440             Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
 441         }
 442     }
 443
 444     fn c_bytes(&mut self, bytes: &[u8], casei: bool) -> Result {
 445         debug_assert!(!bytes.is_empty());
 446         let mut bytes: Box<Iterator<Item=&u8>> =
 447             if self.compiled.is_reverse {
 448                 Box::new(bytes.iter().rev())
 449             } else {
 450                 Box::new(bytes.iter())
 451             };
 452         let first = *bytes.next().expect("non-empty literal");
 453         let Patch { mut hole, entry } = try!(self.c_byte(first, casei));
 454         for &b in bytes {
 455             let p = try!(self.c_byte(b, casei));
 456             self.fill(hole, p.entry);
 457             hole = p.hole;
 458         }
 459         Ok(Patch { hole: hole, entry: entry })
 460     }
 461
 462     fn c_byte(&mut self, b: u8, casei: bool) -> Result {
 463         if casei {
 464             self.c_class_bytes(&ByteClass::new(vec![
 465                 ByteRange { start: b, end: b },
 466             ]).case_fold())
 467         } else {
 468             self.c_class_bytes(&[ByteRange { start: b, end: b }])
 469         }
 470     }
 471
 472     fn c_class_bytes(&mut self, ranges: &[ByteRange]) -> Result {
 473         debug_assert!(!ranges.is_empty());
 474
 475         let first_split_entry = self.insts.len();
 476         let mut holes = vec![];
 477         let mut prev_hole = Hole::None;
 478         for r in &ranges[0..ranges.len() - 1] {
 479             self.fill_to_next(prev_hole);
 480             let split = self.push_split_hole();
 481             let next = self.insts.len();
 482             self.byte_classes.set_range(r.start, r.end);
 483             holes.push(self.push_hole(InstHole::Bytes {
 484                 start: r.start, end: r.end,
 485             }));
 486             prev_hole = self.fill_split(split, Some(next), None);
 487         }
 488         let next = self.insts.len();
 489         let r = &ranges[ranges.len() - 1];
 490         self.byte_classes.set_range(r.start, r.end);
 491         holes.push(self.push_hole(InstHole::Bytes {
 492             start: r.start, end: r.end,
 493         }));
 494         self.fill(prev_hole, next);
 495         Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
 496     }
 497
 498     fn c_empty_look(&mut self, look: EmptyLook) -> Result {
 499         let hole = self.push_hole(InstHole::EmptyLook { look: look });
 500         Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
 501     }
 502
 503     fn c_concat<'a, I>(&mut self, exprs: I) -> Result
 504             where I: IntoIterator<Item=&'a Expr> {
 505         let mut exprs = exprs.into_iter();
 506         let first = match exprs.next() {
 507             Some(expr) => expr,
 508             None => {
 509                 return Ok(Patch { hole: Hole::None, entry: self.insts.len() })
 510             }
 511         };
 512         let Patch { mut hole, entry } = try!(self.c(first));
 513         for e in exprs {
 514             let p = try!(self.c(e));
 515             self.fill(hole, p.entry);
 516             hole = p.hole;
 517         }
 518         Ok(Patch { hole: hole, entry: entry })
 519     }
 520
 521     fn c_alternate(&mut self, exprs: &[Expr]) -> Result {
 522         debug_assert!(
 523             exprs.len() >= 2, "alternates must have at least 2 exprs");
 524
 525         // Initial entry point is always the first split.
 526         let first_split_entry = self.insts.len();
 527
 528         // Save up all of the holes from each alternate. They will all get
 529         // patched to point to the same location.
 530         let mut holes = vec![];
 531
 532         let mut prev_hole = Hole::None;
 533         for e in &exprs[0..exprs.len() - 1] {
 534             self.fill_to_next(prev_hole);
 535             let split = self.push_split_hole();
 536             let Patch { hole, entry } = try!(self.c(e));
 537             holes.push(hole);
 538             prev_hole = self.fill_split(split, Some(entry), None);
 539         }
 540         let Patch { hole, entry } = try!(self.c(&exprs[exprs.len() - 1]));
 541         holes.push(hole);
 542         self.fill(prev_hole, entry);
 543         Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
 544     }
 545
 546     fn c_repeat(
 547         &mut self,
 548         expr: &Expr,
 549         kind: Repeater,
 550         greedy: bool,
 551     ) -> Result {
 552         match kind {
 553             Repeater::ZeroOrOne => self.c_repeat_zero_or_one(expr, greedy),
 554             Repeater::ZeroOrMore => self.c_repeat_zero_or_more(expr, greedy),
 555             Repeater::OneOrMore => self.c_repeat_one_or_more(expr, greedy),
 556             Repeater::Range { min, max: None } => {
 557                 self.c_repeat_range_min_or_more(expr, greedy, min)
 558             }
 559             Repeater::Range { min, max: Some(max) } => {
 560                 self.c_repeat_range(expr, greedy, min, max)
 561             }
 562         }
 563     }
 564
 565     fn c_repeat_zero_or_one(
 566         &mut self,
 567         expr: &Expr,
 568         greedy: bool,
 569     ) -> Result {
 570         let split_entry = self.insts.len();
 571         let split = self.push_split_hole();
 572         let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr));
 573
 574         let split_hole = if greedy {
 575             self.fill_split(split, Some(entry_rep), None)
 576         } else {
 577             self.fill_split(split, None, Some(entry_rep))
 578         };
 579         let holes = vec![hole_rep, split_hole];
 580         Ok(Patch { hole: Hole::Many(holes), entry: split_entry })
 581     }
 582
 583     fn c_repeat_zero_or_more(
 584         &mut self,
 585         expr: &Expr,
 586         greedy: bool,
 587     ) -> Result {
 588         let split_entry = self.insts.len();
 589         let split = self.push_split_hole();
 590         let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr));
 591
 592         self.fill(hole_rep, split_entry);
 593         let split_hole = if greedy {
 594             self.fill_split(split, Some(entry_rep), None)
 595         } else {
 596             self.fill_split(split, None, Some(entry_rep))
 597         };
 598         Ok(Patch { hole: split_hole, entry: split_entry })
 599     }
 600
 601     fn c_repeat_one_or_more(
 602         &mut self,
 603         expr: &Expr,
 604         greedy: bool,
 605     ) -> Result {
 606         let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr));
 607         self.fill_to_next(hole_rep);
 608         let split = self.push_split_hole();
 609
 610         let split_hole = if greedy {
 611             self.fill_split(split, Some(entry_rep), None)
 612         } else {
 613             self.fill_split(split, None, Some(entry_rep))
 614         };
 615         Ok(Patch { hole: split_hole, entry: entry_rep })
 616     }
 617
 618     fn c_repeat_range_min_or_more(
 619         &mut self,
 620         expr: &Expr,
 621         greedy: bool,
 622         min: u32,
 623     ) -> Result {
 624         let min = u32_to_usize(min);
 625         let patch_concat = try!(self.c_concat(iter::repeat(expr).take(min)));
 626         let patch_rep = try!(self.c_repeat_zero_or_more(expr, greedy));
 627         self.fill(patch_concat.hole, patch_rep.entry);
 628         Ok(Patch { hole: patch_rep.hole, entry: patch_concat.entry })
 629     }
 630
 631     fn c_repeat_range(
 632         &mut self,
 633         expr: &Expr,
 634         greedy: bool,
 635         min: u32,
 636         max: u32,
 637     ) -> Result {
 638         let (min, max) = (u32_to_usize(min), u32_to_usize(max));
 639         let patch_concat = try!(self.c_concat(iter::repeat(expr).take(min)));
 640         let initial_entry = patch_concat.entry;
 641         if min == max {
 642             return Ok(patch_concat);
 643         }
 644         // It is much simpler to compile, e.g., `a{2,5}` as:
 645         //
 646         //     aaa?a?a?
 647         //
 648         // But you end up with a sequence of instructions like this:
 649         //
 650         //     0: 'a'
 651         //     1: 'a',
 652         //     2: split(3, 4)
 653         //     3: 'a'
 654         //     4: split(5, 6)
 655         //     5: 'a'
 656         //     6: split(7, 8)
 657         //     7: 'a'
 658         //     8: MATCH
 659         //
 660         // This is *incredibly* inefficient because the splits end
 661         // up forming a chain, which has to be resolved everything a
 662         // transition is followed.
 663         let mut holes = vec![];
 664         let mut prev_hole = patch_concat.hole;
 665         for _ in min..max {
 666             self.fill_to_next(prev_hole);
 667             let split = self.push_split_hole();
 668             let Patch { hole, entry } = try!(self.c(expr));
 669             prev_hole = hole;
 670             if greedy {
 671                 holes.push(self.fill_split(split, Some(entry), None));
 672             } else {
 673                 holes.push(self.fill_split(split, None, Some(entry)));
 674             }
 675         }
 676         holes.push(prev_hole);
 677         Ok(Patch { hole: Hole::Many(holes), entry: initial_entry })
 678     }
 679
 680     fn fill(&mut self, hole: Hole, goto: InstPtr) {
 681         match hole {
 682             Hole::None => {}
 683             Hole::One(pc) => {
 684                 self.insts[pc].fill(goto);
 685             }
 686             Hole::Many(holes) => {
 687                 for hole in holes {
 688                     self.fill(hole, goto);
 689                 }
 690             }
 691         }
 692     }
 693
 694     fn fill_to_next(&mut self, hole: Hole) {
 695         let next = self.insts.len();
 696         self.fill(hole, next);
 697     }
 698
 699     fn fill_split(
 700         &mut self,
 701         hole: Hole,
 702         goto1: Option<InstPtr>,
 703         goto2: Option<InstPtr>,
 704     ) -> Hole {
 705         match hole {
 706             Hole::None => Hole::None,
 707             Hole::One(pc) => {
 708                 match (goto1, goto2) {
 709                     (Some(goto1), Some(goto2)) => {
 710                         self.insts[pc].fill_split(goto1, goto2);
 711                         Hole::None
 712                     }
 713                     (Some(goto1), None) => {
 714                         self.insts[pc].half_fill_split_goto1(goto1);
 715                         Hole::One(pc)
 716                     }
 717                     (None, Some(goto2)) => {
 718                         self.insts[pc].half_fill_split_goto2(goto2);
 719                         Hole::One(pc)
 720                     }
 721                     (None, None) => unreachable!("at least one of the split \
 722                                                   holes must be filled"),
 723                 }
 724             }
 725             Hole::Many(holes) => {
 726                 let mut new_holes = vec![];
 727                 for hole in holes {
 728                     new_holes.push(self.fill_split(hole, goto1, goto2));
 729                 }
 730                 if new_holes.is_empty() {
 731                     Hole::None
 732                 } else if new_holes.len() == 1 {
 733                     new_holes.pop().unwrap()
 734                 } else {
 735                     Hole::Many(new_holes)
 736                 }
 737             }
 738         }
 739     }
 740
 741     fn push_compiled(&mut self, inst: Inst) {
 742         self.insts.push(MaybeInst::Compiled(inst));
 743     }
 744
 745     fn push_hole(&mut self, inst: InstHole) -> Hole {
 746         let hole = self.insts.len();
 747         self.insts.push(MaybeInst::Uncompiled(inst));
 748         Hole::One(hole)
 749     }
 750
 751     fn push_split_hole(&mut self) -> Hole {
 752         let hole = self.insts.len();
 753         self.insts.push(MaybeInst::Split);
 754         Hole::One(hole)
 755     }
 756
 757     fn check_size(&self) -> result::Result<(), Error> {
 758         use std::mem::size_of;
 759
 760         if self.insts.len() * size_of::<Inst>() > self.size_limit {
 761             Err(Error::CompiledTooBig(self.size_limit))
 762         } else {
 763             Ok(())
 764         }
 765     }
 766 }
 767
 768 #[derive(Debug)]
 769 enum Hole {
 770     None,
 771     One(InstPtr),
 772     Many(Vec<Hole>),
 773 }
 774
 775 #[derive(Clone, Debug)]
 776 enum MaybeInst {
 777     Compiled(Inst),
 778     Uncompiled(InstHole),
 779     Split,
 780     Split1(InstPtr),
 781     Split2(InstPtr),
 782 }
 783
 784 impl MaybeInst {
 785     fn fill(&mut self, goto: InstPtr) {
 786         let filled = match *self {
 787             MaybeInst::Uncompiled(ref inst) => inst.fill(goto),
 788             MaybeInst::Split1(goto1) => {
 789                 Inst::Split(InstSplit { goto1: goto1, goto2: goto })
 790             }
 791             MaybeInst::Split2(goto2) => {
 792                 Inst::Split(InstSplit { goto1: goto, goto2: goto2 })
 793             }
 794             _ => unreachable!("not all instructions were compiled! \
 795                                found uncompiled instruction: {:?}", self),
 796         };
 797         *self = MaybeInst::Compiled(filled);
 798     }
 799
 800     fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
 801         let filled = match *self {
 802             MaybeInst::Split => {
 803                 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
 804             }
 805             _ => unreachable!("must be called on Split instruction, \
 806                                instead it was called on: {:?}", self),
 807         };
 808         *self = MaybeInst::Compiled(filled);
 809     }
 810
 811     fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
 812         let half_filled = match *self {
 813             MaybeInst::Split => goto1,
 814             _ => unreachable!("must be called on Split instruction, \
 815                                instead it was called on: {:?}", self),
 816         };
 817         *self = MaybeInst::Split1(half_filled);
 818     }
 819
 820     fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
 821         let half_filled = match *self {
 822             MaybeInst::Split => goto2,
 823             _ => unreachable!("must be called on Split instruction, \
 824                                instead it was called on: {:?}", self),
 825         };
 826         *self = MaybeInst::Split2(half_filled);
 827     }
 828
 829     fn unwrap(self) -> Inst {
 830         match self {
 831             MaybeInst::Compiled(inst) => inst,
 832             _ => unreachable!("must be called on a compiled instruction, \
 833                                instead it was called on: {:?}", self),
 834         }
 835     }
 836 }
 837
 838 #[derive(Clone, Debug)]
 839 enum InstHole {
 840     Save { slot: usize },
 841     EmptyLook { look: EmptyLook },
 842     Char { c: char },
 843     Ranges { ranges: Vec<(char, char)> },
 844     Bytes { start: u8, end: u8 },
 845 }
 846
 847 impl InstHole {
 848     fn fill(&self, goto: InstPtr) -> Inst {
 849         match *self {
 850             InstHole::Save { slot } => Inst::Save(InstSave {
 851                 goto: goto,
 852                 slot: slot,
 853             }),
 854             InstHole::EmptyLook { look } => Inst::EmptyLook(InstEmptyLook {
 855                 goto: goto,
 856                 look: look,
 857             }),
 858             InstHole::Char { c } => Inst::Char(InstChar {
 859                 goto: goto,
 860                 c: c,
 861             }),
 862             InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
 863                 goto: goto,
 864                 ranges: ranges.clone(),
 865             }),
 866             InstHole::Bytes { start, end } => Inst::Bytes(InstBytes {
 867                 goto: goto,
 868                 start: start,
 869                 end: end,
 870             }),
 871         }
 872     }
 873 }
 874
 875 struct CompileClass<'a, 'b> {
 876     c: &'a mut Compiler,
 877     ranges: &'b [ClassRange],
 878 }
 879
 880 impl<'a, 'b> CompileClass<'a, 'b> {
 881     fn compile(mut self) -> Result {
 882         let mut holes = vec![];
 883         let mut initial_entry = None;
 884         let mut last_split = Hole::None;
 885         let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
 886         self.c.suffix_cache.clear();
 887
 888         for (i, range) in self.ranges.iter().enumerate() {
 889             let is_last_range = i + 1 == self.ranges.len();
 890             utf8_seqs.reset(range.start, range.end);
 891             let mut it = (&mut utf8_seqs).peekable();
 892             loop {
 893                 let utf8_seq = match it.next() {
 894                     None => break,
 895                     Some(utf8_seq) => utf8_seq,
 896                 };
 897                 if is_last_range && it.peek().is_none() {
 898                     let Patch { hole, entry } = try!(self.c_utf8_seq(&utf8_seq));
 899                     holes.push(hole);
 900                     self.c.fill(last_split, entry);
 901                     last_split = Hole::None;
 902                     if initial_entry.is_none() {
 903                         initial_entry = Some(entry);
 904                     }
 905                 } else {
 906                     if initial_entry.is_none() {
 907                         initial_entry = Some(self.c.insts.len());
 908                     }
 909                     self.c.fill_to_next(last_split);
 910                     last_split = self.c.push_split_hole();
 911                     let Patch { hole, entry } = try!(self.c_utf8_seq(&utf8_seq));
 912                     holes.push(hole);
 913                     last_split = self.c.fill_split(last_split, Some(entry), None);
 914                 }
 915             }
 916         }
 917         self.c.utf8_seqs = Some(utf8_seqs);
 918         Ok(Patch {
 919             hole: Hole::Many(holes),
 920             entry: initial_entry.unwrap(),
 921         })
 922     }
 923
 924     fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
 925         if self.c.compiled.is_reverse {
 926             self.c_utf8_seq_(seq)
 927         } else {
 928             self.c_utf8_seq_(seq.into_iter().rev())
 929         }
 930     }
 931
 932     fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
 933             where I: IntoIterator<Item=&'r Utf8Range> {
 934         // The initial instruction for each UTF-8 sequence should be the same.
 935         let mut from_inst = ::std::usize::MAX;
 936         let mut last_hole = Hole::None;
 937         for byte_range in seq {
 938             let key = SuffixCacheKey {
 939                 from_inst: from_inst,
 940                 start: byte_range.start,
 941                 end: byte_range.end,
 942             };
 943             {
 944                 let pc = self.c.insts.len();
 945                 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
 946                     from_inst = cached_pc;
 947                     continue;
 948                 }
 949             }
 950             self.c.byte_classes.set_range(byte_range.start, byte_range.end);
 951             if from_inst == ::std::usize::MAX {
 952                 last_hole = self.c.push_hole(InstHole::Bytes {
 953                     start: byte_range.start,
 954                     end: byte_range.end,
 955                 });
 956             } else {
 957                 self.c.push_compiled(Inst::Bytes(InstBytes {
 958                     goto: from_inst,
 959                     start: byte_range.start,
 960                     end: byte_range.end,
 961                 }));
 962             }
 963             from_inst = self.c.insts.len().checked_sub(1).unwrap();
 964             debug_assert!(from_inst < ::std::usize::MAX);
 965         }
 966         debug_assert!(from_inst < ::std::usize::MAX);
 967         Ok(Patch { hole: last_hole, entry: from_inst })
 968     }
 969 }
 970
 971 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
 972 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
 973 /// The set of byte ranges looks like this:
 974 ///
 975 /// [0-7F]
 976 /// [C2-DF][80-BF]
 977 /// [E0][A0-BF][80-BF]
 978 /// [E1-EC][80-BF][80-BF]
 979 /// [ED][80-9F][80-BF]
 980 /// [EE-EF][80-BF][80-BF]
 981 ///
 982 /// Each line above translates to one alternate in the compiled regex program.
 983 /// However, all but one of the alternates end in the same suffix, which is
 984 /// a waste of an instruction. The suffix cache facilitates reusing them across
 985 /// alternates.
 986 ///
 987 /// Note that a HashMap could be trivially used for this, but we don't need its
 988 /// overhead. Some small bounded space (LRU style) is more than enough.
 989 struct SuffixCache {
 990     table: Vec<SuffixCacheEntry>,
 991     // Every time the cache is cleared, we increment the version number instead
 992     // of actually zeroing memory. Since we store a copy of the current version
 993     // in every element, all we need to do is make sure to invalidate any stale
 994     // entries upon access. This saves quite a bit of time!
 995     version: usize,
 996 }
 997
 998 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
 999 struct SuffixCacheEntry {
1000     key: SuffixCacheKey,
1001     pc: InstPtr,
1002     version: usize,
1003 }
1004
1005 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1006 struct SuffixCacheKey {
1007     from_inst: InstPtr,
1008     start: u8,
1009     end: u8,
1010 }
1011
1012 impl SuffixCache {
1013     fn new(size: usize) -> Self {
1014         SuffixCache {
1015             table: vec![SuffixCacheEntry::default(); size],
1016             version: 0,
1017         }
1018     }
1019
1020     fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1021         let h = self.hash(&key);
1022         let e = self.table[h];
1023         if e.key == key && e.version == self.version {
1024             Some(e.pc)
1025         } else {
1026             self.table[h] = SuffixCacheEntry {
1027                 key: key,
1028                 pc: pc,
1029                 version: self.version,
1030             };
1031             None
1032         }
1033     }
1034
1035     fn clear(&mut self) {
1036         self.version += 1;
1037     }
1038
1039     fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1040         // Basic FNV-1a hash as described:
1041         // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1042         const FNV_PRIME: u64 = 1099511628211;
1043         let mut h = 14695981039346656037;
1044         h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1045         h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1046         h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1047         (h as usize) % self.table.len()
1048     }
1049 }
1050
1051 struct ByteClassSet([bool; 256]);
1052
1053 impl ByteClassSet {
1054     fn new() -> Self {
1055         ByteClassSet([false; 256])
1056     }
1057
1058     fn set_range(&mut self, start: u8, end: u8) {
1059         debug_assert!(start <= end);
1060         if start > 0 {
1061             self.0[start as usize - 1] = true;
1062         }
1063         self.0[end as usize] = true;
1064     }
1065
1066     fn set_word_boundary(&mut self) {
1067         // We need to mark all ranges of bytes whose pairs result in
1068         // evaluating \b differently.
1069         let iswb = is_word_byte;
1070         let mut b1: u16 = 0;
1071         let mut b2: u16;
1072         while b1 <= 255 {
1073             b2 = b1 + 1;
1074             while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1075                 b2 += 1;
1076             }
1077             self.set_range(b1 as u8, (b2 - 1) as u8);
1078             b1 = b2;
1079         }
1080     }
1081
1082     fn byte_classes(&self) -> Vec<u8> {
1083         // N.B. If you're debugging the DFA, it's useful to simply return
1084         // `(0..256).collect()`, which effectively removes the byte classes
1085         // and makes the transitions easier to read.
1086         // (0usize..256).map(|x| x as u8).collect()
1087         let mut byte_classes = vec![0; 256];
1088         let mut class = 0u8;
1089         let mut i = 0;
1090         loop {
1091             byte_classes[i] = class as u8;
1092             if i >= 255 {
1093                 break;
1094             }
1095             if self.0[i] {
1096                 class = class.checked_add(1).unwrap();
1097             }
1098             i += 1;
1099         }
1100         byte_classes
1101     }
1102 }
1103
1104 fn u32_to_usize(n: u32) -> usize {
1105     // In case usize is less than 32 bits, we need to guard against overflow.
1106     // On most platforms this compiles to nothing.
1107     // TODO Use `std::convert::TryFrom` once it's stable.
1108     if (n as u64) > (::std::usize::MAX as u64) {
1109         panic!("BUG: {} is too big to be pointer sized", n)
1110     }
1111     n as usize
1112 }
1113
1114 #[cfg(test)]
1115 mod tests {
1116     use super::ByteClassSet;
1117
1118     #[test]
1119     fn byte_classes() {
1120         let mut set = ByteClassSet::new();
1121         set.set_range(b'a', b'z');
1122         let classes = set.byte_classes();
1123         assert_eq!(classes[0], 0);
1124         assert_eq!(classes[1], 0);
1125         assert_eq!(classes[2], 0);
1126         assert_eq!(classes[b'a' as usize - 1], 0);
1127         assert_eq!(classes[b'a' as usize], 1);
1128         assert_eq!(classes[b'm' as usize], 1);
1129         assert_eq!(classes[b'z' as usize], 1);
1130         assert_eq!(classes[b'z' as usize + 1], 2);
1131         assert_eq!(classes[254], 2);
1132         assert_eq!(classes[255], 2);
1133
1134         let mut set = ByteClassSet::new();
1135         set.set_range(0, 2);
1136         set.set_range(4, 6);
1137         let classes = set.byte_classes();
1138         assert_eq!(classes[0], 0);
1139         assert_eq!(classes[1], 0);
1140         assert_eq!(classes[2], 0);
1141         assert_eq!(classes[3], 1);
1142         assert_eq!(classes[4], 2);
1143         assert_eq!(classes[5], 2);
1144         assert_eq!(classes[6], 2);
1145         assert_eq!(classes[7], 3);
1146         assert_eq!(classes[255], 3);
1147     }
1148
1149     #[test]
1150     fn full_byte_classes() {
1151         let mut set = ByteClassSet::new();
1152         for i in 0..256u16 {
1153             set.set_range(i as u8, i as u8);
1154         }
1155         assert_eq!(set.byte_classes().len(), 256);
1156     }
1157 }