src/vendor/regex/src/compile.rs

   1 // Copyright 2014-2016 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 use std::collections::HashMap;
  12 use std::iter;
  13 use std::result;
  14 use std::sync::Arc;
  15
  16 use syntax::{
  17     Expr, Repeater, CharClass, ClassRange, ByteClass, ByteRange,
  18     is_word_byte,
  19 };
  20 use utf8_ranges::{Utf8Range, Utf8Sequence, Utf8Sequences};
  21
  22 use prog::{
  23     Program, Inst, InstPtr, EmptyLook,
  24     InstSave, InstSplit, InstEmptyLook, InstChar, InstRanges, InstBytes,
  25 };
  26
  27 use Error;
  28
  29 type Result = result::Result<Patch, Error>;
  30
  31 #[derive(Debug)]
  32 struct Patch {
  33     hole: Hole,
  34     entry: InstPtr,
  35 }
  36
  37 /// A compiler translates a regular expression AST to a sequence of
  38 /// instructions. The sequence of instructions represents an NFA.
  39 pub struct Compiler {
  40     insts: Vec<MaybeInst>,
  41     compiled: Program,
  42     capture_name_idx: HashMap<String, usize>,
  43     num_exprs: usize,
  44     size_limit: usize,
  45     suffix_cache: SuffixCache,
  46     utf8_seqs: Option<Utf8Sequences>,
  47     byte_classes: ByteClassSet,
  48 }
  49
  50 impl Compiler {
  51     /// Create a new regular expression compiler.
  52     ///
  53     /// Various options can be set before calling `compile` on an expression.
  54     pub fn new() -> Self {
  55         Compiler {
  56             insts: vec![],
  57             compiled: Program::new(),
  58             capture_name_idx: HashMap::new(),
  59             num_exprs: 0,
  60             size_limit: 10 * (1 << 20),
  61             suffix_cache: SuffixCache::new(1000),
  62             utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
  63             byte_classes: ByteClassSet::new(),
  64         }
  65     }
  66
  67     /// The size of the resulting program is limited by size_limit. If
  68     /// the program approximately exceeds the given size (in bytes), then
  69     /// compilation will stop and return an error.
  70     pub fn size_limit(mut self, size_limit: usize) -> Self {
  71         self.size_limit = size_limit;
  72         self
  73     }
  74
  75     /// If bytes is true, then the program is compiled as a byte based
  76     /// automaton, which incorporates UTF-8 decoding into the machine. If it's
  77     /// false, then the automaton is Unicode scalar value based, e.g., an
  78     /// engine utilizing such an automaton is resposible for UTF-8 decoding.
  79     ///
  80     /// The specific invariant is that when returning a byte based machine,
  81     /// the neither the `Char` nor `Ranges` instructions are produced.
  82     /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
  83     /// instruction is never produced.
  84     ///
  85     /// Note that `dfa(true)` implies `bytes(true)`.
  86     pub fn bytes(mut self, yes: bool) -> Self {
  87         self.compiled.is_bytes = yes;
  88         self
  89     }
  90
  91     /// When disabled, the program compiled may match arbitrary bytes.
  92     ///
  93     /// When enabled (the default), all compiled programs exclusively match
  94     /// valid UTF-8 bytes.
  95     pub fn only_utf8(mut self, yes: bool) -> Self {
  96         self.compiled.only_utf8 = yes;
  97         self
  98     }
  99
 100     /// When set, the machine returned is suitable for use in the DFA matching
 101     /// engine.
 102     ///
 103     /// In particular, this ensures that if the regex is not anchored in the
 104     /// beginning, then a preceding `.*?` is included in the program. (The NFA
 105     /// based engines handle the preceding `.*?` explicitly, which is difficult
 106     /// or impossible in the DFA engine.)
 107     pub fn dfa(mut self, yes: bool) -> Self {
 108         self.compiled.is_dfa = yes;
 109         self
 110     }
 111
 112     /// When set, the machine returned is suitable for matching text in
 113     /// reverse. In particular, all concatenations are flipped.
 114     pub fn reverse(mut self, yes: bool) -> Self {
 115         self.compiled.is_reverse = yes;
 116         self
 117     }
 118
 119     /// Compile a regular expression given its AST.
 120     ///
 121     /// The compiler is guaranteed to succeed unless the program exceeds the
 122     /// specified size limit. If the size limit is exceeded, then compilation
 123     /// stops and returns an error.
 124     pub fn compile(
 125         mut self,
 126         exprs: &[Expr],
 127     ) -> result::Result<Program, Error> {
 128         debug_assert!(exprs.len() >= 1);
 129         self.num_exprs = exprs.len();
 130         if exprs.len() == 1 {
 131             self.compile_one(&exprs[0])
 132         } else {
 133             self.compile_many(exprs)
 134         }
 135     }
 136
 137     fn compile_one(mut self, expr: &Expr) -> result::Result<Program, Error> {
 138         // If we're compiling a forward DFA and we aren't anchored, then
 139         // add a `.*?` before the first capture group.
 140         // Other matching engines handle this by baking the logic into the
 141         // matching engine itself.
 142         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
 143         self.compiled.is_anchored_start = expr.is_anchored_start();
 144         self.compiled.is_anchored_end = expr.is_anchored_end();
 145         if self.compiled.needs_dotstar() {
 146             dotstar_patch = try!(self.c_dotstar());
 147             self.compiled.start = dotstar_patch.entry;
 148         }
 149         self.compiled.captures = vec![None];
 150         let patch = try!(self.c_capture(0, expr));
 151         if self.compiled.needs_dotstar() {
 152             self.fill(dotstar_patch.hole, patch.entry);
 153         } else {
 154             self.compiled.start = patch.entry;
 155         }
 156         self.fill_to_next(patch.hole);
 157         self.compiled.matches = vec![self.insts.len()];
 158         self.push_compiled(Inst::Match(0));
 159         self.compile_finish()
 160     }
 161
 162     fn compile_many(
 163         mut self,
 164         exprs: &[Expr],
 165     ) -> result::Result<Program, Error> {
 166         debug_assert!(exprs.len() > 1);
 167
 168         self.compiled.is_anchored_start =
 169             exprs.iter().all(|e| e.is_anchored_start());
 170         self.compiled.is_anchored_end =
 171             exprs.iter().all(|e| e.is_anchored_end());
 172         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
 173         if self.compiled.needs_dotstar() {
 174             dotstar_patch = try!(self.c_dotstar());
 175             self.compiled.start = dotstar_patch.entry;
 176         } else {
 177             self.compiled.start = 0; // first instruction is always split
 178         }
 179         self.fill_to_next(dotstar_patch.hole);
 180
 181         for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
 182             let split = self.push_split_hole();
 183             let Patch { hole, entry } = try!(self.c_capture(0, expr));
 184             self.fill_to_next(hole);
 185             self.compiled.matches.push(self.insts.len());
 186             self.push_compiled(Inst::Match(i));
 187
 188             let next = self.insts.len();
 189             self.fill_split(split, Some(entry), Some(next));
 190         }
 191         let i = exprs.len() - 1;
 192         let Patch { hole, .. } = try!(self.c_capture(0, &exprs[i]));
 193         self.fill_to_next(hole);
 194         self.compiled.matches.push(self.insts.len());
 195         self.push_compiled(Inst::Match(i));
 196         self.compile_finish()
 197     }
 198
 199     fn compile_finish(mut self) -> result::Result<Program, Error> {
 200         self.compiled.insts =
 201             self.insts.into_iter().map(|inst| inst.unwrap()).collect();
 202         self.compiled.byte_classes = self.byte_classes.byte_classes();
 203         self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
 204         Ok(self.compiled)
 205     }
 206
 207     fn c(&mut self, expr: &Expr) -> Result {
 208         use prog;
 209         use syntax::Expr::*;
 210
 211         try!(self.check_size());
 212         match *expr {
 213             Empty => Ok(Patch { hole: Hole::None, entry: self.insts.len() }),
 214             Literal { ref chars, casei } => self.c_literal(chars, casei),
 215             LiteralBytes { ref bytes, casei } => self.c_bytes(bytes, casei),
 216             AnyChar => self.c_class(&[ClassRange {
 217                 start: '\x00',
 218                 end: '\u{10ffff}',
 219             }]),
 220             AnyCharNoNL => {
 221                 self.c_class(&[
 222                     ClassRange { start: '\x00', end: '\x09' },
 223                     ClassRange { start: '\x0b', end: '\u{10ffff}' },
 224                 ])
 225             }
 226             AnyByte => {
 227                 self.c_class_bytes(&[ByteRange { start: 0, end: 0xFF }])
 228             }
 229             AnyByteNoNL => {
 230                 self.c_class_bytes(&[
 231                     ByteRange { start: 0, end: 0x9 },
 232                     ByteRange { start: 0xB, end: 0xFF },
 233                 ])
 234             }
 235             Class(ref cls) => {
 236                 self.c_class(cls)
 237             }
 238             ClassBytes(ref cls) => {
 239                 self.c_class_bytes(cls)
 240             }
 241             StartLine if self.compiled.is_reverse => {
 242                 self.byte_classes.set_range(b'\n', b'\n');
 243                 self.c_empty_look(prog::EmptyLook::EndLine)
 244             }
 245             StartLine => {
 246                 self.byte_classes.set_range(b'\n', b'\n');
 247                 self.c_empty_look(prog::EmptyLook::StartLine)
 248             }
 249             EndLine if self.compiled.is_reverse => {
 250                 self.byte_classes.set_range(b'\n', b'\n');
 251                 self.c_empty_look(prog::EmptyLook::StartLine)
 252             }
 253             EndLine => {
 254                 self.byte_classes.set_range(b'\n', b'\n');
 255                 self.c_empty_look(prog::EmptyLook::EndLine)
 256             }
 257             StartText if self.compiled.is_reverse => {
 258                 self.c_empty_look(prog::EmptyLook::EndText)
 259             }
 260             StartText => {
 261                 self.c_empty_look(prog::EmptyLook::StartText)
 262             }
 263             EndText if self.compiled.is_reverse => {
 264                 self.c_empty_look(prog::EmptyLook::StartText)
 265             }
 266             EndText => {
 267                 self.c_empty_look(prog::EmptyLook::EndText)
 268             }
 269             WordBoundary => {
 270                 self.compiled.has_unicode_word_boundary = true;
 271                 self.byte_classes.set_word_boundary();
 272                 self.c_empty_look(prog::EmptyLook::WordBoundary)
 273             }
 274             NotWordBoundary => {
 275                 self.compiled.has_unicode_word_boundary = true;
 276                 self.byte_classes.set_word_boundary();
 277                 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
 278             }
 279             WordBoundaryAscii => {
 280                 self.byte_classes.set_word_boundary();
 281                 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
 282             }
 283             NotWordBoundaryAscii => {
 284                 self.byte_classes.set_word_boundary();
 285                 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
 286             }
 287             Group { ref e, i: None, name: None } => self.c(e),
 288             Group { ref e, i, ref name } => {
 289                 // it's impossible to have a named capture without an index
 290                 let i = i.expect("capture index");
 291                 if i >= self.compiled.captures.len() {
 292                     self.compiled.captures.push(name.clone());
 293                     if let Some(ref name) = *name {
 294                         self.capture_name_idx.insert(name.to_owned(), i);
 295                     }
 296                 }
 297                 self.c_capture(2 * i, e)
 298             }
 299             Concat(ref es) => {
 300                 if self.compiled.is_reverse {
 301                     self.c_concat(es.iter().rev())
 302                 } else {
 303                     self.c_concat(es)
 304                 }
 305             }
 306             Alternate(ref es) => self.c_alternate(&**es),
 307             Repeat { ref e, r, greedy } => self.c_repeat(e, r, greedy),
 308         }
 309     }
 310
 311     fn c_capture(&mut self, first_slot: usize, expr: &Expr) -> Result {
 312         if self.num_exprs > 1 || self.compiled.is_dfa {
 313             // Don't ever compile Save instructions for regex sets because
 314             // they are never used. They are also never used in DFA programs
 315             // because DFAs can't handle captures.
 316             self.c(expr)
 317         } else {
 318             let entry = self.insts.len();
 319             let hole = self.push_hole(InstHole::Save { slot: first_slot });
 320             let patch = try!(self.c(expr));
 321             self.fill(hole, patch.entry);
 322             self.fill_to_next(patch.hole);
 323             let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
 324             Ok(Patch { hole: hole, entry: entry })
 325         }
 326     }
 327
 328     fn c_dotstar(&mut self) -> Result {
 329         Ok(if !self.compiled.only_utf8() {
 330             try!(self.c(&Expr::Repeat {
 331                 e: Box::new(Expr::AnyByte),
 332                 r: Repeater::ZeroOrMore,
 333                 greedy: false,
 334             }))
 335         } else {
 336             try!(self.c(&Expr::Repeat {
 337                 e: Box::new(Expr::AnyChar),
 338                 r: Repeater::ZeroOrMore,
 339                 greedy: false,
 340             }))
 341         })
 342     }
 343
 344     fn c_literal(&mut self, chars: &[char], casei: bool) -> Result {
 345         debug_assert!(!chars.is_empty());
 346         let mut chars: Box<Iterator<Item=&char>> =
 347             if self.compiled.is_reverse {
 348                 Box::new(chars.iter().rev())
 349             } else {
 350                 Box::new(chars.iter())
 351             };
 352         let first = *chars.next().expect("non-empty literal");
 353         let Patch { mut hole, entry } = try!(self.c_char(first, casei));
 354         for &c in chars {
 355             let p = try!(self.c_char(c, casei));
 356             self.fill(hole, p.entry);
 357             hole = p.hole;
 358         }
 359         Ok(Patch { hole: hole, entry: entry })
 360     }
 361
 362     fn c_char(&mut self, c: char, casei: bool) -> Result {
 363         if casei {
 364             self.c_class(&CharClass::new(vec![
 365                 ClassRange { start: c, end: c },
 366             ]).case_fold())
 367         } else {
 368             self.c_class(&[ClassRange { start: c, end: c }])
 369         }
 370     }
 371
 372     fn c_class(&mut self, ranges: &[ClassRange]) -> Result {
 373         assert!(!ranges.is_empty());
 374         if self.compiled.uses_bytes() {
 375             CompileClass {
 376                 c: self,
 377                 ranges: ranges,
 378             }.compile()
 379         } else {
 380             let ranges: Vec<(char, char)> =
 381                 ranges.iter().map(|r| (r.start, r.end)).collect();
 382             let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
 383                 self.push_hole(InstHole::Char { c: ranges[0].0 })
 384             } else {
 385                 self.push_hole(InstHole::Ranges { ranges: ranges })
 386             };
 387             Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
 388         }
 389     }
 390
 391     fn c_bytes(&mut self, bytes: &[u8], casei: bool) -> Result {
 392         debug_assert!(!bytes.is_empty());
 393         let mut bytes: Box<Iterator<Item=&u8>> =
 394             if self.compiled.is_reverse {
 395                 Box::new(bytes.iter().rev())
 396             } else {
 397                 Box::new(bytes.iter())
 398             };
 399         let first = *bytes.next().expect("non-empty literal");
 400         let Patch { mut hole, entry } = try!(self.c_byte(first, casei));
 401         for &b in bytes {
 402             let p = try!(self.c_byte(b, casei));
 403             self.fill(hole, p.entry);
 404             hole = p.hole;
 405         }
 406         Ok(Patch { hole: hole, entry: entry })
 407     }
 408
 409     fn c_byte(&mut self, b: u8, casei: bool) -> Result {
 410         if casei {
 411             self.c_class_bytes(&ByteClass::new(vec![
 412                 ByteRange { start: b, end: b },
 413             ]).case_fold())
 414         } else {
 415             self.c_class_bytes(&[ByteRange { start: b, end: b }])
 416         }
 417     }
 418
 419     fn c_class_bytes(&mut self, ranges: &[ByteRange]) -> Result {
 420         debug_assert!(!ranges.is_empty());
 421
 422         let first_split_entry = self.insts.len();
 423         let mut holes = vec![];
 424         let mut prev_hole = Hole::None;
 425         for r in &ranges[0..ranges.len() - 1] {
 426             self.fill_to_next(prev_hole);
 427             let split = self.push_split_hole();
 428             let next = self.insts.len();
 429             self.byte_classes.set_range(r.start, r.end);
 430             holes.push(self.push_hole(InstHole::Bytes {
 431                 start: r.start, end: r.end,
 432             }));
 433             prev_hole = self.fill_split(split, Some(next), None);
 434         }
 435         let next = self.insts.len();
 436         let r = &ranges[ranges.len() - 1];
 437         self.byte_classes.set_range(r.start, r.end);
 438         holes.push(self.push_hole(InstHole::Bytes {
 439             start: r.start, end: r.end,
 440         }));
 441         self.fill(prev_hole, next);
 442         Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
 443     }
 444
 445     fn c_empty_look(&mut self, look: EmptyLook) -> Result {
 446         let hole = self.push_hole(InstHole::EmptyLook { look: look });
 447         Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
 448     }
 449
 450     fn c_concat<'a, I>(&mut self, exprs: I) -> Result
 451             where I: IntoIterator<Item=&'a Expr> {
 452         let mut exprs = exprs.into_iter();
 453         let first = match exprs.next() {
 454             Some(expr) => expr,
 455             None => {
 456                 return Ok(Patch { hole: Hole::None, entry: self.insts.len() })
 457             }
 458         };
 459         let Patch { mut hole, entry } = try!(self.c(first));
 460         for e in exprs {
 461             let p = try!(self.c(e));
 462             self.fill(hole, p.entry);
 463             hole = p.hole;
 464         }
 465         Ok(Patch { hole: hole, entry: entry })
 466     }
 467
 468     fn c_alternate(&mut self, exprs: &[Expr]) -> Result {
 469         debug_assert!(
 470             exprs.len() >= 2, "alternates must have at least 2 exprs");
 471
 472         // Initial entry point is always the first split.
 473         let first_split_entry = self.insts.len();
 474
 475         // Save up all of the holes from each alternate. They will all get
 476         // patched to point to the same location.
 477         let mut holes = vec![];
 478
 479         let mut prev_hole = Hole::None;
 480         for e in &exprs[0..exprs.len() - 1] {
 481             self.fill_to_next(prev_hole);
 482             let split = self.push_split_hole();
 483             let Patch { hole, entry } = try!(self.c(e));
 484             holes.push(hole);
 485             prev_hole = self.fill_split(split, Some(entry), None);
 486         }
 487         let Patch { hole, entry } = try!(self.c(&exprs[exprs.len() - 1]));
 488         holes.push(hole);
 489         self.fill(prev_hole, entry);
 490         Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
 491     }
 492
 493     fn c_repeat(
 494         &mut self,
 495         expr: &Expr,
 496         kind: Repeater,
 497         greedy: bool,
 498     ) -> Result {
 499         match kind {
 500             Repeater::ZeroOrOne => self.c_repeat_zero_or_one(expr, greedy),
 501             Repeater::ZeroOrMore => self.c_repeat_zero_or_more(expr, greedy),
 502             Repeater::OneOrMore => self.c_repeat_one_or_more(expr, greedy),
 503             Repeater::Range { min, max: None } => {
 504                 self.c_repeat_range_min_or_more(expr, greedy, min)
 505             }
 506             Repeater::Range { min, max: Some(max) } => {
 507                 self.c_repeat_range(expr, greedy, min, max)
 508             }
 509         }
 510     }
 511
 512     fn c_repeat_zero_or_one(
 513         &mut self,
 514         expr: &Expr,
 515         greedy: bool,
 516     ) -> Result {
 517         let split_entry = self.insts.len();
 518         let split = self.push_split_hole();
 519         let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr));
 520
 521         let split_hole = if greedy {
 522             self.fill_split(split, Some(entry_rep), None)
 523         } else {
 524             self.fill_split(split, None, Some(entry_rep))
 525         };
 526         let holes = vec![hole_rep, split_hole];
 527         Ok(Patch { hole: Hole::Many(holes), entry: split_entry })
 528     }
 529
 530     fn c_repeat_zero_or_more(
 531         &mut self,
 532         expr: &Expr,
 533         greedy: bool,
 534     ) -> Result {
 535         let split_entry = self.insts.len();
 536         let split = self.push_split_hole();
 537         let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr));
 538
 539         self.fill(hole_rep, split_entry);
 540         let split_hole = if greedy {
 541             self.fill_split(split, Some(entry_rep), None)
 542         } else {
 543             self.fill_split(split, None, Some(entry_rep))
 544         };
 545         Ok(Patch { hole: split_hole, entry: split_entry })
 546     }
 547
 548     fn c_repeat_one_or_more(
 549         &mut self,
 550         expr: &Expr,
 551         greedy: bool,
 552     ) -> Result {
 553         let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr));
 554         self.fill_to_next(hole_rep);
 555         let split = self.push_split_hole();
 556
 557         let split_hole = if greedy {
 558             self.fill_split(split, Some(entry_rep), None)
 559         } else {
 560             self.fill_split(split, None, Some(entry_rep))
 561         };
 562         Ok(Patch { hole: split_hole, entry: entry_rep })
 563     }
 564
 565     fn c_repeat_range_min_or_more(
 566         &mut self,
 567         expr: &Expr,
 568         greedy: bool,
 569         min: u32,
 570     ) -> Result {
 571         let min = u32_to_usize(min);
 572         let patch_concat = try!(self.c_concat(iter::repeat(expr).take(min)));
 573         let patch_rep = try!(self.c_repeat_zero_or_more(expr, greedy));
 574         self.fill(patch_concat.hole, patch_rep.entry);
 575         Ok(Patch { hole: patch_rep.hole, entry: patch_concat.entry })
 576     }
 577
 578     fn c_repeat_range(
 579         &mut self,
 580         expr: &Expr,
 581         greedy: bool,
 582         min: u32,
 583         max: u32,
 584     ) -> Result {
 585         let (min, max) = (u32_to_usize(min), u32_to_usize(max));
 586         let patch_concat = try!(self.c_concat(iter::repeat(expr).take(min)));
 587         let initial_entry = patch_concat.entry;
 588         if min == max {
 589             return Ok(patch_concat);
 590         }
 591         // It is much simpler to compile, e.g., `a{2,5}` as:
 592         //
 593         //     aaa?a?a?
 594         //
 595         // But you end up with a sequence of instructions like this:
 596         //
 597         //     0: 'a'
 598         //     1: 'a',
 599         //     2: split(3, 4)
 600         //     3: 'a'
 601         //     4: split(5, 6)
 602         //     5: 'a'
 603         //     6: split(7, 8)
 604         //     7: 'a'
 605         //     8: MATCH
 606         //
 607         // This is *incredibly* inefficient because the splits end
 608         // up forming a chain, which has to be resolved everything a
 609         // transition is followed.
 610         let mut holes = vec![];
 611         let mut prev_hole = patch_concat.hole;
 612         for _ in min..max {
 613             self.fill_to_next(prev_hole);
 614             let split = self.push_split_hole();
 615             let Patch { hole, entry } = try!(self.c(expr));
 616             prev_hole = hole;
 617             if greedy {
 618                 holes.push(self.fill_split(split, Some(entry), None));
 619             } else {
 620                 holes.push(self.fill_split(split, None, Some(entry)));
 621             }
 622         }
 623         holes.push(prev_hole);
 624         Ok(Patch { hole: Hole::Many(holes), entry: initial_entry })
 625     }
 626
 627     fn fill(&mut self, hole: Hole, goto: InstPtr) {
 628         match hole {
 629             Hole::None => {}
 630             Hole::One(pc) => {
 631                 self.insts[pc].fill(goto);
 632             }
 633             Hole::Many(holes) => {
 634                 for hole in holes {
 635                     self.fill(hole, goto);
 636                 }
 637             }
 638         }
 639     }
 640
 641     fn fill_to_next(&mut self, hole: Hole) {
 642         let next = self.insts.len();
 643         self.fill(hole, next);
 644     }
 645
 646     fn fill_split(
 647         &mut self,
 648         hole: Hole,
 649         goto1: Option<InstPtr>,
 650         goto2: Option<InstPtr>,
 651     ) -> Hole {
 652         match hole {
 653             Hole::None => Hole::None,
 654             Hole::One(pc) => {
 655                 match (goto1, goto2) {
 656                     (Some(goto1), Some(goto2)) => {
 657                         self.insts[pc].fill_split(goto1, goto2);
 658                         Hole::None
 659                     }
 660                     (Some(goto1), None) => {
 661                         self.insts[pc].half_fill_split_goto1(goto1);
 662                         Hole::One(pc)
 663                     }
 664                     (None, Some(goto2)) => {
 665                         self.insts[pc].half_fill_split_goto2(goto2);
 666                         Hole::One(pc)
 667                     }
 668                     (None, None) => unreachable!("at least one of the split \
 669                                                   holes must be filled"),
 670                 }
 671             }
 672             Hole::Many(holes) => {
 673                 let mut new_holes = vec![];
 674                 for hole in holes {
 675                     new_holes.push(self.fill_split(hole, goto1, goto2));
 676                 }
 677                 if new_holes.is_empty() {
 678                     Hole::None
 679                 } else if new_holes.len() == 1 {
 680                     new_holes.pop().unwrap()
 681                 } else {
 682                     Hole::Many(new_holes)
 683                 }
 684             }
 685         }
 686     }
 687
 688     fn push_compiled(&mut self, inst: Inst) {
 689         self.insts.push(MaybeInst::Compiled(inst));
 690     }
 691
 692     fn push_hole(&mut self, inst: InstHole) -> Hole {
 693         let hole = self.insts.len();
 694         self.insts.push(MaybeInst::Uncompiled(inst));
 695         Hole::One(hole)
 696     }
 697
 698     fn push_split_hole(&mut self) -> Hole {
 699         let hole = self.insts.len();
 700         self.insts.push(MaybeInst::Split);
 701         Hole::One(hole)
 702     }
 703
 704     fn check_size(&self) -> result::Result<(), Error> {
 705         use std::mem::size_of;
 706
 707         if self.insts.len() * size_of::<Inst>() > self.size_limit {
 708             Err(Error::CompiledTooBig(self.size_limit))
 709         } else {
 710             Ok(())
 711         }
 712     }
 713 }
 714
 715 #[derive(Debug)]
 716 enum Hole {
 717     None,
 718     One(InstPtr),
 719     Many(Vec<Hole>),
 720 }
 721
 722 #[derive(Clone, Debug)]
 723 enum MaybeInst {
 724     Compiled(Inst),
 725     Uncompiled(InstHole),
 726     Split,
 727     Split1(InstPtr),
 728     Split2(InstPtr),
 729 }
 730
 731 impl MaybeInst {
 732     fn fill(&mut self, goto: InstPtr) {
 733         let filled = match *self {
 734             MaybeInst::Uncompiled(ref inst) => inst.fill(goto),
 735             MaybeInst::Split1(goto1) => {
 736                 Inst::Split(InstSplit { goto1: goto1, goto2: goto })
 737             }
 738             MaybeInst::Split2(goto2) => {
 739                 Inst::Split(InstSplit { goto1: goto, goto2: goto2 })
 740             }
 741             _ => unreachable!("not all instructions were compiled! \
 742                                found uncompiled instruction: {:?}", self),
 743         };
 744         *self = MaybeInst::Compiled(filled);
 745     }
 746
 747     fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
 748         let filled = match *self {
 749             MaybeInst::Split => {
 750                 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
 751             }
 752             _ => unreachable!("must be called on Split instruction, \
 753                                instead it was called on: {:?}", self),
 754         };
 755         *self = MaybeInst::Compiled(filled);
 756     }
 757
 758     fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
 759         let half_filled = match *self {
 760             MaybeInst::Split => goto1,
 761             _ => unreachable!("must be called on Split instruction, \
 762                                instead it was called on: {:?}", self),
 763         };
 764         *self = MaybeInst::Split1(half_filled);
 765     }
 766
 767     fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
 768         let half_filled = match *self {
 769             MaybeInst::Split => goto2,
 770             _ => unreachable!("must be called on Split instruction, \
 771                                instead it was called on: {:?}", self),
 772         };
 773         *self = MaybeInst::Split2(half_filled);
 774     }
 775
 776     fn unwrap(self) -> Inst {
 777         match self {
 778             MaybeInst::Compiled(inst) => inst,
 779             _ => unreachable!("must be called on a compiled instruction, \
 780                                instead it was called on: {:?}", self),
 781         }
 782     }
 783 }
 784
 785 #[derive(Clone, Debug)]
 786 enum InstHole {
 787     Save { slot: usize },
 788     EmptyLook { look: EmptyLook },
 789     Char { c: char },
 790     Ranges { ranges: Vec<(char, char)> },
 791     Bytes { start: u8, end: u8 },
 792 }
 793
 794 impl InstHole {
 795     fn fill(&self, goto: InstPtr) -> Inst {
 796         match *self {
 797             InstHole::Save { slot } => Inst::Save(InstSave {
 798                 goto: goto,
 799                 slot: slot,
 800             }),
 801             InstHole::EmptyLook { look } => Inst::EmptyLook(InstEmptyLook {
 802                 goto: goto,
 803                 look: look,
 804             }),
 805             InstHole::Char { c } => Inst::Char(InstChar {
 806                 goto: goto,
 807                 c: c,
 808             }),
 809             InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
 810                 goto: goto,
 811                 ranges: ranges.clone(),
 812             }),
 813             InstHole::Bytes { start, end } => Inst::Bytes(InstBytes {
 814                 goto: goto,
 815                 start: start,
 816                 end: end,
 817             }),
 818         }
 819     }
 820 }
 821
 822 struct CompileClass<'a, 'b> {
 823     c: &'a mut Compiler,
 824     ranges: &'b [ClassRange],
 825 }
 826
 827 impl<'a, 'b> CompileClass<'a, 'b> {
 828     fn compile(mut self) -> Result {
 829         let mut holes = vec![];
 830         let mut initial_entry = None;
 831         let mut last_split = Hole::None;
 832         let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
 833         self.c.suffix_cache.clear();
 834
 835         for (i, ref range) in self.ranges.iter().enumerate() {
 836             let is_last_range = i + 1 == self.ranges.len();
 837             utf8_seqs.reset(range.start, range.end);
 838             let mut it = (&mut utf8_seqs).peekable();
 839             loop {
 840                 let utf8_seq = match it.next() {
 841                     None => break,
 842                     Some(utf8_seq) => utf8_seq,
 843                 };
 844                 if is_last_range && it.peek().is_none() {
 845                     let Patch { hole, entry } = try!(self.c_utf8_seq(&utf8_seq));
 846                     holes.push(hole);
 847                     self.c.fill(last_split, entry);
 848                     last_split = Hole::None;
 849                     if initial_entry.is_none() {
 850                         initial_entry = Some(entry);
 851                     }
 852                 } else {
 853                     if initial_entry.is_none() {
 854                         initial_entry = Some(self.c.insts.len());
 855                     }
 856                     self.c.fill_to_next(last_split);
 857                     last_split = self.c.push_split_hole();
 858                     let Patch { hole, entry } = try!(self.c_utf8_seq(&utf8_seq));
 859                     holes.push(hole);
 860                     last_split = self.c.fill_split(last_split, Some(entry), None);
 861                 }
 862             }
 863         }
 864         self.c.utf8_seqs = Some(utf8_seqs);
 865         Ok(Patch {
 866             hole: Hole::Many(holes),
 867             entry: initial_entry.unwrap(),
 868         })
 869     }
 870
 871     fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
 872         if self.c.compiled.is_reverse {
 873             self.c_utf8_seq_(seq)
 874         } else {
 875             self.c_utf8_seq_(seq.into_iter().rev())
 876         }
 877     }
 878
 879     fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
 880             where I: IntoIterator<Item=&'r Utf8Range> {
 881         // The initial instruction for each UTF-8 sequence should be the same.
 882         let mut from_inst = ::std::usize::MAX;
 883         let mut last_hole = Hole::None;
 884         for byte_range in seq {
 885             let key = SuffixCacheKey {
 886                 from_inst: from_inst,
 887                 start: byte_range.start,
 888                 end: byte_range.end,
 889             };
 890             {
 891                 let pc = self.c.insts.len();
 892                 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
 893                     from_inst = cached_pc;
 894                     continue;
 895                 }
 896             }
 897             self.c.byte_classes.set_range(byte_range.start, byte_range.end);
 898             if from_inst == ::std::usize::MAX {
 899                 last_hole = self.c.push_hole(InstHole::Bytes {
 900                     start: byte_range.start,
 901                     end: byte_range.end,
 902                 });
 903             } else {
 904                 self.c.push_compiled(Inst::Bytes(InstBytes {
 905                     goto: from_inst,
 906                     start: byte_range.start,
 907                     end: byte_range.end,
 908                 }));
 909             }
 910             from_inst = self.c.insts.len().checked_sub(1).unwrap();
 911             debug_assert!(from_inst < ::std::usize::MAX);
 912         }
 913         debug_assert!(from_inst < ::std::usize::MAX);
 914         Ok(Patch { hole: last_hole, entry: from_inst })
 915     }
 916 }
 917
 918 /// SuffixCache is a simple bounded hash map for caching suffix entries in
 919 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
 920 /// The set of byte ranges looks like this:
 921 ///
 922 /// [0-7F]
 923 /// [C2-DF][80-BF]
 924 /// [E0][A0-BF][80-BF]
 925 /// [E1-EC][80-BF][80-BF]
 926 /// [ED][80-9F][80-BF]
 927 /// [EE-EF][80-BF][80-BF]
 928 ///
 929 /// Each line above translates to one alternate in the compiled regex program.
 930 /// However, all but one of the alternates end in the same suffix, which is
 931 /// a waste of an instruction. The suffix cache facilitates reusing them across
 932 /// alternates.
 933 ///
 934 /// Note that a HashMap could be trivially used for this, but we don't need its
 935 /// overhead. Some small bounded space (LRU style) is more than enough.
 936 struct SuffixCache {
 937     table: Vec<SuffixCacheEntry>,
 938     // Every time the cache is cleared, we increment the version number instead
 939     // of actually zeroing memory. Since we store a copy of the current version
 940     // in every element, all we need to do is make sure to invalidate any stale
 941     // entries upon access. This saves quite a bit of time!
 942     version: usize,
 943 }
 944
 945 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
 946 struct SuffixCacheEntry {
 947     key: SuffixCacheKey,
 948     pc: InstPtr,
 949     version: usize,
 950 }
 951
 952 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
 953 struct SuffixCacheKey {
 954     from_inst: InstPtr,
 955     start: u8,
 956     end: u8,
 957 }
 958
 959 impl SuffixCache {
 960     fn new(size: usize) -> Self {
 961         SuffixCache {
 962             table: vec![SuffixCacheEntry::default(); size],
 963             version: 0,
 964         }
 965     }
 966
 967     fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
 968         let h = self.hash(&key);
 969         let e = self.table[h];
 970         if e.key == key && e.version == self.version {
 971             Some(e.pc)
 972         } else {
 973             self.table[h] = SuffixCacheEntry {
 974                 key: key,
 975                 pc: pc,
 976                 version: self.version,
 977             };
 978             None
 979         }
 980     }
 981
 982     fn clear(&mut self) {
 983         self.version += 1;
 984     }
 985
 986     fn hash(&self, suffix: &SuffixCacheKey) -> usize {
 987         // Basic FNV-1a hash as described:
 988         // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
 989         const FNV_PRIME: u64 = 1099511628211;
 990         let mut h = 14695981039346656037;
 991         h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
 992         h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
 993         h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
 994         (h as usize) % self.table.len()
 995     }
 996 }
 997
 998 struct ByteClassSet([bool; 256]);
 999
1000 impl ByteClassSet {
1001     fn new() -> Self {
1002         ByteClassSet([false; 256])
1003     }
1004
1005     fn set_range(&mut self, start: u8, end: u8) {
1006         debug_assert!(start <= end);
1007         if start > 0 {
1008             self.0[start as usize - 1] = true;
1009         }
1010         self.0[end as usize] = true;
1011     }
1012
1013     fn set_word_boundary(&mut self) {
1014         // We need to mark all ranges of bytes whose pairs result in
1015         // evaluating \b differently.
1016         let iswb = is_word_byte;
1017         let mut b1: u16 = 0;
1018         let mut b2: u16;
1019         while b1 <= 255 {
1020             b2 = b1 + 1;
1021             while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1022                 b2 += 1;
1023             }
1024             self.set_range(b1 as u8, (b2 - 1) as u8);
1025             b1 = b2;
1026         }
1027     }
1028
1029     fn byte_classes(&self) -> Vec<u8> {
1030         // N.B. If you're debugging the DFA, it's useful to simply return
1031         // `(0..256).collect()`, which effectively removes the byte classes
1032         // and makes the transitions easier to read.
1033         // (0usize..256).map(|x| x as u8).collect()
1034         let mut byte_classes = vec![0; 256];
1035         let mut class = 0u8;
1036         for i in 0..256 {
1037             byte_classes[i] = class;
1038             if self.0[i] {
1039                 class = class.checked_add(1).unwrap();
1040             }
1041         }
1042         byte_classes
1043     }
1044 }
1045
1046 fn u32_to_usize(n: u32) -> usize {
1047     if (n as u64) > (::std::usize::MAX as u64) {
1048         panic!("BUG: {} is too big to be pointer sized", n)
1049     }
1050     n as usize
1051 }
1052
1053 #[cfg(test)]
1054 mod tests {
1055     use super::ByteClassSet;
1056
1057     #[test]
1058     fn byte_classes() {
1059         let mut set = ByteClassSet::new();
1060         set.set_range(b'a', b'z');
1061         let classes = set.byte_classes();
1062         assert_eq!(classes[0], 0);
1063         assert_eq!(classes[1], 0);
1064         assert_eq!(classes[2], 0);
1065         assert_eq!(classes[b'a' as usize - 1], 0);
1066         assert_eq!(classes[b'a' as usize], 1);
1067         assert_eq!(classes[b'm' as usize], 1);
1068         assert_eq!(classes[b'z' as usize], 1);
1069         assert_eq!(classes[b'z' as usize + 1], 2);
1070         assert_eq!(classes[254], 2);
1071         assert_eq!(classes[255], 2);
1072
1073         let mut set = ByteClassSet::new();
1074         set.set_range(0, 2);
1075         set.set_range(4, 6);
1076         let classes = set.byte_classes();
1077         assert_eq!(classes[0], 0);
1078         assert_eq!(classes[1], 0);
1079         assert_eq!(classes[2], 0);
1080         assert_eq!(classes[3], 1);
1081         assert_eq!(classes[4], 2);
1082         assert_eq!(classes[5], 2);
1083         assert_eq!(classes[6], 2);
1084         assert_eq!(classes[7], 3);
1085         assert_eq!(classes[255], 3);
1086     }
1087 }