src/vendor/regex-0.2.11/src/pikevm.rs

   1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 // This module implements the Pike VM. That is, it guarantees linear time
  12 // search of a regex on any text with memory use proportional to the size of
  13 // the regex.
  14 //
  15 // It is equal in power to the backtracking engine in this crate, except the
  16 // backtracking engine is typically faster on small regexes/texts at the
  17 // expense of a bigger memory footprint.
  18 //
  19 // It can do more than the DFA can (specifically, record capture locations
  20 // and execute Unicode word boundary assertions), but at a slower speed.
  21 // Specifically, the Pike VM exectues a DFA implicitly by repeatedly expanding
  22 // epsilon transitions. That is, the Pike VM engine can be in multiple states
  23 // at once where as the DFA is only ever in one state at a time.
  24 //
  25 // Therefore, the Pike VM is generally treated as the fallback when the other
  26 // matching engines either aren't feasible to run or are insufficient.
  27
  28 use std::mem;
  29
  30 use exec::ProgramCache;
  31 use input::{Input, InputAt};
  32 use prog::{Program, InstPtr};
  33 use re_trait::Slot;
  34 use sparse::SparseSet;
  35
  36 /// An NFA simulation matching engine.
  37 #[derive(Debug)]
  38 pub struct Fsm<'r, I> {
  39     /// The sequence of opcodes (among other things) that is actually executed.
  40     ///
  41     /// The program may be byte oriented or Unicode codepoint oriented.
  42     prog: &'r Program,
  43     /// An explicit stack used for following epsilon transitions. (This is
  44     /// borrowed from the cache.)
  45     stack: &'r mut Vec<FollowEpsilon>,
  46     /// The input to search.
  47     input: I,
  48 }
  49
  50 /// A cached allocation that can be reused on each execution.
  51 #[derive(Clone, Debug)]
  52 pub struct Cache {
  53     /// A pair of ordered sets for tracking NFA states.
  54     clist: Threads,
  55     nlist: Threads,
  56     /// An explicit stack used for following epsilon transitions.
  57     stack: Vec<FollowEpsilon>,
  58 }
  59
  60 /// An ordered set of NFA states and their captures.
  61 #[derive(Clone, Debug)]
  62 struct Threads {
  63     /// An ordered set of opcodes (each opcode is an NFA state).
  64     set: SparseSet,
  65     /// Captures for every NFA state.
  66     ///
  67     /// It is stored in row-major order, where the columns are the capture
  68     /// slots and the rows are the states.
  69     caps: Vec<Slot>,
  70     /// The number of capture slots stored per thread. (Every capture has
  71     /// two slots.)
  72     slots_per_thread: usize,
  73 }
  74
  75 /// A representation of an explicit stack frame when following epsilon
  76 /// transitions. This is used to avoid recursion.
  77 #[derive(Clone, Debug)]
  78 enum FollowEpsilon {
  79     /// Follow transitions at the given instruction pointer.
  80     IP(InstPtr),
  81     /// Restore the capture slot with the given position in the input.
  82     Capture { slot: usize, pos: Slot },
  83 }
  84
  85 impl Cache {
  86     /// Create a new allocation used by the NFA machine to record execution
  87     /// and captures.
  88     pub fn new(_prog: &Program) -> Self {
  89         Cache {
  90             clist: Threads::new(),
  91             nlist: Threads::new(),
  92             stack: vec![],
  93         }
  94     }
  95 }
  96
  97 impl<'r, I: Input> Fsm<'r, I> {
  98     /// Execute the NFA matching engine.
  99     ///
 100     /// If there's a match, `exec` returns `true` and populates the given
 101     /// captures accordingly.
 102     pub fn exec(
 103         prog: &'r Program,
 104         cache: &ProgramCache,
 105         matches: &mut [bool],
 106         slots: &mut [Slot],
 107         quit_after_match: bool,
 108         input: I,
 109         start: usize,
 110     ) -> bool {
 111         let mut cache = cache.borrow_mut();
 112         let cache = &mut cache.pikevm;
 113         cache.clist.resize(prog.len(), prog.captures.len());
 114         cache.nlist.resize(prog.len(), prog.captures.len());
 115         let at = input.at(start);
 116         Fsm {
 117             prog: prog,
 118             stack: &mut cache.stack,
 119             input: input,
 120         }.exec_(
 121             &mut cache.clist,
 122             &mut cache.nlist,
 123             matches,
 124             slots,
 125             quit_after_match,
 126             at,
 127         )
 128     }
 129
 130     fn exec_(
 131         &mut self,
 132         mut clist: &mut Threads,
 133         mut nlist: &mut Threads,
 134         matches: &mut [bool],
 135         slots: &mut [Slot],
 136         quit_after_match: bool,
 137         mut at: InputAt,
 138     ) -> bool {
 139         let mut matched = false;
 140         let mut all_matched = false;
 141         clist.set.clear();
 142         nlist.set.clear();
 143 'LOOP:  loop {
 144             if clist.set.is_empty() {
 145                 // Three ways to bail out when our current set of threads is
 146                 // empty.
 147                 //
 148                 // 1. We have a match---so we're done exploring any possible
 149                 //    alternatives. Time to quit. (We can't do this if we're
 150                 //    looking for matches for multiple regexes, unless we know
 151                 //    they all matched.)
 152                 //
 153                 // 2. If the expression starts with a '^' we can terminate as
 154                 //    soon as the last thread dies.
 155                 if (matched && matches.len() <= 1)
 156                     || all_matched
 157                     || (!at.is_start() && self.prog.is_anchored_start) {
 158                     break;
 159                 }
 160
 161                 // 3. If there's a literal prefix for the program, try to
 162                 //    jump ahead quickly. If it can't be found, then we can
 163                 //    bail out early.
 164                 if !self.prog.prefixes.is_empty() {
 165                     at = match self.input.prefix_at(&self.prog.prefixes, at) {
 166                         None => break,
 167                         Some(at) => at,
 168                     };
 169                 }
 170             }
 171
 172             // This simulates a preceding '.*?' for every regex by adding
 173             // a state starting at the current position in the input for the
 174             // beginning of the program only if we don't already have a match.
 175             if clist.set.is_empty()
 176                 || (!self.prog.is_anchored_start && !all_matched) {
 177                 self.add(&mut clist, slots, 0, at);
 178             }
 179             // The previous call to "add" actually inspects the position just
 180             // before the current character. For stepping through the machine,
 181             // we can to look at the current character, so we advance the
 182             // input.
 183             let at_next = self.input.at(at.next_pos());
 184             for i in 0..clist.set.len() {
 185                 let ip = clist.set[i];
 186                 if self.step(
 187                     &mut nlist,
 188                     matches,
 189                     slots,
 190                     clist.caps(ip),
 191                     ip,
 192                     at,
 193                     at_next,
 194                 ) {
 195                     matched = true;
 196                     all_matched = all_matched || matches.iter().all(|&b| b);
 197                     if quit_after_match {
 198                         // If we only care if a match occurs (not its
 199                         // position), then we can quit right now.
 200                         break 'LOOP;
 201                     }
 202                     if self.prog.matches.len() == 1 {
 203                         // We don't need to check the rest of the threads
 204                         // in this set because we've matched something
 205                         // ("leftmost-first"). However, we still need to check
 206                         // threads in the next set to support things like
 207                         // greedy matching.
 208                         //
 209                         // This is only true on normal regexes. For regex sets,
 210                         // we need to mush on to observe other matches.
 211                         break;
 212                     }
 213                 }
 214             }
 215             if at.is_end() {
 216                 break;
 217             }
 218             at = at_next;
 219             mem::swap(clist, nlist);
 220             nlist.set.clear();
 221         }
 222         matched
 223     }
 224
 225     /// Step through the input, one token (byte or codepoint) at a time.
 226     ///
 227     /// nlist is the set of states that will be processed on the next token
 228     /// in the input.
 229     ///
 230     /// caps is the set of captures passed by the caller of the NFA. They are
 231     /// written to only when a match state is visited.
 232     ///
 233     /// thread_caps is the set of captures set for the current NFA state, ip.
 234     ///
 235     /// at and at_next are the current and next positions in the input. at or
 236     /// at_next may be EOF.
 237     fn step(
 238         &mut self,
 239         nlist: &mut Threads,
 240         matches: &mut [bool],
 241         slots: &mut [Slot],
 242         thread_caps: &mut [Option<usize>],
 243         ip: usize,
 244         at: InputAt,
 245         at_next: InputAt,
 246     ) -> bool {
 247         use prog::Inst::*;
 248         match self.prog[ip] {
 249             Match(match_slot) => {
 250                 if match_slot < matches.len() {
 251                     matches[match_slot] = true;
 252                 }
 253                 for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
 254                     *slot = *val;
 255                 }
 256                 true
 257             }
 258             Char(ref inst) => {
 259                 if inst.c == at.char() {
 260                     self.add(nlist, thread_caps, inst.goto, at_next);
 261                 }
 262                 false
 263             }
 264             Ranges(ref inst) => {
 265                 if inst.matches(at.char()) {
 266                     self.add(nlist, thread_caps, inst.goto, at_next);
 267                 }
 268                 false
 269             }
 270             Bytes(ref inst) => {
 271                 if let Some(b) = at.byte() {
 272                     if inst.matches(b) {
 273                         self.add(nlist, thread_caps, inst.goto, at_next);
 274                     }
 275                 }
 276                 false
 277             }
 278             EmptyLook(_) | Save(_) | Split(_) => false,
 279         }
 280     }
 281
 282     /// Follows epsilon transitions and adds them for processing to nlist,
 283     /// starting at and including ip.
 284     fn add(
 285         &mut self,
 286         nlist: &mut Threads,
 287         thread_caps: &mut [Option<usize>],
 288         ip: usize,
 289         at: InputAt,
 290     ) {
 291         self.stack.push(FollowEpsilon::IP(ip));
 292         while let Some(frame) = self.stack.pop() {
 293             match frame {
 294                 FollowEpsilon::IP(ip) => {
 295                     self.add_step(nlist, thread_caps, ip, at);
 296                 }
 297                 FollowEpsilon::Capture { slot, pos } => {
 298                     thread_caps[slot] = pos;
 299                 }
 300             }
 301         }
 302     }
 303
 304     /// A helper function for add that avoids excessive pushing to the stack.
 305     fn add_step(
 306         &mut self,
 307         nlist: &mut Threads,
 308         thread_caps: &mut [Option<usize>],
 309         mut ip: usize,
 310         at: InputAt,
 311     ) {
 312         // Instead of pushing and popping to the stack, we mutate ip as we
 313         // traverse the set of states. We only push to the stack when we
 314         // absolutely need recursion (restoring captures or following a
 315         // branch).
 316         use prog::Inst::*;
 317         loop {
 318             // Don't visit states we've already added.
 319             if nlist.set.contains(ip) {
 320                 return;
 321             }
 322             nlist.set.insert(ip);
 323             match self.prog[ip] {
 324                 EmptyLook(ref inst) => {
 325                     if self.input.is_empty_match(at, inst) {
 326                         ip = inst.goto;
 327                     }
 328                 }
 329                 Save(ref inst) => {
 330                     if inst.slot < thread_caps.len() {
 331                         self.stack.push(FollowEpsilon::Capture {
 332                             slot: inst.slot,
 333                             pos: thread_caps[inst.slot],
 334                         });
 335                         thread_caps[inst.slot] = Some(at.pos());
 336                     }
 337                     ip = inst.goto;
 338                 }
 339                 Split(ref inst) => {
 340                     self.stack.push(FollowEpsilon::IP(inst.goto2));
 341                     ip = inst.goto1;
 342                 }
 343                 Match(_) | Char(_) | Ranges(_) | Bytes(_) => {
 344                     let t = &mut nlist.caps(ip);
 345                     for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
 346                         *slot = *val;
 347                     }
 348                     return;
 349                 }
 350             }
 351         }
 352     }
 353 }
 354
 355 impl Threads {
 356     fn new() -> Self {
 357         Threads {
 358             set: SparseSet::new(0),
 359             caps: vec![],
 360             slots_per_thread: 0,
 361         }
 362     }
 363
 364     fn resize(&mut self, num_insts: usize, ncaps: usize) {
 365         if num_insts == self.set.capacity() {
 366             return;
 367         }
 368         self.slots_per_thread = ncaps * 2;
 369         self.set = SparseSet::new(num_insts);
 370         self.caps = vec![None; self.slots_per_thread * num_insts];
 371     }
 372
 373     fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
 374         let i = pc * self.slots_per_thread;
 375         &mut self.caps[i..i + self.slots_per_thread]
 376     }
 377 }