src/vendor/regex-0.2.10/src/backtrack.rs

   1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 // This is the backtracking matching engine. It has the same exact capability
  12 // as the full NFA simulation, except it is artificially restricted to small
  13 // regexes on small inputs because of its memory requirements.
  14 //
  15 // In particular, this is a *bounded* backtracking engine. It retains worst
  16 // case linear time by keeping track of the states that it has visited (using a
  17 // bitmap). Namely, once a state is visited, it is never visited again. Since a
  18 // state is keyed by `(instruction index, input index)`, we have that its time
  19 // complexity is `O(mn)` (i.e., linear in the size of the search text).
  20 //
  21 // The backtracking engine can beat out the NFA simulation on small
  22 // regexes/inputs because it doesn't have to keep track of multiple copies of
  23 // the capture groups. In benchmarks, the backtracking engine is roughly twice
  24 // as fast as the full NFA simulation. Note though that its performance doesn't
  25 // scale, even if you're willing to live with the memory requirements. Namely,
  26 // the bitset has to be zeroed on each execution, which becomes quite expensive
  27 // on large bitsets.
  28
  29 use exec::ProgramCache;
  30 use input::{Input, InputAt};
  31 use prog::{Program, InstPtr};
  32 use re_trait::Slot;
  33
  34 type Bits = u32;
  35
  36 const BIT_SIZE: usize = 32;
  37 const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB
  38
  39 /// Returns true iff the given regex and input should be executed by this
  40 /// engine with reasonable memory usage.
  41 pub fn should_exec(num_insts: usize, text_len: usize) -> bool {
  42     // Total memory usage in bytes is determined by:
  43     //
  44     //   ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32))
  45     //
  46     // The actual limit picked is pretty much a heuristic.
  47     // See: https://github.com/rust-lang/regex/issues/215
  48     let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4;
  49     size <= MAX_SIZE_BYTES
  50 }
  51
  52 /// A backtracking matching engine.
  53 #[derive(Debug)]
  54 pub struct Bounded<'a, 'm, 'r, 's, I> {
  55     prog: &'r Program,
  56     input: I,
  57     matches: &'m mut [bool],
  58     slots: &'s mut [Slot],
  59     m: &'a mut Cache,
  60 }
  61
  62 /// Shared cached state between multiple invocations of a backtracking engine
  63 /// in the same thread.
  64 #[derive(Clone, Debug)]
  65 pub struct Cache {
  66     jobs: Vec<Job>,
  67     visited: Vec<Bits>,
  68 }
  69
  70 impl Cache {
  71     /// Create new empty cache for the backtracking engine.
  72     pub fn new(_prog: &Program) -> Self {
  73         Cache { jobs: vec![], visited: vec![] }
  74     }
  75 }
  76
  77 /// A job is an explicit unit of stack space in the backtracking engine.
  78 ///
  79 /// The "normal" representation is a single state transition, which corresponds
  80 /// to an NFA state and a character in the input. However, the backtracking
  81 /// engine must keep track of old capture group values. We use the explicit
  82 /// stack to do it.
  83 #[derive(Clone, Copy, Debug)]
  84 enum Job {
  85     Inst { ip: InstPtr, at: InputAt },
  86     SaveRestore { slot: usize, old_pos: Option<usize> },
  87 }
  88
  89 impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
  90     /// Execute the backtracking matching engine.
  91     ///
  92     /// If there's a match, `exec` returns `true` and populates the given
  93     /// captures accordingly.
  94     pub fn exec(
  95         prog: &'r Program,
  96         cache: &ProgramCache,
  97         matches: &'m mut [bool],
  98         slots: &'s mut [Slot],
  99         input: I,
 100         start: usize,
 101     ) -> bool {
 102         let mut cache = cache.borrow_mut();
 103         let cache = &mut cache.backtrack;
 104         let start = input.at(start);
 105         let mut b = Bounded {
 106             prog: prog,
 107             input: input,
 108             matches: matches,
 109             slots: slots,
 110             m: cache,
 111         };
 112         b.exec_(start)
 113     }
 114
 115     /// Clears the cache such that the backtracking engine can be executed
 116     /// on some input of fixed length.
 117     fn clear(&mut self) {
 118         // Reset the job memory so that we start fresh.
 119         self.m.jobs.clear();
 120
 121         // Now we need to clear the bit state set.
 122         // We do this by figuring out how much space we need to keep track
 123         // of the states we've visited.
 124         // Then we reset all existing allocated space to 0.
 125         // Finally, we request more space if we need it.
 126         //
 127         // This is all a little circuitous, but doing this unsafely
 128         // doesn't seem to have a measurable impact on performance.
 129         // (Probably because backtracking is limited to such small
 130         // inputs/regexes in the first place.)
 131         let visited_len =
 132             (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1)
 133             /
 134             BIT_SIZE;
 135         self.m.visited.truncate(visited_len);
 136         for v in &mut self.m.visited {
 137             *v = 0;
 138         }
 139         if visited_len > self.m.visited.len() {
 140             let len = self.m.visited.len();
 141             self.m.visited.reserve_exact(visited_len - len);
 142             for _ in 0..(visited_len - len) {
 143                 self.m.visited.push(0);
 144             }
 145         }
 146     }
 147
 148     /// Start backtracking at the given position in the input, but also look
 149     /// for literal prefixes.
 150     fn exec_(&mut self, mut at: InputAt) -> bool {
 151         self.clear();
 152         // If this is an anchored regex at the beginning of the input, then
 153         // we're either already done or we only need to try backtracking once.
 154         if self.prog.is_anchored_start {
 155             return if !at.is_start() {
 156                 false
 157             } else {
 158                 self.backtrack(at)
 159             };
 160         }
 161         let mut matched = false;
 162         loop {
 163             if !self.prog.prefixes.is_empty() {
 164                 at = match self.input.prefix_at(&self.prog.prefixes, at) {
 165                     None => break,
 166                     Some(at) => at,
 167                 };
 168             }
 169             matched = self.backtrack(at) || matched;
 170             if matched && self.prog.matches.len() == 1 {
 171                 return true;
 172             }
 173             if at.is_end() {
 174                 break;
 175             }
 176             at = self.input.at(at.next_pos());
 177         }
 178         matched
 179     }
 180
 181     /// The main backtracking loop starting at the given input position.
 182     fn backtrack(&mut self, start: InputAt) -> bool {
 183         // N.B. We use an explicit stack to avoid recursion.
 184         // To avoid excessive pushing and popping, most transitions are handled
 185         // in the `step` helper function, which only pushes to the stack when
 186         // there's a capture or a branch.
 187         let mut matched = false;
 188         self.m.jobs.push(Job::Inst { ip: 0, at: start });
 189         while let Some(job) = self.m.jobs.pop() {
 190             match job {
 191                 Job::Inst { ip, at } => {
 192                     if self.step(ip, at) {
 193                         // Only quit if we're matching one regex.
 194                         // If we're matching a regex set, then mush on and
 195                         // try to find other matches (if we want them).
 196                         if self.prog.matches.len() == 1 {
 197                             return true;
 198                         }
 199                         matched = true;
 200                     }
 201                 }
 202                 Job::SaveRestore { slot, old_pos } => {
 203                     if slot < self.slots.len() {
 204                         self.slots[slot] = old_pos;
 205                     }
 206                 }
 207             }
 208         }
 209         matched
 210     }
 211
 212     fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool {
 213         use prog::Inst::*;
 214         loop {
 215             // This loop is an optimization to avoid constantly pushing/popping
 216             // from the stack. Namely, if we're pushing a job only to run it
 217             // next, avoid the push and just mutate `ip` (and possibly `at`)
 218             // in place.
 219             if self.has_visited(ip, at) {
 220                 return false;
 221             }
 222             match self.prog[ip] {
 223                 Match(slot) => {
 224                     if slot < self.matches.len() {
 225                         self.matches[slot] = true;
 226                     }
 227                     return true;
 228                 }
 229                 Save(ref inst) => {
 230                     if let Some(&old_pos) = self.slots.get(inst.slot) {
 231                         // If this path doesn't work out, then we save the old
 232                         // capture index (if one exists) in an alternate
 233                         // job. If the next path fails, then the alternate
 234                         // job is popped and the old capture index is restored.
 235                         self.m.jobs.push(Job::SaveRestore {
 236                             slot: inst.slot,
 237                             old_pos: old_pos,
 238                         });
 239                         self.slots[inst.slot] = Some(at.pos());
 240                     }
 241                     ip = inst.goto;
 242                 }
 243                 Split(ref inst) => {
 244                     self.m.jobs.push(Job::Inst { ip: inst.goto2, at: at });
 245                     ip = inst.goto1;
 246                 }
 247                 EmptyLook(ref inst) => {
 248                     if self.input.is_empty_match(at, inst) {
 249                         ip = inst.goto;
 250                     } else {
 251                         return false;
 252                     }
 253                 }
 254                 Char(ref inst) => {
 255                     if inst.c == at.char() {
 256                         ip = inst.goto;
 257                         at = self.input.at(at.next_pos());
 258                     } else {
 259                         return false;
 260                     }
 261                 }
 262                 Ranges(ref inst) => {
 263                     if inst.matches(at.char()) {
 264                         ip = inst.goto;
 265                         at = self.input.at(at.next_pos());
 266                     } else {
 267                         return false;
 268                     }
 269                 }
 270                 Bytes(ref inst) => {
 271                     if let Some(b) = at.byte() {
 272                         if inst.matches(b) {
 273                             ip = inst.goto;
 274                             at = self.input.at(at.next_pos());
 275                             continue;
 276                         }
 277                     }
 278                     return false;
 279                 }
 280             }
 281         }
 282     }
 283
 284     fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool {
 285         let k = ip * (self.input.len() + 1) + at.pos();
 286         let k1 = k / BIT_SIZE;
 287         let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1)));
 288         if self.m.visited[k1] & k2 == 0 {
 289             self.m.visited[k1] |= k2;
 290             false
 291         } else {
 292             true
 293         }
 294     }
 295 }
 296
 297 fn usize_to_u32(n: usize) -> u32 {
 298     if (n as u64) > (::std::u32::MAX as u64) {
 299         panic!("BUG: {} is too big to fit into u32", n)
 300     }
 301     n as u32
 302 }