1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 // This is the backtracking matching engine. It has the same exact capability
12 // as the full NFA simulation, except it is artificially restricted to small
13 // regexes on small inputs because of its memory requirements.
15 // In particular, this is a *bounded* backtracking engine. It retains worst
16 // case linear time by keeping track of the states that it has visited (using a
17 // bitmap). Namely, once a state is visited, it is never visited again. Since a
18 // state is keyed by `(instruction index, input index)`, we have that its time
19 // complexity is `O(mn)` (i.e., linear in the size of the search text).
21 // The backtracking engine can beat out the NFA simulation on small
22 // regexes/inputs because it doesn't have to keep track of multiple copies of
23 // the capture groups. In benchmarks, the backtracking engine is roughly twice
24 // as fast as the full NFA simulation. Note though that its performance doesn't
25 // scale, even if you're willing to live with the memory requirements. Namely,
26 // the bitset has to be zeroed on each execution, which becomes quite expensive
29 use exec
::ProgramCache
;
30 use input
::{Input, InputAt}
;
31 use prog
::{Program, InstPtr}
;
36 const BIT_SIZE
: usize = 32;
37 const MAX_SIZE_BYTES
: usize = 256 * (1 << 10); // 256 KB
39 /// Returns true iff the given regex and input should be executed by this
40 /// engine with reasonable memory usage.
41 pub fn should_exec(num_insts
: usize, text_len
: usize) -> bool
{
42 // Total memory usage in bytes is determined by:
44 // ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32))
46 // The actual limit picked is pretty much a heuristic.
47 // See: https://github.com/rust-lang/regex/issues/215
48 let size
= ((num_insts
* (text_len
+ 1) + BIT_SIZE
- 1) / BIT_SIZE
) * 4;
49 size
<= MAX_SIZE_BYTES
52 /// A backtracking matching engine.
54 pub struct Bounded
<'a
, 'm
, 'r
, 's
, I
> {
57 matches
: &'m
mut [bool
],
58 slots
: &'s
mut [Slot
],
62 /// Shared cached state between multiple invocations of a backtracking engine
63 /// in the same thread.
64 #[derive(Clone, Debug)]
71 /// Create new empty cache for the backtracking engine.
72 pub fn new(_prog
: &Program
) -> Self {
73 Cache { jobs: vec![], visited: vec![] }
77 /// A job is an explicit unit of stack space in the backtracking engine.
79 /// The "normal" representation is a single state transition, which corresponds
80 /// to an NFA state and a character in the input. However, the backtracking
81 /// engine must keep track of old capture group values. We use the explicit
83 #[derive(Clone, Copy, Debug)]
85 Inst { ip: InstPtr, at: InputAt }
,
86 SaveRestore { slot: usize, old_pos: Option<usize> }
,
89 impl<'a
, 'm
, 'r
, 's
, I
: Input
> Bounded
<'a
, 'm
, 'r
, 's
, I
> {
90 /// Execute the backtracking matching engine.
92 /// If there's a match, `exec` returns `true` and populates the given
93 /// captures accordingly.
97 matches
: &'m
mut [bool
],
98 slots
: &'s
mut [Slot
],
102 let mut cache
= cache
.borrow_mut();
103 let cache
= &mut cache
.backtrack
;
104 let start
= input
.at(start
);
105 let mut b
= Bounded
{
115 /// Clears the cache such that the backtracking engine can be executed
116 /// on some input of fixed length.
117 fn clear(&mut self) {
118 // Reset the job memory so that we start fresh.
121 // Now we need to clear the bit state set.
122 // We do this by figuring out how much space we need to keep track
123 // of the states we've visited.
124 // Then we reset all existing allocated space to 0.
125 // Finally, we request more space if we need it.
127 // This is all a little circuitous, but doing this unsafely
128 // doesn't seem to have a measurable impact on performance.
129 // (Probably because backtracking is limited to such small
130 // inputs/regexes in the first place.)
132 (self.prog
.len() * (self.input
.len() + 1) + BIT_SIZE
- 1)
135 self.m
.visited
.truncate(visited_len
);
136 for v
in &mut self.m
.visited
{
139 if visited_len
> self.m
.visited
.len() {
140 let len
= self.m
.visited
.len();
141 self.m
.visited
.reserve_exact(visited_len
- len
);
142 for _
in 0..(visited_len
- len
) {
143 self.m
.visited
.push(0);
148 /// Start backtracking at the given position in the input, but also look
149 /// for literal prefixes.
150 fn exec_(&mut self, mut at
: InputAt
) -> bool
{
152 // If this is an anchored regex at the beginning of the input, then
153 // we're either already done or we only need to try backtracking once.
154 if self.prog
.is_anchored_start
{
155 return if !at
.is_start() {
161 let mut matched
= false;
163 if !self.prog
.prefixes
.is_empty() {
164 at
= match self.input
.prefix_at(&self.prog
.prefixes
, at
) {
169 matched
= self.backtrack(at
) || matched
;
170 if matched
&& self.prog
.matches
.len() == 1 {
176 at
= self.input
.at(at
.next_pos());
181 /// The main backtracking loop starting at the given input position.
182 fn backtrack(&mut self, start
: InputAt
) -> bool
{
183 // N.B. We use an explicit stack to avoid recursion.
184 // To avoid excessive pushing and popping, most transitions are handled
185 // in the `step` helper function, which only pushes to the stack when
186 // there's a capture or a branch.
187 let mut matched
= false;
188 self.m
.jobs
.push(Job
::Inst { ip: 0, at: start }
);
189 while let Some(job
) = self.m
.jobs
.pop() {
191 Job
::Inst { ip, at }
=> {
192 if self.step(ip
, at
) {
193 // Only quit if we're matching one regex.
194 // If we're matching a regex set, then mush on and
195 // try to find other matches (if we want them).
196 if self.prog
.matches
.len() == 1 {
202 Job
::SaveRestore { slot, old_pos }
=> {
203 if slot
< self.slots
.len() {
204 self.slots
[slot
] = old_pos
;
212 fn step(&mut self, mut ip
: InstPtr
, mut at
: InputAt
) -> bool
{
215 // This loop is an optimization to avoid constantly pushing/popping
216 // from the stack. Namely, if we're pushing a job only to run it
217 // next, avoid the push and just mutate `ip` (and possibly `at`)
219 if self.has_visited(ip
, at
) {
222 match self.prog
[ip
] {
224 if slot
< self.matches
.len() {
225 self.matches
[slot
] = true;
230 if let Some(&old_pos
) = self.slots
.get(inst
.slot
) {
231 // If this path doesn't work out, then we save the old
232 // capture index (if one exists) in an alternate
233 // job. If the next path fails, then the alternate
234 // job is popped and the old capture index is restored.
235 self.m
.jobs
.push(Job
::SaveRestore
{
239 self.slots
[inst
.slot
] = Some(at
.pos());
244 self.m
.jobs
.push(Job
::Inst { ip: inst.goto2, at: at }
);
247 EmptyLook(ref inst
) => {
248 if self.input
.is_empty_match(at
, inst
) {
255 if inst
.c
== at
.char() {
257 at
= self.input
.at(at
.next_pos());
262 Ranges(ref inst
) => {
263 if inst
.matches(at
.char()) {
265 at
= self.input
.at(at
.next_pos());
271 if let Some(b
) = at
.byte() {
274 at
= self.input
.at(at
.next_pos());
284 fn has_visited(&mut self, ip
: InstPtr
, at
: InputAt
) -> bool
{
285 let k
= ip
* (self.input
.len() + 1) + at
.pos();
286 let k1
= k
/ BIT_SIZE
;
287 let k2
= usize_to_u32(1 << (k
& (BIT_SIZE
- 1)));
288 if self.m
.visited
[k1
] & k2
== 0 {
289 self.m
.visited
[k1
] |= k2
;
297 fn usize_to_u32(n
: usize) -> u32 {
298 if (n
as u64) > (::std
::u32::MAX
as u64) {
299 panic
!("BUG: {} is too big to fit into u32", n
)