]> git.proxmox.com Git - rustc.git/blob - src/vendor/regex-0.2.10/src/backtrack.rs
New upstream version 1.28.0~beta.14+dfsg1
[rustc.git] / src / vendor / regex-0.2.10 / src / backtrack.rs
1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 // This is the backtracking matching engine. It has the same exact capability
12 // as the full NFA simulation, except it is artificially restricted to small
13 // regexes on small inputs because of its memory requirements.
14 //
15 // In particular, this is a *bounded* backtracking engine. It retains worst
16 // case linear time by keeping track of the states that it has visited (using a
17 // bitmap). Namely, once a state is visited, it is never visited again. Since a
18 // state is keyed by `(instruction index, input index)`, we have that its time
19 // complexity is `O(mn)` (i.e., linear in the size of the search text).
20 //
21 // The backtracking engine can beat out the NFA simulation on small
22 // regexes/inputs because it doesn't have to keep track of multiple copies of
23 // the capture groups. In benchmarks, the backtracking engine is roughly twice
24 // as fast as the full NFA simulation. Note though that its performance doesn't
25 // scale, even if you're willing to live with the memory requirements. Namely,
26 // the bitset has to be zeroed on each execution, which becomes quite expensive
27 // on large bitsets.
28
29 use exec::ProgramCache;
30 use input::{Input, InputAt};
31 use prog::{Program, InstPtr};
32 use re_trait::Slot;
33
34 type Bits = u32;
35
36 const BIT_SIZE: usize = 32;
37 const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB
38
39 /// Returns true iff the given regex and input should be executed by this
40 /// engine with reasonable memory usage.
41 pub fn should_exec(num_insts: usize, text_len: usize) -> bool {
42 // Total memory usage in bytes is determined by:
43 //
44 // ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32))
45 //
46 // The actual limit picked is pretty much a heuristic.
47 // See: https://github.com/rust-lang/regex/issues/215
48 let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4;
49 size <= MAX_SIZE_BYTES
50 }
51
52 /// A backtracking matching engine.
53 #[derive(Debug)]
54 pub struct Bounded<'a, 'm, 'r, 's, I> {
55 prog: &'r Program,
56 input: I,
57 matches: &'m mut [bool],
58 slots: &'s mut [Slot],
59 m: &'a mut Cache,
60 }
61
62 /// Shared cached state between multiple invocations of a backtracking engine
63 /// in the same thread.
64 #[derive(Clone, Debug)]
65 pub struct Cache {
66 jobs: Vec<Job>,
67 visited: Vec<Bits>,
68 }
69
70 impl Cache {
71 /// Create new empty cache for the backtracking engine.
72 pub fn new(_prog: &Program) -> Self {
73 Cache { jobs: vec![], visited: vec![] }
74 }
75 }
76
77 /// A job is an explicit unit of stack space in the backtracking engine.
78 ///
79 /// The "normal" representation is a single state transition, which corresponds
80 /// to an NFA state and a character in the input. However, the backtracking
81 /// engine must keep track of old capture group values. We use the explicit
82 /// stack to do it.
83 #[derive(Clone, Copy, Debug)]
84 enum Job {
85 Inst { ip: InstPtr, at: InputAt },
86 SaveRestore { slot: usize, old_pos: Option<usize> },
87 }
88
89 impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
90 /// Execute the backtracking matching engine.
91 ///
92 /// If there's a match, `exec` returns `true` and populates the given
93 /// captures accordingly.
94 pub fn exec(
95 prog: &'r Program,
96 cache: &ProgramCache,
97 matches: &'m mut [bool],
98 slots: &'s mut [Slot],
99 input: I,
100 start: usize,
101 ) -> bool {
102 let mut cache = cache.borrow_mut();
103 let cache = &mut cache.backtrack;
104 let start = input.at(start);
105 let mut b = Bounded {
106 prog: prog,
107 input: input,
108 matches: matches,
109 slots: slots,
110 m: cache,
111 };
112 b.exec_(start)
113 }
114
115 /// Clears the cache such that the backtracking engine can be executed
116 /// on some input of fixed length.
117 fn clear(&mut self) {
118 // Reset the job memory so that we start fresh.
119 self.m.jobs.clear();
120
121 // Now we need to clear the bit state set.
122 // We do this by figuring out how much space we need to keep track
123 // of the states we've visited.
124 // Then we reset all existing allocated space to 0.
125 // Finally, we request more space if we need it.
126 //
127 // This is all a little circuitous, but doing this unsafely
128 // doesn't seem to have a measurable impact on performance.
129 // (Probably because backtracking is limited to such small
130 // inputs/regexes in the first place.)
131 let visited_len =
132 (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1)
133 /
134 BIT_SIZE;
135 self.m.visited.truncate(visited_len);
136 for v in &mut self.m.visited {
137 *v = 0;
138 }
139 if visited_len > self.m.visited.len() {
140 let len = self.m.visited.len();
141 self.m.visited.reserve_exact(visited_len - len);
142 for _ in 0..(visited_len - len) {
143 self.m.visited.push(0);
144 }
145 }
146 }
147
148 /// Start backtracking at the given position in the input, but also look
149 /// for literal prefixes.
150 fn exec_(&mut self, mut at: InputAt) -> bool {
151 self.clear();
152 // If this is an anchored regex at the beginning of the input, then
153 // we're either already done or we only need to try backtracking once.
154 if self.prog.is_anchored_start {
155 return if !at.is_start() {
156 false
157 } else {
158 self.backtrack(at)
159 };
160 }
161 let mut matched = false;
162 loop {
163 if !self.prog.prefixes.is_empty() {
164 at = match self.input.prefix_at(&self.prog.prefixes, at) {
165 None => break,
166 Some(at) => at,
167 };
168 }
169 matched = self.backtrack(at) || matched;
170 if matched && self.prog.matches.len() == 1 {
171 return true;
172 }
173 if at.is_end() {
174 break;
175 }
176 at = self.input.at(at.next_pos());
177 }
178 matched
179 }
180
181 /// The main backtracking loop starting at the given input position.
182 fn backtrack(&mut self, start: InputAt) -> bool {
183 // N.B. We use an explicit stack to avoid recursion.
184 // To avoid excessive pushing and popping, most transitions are handled
185 // in the `step` helper function, which only pushes to the stack when
186 // there's a capture or a branch.
187 let mut matched = false;
188 self.m.jobs.push(Job::Inst { ip: 0, at: start });
189 while let Some(job) = self.m.jobs.pop() {
190 match job {
191 Job::Inst { ip, at } => {
192 if self.step(ip, at) {
193 // Only quit if we're matching one regex.
194 // If we're matching a regex set, then mush on and
195 // try to find other matches (if we want them).
196 if self.prog.matches.len() == 1 {
197 return true;
198 }
199 matched = true;
200 }
201 }
202 Job::SaveRestore { slot, old_pos } => {
203 if slot < self.slots.len() {
204 self.slots[slot] = old_pos;
205 }
206 }
207 }
208 }
209 matched
210 }
211
212 fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool {
213 use prog::Inst::*;
214 loop {
215 // This loop is an optimization to avoid constantly pushing/popping
216 // from the stack. Namely, if we're pushing a job only to run it
217 // next, avoid the push and just mutate `ip` (and possibly `at`)
218 // in place.
219 if self.has_visited(ip, at) {
220 return false;
221 }
222 match self.prog[ip] {
223 Match(slot) => {
224 if slot < self.matches.len() {
225 self.matches[slot] = true;
226 }
227 return true;
228 }
229 Save(ref inst) => {
230 if let Some(&old_pos) = self.slots.get(inst.slot) {
231 // If this path doesn't work out, then we save the old
232 // capture index (if one exists) in an alternate
233 // job. If the next path fails, then the alternate
234 // job is popped and the old capture index is restored.
235 self.m.jobs.push(Job::SaveRestore {
236 slot: inst.slot,
237 old_pos: old_pos,
238 });
239 self.slots[inst.slot] = Some(at.pos());
240 }
241 ip = inst.goto;
242 }
243 Split(ref inst) => {
244 self.m.jobs.push(Job::Inst { ip: inst.goto2, at: at });
245 ip = inst.goto1;
246 }
247 EmptyLook(ref inst) => {
248 if self.input.is_empty_match(at, inst) {
249 ip = inst.goto;
250 } else {
251 return false;
252 }
253 }
254 Char(ref inst) => {
255 if inst.c == at.char() {
256 ip = inst.goto;
257 at = self.input.at(at.next_pos());
258 } else {
259 return false;
260 }
261 }
262 Ranges(ref inst) => {
263 if inst.matches(at.char()) {
264 ip = inst.goto;
265 at = self.input.at(at.next_pos());
266 } else {
267 return false;
268 }
269 }
270 Bytes(ref inst) => {
271 if let Some(b) = at.byte() {
272 if inst.matches(b) {
273 ip = inst.goto;
274 at = self.input.at(at.next_pos());
275 continue;
276 }
277 }
278 return false;
279 }
280 }
281 }
282 }
283
284 fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool {
285 let k = ip * (self.input.len() + 1) + at.pos();
286 let k1 = k / BIT_SIZE;
287 let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1)));
288 if self.m.visited[k1] & k2 == 0 {
289 self.m.visited[k1] |= k2;
290 false
291 } else {
292 true
293 }
294 }
295 }
296
297 fn usize_to_u32(n: usize) -> u32 {
298 if (n as u64) > (::std::u32::MAX as u64) {
299 panic!("BUG: {} is too big to fit into u32", n)
300 }
301 n as u32
302 }