]> git.proxmox.com Git - rustc.git/blob - src/vendor/regex-0.2.11/src/pikevm.rs
New upstream version 1.29.0+dfsg1
[rustc.git] / src / vendor / regex-0.2.11 / src / pikevm.rs
1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 // This module implements the Pike VM. That is, it guarantees linear time
12 // search of a regex on any text with memory use proportional to the size of
13 // the regex.
14 //
15 // It is equal in power to the backtracking engine in this crate, except the
16 // backtracking engine is typically faster on small regexes/texts at the
17 // expense of a bigger memory footprint.
18 //
19 // It can do more than the DFA can (specifically, record capture locations
20 // and execute Unicode word boundary assertions), but at a slower speed.
21 // Specifically, the Pike VM exectues a DFA implicitly by repeatedly expanding
22 // epsilon transitions. That is, the Pike VM engine can be in multiple states
23 // at once where as the DFA is only ever in one state at a time.
24 //
25 // Therefore, the Pike VM is generally treated as the fallback when the other
26 // matching engines either aren't feasible to run or are insufficient.
27
28 use std::mem;
29
30 use exec::ProgramCache;
31 use input::{Input, InputAt};
32 use prog::{Program, InstPtr};
33 use re_trait::Slot;
34 use sparse::SparseSet;
35
36 /// An NFA simulation matching engine.
37 #[derive(Debug)]
38 pub struct Fsm<'r, I> {
39 /// The sequence of opcodes (among other things) that is actually executed.
40 ///
41 /// The program may be byte oriented or Unicode codepoint oriented.
42 prog: &'r Program,
43 /// An explicit stack used for following epsilon transitions. (This is
44 /// borrowed from the cache.)
45 stack: &'r mut Vec<FollowEpsilon>,
46 /// The input to search.
47 input: I,
48 }
49
50 /// A cached allocation that can be reused on each execution.
51 #[derive(Clone, Debug)]
52 pub struct Cache {
53 /// A pair of ordered sets for tracking NFA states.
54 clist: Threads,
55 nlist: Threads,
56 /// An explicit stack used for following epsilon transitions.
57 stack: Vec<FollowEpsilon>,
58 }
59
60 /// An ordered set of NFA states and their captures.
61 #[derive(Clone, Debug)]
62 struct Threads {
63 /// An ordered set of opcodes (each opcode is an NFA state).
64 set: SparseSet,
65 /// Captures for every NFA state.
66 ///
67 /// It is stored in row-major order, where the columns are the capture
68 /// slots and the rows are the states.
69 caps: Vec<Slot>,
70 /// The number of capture slots stored per thread. (Every capture has
71 /// two slots.)
72 slots_per_thread: usize,
73 }
74
75 /// A representation of an explicit stack frame when following epsilon
76 /// transitions. This is used to avoid recursion.
77 #[derive(Clone, Debug)]
78 enum FollowEpsilon {
79 /// Follow transitions at the given instruction pointer.
80 IP(InstPtr),
81 /// Restore the capture slot with the given position in the input.
82 Capture { slot: usize, pos: Slot },
83 }
84
85 impl Cache {
86 /// Create a new allocation used by the NFA machine to record execution
87 /// and captures.
88 pub fn new(_prog: &Program) -> Self {
89 Cache {
90 clist: Threads::new(),
91 nlist: Threads::new(),
92 stack: vec![],
93 }
94 }
95 }
96
97 impl<'r, I: Input> Fsm<'r, I> {
98 /// Execute the NFA matching engine.
99 ///
100 /// If there's a match, `exec` returns `true` and populates the given
101 /// captures accordingly.
102 pub fn exec(
103 prog: &'r Program,
104 cache: &ProgramCache,
105 matches: &mut [bool],
106 slots: &mut [Slot],
107 quit_after_match: bool,
108 input: I,
109 start: usize,
110 ) -> bool {
111 let mut cache = cache.borrow_mut();
112 let cache = &mut cache.pikevm;
113 cache.clist.resize(prog.len(), prog.captures.len());
114 cache.nlist.resize(prog.len(), prog.captures.len());
115 let at = input.at(start);
116 Fsm {
117 prog: prog,
118 stack: &mut cache.stack,
119 input: input,
120 }.exec_(
121 &mut cache.clist,
122 &mut cache.nlist,
123 matches,
124 slots,
125 quit_after_match,
126 at,
127 )
128 }
129
130 fn exec_(
131 &mut self,
132 mut clist: &mut Threads,
133 mut nlist: &mut Threads,
134 matches: &mut [bool],
135 slots: &mut [Slot],
136 quit_after_match: bool,
137 mut at: InputAt,
138 ) -> bool {
139 let mut matched = false;
140 let mut all_matched = false;
141 clist.set.clear();
142 nlist.set.clear();
143 'LOOP: loop {
144 if clist.set.is_empty() {
145 // Three ways to bail out when our current set of threads is
146 // empty.
147 //
148 // 1. We have a match---so we're done exploring any possible
149 // alternatives. Time to quit. (We can't do this if we're
150 // looking for matches for multiple regexes, unless we know
151 // they all matched.)
152 //
153 // 2. If the expression starts with a '^' we can terminate as
154 // soon as the last thread dies.
155 if (matched && matches.len() <= 1)
156 || all_matched
157 || (!at.is_start() && self.prog.is_anchored_start) {
158 break;
159 }
160
161 // 3. If there's a literal prefix for the program, try to
162 // jump ahead quickly. If it can't be found, then we can
163 // bail out early.
164 if !self.prog.prefixes.is_empty() {
165 at = match self.input.prefix_at(&self.prog.prefixes, at) {
166 None => break,
167 Some(at) => at,
168 };
169 }
170 }
171
172 // This simulates a preceding '.*?' for every regex by adding
173 // a state starting at the current position in the input for the
174 // beginning of the program only if we don't already have a match.
175 if clist.set.is_empty()
176 || (!self.prog.is_anchored_start && !all_matched) {
177 self.add(&mut clist, slots, 0, at);
178 }
179 // The previous call to "add" actually inspects the position just
180 // before the current character. For stepping through the machine,
181 // we can to look at the current character, so we advance the
182 // input.
183 let at_next = self.input.at(at.next_pos());
184 for i in 0..clist.set.len() {
185 let ip = clist.set[i];
186 if self.step(
187 &mut nlist,
188 matches,
189 slots,
190 clist.caps(ip),
191 ip,
192 at,
193 at_next,
194 ) {
195 matched = true;
196 all_matched = all_matched || matches.iter().all(|&b| b);
197 if quit_after_match {
198 // If we only care if a match occurs (not its
199 // position), then we can quit right now.
200 break 'LOOP;
201 }
202 if self.prog.matches.len() == 1 {
203 // We don't need to check the rest of the threads
204 // in this set because we've matched something
205 // ("leftmost-first"). However, we still need to check
206 // threads in the next set to support things like
207 // greedy matching.
208 //
209 // This is only true on normal regexes. For regex sets,
210 // we need to mush on to observe other matches.
211 break;
212 }
213 }
214 }
215 if at.is_end() {
216 break;
217 }
218 at = at_next;
219 mem::swap(clist, nlist);
220 nlist.set.clear();
221 }
222 matched
223 }
224
225 /// Step through the input, one token (byte or codepoint) at a time.
226 ///
227 /// nlist is the set of states that will be processed on the next token
228 /// in the input.
229 ///
230 /// caps is the set of captures passed by the caller of the NFA. They are
231 /// written to only when a match state is visited.
232 ///
233 /// thread_caps is the set of captures set for the current NFA state, ip.
234 ///
235 /// at and at_next are the current and next positions in the input. at or
236 /// at_next may be EOF.
237 fn step(
238 &mut self,
239 nlist: &mut Threads,
240 matches: &mut [bool],
241 slots: &mut [Slot],
242 thread_caps: &mut [Option<usize>],
243 ip: usize,
244 at: InputAt,
245 at_next: InputAt,
246 ) -> bool {
247 use prog::Inst::*;
248 match self.prog[ip] {
249 Match(match_slot) => {
250 if match_slot < matches.len() {
251 matches[match_slot] = true;
252 }
253 for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
254 *slot = *val;
255 }
256 true
257 }
258 Char(ref inst) => {
259 if inst.c == at.char() {
260 self.add(nlist, thread_caps, inst.goto, at_next);
261 }
262 false
263 }
264 Ranges(ref inst) => {
265 if inst.matches(at.char()) {
266 self.add(nlist, thread_caps, inst.goto, at_next);
267 }
268 false
269 }
270 Bytes(ref inst) => {
271 if let Some(b) = at.byte() {
272 if inst.matches(b) {
273 self.add(nlist, thread_caps, inst.goto, at_next);
274 }
275 }
276 false
277 }
278 EmptyLook(_) | Save(_) | Split(_) => false,
279 }
280 }
281
282 /// Follows epsilon transitions and adds them for processing to nlist,
283 /// starting at and including ip.
284 fn add(
285 &mut self,
286 nlist: &mut Threads,
287 thread_caps: &mut [Option<usize>],
288 ip: usize,
289 at: InputAt,
290 ) {
291 self.stack.push(FollowEpsilon::IP(ip));
292 while let Some(frame) = self.stack.pop() {
293 match frame {
294 FollowEpsilon::IP(ip) => {
295 self.add_step(nlist, thread_caps, ip, at);
296 }
297 FollowEpsilon::Capture { slot, pos } => {
298 thread_caps[slot] = pos;
299 }
300 }
301 }
302 }
303
304 /// A helper function for add that avoids excessive pushing to the stack.
305 fn add_step(
306 &mut self,
307 nlist: &mut Threads,
308 thread_caps: &mut [Option<usize>],
309 mut ip: usize,
310 at: InputAt,
311 ) {
312 // Instead of pushing and popping to the stack, we mutate ip as we
313 // traverse the set of states. We only push to the stack when we
314 // absolutely need recursion (restoring captures or following a
315 // branch).
316 use prog::Inst::*;
317 loop {
318 // Don't visit states we've already added.
319 if nlist.set.contains(ip) {
320 return;
321 }
322 nlist.set.insert(ip);
323 match self.prog[ip] {
324 EmptyLook(ref inst) => {
325 if self.input.is_empty_match(at, inst) {
326 ip = inst.goto;
327 }
328 }
329 Save(ref inst) => {
330 if inst.slot < thread_caps.len() {
331 self.stack.push(FollowEpsilon::Capture {
332 slot: inst.slot,
333 pos: thread_caps[inst.slot],
334 });
335 thread_caps[inst.slot] = Some(at.pos());
336 }
337 ip = inst.goto;
338 }
339 Split(ref inst) => {
340 self.stack.push(FollowEpsilon::IP(inst.goto2));
341 ip = inst.goto1;
342 }
343 Match(_) | Char(_) | Ranges(_) | Bytes(_) => {
344 let t = &mut nlist.caps(ip);
345 for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
346 *slot = *val;
347 }
348 return;
349 }
350 }
351 }
352 }
353 }
354
355 impl Threads {
356 fn new() -> Self {
357 Threads {
358 set: SparseSet::new(0),
359 caps: vec![],
360 slots_per_thread: 0,
361 }
362 }
363
364 fn resize(&mut self, num_insts: usize, ncaps: usize) {
365 if num_insts == self.set.capacity() {
366 return;
367 }
368 self.slots_per_thread = ncaps * 2;
369 self.set = SparseSet::new(num_insts);
370 self.caps = vec![None; self.slots_per_thread * num_insts];
371 }
372
373 fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
374 let i = pc * self.slots_per_thread;
375 &mut self.caps[i..i + self.slots_per_thread]
376 }
377 }