1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 // This module implements the Pike VM. That is, it guarantees linear time
12 // search of a regex on any text with memory use proportional to the size of
15 // It is equal in power to the backtracking engine in this crate, except the
16 // backtracking engine is typically faster on small regexes/texts at the
17 // expense of a bigger memory footprint.
19 // It can do more than the DFA can (specifically, record capture locations
20 // and execute Unicode word boundary assertions), but at a slower speed.
21 // Specifically, the Pike VM exectues a DFA implicitly by repeatedly expanding
22 // epsilon transitions. That is, the Pike VM engine can be in multiple states
23 // at once where as the DFA is only ever in one state at a time.
25 // Therefore, the Pike VM is generally treated as the fallback when the other
26 // matching engines either aren't feasible to run or are insufficient.
30 use exec
::ProgramCache
;
31 use input
::{Input, InputAt}
;
32 use prog
::{Program, InstPtr}
;
34 use sparse
::SparseSet
;
36 /// An NFA simulation matching engine.
38 pub struct Fsm
<'r
, I
> {
39 /// The sequence of opcodes (among other things) that is actually executed.
41 /// The program may be byte oriented or Unicode codepoint oriented.
43 /// An explicit stack used for following epsilon transitions. (This is
44 /// borrowed from the cache.)
45 stack
: &'r
mut Vec
<FollowEpsilon
>,
46 /// The input to search.
50 /// A cached allocation that can be reused on each execution.
51 #[derive(Clone, Debug)]
53 /// A pair of ordered sets for tracking NFA states.
56 /// An explicit stack used for following epsilon transitions.
57 stack
: Vec
<FollowEpsilon
>,
60 /// An ordered set of NFA states and their captures.
61 #[derive(Clone, Debug)]
63 /// An ordered set of opcodes (each opcode is an NFA state).
65 /// Captures for every NFA state.
67 /// It is stored in row-major order, where the columns are the capture
68 /// slots and the rows are the states.
70 /// The number of capture slots stored per thread. (Every capture has
72 slots_per_thread
: usize,
75 /// A representation of an explicit stack frame when following epsilon
76 /// transitions. This is used to avoid recursion.
77 #[derive(Clone, Debug)]
79 /// Follow transitions at the given instruction pointer.
81 /// Restore the capture slot with the given position in the input.
82 Capture { slot: usize, pos: Slot }
,
86 /// Create a new allocation used by the NFA machine to record execution
88 pub fn new(_prog
: &Program
) -> Self {
90 clist
: Threads
::new(),
91 nlist
: Threads
::new(),
97 impl<'r
, I
: Input
> Fsm
<'r
, I
> {
98 /// Execute the NFA matching engine.
100 /// If there's a match, `exec` returns `true` and populates the given
101 /// captures accordingly.
104 cache
: &ProgramCache
,
105 matches
: &mut [bool
],
107 quit_after_match
: bool
,
111 let mut cache
= cache
.borrow_mut();
112 let cache
= &mut cache
.pikevm
;
113 cache
.clist
.resize(prog
.len(), prog
.captures
.len());
114 cache
.nlist
.resize(prog
.len(), prog
.captures
.len());
115 let at
= input
.at(start
);
118 stack
: &mut cache
.stack
,
132 mut clist
: &mut Threads
,
133 mut nlist
: &mut Threads
,
134 matches
: &mut [bool
],
136 quit_after_match
: bool
,
139 let mut matched
= false;
140 let mut all_matched
= false;
144 if clist
.set
.is_empty() {
145 // Three ways to bail out when our current set of threads is
148 // 1. We have a match---so we're done exploring any possible
149 // alternatives. Time to quit. (We can't do this if we're
150 // looking for matches for multiple regexes, unless we know
151 // they all matched.)
153 // 2. If the expression starts with a '^' we can terminate as
154 // soon as the last thread dies.
155 if (matched
&& matches
.len() <= 1)
157 || (!at
.is_start() && self.prog
.is_anchored_start
) {
161 // 3. If there's a literal prefix for the program, try to
162 // jump ahead quickly. If it can't be found, then we can
164 if !self.prog
.prefixes
.is_empty() {
165 at
= match self.input
.prefix_at(&self.prog
.prefixes
, at
) {
172 // This simulates a preceding '.*?' for every regex by adding
173 // a state starting at the current position in the input for the
174 // beginning of the program only if we don't already have a match.
175 if clist
.set
.is_empty()
176 || (!self.prog
.is_anchored_start
&& !all_matched
) {
177 self.add(&mut clist
, slots
, 0, at
);
179 // The previous call to "add" actually inspects the position just
180 // before the current character. For stepping through the machine,
181 // we can to look at the current character, so we advance the
183 let at_next
= self.input
.at(at
.next_pos());
184 for i
in 0..clist
.set
.len() {
185 let ip
= clist
.set
[i
];
196 all_matched
= all_matched
|| matches
.iter().all(|&b
| b
);
197 if quit_after_match
{
198 // If we only care if a match occurs (not its
199 // position), then we can quit right now.
202 if self.prog
.matches
.len() == 1 {
203 // We don't need to check the rest of the threads
204 // in this set because we've matched something
205 // ("leftmost-first"). However, we still need to check
206 // threads in the next set to support things like
209 // This is only true on normal regexes. For regex sets,
210 // we need to mush on to observe other matches.
219 mem
::swap(clist
, nlist
);
225 /// Step through the input, one token (byte or codepoint) at a time.
227 /// nlist is the set of states that will be processed on the next token
230 /// caps is the set of captures passed by the caller of the NFA. They are
231 /// written to only when a match state is visited.
233 /// thread_caps is the set of captures set for the current NFA state, ip.
235 /// at and at_next are the current and next positions in the input. at or
236 /// at_next may be EOF.
240 matches
: &mut [bool
],
242 thread_caps
: &mut [Option
<usize>],
248 match self.prog
[ip
] {
249 Match(match_slot
) => {
250 if match_slot
< matches
.len() {
251 matches
[match_slot
] = true;
253 for (slot
, val
) in slots
.iter_mut().zip(thread_caps
.iter()) {
259 if inst
.c
== at
.char() {
260 self.add(nlist
, thread_caps
, inst
.goto
, at_next
);
264 Ranges(ref inst
) => {
265 if inst
.matches(at
.char()) {
266 self.add(nlist
, thread_caps
, inst
.goto
, at_next
);
271 if let Some(b
) = at
.byte() {
273 self.add(nlist
, thread_caps
, inst
.goto
, at_next
);
278 EmptyLook(_
) | Save(_
) | Split(_
) => false,
282 /// Follows epsilon transitions and adds them for processing to nlist,
283 /// starting at and including ip.
287 thread_caps
: &mut [Option
<usize>],
291 self.stack
.push(FollowEpsilon
::IP(ip
));
292 while let Some(frame
) = self.stack
.pop() {
294 FollowEpsilon
::IP(ip
) => {
295 self.add_step(nlist
, thread_caps
, ip
, at
);
297 FollowEpsilon
::Capture { slot, pos }
=> {
298 thread_caps
[slot
] = pos
;
304 /// A helper function for add that avoids excessive pushing to the stack.
308 thread_caps
: &mut [Option
<usize>],
312 // Instead of pushing and popping to the stack, we mutate ip as we
313 // traverse the set of states. We only push to the stack when we
314 // absolutely need recursion (restoring captures or following a
318 // Don't visit states we've already added.
319 if nlist
.set
.contains(ip
) {
322 nlist
.set
.insert(ip
);
323 match self.prog
[ip
] {
324 EmptyLook(ref inst
) => {
325 if self.input
.is_empty_match(at
, inst
) {
330 if inst
.slot
< thread_caps
.len() {
331 self.stack
.push(FollowEpsilon
::Capture
{
333 pos
: thread_caps
[inst
.slot
],
335 thread_caps
[inst
.slot
] = Some(at
.pos());
340 self.stack
.push(FollowEpsilon
::IP(inst
.goto2
));
343 Match(_
) | Char(_
) | Ranges(_
) | Bytes(_
) => {
344 let t
= &mut nlist
.caps(ip
);
345 for (slot
, val
) in t
.iter_mut().zip(thread_caps
.iter()) {
358 set
: SparseSet
::new(0),
364 fn resize(&mut self, num_insts
: usize, ncaps
: usize) {
365 if num_insts
== self.set
.capacity() {
368 self.slots_per_thread
= ncaps
* 2;
369 self.set
= SparseSet
::new(num_insts
);
370 self.caps
= vec
![None
; self.slots_per_thread
* num_insts
];
373 fn caps(&mut self, pc
: usize) -> &mut [Option
<usize>] {
374 let i
= pc
* self.slots_per_thread
;
375 &mut self.caps
[i
..i
+ self.slots_per_thread
]