]> git.proxmox.com Git - rustc.git/blob - src/vendor/regex-0.2.10/src/dfa.rs
New upstream version 1.28.0+dfsg1
[rustc.git] / src / vendor / regex-0.2.10 / src / dfa.rs
1 // Copyright 2014-2016 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 /*!
12 The DFA matching engine.
13
14 A DFA provides faster matching because the engine is in exactly one state at
15 any point in time. In the NFA, there may be multiple active states, and
16 considerable CPU cycles are spent shuffling them around. In finite automata
17 speak, the DFA follows epsilon transitions in the regex far less than the NFA.
18
19 A DFA is a classic trade off between time and space. The NFA is slower, but
20 its memory requirements are typically small and predictable. The DFA is faster,
21 but given the right regex and the right input, the number of states in the
22 DFA can grow exponentially. To mitigate this space problem, we do two things:
23
24 1. We implement an *online* DFA. That is, the DFA is constructed from the NFA
25 during a search. When a new state is computed, it is stored in a cache so
26 that it may be reused. An important consequence of this implementation
27 is that states that are never reached for a particular input are never
28 computed. (This is impossible in an "offline" DFA which needs to compute
29 all possible states up front.)
30 2. If the cache gets too big, we wipe it and continue matching.
31
32 In pathological cases, a new state can be created for every byte of input.
33 (e.g., The regex `(a|b)*a(a|b){20}` on a long sequence of a's and b's.)
34 In this case, performance regresses to slightly slower than the full NFA
35 simulation, in large part because the cache becomes useless. If the cache
36 is wiped too frequently, the DFA quits and control falls back to one of the
37 NFA simulations.
38
39 Because of the "lazy" nature of this DFA, the inner matching loop is
40 considerably more complex than one might expect out of a DFA. A number of
41 tricks are employed to make it fast. Tread carefully.
42
43 N.B. While this implementation is heavily commented, Russ Cox's series of
44 articles on regexes is strongly recommended: https://swtch.com/~rsc/regexp/
45 (As is the DFA implementation in RE2, which heavily influenced this
46 implementation.)
47 */
48
49 use std::collections::HashMap;
50 use std::fmt;
51 use std::iter::repeat;
52 use std::mem;
53
54 use exec::ProgramCache;
55 use prog::{Inst, Program};
56 use sparse::SparseSet;
57
58 /// Return true if and only if the given program can be executed by a DFA.
59 ///
60 /// Generally, a DFA is always possible. A pathological case where it is not
61 /// possible is if the number of NFA states exceeds `u32::MAX`, in which case,
62 /// this function will return false.
63 ///
64 /// This function will also return false if the given program has any Unicode
65 /// instructions (Char or Ranges) since the DFA operates on bytes only.
66 pub fn can_exec(insts: &Program) -> bool {
67 use prog::Inst::*;
68 // If for some reason we manage to allocate a regex program with more
69 // than i32::MAX instructions, then we can't execute the DFA because we
70 // use 32 bit instruction pointer deltas for memory savings.
71 // If i32::MAX is the largest positive delta,
72 // then -i32::MAX == i32::MIN + 1 is the largest negative delta,
73 // and we are OK to use 32 bits.
74 if insts.dfa_size_limit == 0 || insts.len() > ::std::i32::MAX as usize {
75 return false;
76 }
77 for inst in insts {
78 match *inst {
79 Char(_) | Ranges(_) => return false,
80 EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {}
81 }
82 }
83 true
84 }
85
86 /// A reusable cache of DFA states.
87 ///
88 /// This cache is reused between multiple invocations of the same regex
89 /// program. (It is not shared simultaneously between threads. If there is
90 /// contention, then new caches are created.)
91 #[derive(Clone, Debug)]
92 pub struct Cache {
93 /// Group persistent DFA related cache state together. The sparse sets
94 /// listed below are used as scratch space while computing uncached states.
95 inner: CacheInner,
96 /// qcur and qnext are ordered sets with constant time
97 /// addition/membership/clearing-whole-set and linear time iteration. They
98 /// are used to manage the sets of NFA states in DFA states when computing
99 /// cached DFA states. In particular, the order of the NFA states matters
100 /// for leftmost-first style matching. Namely, when computing a cached
101 /// state, the set of NFA states stops growing as soon as the first Match
102 /// instruction is observed.
103 qcur: SparseSet,
104 qnext: SparseSet,
105 }
106
107 /// `CacheInner` is logically just a part of Cache, but groups together fields
108 /// that aren't passed as function parameters throughout search. (This split
109 /// is mostly an artifact of the borrow checker. It is happily paid.)
110 #[derive(Clone, Debug)]
111 struct CacheInner {
112 /// A cache of pre-compiled DFA states, keyed by the set of NFA states
113 /// and the set of empty-width flags set at the byte in the input when the
114 /// state was observed.
115 ///
116 /// A StatePtr is effectively a `*State`, but to avoid various inconvenient
117 /// things, we just pass indexes around manually. The performance impact of
118 /// this is probably an instruction or two in the inner loop. However, on
119 /// 64 bit, each StatePtr is half the size of a *State.
120 compiled: HashMap<State, StatePtr>,
121 /// The transition table.
122 ///
123 /// The transition table is laid out in row-major order, where states are
124 /// rows and the transitions for each state are columns. At a high level,
125 /// given state `s` and byte `b`, the next state can be found at index
126 /// `s * 256 + b`.
127 ///
128 /// This is, of course, a lie. A StatePtr is actually a pointer to the
129 /// *start* of a row in this table. When indexing in the DFA's inner loop,
130 /// this removes the need to multiply the StatePtr by the stride. Yes, it
131 /// matters. This reduces the number of states we can store, but: the
132 /// stride is rarely 256 since we define transitions in terms of
133 /// *equivalence classes* of bytes. Each class corresponds to a set of
134 /// bytes that never discriminate a distinct path through the DFA from each
135 /// other.
136 trans: Transitions,
137 /// Our set of states. Note that `StatePtr / num_byte_classes` indexes
138 /// this Vec rather than just a `StatePtr`.
139 states: Vec<State>,
140 /// A set of cached start states, which are limited to the number of
141 /// permutations of flags set just before the initial byte of input. (The
142 /// index into this vec is a `EmptyFlags`.)
143 ///
144 /// N.B. A start state can be "dead" (i.e., no possible match), so we
145 /// represent it with a StatePtr.
146 start_states: Vec<StatePtr>,
147 /// Stack scratch space used to follow epsilon transitions in the NFA.
148 /// (This permits us to avoid recursion.)
149 ///
150 /// The maximum stack size is the number of NFA states.
151 stack: Vec<InstPtr>,
152 /// The total number of times this cache has been flushed by the DFA
153 /// because of space constraints.
154 flush_count: u64,
155 /// The total heap size of the DFA's cache. We use this to determine when
156 /// we should flush the cache.
157 size: usize,
158 }
159
160 /// The transition table.
161 ///
162 /// It is laid out in row-major order, with states as rows and byte class
163 /// transitions as columns.
164 ///
165 /// The transition table is responsible for producing valid `StatePtrs`. A
166 /// `StatePtr` points to the start of a particular row in this table. When
167 /// indexing to find the next state this allows us to avoid a multiplication
168 /// when computing an index into the table.
169 #[derive(Clone)]
170 struct Transitions {
171 /// The table.
172 table: Vec<StatePtr>,
173 /// The stride.
174 num_byte_classes: usize,
175 }
176
177 /// Fsm encapsulates the actual execution of the DFA.
178 #[derive(Debug)]
179 pub struct Fsm<'a> {
180 /// prog contains the NFA instruction opcodes. DFA execution uses either
181 /// the `dfa` instructions or the `dfa_reverse` instructions from
182 /// `exec::ExecReadOnly`. (It never uses `ExecReadOnly.nfa`, which may have
183 /// Unicode opcodes that cannot be executed by the DFA.)
184 prog: &'a Program,
185 /// The start state. We record it here because the pointer may change
186 /// when the cache is wiped.
187 start: StatePtr,
188 /// The current position in the input.
189 at: usize,
190 /// Should we quit after seeing the first match? e.g., When the caller
191 /// uses `is_match` or `shortest_match`.
192 quit_after_match: bool,
193 /// The last state that matched.
194 ///
195 /// When no match has occurred, this is set to STATE_UNKNOWN.
196 ///
197 /// This is only useful when matching regex sets. The last match state
198 /// is useful because it contains all of the match instructions seen,
199 /// thereby allowing us to enumerate which regexes in the set matched.
200 last_match_si: StatePtr,
201 /// The input position of the last cache flush. We use this to determine
202 /// if we're thrashing in the cache too often. If so, the DFA quits so
203 /// that we can fall back to the NFA algorithm.
204 last_cache_flush: usize,
205 /// All cached DFA information that is persisted between searches.
206 cache: &'a mut CacheInner,
207 }
208
209 /// The result of running the DFA.
210 ///
211 /// Generally, the result is either a match or not a match, but sometimes the
212 /// DFA runs too slowly because the cache size is too small. In that case, it
213 /// gives up with the intent of falling back to the NFA algorithm.
214 ///
215 /// The DFA can also give up if it runs out of room to create new states, or if
216 /// it sees non-ASCII bytes in the presence of a Unicode word boundary.
217 #[derive(Clone, Debug)]
218 pub enum Result<T> {
219 Match(T),
220 NoMatch(usize),
221 Quit,
222 }
223
224 impl<T> Result<T> {
225 /// Returns true if this result corresponds to a match.
226 pub fn is_match(&self) -> bool {
227 match *self {
228 Result::Match(_) => true,
229 Result::NoMatch(_) | Result::Quit => false,
230 }
231 }
232
233 /// Maps the given function onto T and returns the result.
234 ///
235 /// If this isn't a match, then this is a no-op.
236 pub fn map<U, F: FnMut(T) -> U>(self, mut f: F) -> Result<U> {
237 match self {
238 Result::Match(t) => Result::Match(f(t)),
239 Result::NoMatch(x) => Result::NoMatch(x),
240 Result::Quit => Result::Quit,
241 }
242 }
243
244 /// Sets the non-match position.
245 ///
246 /// If this isn't a non-match, then this is a no-op.
247 fn set_non_match(self, at: usize) -> Result<T> {
248 match self {
249 Result::NoMatch(_) => Result::NoMatch(at),
250 r => r,
251 }
252 }
253 }
254
255 /// `State` is a DFA state. It contains an ordered set of NFA states (not
256 /// necessarily complete) and a smattering of flags.
257 ///
258 /// The flags are packed into the first byte of data.
259 ///
260 /// States don't carry their transitions. Instead, transitions are stored in
261 /// a single row-major table.
262 ///
263 /// Delta encoding is used to store the instruction pointers.
264 /// The first instruction pointer is stored directly starting
265 /// at data[1], and each following pointer is stored as an offset
266 /// to the previous one. If a delta is in the range -127..127,
267 /// it is packed into a single byte; Otherwise the byte 128 (-128 as an i8)
268 /// is coded as a flag, followed by 4 bytes encoding the delta.
269 #[derive(Clone, Eq, Hash, PartialEq)]
270 struct State{
271 data: Box<[u8]>,
272 }
273
274 /// `InstPtr` is a 32 bit pointer into a sequence of opcodes (i.e., it indexes
275 /// an NFA state).
276 ///
277 /// Throughout this library, this is usually set to `usize`, but we force a
278 /// `u32` here for the DFA to save on space.
279 type InstPtr = u32;
280
281 /// Adds ip to data using delta encoding with respect to prev.
282 ///
283 /// After completion, `data` will contain `ip` and `prev` will be set to `ip`.
284 fn push_inst_ptr(data: &mut Vec<u8>, prev: &mut InstPtr, ip: InstPtr) {
285 let delta = (ip as i32) - (*prev as i32);
286 write_vari32(data, delta);
287 *prev = ip;
288 }
289
290 struct InstPtrs<'a> {
291 base: usize,
292 data: &'a [u8],
293 }
294
295 impl <'a>Iterator for InstPtrs<'a> {
296 type Item = usize;
297
298 fn next(&mut self) -> Option<usize> {
299 if self.data.is_empty() {
300 return None;
301 }
302 let (delta, nread) = read_vari32(self.data);
303 let base = self.base as i32 + delta;
304 debug_assert!(base >= 0);
305 debug_assert!(nread > 0);
306 self.data = &self.data[nread..];
307 self.base = base as usize;
308 Some(self.base)
309 }
310 }
311
312 impl State {
313 fn flags(&self) -> StateFlags {
314 StateFlags(self.data[0])
315 }
316
317 fn inst_ptrs(&self) -> InstPtrs {
318 InstPtrs {
319 base: 0,
320 data: &self.data[1..],
321 }
322 }
323 }
324
325 /// `StatePtr` is a 32 bit pointer to the start of a row in the transition
326 /// table.
327 ///
328 /// It has many special values. There are two types of special values:
329 /// sentinels and flags.
330 ///
331 /// Sentinels corresponds to special states that carry some kind of
332 /// significance. There are three such states: unknown, dead and quit states.
333 ///
334 /// Unknown states are states that haven't been computed yet. They indicate
335 /// that a transition should be filled in that points to either an existing
336 /// cached state or a new state altogether. In general, an unknown state means
337 /// "follow the NFA's epsilon transitions."
338 ///
339 /// Dead states are states that can never lead to a match, no matter what
340 /// subsequent input is observed. This means that the DFA should quit
341 /// immediately and return the longest match it has found thus far.
342 ///
343 /// Quit states are states that imply the DFA is not capable of matching the
344 /// regex correctly. Currently, this is only used when a Unicode word boundary
345 /// exists in the regex *and* a non-ASCII byte is observed.
346 ///
347 /// The other type of state pointer is a state pointer with special flag bits.
348 /// There are two flags: a start flag and a match flag. The lower bits of both
349 /// kinds always contain a "valid" `StatePtr` (indicated by the `STATE_MAX`
350 /// mask).
351 ///
352 /// The start flag means that the state is a start state, and therefore may be
353 /// subject to special prefix scanning optimizations.
354 ///
355 /// The match flag means that the state is a match state, and therefore the
356 /// current position in the input (while searching) should be recorded.
357 ///
358 /// The above exists mostly in the service of making the inner loop fast.
359 /// In particular, the inner *inner* loop looks something like this:
360 ///
361 /// ```ignore
362 /// while state <= STATE_MAX and i < len(text):
363 /// state = state.next[i]
364 /// ```
365 ///
366 /// This is nice because it lets us execute a lazy DFA as if it were an
367 /// entirely offline DFA (i.e., with very few instructions). The loop will
368 /// quit only when we need to examine a case that needs special attention.
369 type StatePtr = u32;
370
371 /// An unknown state means that the state has not been computed yet, and that
372 /// the only way to progress is to compute it.
373 const STATE_UNKNOWN: StatePtr = 1<<31;
374
375 /// A dead state means that the state has been computed and it is known that
376 /// once it is entered, no future match can ever occur.
377 const STATE_DEAD: StatePtr = STATE_UNKNOWN + 1;
378
379 /// A quit state means that the DFA came across some input that it doesn't
380 /// know how to process correctly. The DFA should quit and another matching
381 /// engine should be run in its place.
382 const STATE_QUIT: StatePtr = STATE_DEAD + 1;
383
384 /// A start state is a state that the DFA can start in.
385 ///
386 /// Note that start states have their lower bits set to a state pointer.
387 const STATE_START: StatePtr = 1<<30;
388
389 /// A match state means that the regex has successfully matched.
390 ///
391 /// Note that match states have their lower bits set to a state pointer.
392 const STATE_MATCH: StatePtr = 1<<29;
393
394 /// The maximum state pointer. This is useful to mask out the "valid" state
395 /// pointer from a state with the "start" or "match" bits set.
396 ///
397 /// It doesn't make sense to use this with unknown, dead or quit state
398 /// pointers, since those pointers are sentinels and never have their lower
399 /// bits set to anything meaningful.
400 const STATE_MAX: StatePtr = STATE_MATCH - 1;
401
402 /// Byte is a u8 in spirit, but a u16 in practice so that we can represent the
403 /// special EOF sentinel value.
404 #[derive(Copy, Clone, Debug)]
405 struct Byte(u16);
406
407 /// A set of flags for zero-width assertions.
408 #[derive(Clone, Copy, Eq, Debug, Default, Hash, PartialEq)]
409 struct EmptyFlags {
410 start: bool,
411 end: bool,
412 start_line: bool,
413 end_line: bool,
414 word_boundary: bool,
415 not_word_boundary: bool,
416 }
417
418 /// A set of flags describing various configurations of a DFA state. This is
419 /// represented by a `u8` so that it is compact.
420 #[derive(Clone, Copy, Eq, Default, Hash, PartialEq)]
421 struct StateFlags(u8);
422
423 impl Cache {
424 /// Create new empty cache for the DFA engine.
425 pub fn new(prog: &Program) -> Self {
426 // We add 1 to account for the special EOF byte.
427 let num_byte_classes = (prog.byte_classes[255] as usize + 1) + 1;
428 let starts = vec![STATE_UNKNOWN; 256];
429 let mut cache = Cache {
430 inner: CacheInner {
431 compiled: HashMap::new(),
432 trans: Transitions::new(num_byte_classes),
433 states: vec![],
434 start_states: starts,
435 stack: vec![],
436 flush_count: 0,
437 size: 0,
438 },
439 qcur: SparseSet::new(prog.insts.len()),
440 qnext: SparseSet::new(prog.insts.len()),
441 };
442 cache.inner.reset_size();
443 cache
444 }
445 }
446
447 impl CacheInner {
448 /// Resets the cache size to account for fixed costs, such as the program
449 /// and stack sizes.
450 fn reset_size(&mut self) {
451 self.size =
452 (self.start_states.len() * mem::size_of::<StatePtr>())
453 + (self.stack.len() * mem::size_of::<InstPtr>());
454 }
455 }
456
457 impl<'a> Fsm<'a> {
458 #[inline(always)] // reduces constant overhead
459 pub fn forward(
460 prog: &'a Program,
461 cache: &ProgramCache,
462 quit_after_match: bool,
463 text: &[u8],
464 at: usize,
465 ) -> Result<usize> {
466 let mut cache = cache.borrow_mut();
467 let cache = &mut cache.dfa;
468 let mut dfa = Fsm {
469 prog: prog,
470 start: 0, // filled in below
471 at: at,
472 quit_after_match: quit_after_match,
473 last_match_si: STATE_UNKNOWN,
474 last_cache_flush: at,
475 cache: &mut cache.inner,
476 };
477 let (empty_flags, state_flags) = dfa.start_flags(text, at);
478 dfa.start = match dfa.start_state(
479 &mut cache.qcur,
480 empty_flags,
481 state_flags,
482 ) {
483 None => return Result::Quit,
484 Some(STATE_DEAD) => return Result::NoMatch(at),
485 Some(si) => si,
486 };
487 debug_assert!(dfa.start != STATE_UNKNOWN);
488 dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text)
489 }
490
491 #[inline(always)] // reduces constant overhead
492 pub fn reverse(
493 prog: &'a Program,
494 cache: &ProgramCache,
495 quit_after_match: bool,
496 text: &[u8],
497 at: usize,
498 ) -> Result<usize> {
499 let mut cache = cache.borrow_mut();
500 let cache = &mut cache.dfa_reverse;
501 let mut dfa = Fsm {
502 prog: prog,
503 start: 0, // filled in below
504 at: at,
505 quit_after_match: quit_after_match,
506 last_match_si: STATE_UNKNOWN,
507 last_cache_flush: at,
508 cache: &mut cache.inner,
509 };
510 let (empty_flags, state_flags) = dfa.start_flags_reverse(text, at);
511 dfa.start = match dfa.start_state(
512 &mut cache.qcur,
513 empty_flags,
514 state_flags,
515 ) {
516 None => return Result::Quit,
517 Some(STATE_DEAD) => return Result::NoMatch(at),
518 Some(si) => si,
519 };
520 debug_assert!(dfa.start != STATE_UNKNOWN);
521 dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text)
522 }
523
524 #[inline(always)] // reduces constant overhead
525 pub fn forward_many(
526 prog: &'a Program,
527 cache: &ProgramCache,
528 matches: &mut [bool],
529 text: &[u8],
530 at: usize,
531 ) -> Result<usize> {
532 debug_assert!(matches.len() == prog.matches.len());
533 let mut cache = cache.borrow_mut();
534 let cache = &mut cache.dfa;
535 let mut dfa = Fsm {
536 prog: prog,
537 start: 0, // filled in below
538 at: at,
539 quit_after_match: false,
540 last_match_si: STATE_UNKNOWN,
541 last_cache_flush: at,
542 cache: &mut cache.inner,
543 };
544 let (empty_flags, state_flags) = dfa.start_flags(text, at);
545 dfa.start = match dfa.start_state(
546 &mut cache.qcur,
547 empty_flags,
548 state_flags,
549 ) {
550 None => return Result::Quit,
551 Some(STATE_DEAD) => return Result::NoMatch(at),
552 Some(si) => si,
553 };
554 debug_assert!(dfa.start != STATE_UNKNOWN);
555 let result = dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text);
556 if result.is_match() {
557 if matches.len() == 1 {
558 matches[0] = true;
559 } else {
560 debug_assert!(dfa.last_match_si != STATE_UNKNOWN);
561 debug_assert!(dfa.last_match_si != STATE_DEAD);
562 for ip in dfa.state(dfa.last_match_si).inst_ptrs() {
563 if let Inst::Match(slot) = dfa.prog[ip] {
564 matches[slot] = true;
565 }
566 }
567 }
568 }
569 result
570 }
571
572 /// Executes the DFA on a forward NFA.
573 ///
574 /// {qcur,qnext} are scratch ordered sets which may be non-empty.
575 #[inline(always)] // reduces constant overhead
576 fn exec_at(
577 &mut self,
578 qcur: &mut SparseSet,
579 qnext: &mut SparseSet,
580 text: &[u8],
581 ) -> Result<usize> {
582 // For the most part, the DFA is basically:
583 //
584 // last_match = null
585 // while current_byte != EOF:
586 // si = current_state.next[current_byte]
587 // if si is match
588 // last_match = si
589 // return last_match
590 //
591 // However, we need to deal with a few things:
592 //
593 // 1. This is an *online* DFA, so the current state's next list
594 // may not point to anywhere yet, so we must go out and compute
595 // them. (They are then cached into the current state's next list
596 // to avoid re-computation.)
597 // 2. If we come across a state that is known to be dead (i.e., never
598 // leads to a match), then we can quit early.
599 // 3. If the caller just wants to know if a match occurs, then we
600 // can quit as soon as we know we have a match. (Full leftmost
601 // first semantics require continuing on.)
602 // 4. If we're in the start state, then we can use a pre-computed set
603 // of prefix literals to skip quickly along the input.
604 // 5. After the input is exhausted, we run the DFA on one symbol
605 // that stands for EOF. This is useful for handling empty width
606 // assertions.
607 // 6. We can't actually do state.next[byte]. Instead, we have to do
608 // state.next[byte_classes[byte]], which permits us to keep the
609 // 'next' list very small.
610 //
611 // Since there's a bunch of extra stuff we need to consider, we do some
612 // pretty hairy tricks to get the inner loop to run as fast as
613 // possible.
614 debug_assert!(!self.prog.is_reverse);
615
616 // The last match is the currently known ending match position. It is
617 // reported as an index to the most recent byte that resulted in a
618 // transition to a match state and is always stored in capture slot `1`
619 // when searching forwards. Its maximum value is `text.len()`.
620 let mut result = Result::NoMatch(self.at);
621 let (mut prev_si, mut next_si) = (self.start, self.start);
622 let mut at = self.at;
623 while at < text.len() {
624 // This is the real inner loop. We take advantage of special bits
625 // set in the state pointer to determine whether a state is in the
626 // "common" case or not. Specifically, the common case is a
627 // non-match non-start non-dead state that has already been
628 // computed. So long as we remain in the common case, this inner
629 // loop will chew through the input.
630 //
631 // We also unroll the loop 4 times to amortize the cost of checking
632 // whether we've consumed the entire input. We are also careful
633 // to make sure that `prev_si` always represents the previous state
634 // and `next_si` always represents the next state after the loop
635 // exits, even if it isn't always true inside the loop.
636 while next_si <= STATE_MAX && at < text.len() {
637 // Argument for safety is in the definition of next_si.
638 prev_si = unsafe { self.next_si(next_si, text, at) };
639 at += 1;
640 if prev_si > STATE_MAX || at + 2 >= text.len() {
641 mem::swap(&mut prev_si, &mut next_si);
642 break;
643 }
644 next_si = unsafe { self.next_si(prev_si, text, at) };
645 at += 1;
646 if next_si > STATE_MAX {
647 break;
648 }
649 prev_si = unsafe { self.next_si(next_si, text, at) };
650 at += 1;
651 if prev_si > STATE_MAX {
652 mem::swap(&mut prev_si, &mut next_si);
653 break;
654 }
655 next_si = unsafe { self.next_si(prev_si, text, at) };
656 at += 1;
657 }
658 if next_si & STATE_MATCH > 0 {
659 // A match state is outside of the common case because it needs
660 // special case analysis. In particular, we need to record the
661 // last position as having matched and possibly quit the DFA if
662 // we don't need to keep matching.
663 next_si &= !STATE_MATCH;
664 result = Result::Match(at - 1);
665 if self.quit_after_match {
666 return result;
667 }
668 self.last_match_si = next_si;
669 prev_si = next_si;
670
671 // This permits short-circuiting when matching a regex set.
672 // In particular, if this DFA state contains only match states,
673 // then it's impossible to extend the set of matches since
674 // match states are final. Therefore, we can quit.
675 if self.prog.matches.len() > 1 {
676 let state = self.state(next_si);
677 let just_matches = state.inst_ptrs()
678 .all(|ip| self.prog[ip].is_match());
679 if just_matches {
680 return result;
681 }
682 }
683
684 // Another inner loop! If the DFA stays in this particular
685 // match state, then we can rip through all of the input
686 // very quickly, and only recording the match location once
687 // we've left this particular state.
688 let cur = at;
689 while (next_si & !STATE_MATCH) == prev_si
690 && at + 2 < text.len() {
691 // Argument for safety is in the definition of next_si.
692 next_si = unsafe {
693 self.next_si(next_si & !STATE_MATCH, text, at)
694 };
695 at += 1;
696 }
697 if at > cur {
698 result = Result::Match(at - 2);
699 }
700 } else if next_si & STATE_START > 0 {
701 // A start state isn't in the common case because we may
702 // what to do quick prefix scanning. If the program doesn't
703 // have a detected prefix, then start states are actually
704 // considered common and this case is never reached.
705 debug_assert!(self.has_prefix());
706 next_si &= !STATE_START;
707 prev_si = next_si;
708 at = match self.prefix_at(text, at) {
709 None => return Result::NoMatch(text.len()),
710 Some(i) => i,
711 };
712 } else if next_si >= STATE_UNKNOWN {
713 if next_si == STATE_QUIT {
714 return Result::Quit;
715 }
716 // Finally, this corresponds to the case where the transition
717 // entered a state that can never lead to a match or a state
718 // that hasn't been computed yet. The latter being the "slow"
719 // path.
720 let byte = Byte::byte(text[at - 1]);
721 // We no longer care about the special bits in the state
722 // pointer.
723 prev_si &= STATE_MAX;
724 // Record where we are. This is used to track progress for
725 // determining whether we should quit if we've flushed the
726 // cache too much.
727 self.at = at;
728 next_si = match self.next_state(qcur, qnext, prev_si, byte) {
729 None => return Result::Quit,
730 Some(STATE_DEAD) => return result.set_non_match(at),
731 Some(si) => si,
732 };
733 debug_assert!(next_si != STATE_UNKNOWN);
734 if next_si & STATE_MATCH > 0 {
735 next_si &= !STATE_MATCH;
736 result = Result::Match(at - 1);
737 if self.quit_after_match {
738 return result;
739 }
740 self.last_match_si = next_si;
741 }
742 prev_si = next_si;
743 } else {
744 prev_si = next_si;
745 }
746 }
747
748 // Run the DFA once more on the special EOF senitnel value.
749 // We don't care about the special bits in the state pointer any more,
750 // so get rid of them.
751 prev_si &= STATE_MAX;
752 prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
753 None => return Result::Quit,
754 Some(STATE_DEAD) => return result.set_non_match(text.len()),
755 Some(si) => si & !STATE_START,
756 };
757 debug_assert!(prev_si != STATE_UNKNOWN);
758 if prev_si & STATE_MATCH > 0 {
759 prev_si &= !STATE_MATCH;
760 self.last_match_si = prev_si;
761 result = Result::Match(text.len());
762 }
763 result
764 }
765
766 /// Executes the DFA on a reverse NFA.
767 #[inline(always)] // reduces constant overhead
768 fn exec_at_reverse(
769 &mut self,
770 qcur: &mut SparseSet,
771 qnext: &mut SparseSet,
772 text: &[u8],
773 ) -> Result<usize> {
774 // The comments in `exec_at` above mostly apply here too. The main
775 // difference is that we move backwards over the input and we look for
776 // the longest possible match instead of the leftmost-first match.
777 //
778 // N.B. The code duplication here is regrettable. Efforts to improve
779 // it without sacrificing performance are welcome. ---AG
780 debug_assert!(self.prog.is_reverse);
781 let mut result = Result::NoMatch(self.at);
782 let (mut prev_si, mut next_si) = (self.start, self.start);
783 let mut at = self.at;
784 while at > 0 {
785 while next_si <= STATE_MAX && at > 0 {
786 // Argument for safety is in the definition of next_si.
787 at -= 1;
788 prev_si = unsafe { self.next_si(next_si, text, at) };
789 if prev_si > STATE_MAX || at <= 4 {
790 mem::swap(&mut prev_si, &mut next_si);
791 break;
792 }
793 at -= 1;
794 next_si = unsafe { self.next_si(prev_si, text, at) };
795 if next_si > STATE_MAX {
796 break;
797 }
798 at -= 1;
799 prev_si = unsafe { self.next_si(next_si, text, at) };
800 if prev_si > STATE_MAX {
801 mem::swap(&mut prev_si, &mut next_si);
802 break;
803 }
804 at -= 1;
805 next_si = unsafe { self.next_si(prev_si, text, at) };
806 }
807 if next_si & STATE_MATCH > 0 {
808 next_si &= !STATE_MATCH;
809 result = Result::Match(at + 1);
810 if self.quit_after_match {
811 return result
812 }
813 self.last_match_si = next_si;
814 prev_si = next_si;
815 let cur = at;
816 while (next_si & !STATE_MATCH) == prev_si && at >= 2 {
817 // Argument for safety is in the definition of next_si.
818 at -= 1;
819 next_si = unsafe {
820 self.next_si(next_si & !STATE_MATCH, text, at)
821 };
822 }
823 if at < cur {
824 result = Result::Match(at + 2);
825 }
826 } else if next_si >= STATE_UNKNOWN {
827 if next_si == STATE_QUIT {
828 return Result::Quit;
829 }
830 let byte = Byte::byte(text[at]);
831 prev_si &= STATE_MAX;
832 self.at = at;
833 next_si = match self.next_state(qcur, qnext, prev_si, byte) {
834 None => return Result::Quit,
835 Some(STATE_DEAD) => return result.set_non_match(at),
836 Some(si) => si,
837 };
838 debug_assert!(next_si != STATE_UNKNOWN);
839 if next_si & STATE_MATCH > 0 {
840 next_si &= !STATE_MATCH;
841 result = Result::Match(at + 1);
842 if self.quit_after_match {
843 return result;
844 }
845 self.last_match_si = next_si;
846 }
847 prev_si = next_si;
848 } else {
849 prev_si = next_si;
850 }
851 }
852
853 // Run the DFA once more on the special EOF senitnel value.
854 prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
855 None => return Result::Quit,
856 Some(STATE_DEAD) => return result.set_non_match(0),
857 Some(si) => si,
858 };
859 debug_assert!(prev_si != STATE_UNKNOWN);
860 if prev_si & STATE_MATCH > 0 {
861 prev_si &= !STATE_MATCH;
862 self.last_match_si = prev_si;
863 result = Result::Match(0);
864 }
865 result
866 }
867
868 /// next_si transitions to the next state, where the transition input
869 /// corresponds to text[i].
870 ///
871 /// This elides bounds checks, and is therefore unsafe.
872 #[inline(always)]
873 unsafe fn next_si(&self, si: StatePtr, text: &[u8], i: usize) -> StatePtr {
874 // What is the argument for safety here?
875 // We have three unchecked accesses that could possibly violate safety:
876 //
877 // 1. The given byte of input (`text[i]`).
878 // 2. The class of the byte of input (`classes[text[i]]`).
879 // 3. The transition for the class (`trans[si + cls]`).
880 //
881 // (1) is only safe when calling next_si is guarded by
882 // `i < text.len()`.
883 //
884 // (2) is the easiest case to guarantee since `text[i]` is always a
885 // `u8` and `self.prog.byte_classes` always has length `u8::MAX`.
886 // (See `ByteClassSet.byte_classes` in `compile.rs`.)
887 //
888 // (3) is only safe if (1)+(2) are safe. Namely, the transitions
889 // of every state are defined to have length equal to the number of
890 // byte classes in the program. Therefore, a valid class leads to a
891 // valid transition. (All possible transitions are valid lookups, even
892 // if it points to a state that hasn't been computed yet.) (3) also
893 // relies on `si` being correct, but StatePtrs should only ever be
894 // retrieved from the transition table, which ensures they are correct.
895 debug_assert!(i < text.len());
896 let b = *text.get_unchecked(i);
897 debug_assert!((b as usize) < self.prog.byte_classes.len());
898 let cls = *self.prog.byte_classes.get_unchecked(b as usize);
899 self.cache.trans.next_unchecked(si, cls as usize)
900 }
901
902 /// Computes the next state given the current state and the current input
903 /// byte (which may be EOF).
904 ///
905 /// If STATE_DEAD is returned, then there is no valid state transition.
906 /// This implies that no permutation of future input can lead to a match
907 /// state.
908 ///
909 /// STATE_UNKNOWN can never be returned.
910 fn exec_byte(
911 &mut self,
912 qcur: &mut SparseSet,
913 qnext: &mut SparseSet,
914 mut si: StatePtr,
915 b: Byte,
916 ) -> Option<StatePtr> {
917 use prog::Inst::*;
918
919 // Initialize a queue with the current DFA state's NFA states.
920 qcur.clear();
921 for ip in self.state(si).inst_ptrs() {
922 qcur.insert(ip);
923 }
924
925 // Before inspecting the current byte, we may need to also inspect
926 // whether the position immediately preceding the current byte
927 // satisfies the empty assertions found in the current state.
928 //
929 // We only need to do this step if there are any empty assertions in
930 // the current state.
931 let is_word_last = self.state(si).flags().is_word();
932 let is_word = b.is_ascii_word();
933 if self.state(si).flags().has_empty() {
934 // Compute the flags immediately preceding the current byte.
935 // This means we only care about the "end" or "end line" flags.
936 // (The "start" flags are computed immediately proceding the
937 // current byte and is handled below.)
938 let mut flags = EmptyFlags::default();
939 if b.is_eof() {
940 flags.end = true;
941 flags.end_line = true;
942 } else if b.as_byte().map_or(false, |b| b == b'\n') {
943 flags.end_line = true;
944 }
945 if is_word_last == is_word {
946 flags.not_word_boundary = true;
947 } else {
948 flags.word_boundary = true;
949 }
950 // Now follow epsilon transitions from every NFA state, but make
951 // sure we only follow transitions that satisfy our flags.
952 qnext.clear();
953 for &ip in &*qcur {
954 self.follow_epsilons(usize_to_u32(ip), qnext, flags);
955 }
956 mem::swap(qcur, qnext);
957 }
958
959 // Now we set flags for immediately after the current byte. Since start
960 // states are processed separately, and are the only states that can
961 // have the StartText flag set, we therefore only need to worry about
962 // the StartLine flag here.
963 //
964 // We do also keep track of whether this DFA state contains a NFA state
965 // that is a matching state. This is precisely how we delay the DFA
966 // matching by one byte in order to process the special EOF sentinel
967 // byte. Namely, if this DFA state containing a matching NFA state,
968 // then it is the *next* DFA state that is marked as a match.
969 let mut empty_flags = EmptyFlags::default();
970 let mut state_flags = StateFlags::default();
971 empty_flags.start_line = b.as_byte().map_or(false, |b| b == b'\n');
972 if b.is_ascii_word() {
973 state_flags.set_word();
974 }
975 // Now follow all epsilon transitions again, but only after consuming
976 // the current byte.
977 qnext.clear();
978 for &ip in &*qcur {
979 match self.prog[ip as usize] {
980 // These states never happen in a byte-based program.
981 Char(_) | Ranges(_) => unreachable!(),
982 // These states are handled when following epsilon transitions.
983 Save(_) | Split(_) | EmptyLook(_) => {}
984 Match(_) => {
985 state_flags.set_match();
986 if !self.continue_past_first_match() {
987 break;
988 } else if self.prog.matches.len() > 1
989 && !qnext.contains(ip as usize) {
990 // If we are continuing on to find other matches,
991 // then keep a record of the match states we've seen.
992 qnext.insert(ip);
993 }
994 }
995 Bytes(ref inst) => {
996 if b.as_byte().map_or(false, |b| inst.matches(b)) {
997 self.follow_epsilons(
998 inst.goto as InstPtr, qnext, empty_flags);
999 }
1000 }
1001 }
1002 }
1003
1004 let cache =
1005 if b.is_eof() && self.prog.matches.len() > 1 {
1006 // If we're processing the last byte of the input and we're
1007 // matching a regex set, then make the next state contain the
1008 // previous states transitions. We do this so that the main
1009 // matching loop can extract all of the match instructions.
1010 mem::swap(qcur, qnext);
1011 // And don't cache this state because it's totally bunk.
1012 false
1013 } else {
1014 true
1015 };
1016
1017 // We've now built up the set of NFA states that ought to comprise the
1018 // next DFA state, so try to find it in the cache, and if it doesn't
1019 // exist, cache it.
1020 //
1021 // N.B. We pass `&mut si` here because the cache may clear itself if
1022 // it has gotten too full. When that happens, the location of the
1023 // current state may change.
1024 let mut next = match self.cached_state(
1025 qnext,
1026 state_flags,
1027 Some(&mut si),
1028 ) {
1029 None => return None,
1030 Some(next) => next,
1031 };
1032 if (self.start & !STATE_START) == next {
1033 // Start states can never be match states since all matches are
1034 // delayed by one byte.
1035 debug_assert!(!self.state(next).flags().is_match());
1036 next = self.start_ptr(next);
1037 }
1038 if next <= STATE_MAX && self.state(next).flags().is_match() {
1039 next |= STATE_MATCH;
1040 }
1041 debug_assert!(next != STATE_UNKNOWN);
1042 // And now store our state in the current state's next list.
1043 if cache {
1044 let cls = self.byte_class(b);
1045 self.cache.trans.set_next(si, cls, next);
1046 }
1047 Some(next)
1048 }
1049
1050 /// Follows the epsilon transitions starting at (and including) `ip`. The
1051 /// resulting states are inserted into the ordered set `q`.
1052 ///
1053 /// Conditional epsilon transitions (i.e., empty width assertions) are only
1054 /// followed if they are satisfied by the given flags, which should
1055 /// represent the flags set at the current location in the input.
1056 ///
1057 /// If the current location corresponds to the empty string, then only the
1058 /// end line and/or end text flags may be set. If the current location
1059 /// corresponds to a real byte in the input, then only the start line
1060 /// and/or start text flags may be set.
1061 ///
1062 /// As an exception to the above, when finding the initial state, any of
1063 /// the above flags may be set:
1064 ///
1065 /// If matching starts at the beginning of the input, then start text and
1066 /// start line should be set. If the input is empty, then end text and end
1067 /// line should also be set.
1068 ///
1069 /// If matching starts after the beginning of the input, then only start
1070 /// line should be set if the preceding byte is `\n`. End line should never
1071 /// be set in this case. (Even if the proceding byte is a `\n`, it will
1072 /// be handled in a subsequent DFA state.)
1073 fn follow_epsilons(
1074 &mut self,
1075 ip: InstPtr,
1076 q: &mut SparseSet,
1077 flags: EmptyFlags,
1078 ) {
1079 use prog::Inst::*;
1080 use prog::EmptyLook::*;
1081
1082 // We need to traverse the NFA to follow epsilon transitions, so avoid
1083 // recursion with an explicit stack.
1084 self.cache.stack.push(ip);
1085 while let Some(mut ip) = self.cache.stack.pop() {
1086 // Try to munch through as many states as possible without
1087 // pushes/pops to the stack.
1088 loop {
1089 // Don't visit states we've already added.
1090 if q.contains(ip as usize) {
1091 break;
1092 }
1093 q.insert(ip as usize);
1094 match self.prog[ip as usize] {
1095 Char(_) | Ranges(_) => unreachable!(),
1096 Match(_) | Bytes(_) => {
1097 break;
1098 }
1099 EmptyLook(ref inst) => {
1100 // Only follow empty assertion states if our flags
1101 // satisfy the assertion.
1102 match inst.look {
1103 StartLine if flags.start_line => {
1104 ip = inst.goto as InstPtr;
1105 }
1106 EndLine if flags.end_line => {
1107 ip = inst.goto as InstPtr;
1108 }
1109 StartText if flags.start => {
1110 ip = inst.goto as InstPtr;
1111 }
1112 EndText if flags.end => {
1113 ip = inst.goto as InstPtr;
1114 }
1115 WordBoundaryAscii if flags.word_boundary => {
1116 ip = inst.goto as InstPtr;
1117 }
1118 NotWordBoundaryAscii if flags.not_word_boundary => {
1119 ip = inst.goto as InstPtr;
1120 }
1121 WordBoundary if flags.word_boundary => {
1122 ip = inst.goto as InstPtr;
1123 }
1124 NotWordBoundary if flags.not_word_boundary => {
1125 ip = inst.goto as InstPtr;
1126 }
1127 StartLine | EndLine | StartText | EndText
1128 | WordBoundaryAscii | NotWordBoundaryAscii
1129 | WordBoundary | NotWordBoundary => {
1130 break;
1131 }
1132 }
1133 }
1134 Save(ref inst) => {
1135 ip = inst.goto as InstPtr;
1136 }
1137 Split(ref inst) => {
1138 self.cache.stack.push(inst.goto2 as InstPtr);
1139 ip = inst.goto1 as InstPtr;
1140 }
1141 }
1142 }
1143 }
1144 }
1145
1146 /// Find a previously computed state matching the given set of instructions
1147 /// and is_match bool.
1148 ///
1149 /// The given set of instructions should represent a single state in the
1150 /// NFA along with all states reachable without consuming any input.
1151 ///
1152 /// The is_match bool should be true if and only if the preceding DFA state
1153 /// contains an NFA matching state. The cached state produced here will
1154 /// then signify a match. (This enables us to delay a match by one byte,
1155 /// in order to account for the EOF sentinel byte.)
1156 ///
1157 /// If the cache is full, then it is wiped before caching a new state.
1158 ///
1159 /// The current state should be specified if it exists, since it will need
1160 /// to be preserved if the cache clears itself. (Start states are
1161 /// always saved, so they should not be passed here.) It takes a mutable
1162 /// pointer to the index because if the cache is cleared, the state's
1163 /// location may change.
1164 fn cached_state(
1165 &mut self,
1166 q: &SparseSet,
1167 mut state_flags: StateFlags,
1168 current_state: Option<&mut StatePtr>,
1169 ) -> Option<StatePtr> {
1170 // If we couldn't come up with a non-empty key to represent this state,
1171 // then it is dead and can never lead to a match.
1172 //
1173 // Note that inst_flags represent the set of empty width assertions
1174 // in q. We use this as an optimization in exec_byte to determine when
1175 // we should follow epsilon transitions at the empty string preceding
1176 // the current byte.
1177 let key = match self.cached_state_key(q, &mut state_flags) {
1178 None => return Some(STATE_DEAD),
1179 Some(v) => v,
1180 };
1181 // In the cache? Cool. Done.
1182 if let Some(&si) = self.cache.compiled.get(&key) {
1183 return Some(si);
1184 }
1185 // If the cache has gotten too big, wipe it.
1186 if self.approximate_size() > self.prog.dfa_size_limit
1187 && !self.clear_cache_and_save(current_state)
1188 {
1189 // Ooops. DFA is giving up.
1190 return None;
1191 }
1192 // Allocate room for our state and add it.
1193 self.add_state(key)
1194 }
1195
1196 /// Produces a key suitable for describing a state in the DFA cache.
1197 ///
1198 /// The key invariant here is that equivalent keys are produced for any two
1199 /// sets of ordered NFA states (and toggling of whether the previous NFA
1200 /// states contain a match state) that do not discriminate a match for any
1201 /// input.
1202 ///
1203 /// Specifically, q should be an ordered set of NFA states and is_match
1204 /// should be true if and only if the previous NFA states contained a match
1205 /// state.
1206 fn cached_state_key(
1207 &mut self,
1208 q: &SparseSet,
1209 state_flags: &mut StateFlags,
1210 ) -> Option<State> {
1211 use prog::Inst::*;
1212
1213 // We need to build up enough information to recognize pre-built states
1214 // in the DFA. Generally speaking, this includes every instruction
1215 // except for those which are purely epsilon transitions, e.g., the
1216 // Save and Split instructions.
1217 //
1218 // Empty width assertions are also epsilon transitions, but since they
1219 // are conditional, we need to make them part of a state's key in the
1220 // cache.
1221
1222 // Reserve 1 byte for flags.
1223 let mut insts = vec![0];
1224 let mut prev = 0;
1225 for &ip in q {
1226 let ip = usize_to_u32(ip);
1227 match self.prog[ip as usize] {
1228 Char(_) | Ranges(_) => unreachable!(),
1229 Save(_) | Split(_) => {}
1230 Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip),
1231 EmptyLook(_) => {
1232 state_flags.set_empty();
1233 push_inst_ptr(&mut insts, &mut prev, ip)
1234 }
1235 Match(_) => {
1236 push_inst_ptr(&mut insts, &mut prev, ip);
1237 if !self.continue_past_first_match() {
1238 break;
1239 }
1240 }
1241 }
1242 }
1243 // If we couldn't transition to any other instructions and we didn't
1244 // see a match when expanding NFA states previously, then this is a
1245 // dead state and no amount of additional input can transition out
1246 // of this state.
1247 if insts.len() == 1 && !state_flags.is_match() {
1248 None
1249 } else {
1250 let StateFlags(f) = *state_flags;
1251 insts[0] = f;
1252 Some(State { data: insts.into_boxed_slice() })
1253 }
1254 }
1255
1256 /// Clears the cache, but saves and restores current_state if it is not
1257 /// none.
1258 ///
1259 /// The current state must be provided here in case its location in the
1260 /// cache changes.
1261 ///
1262 /// This returns false if the cache is not cleared and the DFA should
1263 /// give up.
1264 fn clear_cache_and_save(
1265 &mut self,
1266 current_state: Option<&mut StatePtr>,
1267 ) -> bool {
1268 if self.cache.states.is_empty() {
1269 // Nothing to clear...
1270 return true;
1271 }
1272 match current_state {
1273 None => self.clear_cache(),
1274 Some(si) => {
1275 let cur = self.state(*si).clone();
1276 if !self.clear_cache() {
1277 return false;
1278 }
1279 // The unwrap is OK because we just cleared the cache and
1280 // therefore know that the next state pointer won't exceed
1281 // STATE_MAX.
1282 *si = self.restore_state(cur).unwrap();
1283 true
1284 }
1285 }
1286 }
1287
1288 /// Wipes the state cache, but saves and restores the current start state.
1289 ///
1290 /// This returns false if the cache is not cleared and the DFA should
1291 /// give up.
1292 fn clear_cache(&mut self) -> bool {
1293 // Bail out of the DFA if we're moving too "slowly."
1294 // A heuristic from RE2: assume the DFA is too slow if it is processing
1295 // 10 or fewer bytes per state.
1296 // Additionally, we permit the cache to be flushed a few times before
1297 // caling it quits.
1298 let nstates = self.cache.states.len();
1299 if self.cache.flush_count >= 3
1300 && self.at >= self.last_cache_flush
1301 && (self.at - self.last_cache_flush) <= 10 * nstates {
1302 return false;
1303 }
1304 // Update statistics tracking cache flushes.
1305 self.last_cache_flush = self.at;
1306 self.cache.flush_count += 1;
1307
1308 // OK, actually flush the cache.
1309 let start = self.state(self.start & !STATE_START).clone();
1310 let last_match = if self.last_match_si <= STATE_MAX {
1311 Some(self.state(self.last_match_si).clone())
1312 } else {
1313 None
1314 };
1315 self.cache.reset_size();
1316 self.cache.trans.clear();
1317 self.cache.states.clear();
1318 self.cache.compiled.clear();
1319 for s in &mut self.cache.start_states {
1320 *s = STATE_UNKNOWN;
1321 }
1322 // The unwraps are OK because we just cleared the cache and therefore
1323 // know that the next state pointer won't exceed STATE_MAX.
1324 let start_ptr = self.restore_state(start).unwrap();
1325 self.start = self.start_ptr(start_ptr);
1326 if let Some(last_match) = last_match {
1327 self.last_match_si = self.restore_state(last_match).unwrap();
1328 }
1329 true
1330 }
1331
1332 /// Restores the given state back into the cache, and returns a pointer
1333 /// to it.
1334 fn restore_state(&mut self, state: State) -> Option<StatePtr> {
1335 // If we've already stored this state, just return a pointer to it.
1336 // None will be the wiser.
1337 if let Some(&si) = self.cache.compiled.get(&state) {
1338 return Some(si);
1339 }
1340 self.add_state(state)
1341 }
1342
1343 /// Returns the next state given the current state si and current byte
1344 /// b. {qcur,qnext} are used as scratch space for storing ordered NFA
1345 /// states.
1346 ///
1347 /// This tries to fetch the next state from the cache, but if that fails,
1348 /// it computes the next state, caches it and returns a pointer to it.
1349 ///
1350 /// The pointer can be to a real state, or it can be STATE_DEAD.
1351 /// STATE_UNKNOWN cannot be returned.
1352 ///
1353 /// None is returned if a new state could not be allocated (i.e., the DFA
1354 /// ran out of space and thinks it's running too slowly).
1355 fn next_state(
1356 &mut self,
1357 qcur: &mut SparseSet,
1358 qnext: &mut SparseSet,
1359 si: StatePtr,
1360 b: Byte,
1361 ) -> Option<StatePtr> {
1362 if si == STATE_DEAD {
1363 return Some(STATE_DEAD);
1364 }
1365 match self.cache.trans.next(si, self.byte_class(b)) {
1366 STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b),
1367 STATE_QUIT => None,
1368 STATE_DEAD => Some(STATE_DEAD),
1369 nsi => Some(nsi),
1370 }
1371 }
1372
1373 /// Computes and returns the start state, where searching begins at
1374 /// position `at` in `text`. If the state has already been computed,
1375 /// then it is pulled from the cache. If the state hasn't been cached,
1376 /// then it is computed, cached and a pointer to it is returned.
1377 ///
1378 /// This may return STATE_DEAD but never STATE_UNKNOWN.
1379 #[inline(always)] // reduces constant overhead
1380 fn start_state(
1381 &mut self,
1382 q: &mut SparseSet,
1383 empty_flags: EmptyFlags,
1384 state_flags: StateFlags,
1385 ) -> Option<StatePtr> {
1386 // Compute an index into our cache of start states based on the set
1387 // of empty/state flags set at the current position in the input. We
1388 // don't use every flag since not all flags matter. For example, since
1389 // matches are delayed by one byte, start states can never be match
1390 // states.
1391 let flagi = {
1392 (((empty_flags.start as u8) << 0) |
1393 ((empty_flags.end as u8) << 1) |
1394 ((empty_flags.start_line as u8) << 2) |
1395 ((empty_flags.end_line as u8) << 3) |
1396 ((empty_flags.word_boundary as u8) << 4) |
1397 ((empty_flags.not_word_boundary as u8) << 5) |
1398 ((state_flags.is_word() as u8) << 6))
1399 as usize
1400 };
1401 match self.cache.start_states[flagi] {
1402 STATE_UNKNOWN => {}
1403 STATE_DEAD => return Some(STATE_DEAD),
1404 si => return Some(si),
1405 }
1406 q.clear();
1407 let start = usize_to_u32(self.prog.start);
1408 self.follow_epsilons(start, q, empty_flags);
1409 // Start states can never be match states because we delay every match
1410 // by one byte. Given an empty string and an empty match, the match
1411 // won't actually occur until the DFA processes the special EOF
1412 // sentinel byte.
1413 let sp = match self.cached_state(q, state_flags, None) {
1414 None => return None,
1415 Some(sp) => self.start_ptr(sp),
1416 };
1417 self.cache.start_states[flagi] = sp;
1418 Some(sp)
1419 }
1420
1421 /// Computes the set of starting flags for the given position in text.
1422 ///
1423 /// This should only be used when executing the DFA forwards over the
1424 /// input.
1425 fn start_flags(&self, text: &[u8], at: usize) -> (EmptyFlags, StateFlags) {
1426 let mut empty_flags = EmptyFlags::default();
1427 let mut state_flags = StateFlags::default();
1428 empty_flags.start = at == 0;
1429 empty_flags.end = text.is_empty();
1430 empty_flags.start_line = at == 0 || text[at - 1] == b'\n';
1431 empty_flags.end_line = text.is_empty();
1432
1433 let is_word_last = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
1434 let is_word = at < text.len() && Byte::byte(text[at]).is_ascii_word();
1435 if is_word_last {
1436 state_flags.set_word();
1437 }
1438 if is_word == is_word_last {
1439 empty_flags.not_word_boundary = true;
1440 } else {
1441 empty_flags.word_boundary = true;
1442 }
1443 (empty_flags, state_flags)
1444 }
1445
1446 /// Computes the set of starting flags for the given position in text.
1447 ///
1448 /// This should only be used when executing the DFA in reverse over the
1449 /// input.
1450 fn start_flags_reverse(
1451 &self,
1452 text: &[u8],
1453 at: usize,
1454 ) -> (EmptyFlags, StateFlags) {
1455 let mut empty_flags = EmptyFlags::default();
1456 let mut state_flags = StateFlags::default();
1457 empty_flags.start = at == text.len();
1458 empty_flags.end = text.is_empty();
1459 empty_flags.start_line = at == text.len() || text[at] == b'\n';
1460 empty_flags.end_line = text.is_empty();
1461
1462 let is_word_last =
1463 at < text.len() && Byte::byte(text[at]).is_ascii_word();
1464 let is_word = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
1465 if is_word_last {
1466 state_flags.set_word();
1467 }
1468 if is_word == is_word_last {
1469 empty_flags.not_word_boundary = true;
1470 } else {
1471 empty_flags.word_boundary = true;
1472 }
1473 (empty_flags, state_flags)
1474 }
1475
1476 /// Returns a reference to a State given a pointer to it.
1477 fn state(&self, si: StatePtr) -> &State {
1478 &self.cache.states[si as usize / self.num_byte_classes()]
1479 }
1480
1481 /// Adds the given state to the DFA.
1482 ///
1483 /// This allocates room for transitions out of this state in
1484 /// self.cache.trans. The transitions can be set with the returned
1485 /// StatePtr.
1486 ///
1487 /// If None is returned, then the state limit was reached and the DFA
1488 /// should quit.
1489 fn add_state(&mut self, state: State) -> Option<StatePtr> {
1490 // This will fail if the next state pointer exceeds STATE_PTR. In
1491 // practice, the cache limit will prevent us from ever getting here,
1492 // but maybe callers will set the cache size to something ridiculous...
1493 let si = match self.cache.trans.add() {
1494 None => return None,
1495 Some(si) => si,
1496 };
1497 // If the program has a Unicode word boundary, then set any transitions
1498 // for non-ASCII bytes to STATE_QUIT. If the DFA stumbles over such a
1499 // transition, then it will quit and an alternative matching engine
1500 // will take over.
1501 if self.prog.has_unicode_word_boundary {
1502 for b in 128..256 {
1503 let cls = self.byte_class(Byte::byte(b as u8));
1504 self.cache.trans.set_next(si, cls, STATE_QUIT);
1505 }
1506 }
1507 // Finally, put our actual state on to our heap of states and index it
1508 // so we can find it later.
1509 self.cache.size +=
1510 self.cache.trans.state_heap_size()
1511 + (2 * state.data.len())
1512 + (2 * mem::size_of::<State>())
1513 + mem::size_of::<StatePtr>();
1514 self.cache.states.push(state.clone());
1515 self.cache.compiled.insert(state, si);
1516 // Transition table and set of states and map should all be in sync.
1517 debug_assert!(self.cache.states.len()
1518 == self.cache.trans.num_states());
1519 debug_assert!(self.cache.states.len()
1520 == self.cache.compiled.len());
1521 Some(si)
1522 }
1523
1524 /// Quickly finds the next occurrence of any literal prefixes in the regex.
1525 /// If there are no literal prefixes, then the current position is
1526 /// returned. If there are literal prefixes and one could not be found,
1527 /// then None is returned.
1528 ///
1529 /// This should only be called when the DFA is in a start state.
1530 fn prefix_at(&self, text: &[u8], at: usize) -> Option<usize> {
1531 self.prog.prefixes.find(&text[at..]).map(|(s, _)| at + s)
1532 }
1533
1534 /// Returns the number of byte classes required to discriminate transitions
1535 /// in each state.
1536 ///
1537 /// invariant: num_byte_classes() == len(State.next)
1538 fn num_byte_classes(&self) -> usize {
1539 // We add 1 to account for the special EOF byte.
1540 (self.prog.byte_classes[255] as usize + 1) + 1
1541 }
1542
1543 /// Given an input byte or the special EOF sentinel, return its
1544 /// corresponding byte class.
1545 #[inline(always)]
1546 fn byte_class(&self, b: Byte) -> usize {
1547 match b.as_byte() {
1548 None => self.num_byte_classes() - 1,
1549 Some(b) => self.u8_class(b),
1550 }
1551 }
1552
1553 /// Like byte_class, but explicitly for u8s.
1554 #[inline(always)]
1555 fn u8_class(&self, b: u8) -> usize {
1556 self.prog.byte_classes[b as usize] as usize
1557 }
1558
1559 /// Returns true if the DFA should continue searching past the first match.
1560 ///
1561 /// Leftmost first semantics in the DFA are preserved by not following NFA
1562 /// transitions after the first match is seen.
1563 ///
1564 /// On occasion, we want to avoid leftmost first semantics to find either
1565 /// the longest match (for reverse search) or all possible matches (for
1566 /// regex sets).
1567 fn continue_past_first_match(&self) -> bool {
1568 self.prog.is_reverse || self.prog.matches.len() > 1
1569 }
1570
1571 /// Returns true if there is a prefix we can quickly search for.
1572 fn has_prefix(&self) -> bool {
1573 !self.prog.is_reverse
1574 && !self.prog.prefixes.is_empty()
1575 && !self.prog.is_anchored_start
1576 }
1577
1578 /// Sets the STATE_START bit in the given state pointer if and only if
1579 /// we have a prefix to scan for.
1580 ///
1581 /// If there's no prefix, then it's a waste to treat the start state
1582 /// specially.
1583 fn start_ptr(&self, si: StatePtr) -> StatePtr {
1584 if self.has_prefix() {
1585 si | STATE_START
1586 } else {
1587 si
1588 }
1589 }
1590
1591 /// Approximate size returns the approximate heap space currently used by
1592 /// the DFA. It is used to determine whether the DFA's state cache needs to
1593 /// be wiped. Namely, it is possible that for certain regexes on certain
1594 /// inputs, a new state could be created for every byte of input. (This is
1595 /// bad for memory use, so we bound it with a cache.)
1596 fn approximate_size(&self) -> usize {
1597 self.cache.size + self.prog.approximate_size()
1598 }
1599 }
1600
1601 impl Transitions {
1602 /// Create a new transition table.
1603 ///
1604 /// The number of byte classes corresponds to the stride. Every state will
1605 /// have `num_byte_classes` slots for transitions.
1606 fn new(num_byte_classes: usize) -> Transitions {
1607 Transitions {
1608 table: vec![],
1609 num_byte_classes: num_byte_classes,
1610 }
1611 }
1612
1613 /// Returns the total number of states currently in this table.
1614 fn num_states(&self) -> usize {
1615 self.table.len() / self.num_byte_classes
1616 }
1617
1618 /// Allocates room for one additional state and returns a pointer to it.
1619 ///
1620 /// If there's no more room, None is returned.
1621 fn add(&mut self) -> Option<StatePtr> {
1622 let si = self.table.len();
1623 if si > STATE_MAX as usize {
1624 return None;
1625 }
1626 self.table.extend(repeat(STATE_UNKNOWN).take(self.num_byte_classes));
1627 Some(usize_to_u32(si))
1628 }
1629
1630 /// Clears the table of all states.
1631 fn clear(&mut self) {
1632 self.table.clear();
1633 }
1634
1635 /// Sets the transition from (si, cls) to next.
1636 fn set_next(&mut self, si: StatePtr, cls: usize, next: StatePtr) {
1637 self.table[si as usize + cls] = next;
1638 }
1639
1640 /// Returns the transition corresponding to (si, cls).
1641 fn next(&self, si: StatePtr, cls: usize) -> StatePtr {
1642 self.table[si as usize + cls]
1643 }
1644
1645 /// The heap size, in bytes, of a single state in the transition table.
1646 fn state_heap_size(&self) -> usize {
1647 self.num_byte_classes * mem::size_of::<StatePtr>()
1648 }
1649
1650 /// Like `next`, but uses unchecked access and is therefore unsafe.
1651 unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr {
1652 debug_assert!((si as usize) < self.table.len());
1653 debug_assert!(cls < self.num_byte_classes);
1654 *self.table.get_unchecked(si as usize + cls)
1655 }
1656 }
1657
1658 impl StateFlags {
1659 fn is_match(&self) -> bool {
1660 self.0 & 0b0000000_1 > 0
1661 }
1662
1663 fn set_match(&mut self) {
1664 self.0 |= 0b0000000_1;
1665 }
1666
1667 fn is_word(&self) -> bool {
1668 self.0 & 0b000000_1_0 > 0
1669 }
1670
1671 fn set_word(&mut self) {
1672 self.0 |= 0b000000_1_0;
1673 }
1674
1675 fn has_empty(&self) -> bool {
1676 self.0 & 0b00000_1_00 > 0
1677 }
1678
1679 fn set_empty(&mut self) {
1680 self.0 |= 0b00000_1_00;
1681 }
1682 }
1683
1684 impl Byte {
1685 fn byte(b: u8) -> Self { Byte(b as u16) }
1686 fn eof() -> Self { Byte(256) }
1687 fn is_eof(&self) -> bool { self.0 == 256 }
1688
1689 fn is_ascii_word(&self) -> bool {
1690 let b = match self.as_byte() {
1691 None => return false,
1692 Some(b) => b,
1693 };
1694 match b {
1695 b'A'...b'Z' | b'a'...b'z' | b'0'...b'9' | b'_' => true,
1696 _ => false,
1697 }
1698 }
1699
1700 fn as_byte(&self) -> Option<u8> {
1701 if self.is_eof() {
1702 None
1703 } else {
1704 Some(self.0 as u8)
1705 }
1706 }
1707 }
1708
1709 impl fmt::Debug for State {
1710 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1711 let ips: Vec<usize> = self.inst_ptrs().collect();
1712 f.debug_struct("State")
1713 .field("flags", &self.flags())
1714 .field("insts", &ips)
1715 .finish()
1716 }
1717 }
1718
1719 impl fmt::Debug for Transitions {
1720 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1721 let mut fmtd = f.debug_map();
1722 for si in 0..self.num_states() {
1723 let s = si * self.num_byte_classes;
1724 let e = s + self.num_byte_classes;
1725 fmtd.entry(&si.to_string(), &TransitionsRow(&self.table[s..e]));
1726 }
1727 fmtd.finish()
1728 }
1729 }
1730
1731 struct TransitionsRow<'a>(&'a [StatePtr]);
1732
1733 impl<'a> fmt::Debug for TransitionsRow<'a> {
1734 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1735 let mut fmtd = f.debug_map();
1736 for (b, si) in self.0.iter().enumerate() {
1737 match *si {
1738 STATE_UNKNOWN => {}
1739 STATE_DEAD => {
1740 fmtd.entry(&vb(b as usize), &"DEAD");
1741 }
1742 si => {
1743 fmtd.entry(&vb(b as usize), &si.to_string());
1744 }
1745 }
1746 }
1747 fmtd.finish()
1748 }
1749 }
1750
1751 impl fmt::Debug for StateFlags {
1752 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1753 f.debug_struct("StateFlags")
1754 .field("is_match", &self.is_match())
1755 .field("is_word", &self.is_word())
1756 .field("has_empty", &self.has_empty())
1757 .finish()
1758 }
1759 }
1760
1761 /// Helper function for formatting a byte as a nice-to-read escaped string.
1762 fn vb(b: usize) -> String {
1763 use std::ascii::escape_default;
1764
1765 if b > ::std::u8::MAX as usize {
1766 "EOF".to_owned()
1767 } else {
1768 let escaped = escape_default(b as u8).collect::<Vec<u8>>();
1769 String::from_utf8_lossy(&escaped).into_owned()
1770 }
1771 }
1772
1773 fn usize_to_u32(n: usize) -> u32 {
1774 if (n as u64) > (::std::u32::MAX as u64) {
1775 panic!("BUG: {} is too big to fit into u32", n)
1776 }
1777 n as u32
1778 }
1779
1780 #[allow(dead_code)] // useful for debugging
1781 fn show_state_ptr(si: StatePtr) -> String {
1782 let mut s = format!("{:?}", si & STATE_MAX);
1783 if si == STATE_UNKNOWN {
1784 s = format!("{} (unknown)", s);
1785 }
1786 if si == STATE_DEAD {
1787 s = format!("{} (dead)", s);
1788 }
1789 if si == STATE_QUIT {
1790 s = format!("{} (quit)", s);
1791 }
1792 if si & STATE_START > 0 {
1793 s = format!("{} (start)", s);
1794 }
1795 if si & STATE_MATCH > 0 {
1796 s = format!("{} (match)", s);
1797 }
1798 s
1799 }
1800
1801 /// https://developers.google.com/protocol-buffers/docs/encoding#varints
1802 fn write_vari32(data: &mut Vec<u8>, n: i32) {
1803 let mut un = (n as u32) << 1;
1804 if n < 0 {
1805 un = !un;
1806 }
1807 write_varu32(data, un)
1808 }
1809
1810 /// https://developers.google.com/protocol-buffers/docs/encoding#varints
1811 fn read_vari32(data: &[u8]) -> (i32, usize) {
1812 let (un, i) = read_varu32(data);
1813 let mut n = (un >> 1) as i32;
1814 if un & 1 != 0 {
1815 n = !n;
1816 }
1817 (n, i)
1818 }
1819
1820 /// https://developers.google.com/protocol-buffers/docs/encoding#varints
1821 fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
1822 while n >= 0b1000_0000 {
1823 data.push((n as u8) | 0b1000_0000);
1824 n >>= 7;
1825 }
1826 data.push(n as u8);
1827 }
1828
1829 /// https://developers.google.com/protocol-buffers/docs/encoding#varints
1830 fn read_varu32(data: &[u8]) -> (u32, usize) {
1831 let mut n: u32 = 0;
1832 let mut shift: u32 = 0;
1833 for (i, &b) in data.iter().enumerate() {
1834 if b < 0b1000_0000 {
1835 return (n | ((b as u32) << shift), i + 1);
1836 }
1837 n |= ((b as u32) & 0b0111_1111) << shift;
1838 shift += 7;
1839 }
1840 (0, 0)
1841 }
1842
1843 #[cfg(test)]
1844 mod tests {
1845 extern crate rand;
1846
1847 use quickcheck::{QuickCheck, StdGen, quickcheck};
1848 use super::{
1849 StateFlags, State, push_inst_ptr,
1850 write_varu32, read_varu32, write_vari32, read_vari32,
1851 };
1852
1853 #[test]
1854 fn prop_state_encode_decode() {
1855 fn p(ips: Vec<u32>, flags: u8) -> bool {
1856 let mut data = vec![flags];
1857 let mut prev = 0;
1858 for &ip in ips.iter() {
1859 push_inst_ptr(&mut data, &mut prev, ip);
1860 }
1861 let state = State { data: data.into_boxed_slice() };
1862
1863 let expected: Vec<usize> =
1864 ips.into_iter().map(|ip| ip as usize).collect();
1865 let got: Vec<usize> = state.inst_ptrs().collect();
1866 expected == got && state.flags() == StateFlags(flags)
1867 }
1868 QuickCheck::new()
1869 .gen(StdGen::new(self::rand::thread_rng(), 10_000))
1870 .quickcheck(p as fn(Vec<u32>, u8) -> bool);
1871 }
1872
1873 #[test]
1874 fn prop_read_write_u32() {
1875 fn p(n: u32) -> bool {
1876 let mut buf = vec![];
1877 write_varu32(&mut buf, n);
1878 let (got, nread) = read_varu32(&buf);
1879 nread == buf.len() && got == n
1880 }
1881 quickcheck(p as fn(u32) -> bool);
1882 }
1883
1884 #[test]
1885 fn prop_read_write_i32() {
1886 fn p(n: i32) -> bool {
1887 let mut buf = vec![];
1888 write_vari32(&mut buf, n);
1889 let (got, nread) = read_vari32(&buf);
1890 nread == buf.len() && got == n
1891 }
1892 quickcheck(p as fn(i32) -> bool);
1893 }
1894 }