1 use std
::collections
::HashMap
;
8 use sparse_set
::SparseSet
;
9 use state_id
::{dead_id, StateID}
;
11 type DFARepr
<S
> = dense
::Repr
<Vec
<S
>, S
>;
13 /// A determinizer converts an NFA to a DFA.
15 /// This determinizer follows the typical powerset construction, where each
16 /// DFA state is comprised of one or more NFA states. In the worst case, there
17 /// is one DFA state for every possible combination of NFA states. In practice,
18 /// this only happens in certain conditions, typically when there are bounded
21 /// The type variable `S` refers to the chosen state identifier representation
24 /// The lifetime variable `'a` refers to the lifetime of the NFA being
25 /// converted to a DFA.
27 pub(crate) struct Determinizer
<'a
, S
: StateID
> {
28 /// The NFA we're converting into a DFA.
30 /// The DFA we're building.
32 /// Each DFA state being built is defined as an *ordered* set of NFA
33 /// states, along with a flag indicating whether the state is a match
36 /// This is never empty. The first state is always a dummy state such that
37 /// a state id == 0 corresponds to a dead state.
38 builder_states
: Vec
<Rc
<State
>>,
39 /// A cache of DFA states that already exist and can be easily looked up
40 /// via ordered sets of NFA states.
41 cache
: HashMap
<Rc
<State
>, S
>,
42 /// Scratch space for a stack of NFA states to visit, for depth first
43 /// visiting without recursion.
44 stack
: Vec
<nfa
::StateID
>,
45 /// Scratch space for storing an ordered sequence of NFA states, for
46 /// amortizing allocation.
47 scratch_nfa_states
: Vec
<nfa
::StateID
>,
48 /// Whether to build a DFA that finds the longest possible match.
52 /// An intermediate representation for a DFA state during determinization.
53 #[derive(Debug, Eq, Hash, PartialEq)]
55 /// Whether this state is a match state or not.
57 /// An ordered sequence of NFA states that make up this DFA state.
58 nfa_states
: Vec
<nfa
::StateID
>,
61 impl<'a
, S
: StateID
> Determinizer
<'a
, S
> {
62 /// Create a new determinizer for converting the given NFA to a DFA.
63 pub fn new(nfa
: &'a NFA
) -> Determinizer
<'a
, S
> {
64 let dead
= Rc
::new(State
::dead());
65 let mut cache
= HashMap
::default();
66 cache
.insert(dead
.clone(), dead_id());
70 dfa
: DFARepr
::empty().anchored(nfa
.is_anchored()),
71 builder_states
: vec
![dead
],
74 scratch_nfa_states
: vec
![],
79 /// Instruct the determinizer to use equivalence classes as the transition
80 /// alphabet instead of all possible byte values.
81 pub fn with_byte_classes(mut self) -> Determinizer
<'a
, S
> {
82 let byte_classes
= self.nfa
.byte_classes().clone();
83 self.dfa
= DFARepr
::empty_with_byte_classes(byte_classes
)
84 .anchored(self.nfa
.is_anchored());
88 /// Instruct the determinizer to build a DFA that recognizes the longest
89 /// possible match instead of the leftmost first match. This is useful when
90 /// constructing reverse DFAs for finding the start of a match.
91 pub fn longest_match(mut self, yes
: bool
) -> Determinizer
<'a
, S
> {
92 self.longest_match
= yes
;
96 /// Build the DFA. If there was a problem constructing the DFA (e.g., if
97 /// the chosen state identifier representation is too small), then an error
99 pub fn build(mut self) -> Result
<DFARepr
<S
>> {
100 let representative_bytes
: Vec
<u8> =
101 self.dfa
.byte_classes().representatives().collect();
102 let mut sparse
= self.new_sparse_set();
103 let mut uncompiled
= vec
![self.add_start(&mut sparse
)?
];
104 while let Some(dfa_id
) = uncompiled
.pop() {
105 for &b
in &representative_bytes
{
106 let (next_dfa_id
, is_new
) =
107 self.cached_state(dfa_id
, b
, &mut sparse
)?
;
108 self.dfa
.add_transition(dfa_id
, b
, next_dfa_id
);
110 uncompiled
.push(next_dfa_id
);
115 // At this point, we shuffle the matching states in the final DFA to
116 // the beginning. This permits a DFA's match loop to detect a match
117 // condition by merely inspecting the current state's identifier, and
118 // avoids the need for any additional auxiliary storage.
119 let is_match
: Vec
<bool
> =
120 self.builder_states
.iter().map(|s
| s
.is_match
).collect();
121 self.dfa
.shuffle_match_states(&is_match
);
125 /// Return the identifier for the next DFA state given an existing DFA
126 /// state and an input byte. If the next DFA state already exists, then
127 /// return its identifier from the cache. Otherwise, build the state, cache
128 /// it and return its identifier.
130 /// The given sparse set is used for scratch space. It must have a capacity
131 /// equivalent to the total number of NFA states, but its contents are
132 /// otherwise unspecified.
134 /// This routine returns a boolean indicating whether a new state was
135 /// built. If a new state is built, then the caller needs to add it to its
136 /// frontier of uncompiled DFA states to compute transitions for.
141 sparse
: &mut SparseSet
,
142 ) -> Result
<(S
, bool
)> {
144 // Compute the set of all reachable NFA states, including epsilons.
145 self.next(dfa_id
, b
, sparse
);
146 // Build a candidate state and check if it has already been built.
147 let state
= self.new_state(sparse
);
148 if let Some(&cached_id
) = self.cache
.get(&state
) {
149 // Since we have a cached state, put the constructed state's
150 // memory back into our scratch space, so that it can be reused.
152 mem
::replace(&mut self.scratch_nfa_states
, state
.nfa_states
);
153 return Ok((cached_id
, false));
155 // Nothing was in the cache, so add this state to the cache.
156 self.add_state(state
).map(|s
| (s
, true))
159 /// Compute the set of all eachable NFA states, including the full epsilon
160 /// closure, from a DFA state for a single byte of input.
161 fn next(&mut self, dfa_id
: S
, b
: u8, next_nfa_states
: &mut SparseSet
) {
162 next_nfa_states
.clear();
163 for i
in 0..self.builder_states
[dfa_id
.to_usize()].nfa_states
.len() {
164 let nfa_id
= self.builder_states
[dfa_id
.to_usize()].nfa_states
[i
];
165 match *self.nfa
.state(nfa_id
) {
166 nfa
::State
::Union { .. }
168 | nfa
::State
::Match
=> {}
169 nfa
::State
::Range { range: ref r }
=> {
170 if r
.start
<= b
&& b
<= r
.end
{
171 self.epsilon_closure(r
.next
, next_nfa_states
);
174 nfa
::State
::Sparse { ref ranges }
=> {
175 for r
in ranges
.iter() {
178 } else if r
.start
<= b
&& b
<= r
.end
{
179 self.epsilon_closure(r
.next
, next_nfa_states
);
188 /// Compute the epsilon closure for the given NFA state.
189 fn epsilon_closure(&mut self, start
: nfa
::StateID
, set
: &mut SparseSet
) {
190 if !self.nfa
.state(start
).is_epsilon() {
195 self.stack
.push(start
);
196 while let Some(mut id
) = self.stack
.pop() {
198 if set
.contains(id
) {
202 match *self.nfa
.state(id
) {
203 nfa
::State
::Range { .. }
204 | nfa
::State
::Sparse { .. }
206 | nfa
::State
::Match
=> break,
207 nfa
::State
::Union { ref alternates }
=> {
208 id
= match alternates
.get(0) {
212 self.stack
.extend(alternates
[1..].iter().rev());
219 /// Compute the initial DFA state and return its identifier.
221 /// The sparse set given is used for scratch space, and must have capacity
222 /// equal to the total number of NFA states. Its contents are unspecified.
223 fn add_start(&mut self, sparse
: &mut SparseSet
) -> Result
<S
> {
225 self.epsilon_closure(self.nfa
.start(), sparse
);
226 let state
= self.new_state(&sparse
);
227 let id
= self.add_state(state
)?
;
228 self.dfa
.set_start_state(id
);
232 /// Add the given state to the DFA and make it available in the cache.
234 /// The state initially has no transitions. That is, it transitions to the
235 /// dead state for all possible inputs.
236 fn add_state(&mut self, state
: State
) -> Result
<S
> {
237 let id
= self.dfa
.add_empty_state()?
;
238 let rstate
= Rc
::new(state
);
239 self.builder_states
.push(rstate
.clone());
240 self.cache
.insert(rstate
, id
);
244 /// Convert the given set of ordered NFA states to a DFA state.
245 fn new_state(&mut self, set
: &SparseSet
) -> State
{
246 let mut state
= State
{
248 nfa_states
: mem
::replace(&mut self.scratch_nfa_states
, vec
![]),
250 state
.nfa_states
.clear();
253 match *self.nfa
.state(id
) {
254 nfa
::State
::Range { .. }
=> {
255 state
.nfa_states
.push(id
);
257 nfa
::State
::Sparse { .. }
=> {
258 state
.nfa_states
.push(id
);
260 nfa
::State
::Fail
=> {
263 nfa
::State
::Match
=> {
264 state
.is_match
= true;
265 if !self.longest_match
{
269 nfa
::State
::Union { .. }
=> {}
275 /// Create a new sparse set with enough capacity to hold all NFA states.
276 fn new_sparse_set(&self) -> SparseSet
{
277 SparseSet
::new(self.nfa
.len())
282 /// Create a new empty dead state.
284 State { nfa_states: vec![], is_match: false }