1 use std
::cell
::RefCell
;
7 use state_id
::{dead_id, StateID}
;
9 type DFARepr
<S
> = dense
::Repr
<Vec
<S
>, S
>;
11 /// An implementation of Hopcroft's algorithm for minimizing DFAs.
13 /// The algorithm implemented here is mostly taken from Wikipedia:
14 /// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
16 /// This code has had some light optimization attention paid to it,
17 /// particularly in the form of reducing allocation as much as possible.
18 /// However, it is still generally slow. Future optimization work should
19 /// probably focus on the bigger picture rather than micro-optimizations. For
22 /// 1. Figure out how to more intelligently create initial partitions. That is,
23 /// Hopcroft's algorithm starts by creating two partitions of DFA states
24 /// that are known to NOT be equivalent: match states and non-match states.
25 /// The algorithm proceeds by progressively refining these partitions into
26 /// smaller partitions. If we could start with more partitions, then we
27 /// could reduce the amount of work that Hopcroft's algorithm needs to do.
28 /// 2. For every partition that we visit, we find all incoming transitions to
29 /// every state in the partition for *every* element in the alphabet. (This
30 /// is why using byte classes can significantly decrease minimization times,
31 /// since byte classes shrink the alphabet.) This is quite costly and there
32 /// is perhaps some redundant work being performed depending on the specific
33 /// states in the set. For example, we might be able to only visit some
34 /// elements of the alphabet based on the transitions.
35 /// 3. Move parts of minimization into determinization. If minimization has
36 /// fewer states to deal with, then it should run faster. A prime example
37 /// of this might be large Unicode classes, which are generated in way that
38 /// can create a lot of redundant states. (Some work has been done on this
39 /// point during NFA compilation via the algorithm described in the
40 /// "Incremental Construction of MinimalAcyclic Finite-State Automata"
42 pub(crate) struct Minimizer
<'a
, S
: 'a
> {
43 dfa
: &'a
mut DFARepr
<S
>,
44 in_transitions
: Vec
<Vec
<Vec
<S
>>>,
45 partitions
: Vec
<StateSet
<S
>>,
46 waiting
: Vec
<StateSet
<S
>>,
49 impl<'a
, S
: StateID
> fmt
::Debug
for Minimizer
<'a
, S
> {
50 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
51 f
.debug_struct("Minimizer")
52 .field("dfa", &self.dfa
)
53 .field("in_transitions", &self.in_transitions
)
54 .field("partitions", &self.partitions
)
55 .field("waiting", &self.waiting
)
60 /// A set of states. A state set makes up a single partition in Hopcroft's
63 /// It is represented by an ordered set of state identifiers. We use shared
64 /// ownership so that a single state set can be in both the set of partitions
65 /// and in the set of waiting sets simultaneously without an additional
66 /// allocation. Generally, once a state set is built, it becomes immutable.
68 /// We use this representation because it avoids the overhead of more
69 /// traditional set data structures (HashSet/BTreeSet), and also because
70 /// computing intersection/subtraction on this representation is especially
72 #[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
73 struct StateSet
<S
>(Rc
<RefCell
<Vec
<S
>>>);
75 impl<'a
, S
: StateID
> Minimizer
<'a
, S
> {
76 pub fn new(dfa
: &'a
mut DFARepr
<S
>) -> Minimizer
<'a
, S
> {
77 let in_transitions
= Minimizer
::incoming_transitions(dfa
);
78 let partitions
= Minimizer
::initial_partitions(dfa
);
79 let waiting
= vec
![partitions
[0].clone()];
81 Minimizer { dfa, in_transitions, partitions, waiting }
84 pub fn run(mut self) {
85 let mut incoming
= StateSet
::empty();
86 let mut scratch1
= StateSet
::empty();
87 let mut scratch2
= StateSet
::empty();
88 let mut newparts
= vec
![];
90 while let Some(set
) = self.waiting
.pop() {
91 for b
in (0..self.dfa
.alphabet_len()).map(|b
| b
as u8) {
92 self.find_incoming_to(b
, &set
, &mut incoming
);
94 for p
in 0..self.partitions
.len() {
95 self.partitions
[p
].intersection(&incoming
, &mut scratch1
);
96 if scratch1
.is_empty() {
97 newparts
.push(self.partitions
[p
].clone());
101 self.partitions
[p
].subtract(&incoming
, &mut scratch2
);
102 if scratch2
.is_empty() {
103 newparts
.push(self.partitions
[p
].clone());
108 (scratch1
.deep_clone(), scratch2
.deep_clone());
109 newparts
.push(x
.clone());
110 newparts
.push(y
.clone());
111 match self.find_waiting(&self.partitions
[p
]) {
114 self.waiting
.push(y
);
117 if x
.len() <= y
.len() {
118 self.waiting
.push(x
);
120 self.waiting
.push(y
);
125 newparts
= mem
::replace(&mut self.partitions
, newparts
);
130 // At this point, we now have a minimal partitioning of states, where
131 // each partition is an equivalence class of DFA states. Now we need to
132 // use this partioning to update the DFA to only contain one state for
135 // Create a map from DFA state ID to the representative ID of the
136 // equivalence class to which it belongs. The representative ID of an
137 // equivalence class of states is the minimum ID in that class.
138 let mut state_to_part
= vec
![dead_id(); self.dfa
.state_count()];
139 for p
in &self.partitions
{
140 p
.iter(|id
| state_to_part
[id
.to_usize()] = p
.min());
143 // Generate a new contiguous sequence of IDs for minimal states, and
144 // create a map from equivalence IDs to the new IDs. Thus, the new
145 // minimal ID of *any* state in the unminimized DFA can be obtained
146 // with minimals_ids[state_to_part[old_id]].
147 let mut minimal_ids
= vec
![dead_id(); self.dfa
.state_count()];
148 let mut new_id
= S
::from_usize(0);
149 for (id
, _
) in self.dfa
.states() {
150 if state_to_part
[id
.to_usize()] == id
{
151 minimal_ids
[id
.to_usize()] = new_id
;
152 new_id
= S
::from_usize(new_id
.to_usize() + 1);
155 // The total number of states in the minimal DFA.
156 let minimal_count
= new_id
.to_usize();
158 // Re-map this DFA in place such that the only states remaining
159 // correspond to the representative states of every equivalence class.
160 for id
in (0..self.dfa
.state_count()).map(S
::from_usize
) {
161 // If this state isn't a representative for an equivalence class,
162 // then we skip it since it won't appear in the minimal DFA.
163 if state_to_part
[id
.to_usize()] != id
{
166 for (_
, next
) in self.dfa
.get_state_mut(id
).iter_mut() {
167 *next
= minimal_ids
[state_to_part
[next
.to_usize()].to_usize()];
169 self.dfa
.swap_states(id
, minimal_ids
[id
.to_usize()]);
171 // Trim off all unused states from the pre-minimized DFA. This
172 // represents all states that were merged into a non-singleton
173 // equivalence class of states, and appeared after the first state
174 // in each such class. (Because the state with the smallest ID in each
175 // equivalence class is its representative ID.)
176 self.dfa
.truncate_states(minimal_count
);
178 // Update the new start state, which is now just the minimal ID of
179 // whatever state the old start state was collapsed into.
180 let old_start
= self.dfa
.start_state();
181 self.dfa
.set_start_state(
182 minimal_ids
[state_to_part
[old_start
.to_usize()].to_usize()],
185 // In order to update the ID of the maximum match state, we need to
186 // find the maximum ID among all of the match states in the minimized
187 // DFA. This is not necessarily the new ID of the unminimized maximum
188 // match state, since that could have been collapsed with a much
189 // earlier match state. Therefore, to find the new max match state,
190 // we iterate over all previous match states, find their corresponding
191 // new minimal ID, and take the maximum of those.
192 let old_max
= self.dfa
.max_match_state();
193 self.dfa
.set_max_match_state(dead_id());
194 for id
in (0..(old_max
.to_usize() + 1)).map(S
::from_usize
) {
195 let part
= state_to_part
[id
.to_usize()];
196 let new_id
= minimal_ids
[part
.to_usize()];
197 if new_id
> self.dfa
.max_match_state() {
198 self.dfa
.set_max_match_state(new_id
);
203 fn find_waiting(&self, set
: &StateSet
<S
>) -> Option
<usize> {
204 self.waiting
.iter().position(|s
| s
== set
)
211 incoming
: &mut StateSet
<S
>,
215 for &inid
in &self.in_transitions
[id
.to_usize()][b
as usize] {
219 incoming
.canonicalize();
222 fn initial_partitions(dfa
: &DFARepr
<S
>) -> Vec
<StateSet
<S
>> {
223 let mut is_match
= StateSet
::empty();
224 let mut no_match
= StateSet
::empty();
225 for (id
, _
) in dfa
.states() {
226 if dfa
.is_match_state(id
) {
233 let mut sets
= vec
![is_match
];
234 if !no_match
.is_empty() {
237 sets
.sort_by_key(|s
| s
.len());
241 fn incoming_transitions(dfa
: &DFARepr
<S
>) -> Vec
<Vec
<Vec
<S
>>> {
242 let mut incoming
= vec
![];
243 for _
in dfa
.states() {
244 incoming
.push(vec
![vec
![]; dfa
.alphabet_len()]);
246 for (id
, state
) in dfa
.states() {
247 for (b
, next
) in state
.transitions() {
248 incoming
[next
.to_usize()][b
as usize].push(id
);
255 impl<S
: StateID
> StateSet
<S
> {
256 fn empty() -> StateSet
<S
> {
257 StateSet(Rc
::new(RefCell
::new(vec
![])))
260 fn add(&mut self, id
: S
) {
261 self.0.borrow_mut().push(id
);
268 fn canonicalize(&mut self) {
269 self.0.borrow_mut().sort();
270 self.0.borrow_mut().dedup();
273 fn clear(&mut self) {
274 self.0.borrow_mut().clear();
277 fn len(&self) -> usize {
278 self.0.borrow().len()
281 fn is_empty(&self) -> bool
{
285 fn deep_clone(&self) -> StateSet
<S
> {
286 let ids
= self.0.borrow().iter().cloned().collect();
287 StateSet(Rc
::new(RefCell
::new(ids
)))
290 fn iter
<F
: FnMut(S
)>(&self, mut f
: F
) {
291 for &id
in self.0.borrow().iter() {
296 fn intersection(&self, other
: &StateSet
<S
>, dest
: &mut StateSet
<S
>) {
298 if self.is_empty() || other
.is_empty() {
302 let (seta
, setb
) = (self.0.borrow(), other
.0.borrow());
303 let (mut ita
, mut itb
) = (seta
.iter().cloned(), setb
.iter().cloned());
304 let (mut a
, mut b
) = (ita
.next().unwrap(), itb
.next().unwrap());
308 a
= match ita
.next() {
312 b
= match itb
.next() {
317 a
= match ita
.next() {
322 b
= match itb
.next() {
330 fn subtract(&self, other
: &StateSet
<S
>, dest
: &mut StateSet
<S
>) {
332 if self.is_empty() || other
.is_empty() {
333 self.iter(|s
| dest
.add(s
));
337 let (seta
, setb
) = (self.0.borrow(), other
.0.borrow());
338 let (mut ita
, mut itb
) = (seta
.iter().cloned(), setb
.iter().cloned());
339 let (mut a
, mut b
) = (ita
.next().unwrap(), itb
.next().unwrap());
342 a
= match ita
.next() {
346 b
= match itb
.next() {
355 a
= match ita
.next() {
360 b
= match itb
.next() {