7 dfa
::{dense, Error, DEAD}
,
11 alphabet
::{self, ByteSet}
,
12 determinize
::{State, StateBuilderEmpty, StateBuilderNFA}
,
13 id
::{PatternID, StateID}
,
14 matchtypes
::MatchKind
,
15 sparse_set
::{SparseSet, SparseSets}
,
20 /// A builder for configuring and running a DFA determinizer.
21 #[derive(Clone, Debug)]
22 pub(crate) struct Config
{
24 match_kind
: MatchKind
,
26 dfa_size_limit
: Option
<usize>,
27 determinize_size_limit
: Option
<usize>,
31 /// Create a new default config for a determinizer. The determinizer may be
32 /// configured before calling `run`.
33 pub fn new() -> Config
{
36 match_kind
: MatchKind
::LeftmostFirst
,
37 quit
: ByteSet
::empty(),
39 determinize_size_limit
: None
,
43 /// Run determinization on the given NFA and write the resulting DFA into
44 /// the one given. The DFA given should be initialized but otherwise empty.
45 /// "Initialized" means that it is setup to handle the NFA's byte classes,
46 /// number of patterns and whether to build start states for each pattern.
50 dfa
: &mut dense
::OwnedDFA
,
51 ) -> Result
<(), Error
> {
52 let dead
= State
::dead();
53 let quit
= State
::dead();
54 let mut cache
= StateMap
::default();
55 // We only insert the dead state here since its representation is
56 // identical to the quit state. And we never want anything pointing
57 // to the quit state other than specific transitions derived from the
58 // determinizer's configured "quit" bytes.
60 // We do put the quit state into 'builder_states' below. This ensures
61 // that a proper DFA state ID is allocated for it, and that no other
62 // DFA state uses the "location after the DEAD state." That is, it
63 // is assumed that the quit state is always the state immediately
64 // following the DEAD state.
65 cache
.insert(dead
.clone(), DEAD
);
71 builder_states
: alloc
::vec
![dead
, quit
],
73 memory_usage_state
: 0,
74 sparses
: SparseSets
::new(nfa
.len()),
76 scratch_state_builder
: StateBuilderEmpty
::new(),
81 /// Whether to build an anchored DFA or not. When disabled (the default),
82 /// the unanchored prefix from the NFA is used to start the DFA. Otherwise,
83 /// the anchored start state of the NFA is used to start the DFA.
84 pub fn anchored(&mut self, yes
: bool
) -> &mut Config
{
89 /// The match semantics to use for determinization.
91 /// MatchKind::All corresponds to the standard textbook construction.
92 /// All possible match states are represented in the DFA.
93 /// MatchKind::LeftmostFirst permits greediness and otherwise tries to
94 /// simulate the match semantics of backtracking regex engines. Namely,
95 /// only a subset of match states are built, and dead states are used to
96 /// stop searches with an unanchored prefix.
98 /// The default is MatchKind::LeftmostFirst.
99 pub fn match_kind(&mut self, kind
: MatchKind
) -> &mut Config
{
100 self.match_kind
= kind
;
104 /// The set of bytes to use that will cause the DFA to enter a quit state,
105 /// stop searching and return an error. By default, this is empty.
106 pub fn quit(&mut self, set
: ByteSet
) -> &mut Config
{
111 /// The limit, in bytes of the heap, that the DFA is permitted to use. This
112 /// does not include the auxiliary heap storage used by determinization.
113 pub fn dfa_size_limit(&mut self, bytes
: Option
<usize>) -> &mut Config
{
114 self.dfa_size_limit
= bytes
;
118 /// The limit, in bytes of the heap, that determinization itself is allowed
119 /// to use. This does not include the size of the DFA being built.
120 pub fn determinize_size_limit(
122 bytes
: Option
<usize>,
124 self.determinize_size_limit
= bytes
;
129 /// The actual implementation of determinization that converts an NFA to a DFA
130 /// through powerset construction.
132 /// This determinizer roughly follows the typical powerset construction, where
133 /// each DFA state is comprised of one or more NFA states. In the worst case,
134 /// there is one DFA state for every possible combination of NFA states. In
135 /// practice, this only happens in certain conditions, typically when there are
136 /// bounded repetitions.
138 /// The main differences between this implementation and typical deteminization
139 /// are that this implementation delays matches by one state and hackily makes
140 /// look-around work. Comments below attempt to explain this.
142 /// The lifetime variable `'a` refers to the lifetime of the NFA or DFA,
143 /// whichever is shorter.
146 /// The configuration used to initialize determinization.
148 /// The NFA we're converting into a DFA.
149 nfa
: &'a thompson
::NFA
,
150 /// The DFA we're building.
151 dfa
: &'a
mut dense
::OwnedDFA
,
152 /// Each DFA state being built is defined as an *ordered* set of NFA
153 /// states, along with some meta facts about the ordered set of NFA states.
155 /// This is never empty. The first state is always a dummy state such that
156 /// a state id == 0 corresponds to a dead state. The second state is always
159 /// Why do we have states in both a `Vec` and in a cache map below?
160 /// Well, they serve two different roles based on access patterns.
161 /// `builder_states` is the canonical home of each state, and provides
162 /// constant random access by a DFA state's ID. The cache map below, on
163 /// the other hand, provides a quick way of searching for identical DFA
164 /// states by using the DFA state as a key in the map. Of course, we use
165 /// reference counting to avoid actually duplicating the state's data
166 /// itself. (Although this has never been benchmarked.) Note that the cache
167 /// map does not give us full minimization; it just lets us avoid some very
168 /// obvious redundant states.
170 /// Note that the index into this Vec isn't quite the DFA's state ID.
171 /// Rather, it's just an index. To get the state ID, you have to multiply
172 /// it by the DFA's stride. That's done by self.dfa.from_index. And the
173 /// inverse is self.dfa.to_index.
175 /// Moreover, DFA states don't usually retain the IDs assigned to them
176 /// by their position in this Vec. After determinization completes,
177 /// states are shuffled around to support other optimizations. See the
178 /// sibling 'special' module for more details on that. (The reason for
179 /// mentioning this is that if you print out the DFA for debugging during
180 /// determinization, and then print out the final DFA after it is fully
181 /// built, then the state IDs likely won't match up.)
182 builder_states
: Vec
<State
>,
183 /// A cache of DFA states that already exist and can be easily looked up
184 /// via ordered sets of NFA states.
186 /// See `builder_states` docs for why we store states in two different
189 /// The memory usage, in bytes, used by builder_states and cache. We track
190 /// this as new states are added since states use a variable amount of
191 /// heap. Tracking this as we add states makes it possible to compute the
192 /// total amount of memory used by the determinizer in constant time.
193 memory_usage_state
: usize,
194 /// A pair of sparse sets for tracking ordered sets of NFA state IDs.
195 /// These are reused throughout determinization. A bounded sparse set
196 /// gives us constant time insertion, membership testing and clearing.
198 /// Scratch space for a stack of NFA states to visit, for depth first
199 /// visiting without recursion.
201 /// Scratch space for storing an ordered sequence of NFA states, for
202 /// amortizing allocation. This is principally useful for when we avoid
203 /// adding a new DFA state since it already exists. In order to detect this
204 /// case though, we still need an ordered set of NFA state IDs. So we use
205 /// this space to stage that ordered set before we know whether we need to
206 /// create a new DFA state or not.
207 scratch_state_builder
: StateBuilderEmpty
,
210 /// A map from states to state identifiers. When using std, we use a standard
211 /// hashmap, since it's a bit faster for this use case. (Other maps, like
212 /// one's based on FNV, have not yet been benchmarked.)
214 /// The main purpose of this map is to reuse states where possible. This won't
215 /// fully minimize the DFA, but it works well in a lot of cases.
216 #[cfg(feature = "std")]
217 type StateMap
= std
::collections
::HashMap
<State
, StateID
>;
218 #[cfg(not(feature = "std"))]
219 type StateMap
= BTreeMap
<State
, StateID
>;
221 impl<'a
> Runner
<'a
> {
222 /// Build the DFA. If there was a problem constructing the DFA (e.g., if
223 /// the chosen state identifier representation is too small), then an error
225 fn run(mut self) -> Result
<(), Error
> {
226 if self.nfa
.has_word_boundary_unicode()
227 && !self.config
.quit
.contains_range(0x80, 0xFF)
229 return Err(Error
::unsupported_dfa_word_boundary_unicode());
232 // A sequence of "representative" bytes drawn from each equivalence
233 // class. These representative bytes are fed to the NFA to compute
234 // state transitions. This allows us to avoid re-computing state
235 // transitions for bytes that are guaranteed to produce identical
237 let representatives
: Vec
<alphabet
::Unit
> =
238 self.dfa
.byte_classes().representatives().collect();
239 // The set of all DFA state IDs that still need to have their
240 // transitions set. We start by seeding this with all starting states.
241 let mut uncompiled
= alloc
::vec
![];
242 self.add_all_starts(&mut uncompiled
)?
;
243 while let Some(dfa_id
) = uncompiled
.pop() {
244 for &unit
in &representatives
{
245 if unit
.as_u8().map_or(false, |b
| self.config
.quit
.contains(b
))
249 // In many cases, the state we transition to has already been
250 // computed. 'cached_state' will do the minimal amount of work
251 // to check this, and if it exists, immediately return an
252 // already existing state ID.
253 let (next_dfa_id
, is_new
) = self.cached_state(dfa_id
, unit
)?
;
254 self.dfa
.set_transition(dfa_id
, unit
, next_dfa_id
);
255 // If the state ID we got back is newly created, then we need
256 // to compile it, so add it to our uncompiled frontier.
258 uncompiled
.push(next_dfa_id
);
263 "determinization complete, memory usage: {}, dense DFA size: {}",
265 self.dfa
.memory_usage(),
268 // A map from DFA state ID to one or more NFA match IDs. Each NFA match
269 // ID corresponds to a distinct regex pattern that matches in the state
270 // corresponding to the key.
271 let mut matches
: BTreeMap
<StateID
, Vec
<PatternID
>> = BTreeMap
::new();
273 #[allow(unused_variables)]
274 let mut total_pat_count
= 0;
275 for (i
, state
) in self.builder_states
.into_iter().enumerate() {
276 if let Some(pat_ids
) = state
.match_pattern_ids() {
277 let id
= self.dfa
.from_index(i
);
278 total_pat_count
+= pat_ids
.len();
279 matches
.insert(id
, pat_ids
);
283 use core
::mem
::size_of
;
284 let per_elem
= size_of
::<StateID
>() + size_of
::<Vec
<PatternID
>>();
285 let pats
= total_pat_count
* size_of
::<PatternID
>();
286 let mem
= (matches
.len() * per_elem
) + pats
;
287 log
::trace
!("matches map built, memory usage: {}", mem
);
289 // At this point, we shuffle the "special" states in the final DFA.
290 // This permits a DFA's match loop to detect a match condition (among
291 // other things) by merely inspecting the current state's identifier,
292 // and avoids the need for any additional auxiliary storage.
293 self.dfa
.shuffle(matches
)?
;
297 /// Return the identifier for the next DFA state given an existing DFA
298 /// state and an input byte. If the next DFA state already exists, then
299 /// return its identifier from the cache. Otherwise, build the state, cache
300 /// it and return its identifier.
302 /// This routine returns a boolean indicating whether a new state was
303 /// built. If a new state is built, then the caller needs to add it to its
304 /// frontier of uncompiled DFA states to compute transitions for.
308 unit
: alphabet
::Unit
,
309 ) -> Result
<(StateID
, bool
), Error
> {
310 // Compute the set of all reachable NFA states, including epsilons.
311 let empty_builder
= self.get_state_builder();
312 let builder
= util
::determinize
::next(
314 self.config
.match_kind
,
317 &self.builder_states
[self.dfa
.to_index(dfa_id
)],
321 self.maybe_add_state(builder
)
324 /// Compute the set of DFA start states and add their identifiers in
325 /// 'dfa_state_ids' (no duplicates are added).
328 dfa_state_ids
: &mut Vec
<StateID
>,
329 ) -> Result
<(), Error
> {
330 // Always add the (possibly unanchored) start states for matching any
331 // of the patterns in this DFA.
332 self.add_start_group(None
, dfa_state_ids
)?
;
333 // We only need to compute anchored start states for each pattern if it
334 // was requested to do so.
335 if self.dfa
.has_starts_for_each_pattern() {
336 for pid
in PatternID
::iter(self.dfa
.pattern_count()) {
337 self.add_start_group(Some(pid
), dfa_state_ids
)?
;
343 /// Add a group of start states for the given match pattern ID. Any new
344 /// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are
347 /// When pattern_id is None, then this will compile a group of unanchored
348 /// start states (if the DFA is unanchored). When the pattern_id is
349 /// present, then this will compile a group of anchored start states that
350 /// only match the given pattern.
353 pattern_id
: Option
<PatternID
>,
354 dfa_state_ids
: &mut Vec
<StateID
>,
355 ) -> Result
<(), Error
> {
356 let nfa_start
= match pattern_id
{
357 Some(pid
) => self.nfa
.start_pattern(pid
),
358 None
if self.config
.anchored
=> self.nfa
.start_anchored(),
359 None
=> self.nfa
.start_unanchored(),
362 // When compiling start states, we're careful not to build additional
363 // states that aren't necessary. For example, if the NFA has no word
364 // boundary assertion, then there's no reason to have distinct start
365 // states for 'NonWordByte' and 'WordByte' starting configurations.
366 // Instead, the 'WordByte' starting configuration can just point
367 // directly to the start state for the 'NonWordByte' config.
370 self.add_one_start(nfa_start
, Start
::NonWordByte
)?
;
371 self.dfa
.set_start_state(Start
::NonWordByte
, pattern_id
, id
);
373 dfa_state_ids
.push(id
);
376 if !self.nfa
.has_word_boundary() {
377 self.dfa
.set_start_state(Start
::WordByte
, pattern_id
, id
);
380 self.add_one_start(nfa_start
, Start
::WordByte
)?
;
381 self.dfa
.set_start_state(Start
::WordByte
, pattern_id
, id
);
383 dfa_state_ids
.push(id
);
386 if !self.nfa
.has_any_anchor() {
387 self.dfa
.set_start_state(Start
::Text
, pattern_id
, id
);
388 self.dfa
.set_start_state(Start
::Line
, pattern_id
, id
);
390 let (id
, is_new
) = self.add_one_start(nfa_start
, Start
::Text
)?
;
391 self.dfa
.set_start_state(Start
::Text
, pattern_id
, id
);
393 dfa_state_ids
.push(id
);
396 let (id
, is_new
) = self.add_one_start(nfa_start
, Start
::Line
)?
;
397 self.dfa
.set_start_state(Start
::Line
, pattern_id
, id
);
399 dfa_state_ids
.push(id
);
406 /// Add a new DFA start state corresponding to the given starting NFA
407 /// state, and the starting search configuration. (The starting search
408 /// configuration essentially tells us which look-behind assertions are
409 /// true for this particular state.)
411 /// The boolean returned indicates whether the state ID returned is a newly
412 /// created state, or a previously cached state.
417 ) -> Result
<(StateID
, bool
), Error
> {
418 // Compute the look-behind assertions that are true in this starting
419 // configuration, and the determine the epsilon closure. While
420 // computing the epsilon closure, we only follow condiional epsilon
421 // transitions that satisfy the look-behind assertions in 'facts'.
422 let mut builder_matches
= self.get_state_builder().into_matches();
423 util
::determinize
::set_lookbehind_from_start(
425 &mut builder_matches
,
427 self.sparses
.set1
.clear();
428 util
::determinize
::epsilon_closure(
431 *builder_matches
.look_have(),
433 &mut self.sparses
.set1
,
435 let mut builder
= builder_matches
.into_nfa();
436 util
::determinize
::add_nfa_states(
441 self.maybe_add_state(builder
)
444 /// Adds the given state to the DFA being built depending on whether it
445 /// already exists in this determinizer's cache.
447 /// If it does exist, then the memory used by 'state' is put back into the
448 /// determinizer and the previously created state's ID is returned. (Along
449 /// with 'false', indicating that no new state was added.)
451 /// If it does not exist, then the state is added to the DFA being built
452 /// and a fresh ID is allocated (if ID allocation fails, then an error is
453 /// returned) and returned. (Along with 'true', indicating that a new state
457 builder
: StateBuilderNFA
,
458 ) -> Result
<(StateID
, bool
), Error
> {
459 if let Some(&cached_id
) = self.cache
.get(builder
.as_bytes()) {
460 // Since we have a cached state, put the constructed state's
461 // memory back into our scratch space, so that it can be reused.
462 self.put_state_builder(builder
);
463 return Ok((cached_id
, false));
465 self.add_state(builder
).map(|sid
| (sid
, true))
468 /// Add the given state to the DFA and make it available in the cache.
470 /// The state initially has no transitions. That is, it transitions to the
471 /// dead state for all possible inputs, and transitions to the quit state
472 /// for all quit bytes.
474 /// If adding the state would exceed the maximum value for StateID, then an
475 /// error is returned.
478 builder
: StateBuilderNFA
,
479 ) -> Result
<StateID
, Error
> {
480 let id
= self.dfa
.add_empty_state()?
;
481 if !self.config
.quit
.is_empty() {
482 for b
in self.config
.quit
.iter() {
483 self.dfa
.set_transition(
485 alphabet
::Unit
::u8(b
),
490 let state
= builder
.to_state();
491 // States use reference counting internally, so we only need to count
492 // their memroy usage once.
493 self.memory_usage_state
+= state
.memory_usage();
494 self.builder_states
.push(state
.clone());
495 self.cache
.insert(state
, id
);
496 self.put_state_builder(builder
);
497 if let Some(limit
) = self.config
.dfa_size_limit
{
498 if self.dfa
.memory_usage() > limit
{
499 return Err(Error
::dfa_exceeded_size_limit(limit
));
502 if let Some(limit
) = self.config
.determinize_size_limit
{
503 if self.memory_usage() > limit
{
504 return Err(Error
::determinize_exceeded_size_limit(limit
));
510 /// Returns a state builder from this determinizer that might have existing
511 /// capacity. This helps avoid allocs in cases where a state is built that
512 /// turns out to already be cached.
514 /// Callers must put the state builder back with 'put_state_builder',
515 /// otherwise the allocation reuse won't work.
516 fn get_state_builder(&mut self) -> StateBuilderEmpty
{
518 &mut self.scratch_state_builder
,
519 StateBuilderEmpty
::new(),
523 /// Puts the given state builder back into this determinizer for reuse.
525 /// Note that building a 'State' from a builder always creates a new
526 /// alloc, so callers should always put the builder back.
527 fn put_state_builder(&mut self, builder
: StateBuilderNFA
) {
528 let _
= core
::mem
::replace(
529 &mut self.scratch_state_builder
,
534 /// Return the memory usage, in bytes, of this determinizer at the current
535 /// point in time. This does not include memory used by the NFA or the
536 /// dense DFA itself.
537 fn memory_usage(&self) -> usize {
538 use core
::mem
::size_of
;
540 self.builder_states
.len() * size_of
::<State
>()
541 // Maps likely use more memory than this, but it's probably close.
542 + self.cache
.len() * (size_of
::<State
>() + size_of
::<StateID
>())
543 + self.memory_usage_state
544 + self.stack
.capacity() * size_of
::<StateID
>()
545 + self.scratch_state_builder
.capacity()