]> git.proxmox.com Git - rustc.git/blob - vendor/regex-automata-0.2.0/src/util/determinize/mod.rs
New upstream version 1.74.1+dfsg1
[rustc.git] / vendor / regex-automata-0.2.0 / src / util / determinize / mod.rs
1 /*!
2 This module contains types and routines for implementing determinization.
3
4 In this crate, there are at least two places where we implement
5 determinization: fully ahead-of-time compiled DFAs in the `dfa` module and
6 lazily compiled DFAs in the `hybrid` module. The stuff in this module
7 corresponds to the things that are in common between these implementations.
8
9 There are three broad things that our implementations of determinization have
10 in common, as defined by this module:
11
12 * The classification of start states. That is, whether we're dealing with
13 word boundaries, line boundaries, etc., is all the same. This also includes
14 the look-behind assertions that are satisfied by each starting state
15 classification.
16
17 * The representation of DFA states as sets of NFA states, including
18 convenience types for building these DFA states that are amenable to reusing
19 allocations.
20
21 * Routines for the "classical" parts of determinization: computing the
22 epsilon closure, tracking match states (with corresponding pattern IDs, since
23 we support multi-pattern finite automata) and, of course, computing the
24 transition function between states for units of input.
25
26 I did consider a couple of alternatives to this particular form of code reuse:
27
28 1. Don't do any code reuse. The problem here is that we *really* want both
29 forms of determinization to do exactly identical things when it comes to
30 their handling of NFA states. While our tests generally ensure this, the code
31 is tricky and large enough where not reusing code is a pretty big bummer.
32
33 2. Implement all of determinization once and make it generic over fully
34 compiled DFAs and lazily compiled DFAs. While I didn't actually try this
35 approach, my instinct is that it would be more complex than is needed here.
36 And the interface required would be pretty hairy. Instead, I think splitting
37 it into logical sub-components works better.
38 */
39
40 use alloc::vec::Vec;
41
42 pub(crate) use self::state::{
43 State, StateBuilderEmpty, StateBuilderMatches, StateBuilderNFA,
44 };
45
46 use crate::{
47 nfa::thompson::{self, Look, LookSet},
48 util::{
49 alphabet,
50 id::StateID,
51 matchtypes::MatchKind,
52 sparse_set::{SparseSet, SparseSets},
53 start::Start,
54 },
55 };
56
57 mod state;
58
59 /// Compute the set of all eachable NFA states, including the full epsilon
60 /// closure, from a DFA state for a single unit of input. The set of reachable
61 /// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned
62 /// also includes any look-behind assertions satisfied by `unit`, in addition
63 /// to whether it is a match state. For multi-pattern DFAs, the builder will
64 /// also include the pattern IDs that match (in the order seen).
65 ///
66 /// `nfa` must be able to resolve any NFA state in `state` and any NFA state
67 /// reachable via the epsilon closure of any NFA state in `state`. `sparses`
68 /// must have capacity equivalent to `nfa.len()`.
69 ///
70 /// `match_kind` should correspond to the match semantics implemented by the
71 /// DFA being built. Generally speaking, for leftmost-first match semantics,
72 /// states that appear after the first NFA match state will not be included in
73 /// the `StateBuilderNFA` returned since they are impossible to visit.
74 ///
75 /// `sparses` is used as scratch space for NFA traversal. Other than their
76 /// capacity requirements (detailed above), there are no requirements on what's
77 /// contained within them (if anything). Similarly, what's inside of them once
78 /// this routine returns is unspecified.
79 ///
80 /// `stack` must have length 0. It is used as scratch space for depth first
81 /// traversal. After returning, it is guaranteed that `stack` will have length
82 /// 0.
83 ///
84 /// `state` corresponds to the current DFA state on which one wants to compute
85 /// the transition for the input `unit`.
86 ///
87 /// `empty_builder` corresponds to the builder allocation to use to produce a
88 /// complete `StateBuilderNFA` state. If the state is not needed (or is already
89 /// cached), then it can be cleared and reused without needing to create a new
90 /// `State`. The `StateBuilderNFA` state returned is final and ready to be
91 /// turned into a `State` if necessary.
92 pub(crate) fn next(
93 nfa: &thompson::NFA,
94 match_kind: MatchKind,
95 sparses: &mut SparseSets,
96 stack: &mut Vec<StateID>,
97 state: &State,
98 unit: alphabet::Unit,
99 empty_builder: StateBuilderEmpty,
100 ) -> StateBuilderNFA {
101 sparses.clear();
102
103 // Put the NFA state IDs into a sparse set in case we need to
104 // re-compute their epsilon closure.
105 //
106 // Doing this state shuffling is technically not necessary unless some
107 // kind of look-around is used in the DFA. Some ad hoc experiments
108 // suggested that avoiding this didn't lead to much of an improvement,
109 // but perhaps more rigorous experimentation should be done. And in
110 // particular, avoiding this check requires some light refactoring of
111 // the code below.
112 state.iter_nfa_state_ids(|nfa_id| {
113 sparses.set1.insert(nfa_id);
114 });
115
116 // Compute look-ahead assertions originating from the current state.
117 // Based on the input unit we're transitioning over, some additional
118 // set of assertions may be true. Thus, we re-compute this state's
119 // epsilon closure (but only if necessary).
120 if !state.look_need().is_empty() {
121 // Add look-ahead assertions that are now true based on the current
122 // input unit.
123 let mut look_have = state.look_have().clone();
124 match unit.as_u8() {
125 Some(b'\n') => {
126 look_have.insert(Look::EndLine);
127 }
128 Some(_) => {}
129 None => {
130 look_have.insert(Look::EndText);
131 look_have.insert(Look::EndLine);
132 }
133 }
134 if state.is_from_word() == unit.is_word_byte() {
135 look_have.insert(Look::WordBoundaryUnicodeNegate);
136 look_have.insert(Look::WordBoundaryAsciiNegate);
137 } else {
138 look_have.insert(Look::WordBoundaryUnicode);
139 look_have.insert(Look::WordBoundaryAscii);
140 }
141 // If we have new assertions satisfied that are among the set of
142 // assertions that exist in this state (that is, just because
143 // we added an EndLine assertion above doesn't mean there is an
144 // EndLine conditional epsilon transition in this state), then we
145 // re-compute this state's epsilon closure using the updated set of
146 // assertions.
147 if !look_have
148 .subtract(state.look_have())
149 .intersect(state.look_need())
150 .is_empty()
151 {
152 for nfa_id in &sparses.set1 {
153 epsilon_closure(
154 nfa,
155 nfa_id,
156 look_have,
157 stack,
158 &mut sparses.set2,
159 );
160 }
161 sparses.swap();
162 sparses.set2.clear();
163 }
164 }
165
166 // Convert our empty builder into one that can record assertions and match
167 // pattern IDs.
168 let mut builder = empty_builder.into_matches();
169 // Set whether the StartLine look-behind assertion is true for this
170 // transition or not. The look-behind assertion for ASCII word boundaries
171 // is handled below.
172 if nfa.has_any_anchor() {
173 if unit.as_u8().map_or(false, |b| b == b'\n') {
174 // Why only handle StartLine here and not StartText? That's
175 // because StartText can only impact the starting state, which
176 // is speical cased in start state handling.
177 builder.look_have().insert(Look::StartLine);
178 }
179 }
180 for nfa_id in &sparses.set1 {
181 match *nfa.state(nfa_id) {
182 thompson::State::Union { .. }
183 | thompson::State::Fail
184 | thompson::State::Look { .. }
185 | thompson::State::Capture { .. } => {}
186 thompson::State::Match { id } => {
187 // Notice here that we are calling the NEW state a match
188 // state if the OLD state we are transitioning from
189 // contains an NFA match state. This is precisely how we
190 // delay all matches by one byte and also what therefore
191 // guarantees that starting states cannot be match states.
192 //
193 // If we didn't delay matches by one byte, then whether
194 // a DFA is a matching state or not would be determined
195 // by whether one of its own constituent NFA states
196 // was a match state. (And that would be done in
197 // 'add_nfa_states'.)
198 //
199 // Also, 'add_match_pattern_id' requires that callers never
200 // pass duplicative pattern IDs. We do in fact uphold that
201 // guarantee here, but it's subtle. In particular, a Thompson
202 // NFA guarantees that each pattern has exactly one match
203 // state. Moreover, since we're iterating over the NFA state
204 // IDs in a set, we are guarateed not to have any duplicative
205 // match states. Thus, it is impossible to add the same pattern
206 // ID more than once.
207 builder.add_match_pattern_id(id);
208 if !match_kind.continue_past_first_match() {
209 break;
210 }
211 }
212 thompson::State::Range { range: ref r } => {
213 if r.matches_unit(unit) {
214 epsilon_closure(
215 nfa,
216 r.next,
217 *builder.look_have(),
218 stack,
219 &mut sparses.set2,
220 );
221 }
222 }
223 thompson::State::Sparse(ref sparse) => {
224 if let Some(next) = sparse.matches_unit(unit) {
225 epsilon_closure(
226 nfa,
227 next,
228 *builder.look_have(),
229 stack,
230 &mut sparses.set2,
231 );
232 }
233 }
234 }
235 }
236 // We only set the word byte if there's a word boundary look-around
237 // anywhere in this regex. Otherwise, there's no point in bloating the
238 // number of states if we don't have one.
239 //
240 // We also only set it when the state has a non-zero number of NFA states.
241 // Otherwise, we could wind up with states that *should* be DEAD states
242 // but are otherwise distinct from DEAD states because of this look-behind
243 // assertion being set. While this can't technically impact correctness *in
244 // theory*, it can create pathological DFAs that consume input until EOI or
245 // a quit byte is seen. Consuming until EOI isn't a correctness problem,
246 // but a (serious) perf problem. Hitting a quit byte, however, could be a
247 // correctness problem since it could cause search routines to report an
248 // error instead of a detected match once the quit state is entered. (The
249 // search routine could be made to be a bit smarter by reporting a match
250 // if one was detected once it enters a quit state (and indeed, the search
251 // routines in this crate do just that), but it seems better to prevent
252 // these things by construction if possible.)
253 if nfa.has_word_boundary()
254 && unit.is_word_byte()
255 && !sparses.set2.is_empty()
256 {
257 builder.set_is_from_word();
258 }
259 let mut builder_nfa = builder.into_nfa();
260 add_nfa_states(nfa, &sparses.set2, &mut builder_nfa);
261 builder_nfa
262 }
263
264 /// Compute the epsilon closure for the given NFA state. The epsilon closure
265 /// consists of all NFA state IDs, including `start_nfa_id`, that can be
266 /// reached from `start_nfa_id` without consuming any input. These state IDs
267 /// are written to `set` in the order they are visited, but only if they are
268 /// not already in `set`. `start_nfa_id` must be a valid state ID for the NFA
269 /// given.
270 ///
271 /// `look_have` consists of the satisfied assertions at the current
272 /// position. For conditional look-around epsilon transitions, these are
273 /// only followed if they are satisfied by `look_have`.
274 ///
275 /// `stack` must have length 0. It is used as scratch space for depth first
276 /// traversal. After returning, it is guaranteed that `stack` will have length
277 /// 0.
278 pub(crate) fn epsilon_closure(
279 nfa: &thompson::NFA,
280 start_nfa_id: StateID,
281 look_have: LookSet,
282 stack: &mut Vec<StateID>,
283 set: &mut SparseSet,
284 ) {
285 assert!(stack.is_empty());
286 // If this isn't an epsilon state, then the epsilon closure is always just
287 // itself, so there's no need to spin up the machinery below to handle it.
288 if !nfa.state(start_nfa_id).is_epsilon() {
289 set.insert(start_nfa_id);
290 return;
291 }
292
293 stack.push(start_nfa_id);
294 while let Some(mut id) = stack.pop() {
295 // In many cases, we can avoid stack operations when an NFA state only
296 // adds one new state to visit. In that case, we just set our ID to
297 // that state and mush on. We only use the stack when an NFA state
298 // introduces multiple new states to visit.
299 loop {
300 // Insert this NFA state, and if it's already in the set and thus
301 // already visited, then we can move on to the next one.
302 if !set.insert(id) {
303 break;
304 }
305 match *nfa.state(id) {
306 thompson::State::Range { .. }
307 | thompson::State::Sparse { .. }
308 | thompson::State::Fail
309 | thompson::State::Match { .. } => break,
310 thompson::State::Look { look, next } => {
311 if !look_have.contains(look) {
312 break;
313 }
314 id = next;
315 }
316 thompson::State::Union { ref alternates } => {
317 id = match alternates.get(0) {
318 None => break,
319 Some(&id) => id,
320 };
321 // We need to process our alternates in order to preserve
322 // match preferences, so put the earliest alternates closer
323 // to the top of the stack.
324 stack.extend(alternates[1..].iter().rev());
325 }
326 thompson::State::Capture { next, .. } => {
327 id = next;
328 }
329 }
330 }
331 }
332 }
333
334 /// Add the NFA state IDs in the given `set` to the given DFA builder state.
335 /// The order in which states are added corresponds to the order in which they
336 /// were added to `set`.
337 ///
338 /// The DFA builder state given should already have its complete set of match
339 /// pattern IDs added (if any) and any look-behind assertions (StartLine,
340 /// StartText and whether this state is being generated for a transition over a
341 /// word byte when applicable) that are true immediately prior to transitioning
342 /// into this state (via `builder.look_have()`). The match pattern IDs should
343 /// correspond to matches that occured on the previous transition, since all
344 /// matches are delayed by one byte. The things that should _not_ be set are
345 /// look-ahead assertions (EndLine, EndText and whether the next byte is a
346 /// word byte or not). The builder state should also not have anything in
347 /// `look_need` set, as this routine will compute that for you.
348 ///
349 /// The given NFA should be able to resolve all identifiers in `set` to a
350 /// particular NFA state. Additionally, `set` must have capacity equivalent
351 /// to `nfa.len()`.
352 pub(crate) fn add_nfa_states(
353 nfa: &thompson::NFA,
354 set: &SparseSet,
355 builder: &mut StateBuilderNFA,
356 ) {
357 for nfa_id in set {
358 match *nfa.state(nfa_id) {
359 thompson::State::Range { .. } => {
360 builder.add_nfa_state_id(nfa_id);
361 }
362 thompson::State::Sparse { .. } => {
363 builder.add_nfa_state_id(nfa_id);
364 }
365 thompson::State::Look { look, .. } => {
366 builder.add_nfa_state_id(nfa_id);
367 builder.look_need().insert(look);
368 }
369 thompson::State::Union { .. }
370 | thompson::State::Capture { .. } => {
371 // Pure epsilon transitions don't need to be tracked
372 // as part of the DFA state. Tracking them is actually
373 // superfluous; they won't cause any harm other than making
374 // determinization slower.
375 //
376 // Why aren't these needed? Well, in an NFA, epsilon
377 // transitions are really just jumping points to other
378 // states. So once you hit an epsilon transition, the same
379 // set of resulting states always appears. Therefore,
380 // putting them in a DFA's set of ordered NFA states is
381 // strictly redundant.
382 //
383 // Look-around states are also epsilon transitions, but
384 // they are *conditional*. So their presence could be
385 // discriminatory, and thus, they are tracked above.
386 //
387 // But wait... why are epsilon states in our `set` in the
388 // first place? Why not just leave them out? They're in
389 // our `set` because it was generated by computing an
390 // epsilon closure, and we want to keep track of all states
391 // we visited to avoid re-visiting them. In exchange, we
392 // have to do this second iteration over our collected
393 // states to finalize our DFA state.
394 //
395 // Note that this optimization requires that we re-compute
396 // the epsilon closure to account for look-ahead in 'next'
397 // *only when necessary*. Namely, only when the set of
398 // look-around assertions changes and only when those
399 // changes are within the set of assertions that are
400 // needed in order to step through the closure correctly.
401 // Otherwise, if we re-do the epsilon closure needlessly,
402 // it could change based on the fact that we are omitting
403 // epsilon states here.
404 }
405 thompson::State::Fail => {
406 break;
407 }
408 thompson::State::Match { .. } => {
409 // Normally, the NFA match state doesn't actually need to
410 // be inside the DFA state. But since we delay matches by
411 // one byte, the matching DFA state corresponds to states
412 // that transition from the one we're building here. And
413 // the way we detect those cases is by looking for an NFA
414 // match state. See 'next' for how this is handled.
415 builder.add_nfa_state_id(nfa_id);
416 }
417 }
418 }
419 // If we know this state contains no look-around assertions, then
420 // there's no reason to track which look-around assertions were
421 // satisfied when this state was created.
422 if builder.look_need().is_empty() {
423 builder.look_have().clear();
424 }
425 }
426
427 /// Sets the appropriate look-behind assertions on the given state based on
428 /// this starting configuration.
429 pub(crate) fn set_lookbehind_from_start(
430 start: &Start,
431 builder: &mut StateBuilderMatches,
432 ) {
433 match *start {
434 Start::NonWordByte => {}
435 Start::WordByte => {
436 builder.set_is_from_word();
437 }
438 Start::Text => {
439 builder.look_have().insert(Look::StartText);
440 builder.look_have().insert(Look::StartLine);
441 }
442 Start::Line => {
443 builder.look_have().insert(Look::StartLine);
444 }
445 }
446 }
447
448 #[cfg(test)]
449 mod tests {
450 use super::Start;
451
452 #[test]
453 #[should_panic]
454 fn start_fwd_bad_range() {
455 Start::from_position_fwd(&[], 0, 1);
456 }
457
458 #[test]
459 #[should_panic]
460 fn start_rev_bad_range() {
461 Start::from_position_rev(&[], 0, 1);
462 }
463
464 #[test]
465 fn start_fwd() {
466 let f = Start::from_position_fwd;
467
468 assert_eq!(Start::Text, f(&[], 0, 0));
469 assert_eq!(Start::Text, f(b"abc", 0, 3));
470 assert_eq!(Start::Text, f(b"\nabc", 0, 3));
471
472 assert_eq!(Start::Line, f(b"\nabc", 1, 3));
473
474 assert_eq!(Start::WordByte, f(b"abc", 1, 3));
475
476 assert_eq!(Start::NonWordByte, f(b" abc", 1, 3));
477 }
478
479 #[test]
480 fn start_rev() {
481 let f = Start::from_position_rev;
482
483 assert_eq!(Start::Text, f(&[], 0, 0));
484 assert_eq!(Start::Text, f(b"abc", 0, 3));
485 assert_eq!(Start::Text, f(b"abc\n", 0, 4));
486
487 assert_eq!(Start::Line, f(b"abc\nz", 0, 3));
488
489 assert_eq!(Start::WordByte, f(b"abc", 0, 2));
490
491 assert_eq!(Start::NonWordByte, f(b"abc ", 0, 3));
492 }
493 }