2 This module provides an NFA compiler using Thompson's construction
3 algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA
4 graph as output. The NFA graph is structured in a way that permits it to be
5 executed by a virtual machine and also used to efficiently build a DFA.
7 The compiler deals with a slightly expanded set of NFA states that notably
8 includes an empty node that has exactly one epsilon transition to the next
9 state. In other words, it's a "goto" instruction if one views Thompson's NFA
10 as a set of bytecode instructions. These goto instructions are removed in
11 a subsequent phase before returning the NFA to the caller. The purpose of
12 these empty nodes is that they make the construction algorithm substantially
13 simpler to implement. We remove them before returning to the caller because
14 they can represent substantial overhead when traversing the NFA graph
15 (either while searching using the NFA directly or while building a DFA).
17 In the future, it would be nice to provide a Glushkov compiler as well,
18 as it would work well as a bit-parallel NFA for smaller regexes. But
19 the Thompson construction is one I'm more familiar with and seems more
20 straight-forward to deal with when it comes to large Unicode character
23 Internally, the compiler uses interior mutability to improve composition
24 in the face of the borrow checker. In particular, we'd really like to be
25 able to write things like this:
27 self.c_concat(exprs.iter().map(|e| self.c(e)))
29 Which elegantly uses iterators to build up a sequence of compiled regex
30 sub-expressions and then hands it off to the concatenating compiler
31 routine. Without interior mutability, the borrow checker won't let us
32 borrow `self` mutably both inside and outside the closure at the same
38 cell
::{Cell, RefCell}
,
42 use alloc
::{sync::Arc, vec, vec::Vec}
;
45 hir
::{self, Anchor, Class, Hir, HirKind, Literal, WordBoundary}
,
46 utf8
::{Utf8Range, Utf8Sequences}
,
53 map
::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap}
,
54 range_trie
::RangeTrie
,
55 Look
, SparseTransitions
, State
, Transition
, NFA
,
58 alphabet
::ByteClassSet
,
59 id
::{IteratorIDExt, PatternID, StateID}
,
63 /// The configuration used for compiling a Thompson NFA from a regex pattern.
64 #[derive(Clone, Copy, Debug, Default)]
66 reverse
: Option
<bool
>,
68 nfa_size_limit
: Option
<Option
<usize>>,
70 captures
: Option
<bool
>,
72 unanchored_prefix
: Option
<bool
>,
76 /// Return a new default Thompson NFA compiler configuration.
77 pub fn new() -> Config
{
83 /// A NFA reversal is performed by reversing all of the concatenated
84 /// sub-expressions in the original pattern, recursively. The resulting
85 /// NFA can be used to match the pattern starting from the end of a string
86 /// instead of the beginning of a string.
88 /// Reversing the NFA is useful for building a reverse DFA, which is most
89 /// useful for finding the start of a match after its ending position has
92 /// This is disabled by default.
93 pub fn reverse(mut self, yes
: bool
) -> Config
{
94 self.reverse
= Some(yes
);
98 /// Whether to enable UTF-8 mode or not.
100 /// When UTF-8 mode is enabled (which is the default), unanchored searches
101 /// will only match through valid UTF-8. If invalid UTF-8 is seen, then
102 /// an unanchored search will stop at that point. This is equivalent to
103 /// putting a `(?s:.)*?` at the start of the regex.
105 /// When UTF-8 mode is disabled, then unanchored searches will match
106 /// through any arbitrary byte. This is equivalent to putting a
107 /// `(?s-u:.)*?` at the start of the regex.
109 /// Generally speaking, UTF-8 mode should only be used when you know you
110 /// are searching valid UTF-8, such as a Rust `&str`. If UTF-8 mode is used
111 /// on input that is not valid UTF-8, then the regex is not likely to work
114 /// This is enabled by default.
115 pub fn utf8(mut self, yes
: bool
) -> Config
{
116 self.utf8
= Some(yes
);
120 /// Sets an approximate size limit on the total heap used by the NFA being
123 /// This permits imposing constraints on the size of a compiled NFA. This
124 /// may be useful in contexts where the regex pattern is untrusted and one
125 /// wants to avoid using too much memory.
127 /// This size limit does not apply to auxiliary heap used during
128 /// compilation that is not part of the built NFA.
130 /// Note that this size limit is applied during compilation in order for
131 /// the limit to prevent too much heap from being used. However, the
132 /// implementation may use an intermediate NFA representation that is
133 /// otherwise slightly bigger than the final public form. Since the size
134 /// limit may be applied to an intermediate representation, there is not
135 /// necessarily a precise correspondence between the configured size limit
136 /// and the heap usage of the final NFA.
138 /// There is no size limit by default.
142 /// This example demonstrates how Unicode mode can greatly increase the
146 /// use regex_automata::nfa::thompson::NFA;
148 /// // 300KB isn't enough!
150 /// .configure(NFA::config().nfa_size_limit(Some(300_000)))
151 /// .build(r"\w{20}")
154 /// // ... but 400KB probably is.
155 /// let nfa = NFA::builder()
156 /// .configure(NFA::config().nfa_size_limit(Some(400_000)))
157 /// .build(r"\w{20}")?;
159 /// assert_eq!(nfa.pattern_len(), 1);
161 /// # Ok::<(), Box<dyn std::error::Error>>(())
163 pub fn nfa_size_limit(mut self, bytes
: Option
<usize>) -> Config
{
164 self.nfa_size_limit
= Some(bytes
);
168 /// Apply best effort heuristics to shrink the NFA at the expense of more
171 /// This is enabled by default. Generally speaking, if one is using an NFA
172 /// to compile a DFA, then the extra time used to shrink the NFA will be
173 /// more than made up for during DFA construction (potentially by a lot).
174 /// In other words, enabling this can substantially decrease the overall
175 /// amount of time it takes to build a DFA.
177 /// The only reason to disable this if you want to compile an NFA and start
178 /// using it as quickly as possible without needing to build a DFA. e.g.,
179 /// for an NFA simulation or for a lazy DFA.
181 /// This is enabled by default.
182 pub fn shrink(mut self, yes
: bool
) -> Config
{
183 self.shrink
= Some(yes
);
187 /// Whether to include 'Capture' states in the NFA.
189 /// This can only be enabled when compiling a forward NFA. This is
190 /// always disabled---with no way to override it---when the `reverse`
191 /// configuration is enabled.
193 /// This is enabled by default.
194 pub fn captures(mut self, yes
: bool
) -> Config
{
195 self.captures
= Some(yes
);
199 /// Whether to compile an unanchored prefix into this NFA.
201 /// This is enabled by default. It is made available for tests only to make
202 /// it easier to unit test the output of the compiler.
204 fn unanchored_prefix(mut self, yes
: bool
) -> Config
{
205 self.unanchored_prefix
= Some(yes
);
209 pub fn get_reverse(&self) -> bool
{
210 self.reverse
.unwrap_or(false)
213 pub fn get_utf8(&self) -> bool
{
214 self.utf8
.unwrap_or(true)
217 pub fn get_nfa_size_limit(&self) -> Option
<usize> {
218 self.nfa_size_limit
.unwrap_or(None
)
221 pub fn get_shrink(&self) -> bool
{
222 self.shrink
.unwrap_or(true)
225 pub fn get_captures(&self) -> bool
{
226 !self.get_reverse() && self.captures
.unwrap_or(true)
229 fn get_unanchored_prefix(&self) -> bool
{
232 self.unanchored_prefix
.unwrap_or(true)
240 pub(crate) fn overwrite(self, o
: Config
) -> Config
{
242 reverse
: o
.reverse
.or(self.reverse
),
243 utf8
: o
.utf8
.or(self.utf8
),
244 nfa_size_limit
: o
.nfa_size_limit
.or(self.nfa_size_limit
),
245 shrink
: o
.shrink
.or(self.shrink
),
246 captures
: o
.captures
.or(self.captures
),
248 unanchored_prefix
: o
.unanchored_prefix
.or(self.unanchored_prefix
),
253 /// A builder for compiling an NFA.
254 #[derive(Clone, Debug)]
257 parser
: ParserBuilder
,
261 /// Create a new NFA builder with its default configuration.
262 pub fn new() -> Builder
{
263 Builder { config: Config::default(), parser: ParserBuilder::new() }
266 /// Compile the given regular expression into an NFA.
268 /// If there was a problem parsing the regex, then that error is returned.
270 /// Otherwise, if there was a problem building the NFA, then an error is
271 /// returned. The only error that can occur is if the compiled regex would
272 /// exceed the size limits configured on this builder.
273 pub fn build(&self, pattern
: &str) -> Result
<NFA
, Error
> {
274 self.build_many(&[pattern
])
277 pub fn build_many
<P
: AsRef
<str>>(
280 ) -> Result
<NFA
, Error
> {
281 let mut hirs
= vec
![];
287 .map_err(Error
::syntax
)?
,
289 log
!(log
::trace
!("parsed: {:?}", p
.as_ref()));
291 self.build_many_from_hir(&hirs
)
294 /// Compile the given high level intermediate representation of a regular
295 /// expression into an NFA.
297 /// If there was a problem building the NFA, then an error is returned. The
298 /// only error that can occur is if the compiled regex would exceed the
299 /// size limits configured on this builder.
300 pub fn build_from_hir(&self, expr
: &Hir
) -> Result
<NFA
, Error
> {
301 self.build_from_hir_with(&mut Compiler
::new(), expr
)
304 pub fn build_many_from_hir
<H
: Borrow
<Hir
>>(
307 ) -> Result
<NFA
, Error
> {
308 self.build_many_from_hir_with(&mut Compiler
::new(), exprs
)
311 /// Compile the given high level intermediate representation of a regular
312 /// expression into the NFA given using the given compiler. Callers may
313 /// prefer this over `build` if they would like to reuse allocations while
314 /// compiling many regular expressions.
316 /// On success, the given NFA is completely overwritten with the NFA
317 /// produced by the compiler.
319 /// If there was a problem building the NFA, then an error is returned.
320 /// The only error that can occur is if the compiled regex would exceed
321 /// the size limits configured on this builder. When an error is returned,
322 /// the contents of `nfa` are unspecified and should not be relied upon.
323 /// However, it can still be reused in subsequent calls to this method.
324 fn build_from_hir_with(
326 compiler
: &mut Compiler
,
328 ) -> Result
<NFA
, Error
> {
329 self.build_many_from_hir_with(compiler
, &[expr
])
332 fn build_many_from_hir_with
<H
: Borrow
<Hir
>>(
334 compiler
: &mut Compiler
,
336 ) -> Result
<NFA
, Error
> {
337 compiler
.configure(self.config
);
338 compiler
.compile(exprs
)
341 /// Apply the given NFA configuration options to this builder.
342 pub fn configure(&mut self, config
: Config
) -> &mut Builder
{
343 self.config
= self.config
.overwrite(config
);
347 /// Set the syntax configuration for this builder using
348 /// [`SyntaxConfig`](../../struct.SyntaxConfig.html).
350 /// This permits setting things like case insensitivity, Unicode and multi
353 /// This syntax configuration generally only applies when an NFA is built
354 /// directly from a pattern string. If an NFA is built from an HIR, then
355 /// all syntax settings are ignored.
358 config
: crate::util
::syntax
::SyntaxConfig
,
360 config
.apply(&mut self.parser
);
365 /// A compiler that converts a regex abstract syntax to an NFA via Thompson's
366 /// construction. Namely, this compiler permits epsilon transitions between
368 #[derive(Clone, Debug)]
369 pub struct Compiler
{
370 /// The configuration from the builder.
372 /// The final NFA that is built.
374 /// Parts of this NFA are constructed during compilation, but the actual
375 /// states aren't added until a final "finish" step. This is because the
376 /// states constructed during compilation have unconditional epsilon
377 /// transitions, which makes the logic of compilation much simpler. The
378 /// "finish" step removes these unconditional epsilon transitions and must
379 /// therefore remap all of the transition state IDs.
381 /// The set of compiled NFA states. Once a state is compiled, it is
382 /// assigned a state ID equivalent to its index in this list. Subsequent
383 /// compilation can modify previous states by adding new transitions.
384 states
: RefCell
<Vec
<CState
>>,
385 /// State used for compiling character classes to UTF-8 byte automata.
386 /// State is not retained between character class compilations. This just
387 /// serves to amortize allocation to the extent possible.
388 utf8_state
: RefCell
<Utf8State
>,
389 /// State used for arranging character classes in reverse into a trie.
390 trie_state
: RefCell
<RangeTrie
>,
391 /// State used for caching common suffixes when compiling reverse UTF-8
392 /// automata (for Unicode character classes).
393 utf8_suffix
: RefCell
<Utf8SuffixMap
>,
394 /// A map used to re-map state IDs when translating the compiler's internal
395 /// NFA state representation to the external NFA representation.
396 remap
: RefCell
<Vec
<StateID
>>,
397 /// A set of compiler internal state IDs that correspond to states that are
398 /// exclusively epsilon transitions, i.e., goto instructions, combined with
399 /// the state that they point to. This is used to record said states while
400 /// transforming the compiler's internal NFA representation to the external
402 empties
: RefCell
<Vec
<(StateID
, StateID
)>>,
403 /// The total memory used by each of the 'CState's in 'states'. This only
404 /// includes heap usage by each state, and not the size of the state
406 memory_cstates
: Cell
<usize>,
409 /// A compiler intermediate state representation for an NFA that is only used
410 /// during compilation. Once compilation is done, `CState`s are converted
411 /// to `State`s (defined in the parent module), which have a much simpler
413 #[derive(Clone, Debug, Eq, PartialEq)]
415 /// An empty state whose only purpose is to forward the automaton to
416 /// another state via en epsilon transition. These are useful during
417 /// compilation but are otherwise removed at the end.
421 /// An empty state that records a capture location.
423 /// From the perspective of finite automata, this is precisely equivalent
424 /// to 'Empty', but serves the purpose of instructing NFA simulations to
425 /// record additional state when the finite state machine passes through
426 /// this epsilon transition.
428 /// These transitions are treated as epsilon transitions with no additional
431 /// 'slot' in this context refers to the specific capture group offset that
432 /// is being recorded. Each capturing group has two slots corresponding to
433 /// the start and end of the matching portion of that group.
437 name
: Option
<Arc
<str>>,
443 /// A state that only transitions to `next` if the current input byte is
444 /// in the range `[start, end]` (inclusive on both ends).
448 /// A state with possibly many transitions, represented in a sparse
449 /// fashion. Transitions are ordered lexicographically by input range.
450 /// As such, this may only be used when every transition has equal
451 /// priority. (In practice, this is only used for encoding large UTF-8
452 /// automata.) In contrast, a `Union` state has each alternate in order
453 /// of priority. Priority is used to implement greedy matching and also
454 /// alternations themselves, e.g., `abc|a` where `abc` has priority over
457 /// To clarify, it is possible to remove `Sparse` and represent all things
458 /// that `Sparse` is used for via `Union`. But this creates a more bloated
459 /// NFA with more epsilon transitions than is necessary in the special case
460 /// of character classes.
462 ranges
: Vec
<Transition
>,
464 /// A conditional epsilon transition satisfied via some sort of
470 /// An alternation such that there exists an epsilon transition to all
471 /// states in `alternates`, where matches found via earlier transitions
472 /// are preferred over later transitions.
474 alternates
: Vec
<StateID
>,
476 /// An alternation such that there exists an epsilon transition to all
477 /// states in `alternates`, where matches found via later transitions are
478 /// preferred over earlier transitions.
480 /// This "reverse" state exists for convenience during compilation that
481 /// permits easy construction of non-greedy combinations of NFA states. At
482 /// the end of compilation, Union and UnionReverse states are merged into
483 /// one Union type of state, where the latter has its epsilon transitions
484 /// reversed to reflect the priority inversion.
486 /// The "convenience" here arises from the fact that as new states are
487 /// added to the list of `alternates`, we would like that add operation
488 /// to be amortized constant time. But if we used a `Union`, we'd need to
489 /// prepend the state, which takes O(n) time. There are other approaches we
490 /// could use to solve this, but this seems simple enough.
492 alternates
: Vec
<StateID
>,
494 /// A match state. There is at most one such occurrence of this state in
495 /// an NFA for each pattern compiled into the NFA. At time of writing, a
496 /// match state is always produced for every pattern given, but in theory,
497 /// if a pattern can never lead to a match, then the match state could be
500 /// `id` refers to the ID of the pattern itself, which corresponds to the
501 /// pattern's index (starting at 0). `start_id` refers to the anchored
502 /// NFA starting state corresponding to this pattern.
504 pattern_id
: PatternID
,
509 /// A value that represents the result of compiling a sub-expression of a
510 /// regex's HIR. Specifically, this represents a sub-graph of the NFA that
511 /// has an initial state at `start` and a final state at `end`.
512 #[derive(Clone, Copy, Debug)]
513 pub struct ThompsonRef
{
519 /// Create a new compiler.
520 pub fn new() -> Compiler
{
522 config
: Config
::default(),
523 nfa
: RefCell
::new(NFA
::empty()),
524 states
: RefCell
::new(vec
![]),
525 utf8_state
: RefCell
::new(Utf8State
::new()),
526 trie_state
: RefCell
::new(RangeTrie
::new()),
527 utf8_suffix
: RefCell
::new(Utf8SuffixMap
::new(1000)),
528 remap
: RefCell
::new(vec
![]),
529 empties
: RefCell
::new(vec
![]),
530 memory_cstates
: Cell
::new(0),
534 /// Configure and prepare this compiler from the builder's knobs.
536 /// The compiler is must always reconfigured by the builder before using it
537 /// to build an NFA. Namely, this will also clear any latent state in the
538 /// compiler used during previous compilations.
539 fn configure(&mut self, config
: Config
) {
540 self.config
= config
;
541 self.nfa
.borrow_mut().clear();
542 self.states
.borrow_mut().clear();
543 self.memory_cstates
.set(0);
544 // We don't need to clear anything else since they are cleared on
545 // their own and only when they are used.
548 /// Convert the current intermediate NFA to its final compiled form.
549 fn compile
<H
: Borrow
<Hir
>>(&self, exprs
: &[H
]) -> Result
<NFA
, Error
> {
550 if exprs
.is_empty() {
551 return Ok(NFA
::never_match());
553 if exprs
.len() > PatternID
::LIMIT
{
554 return Err(Error
::too_many_patterns(exprs
.len()));
557 // We always add an unanchored prefix unless we were specifically told
558 // not to (for tests only), or if we know that the regex is anchored
559 // for all matches. When an unanchored prefix is not added, then the
560 // NFA's anchored and unanchored start states are equivalent.
562 exprs
.iter().all(|e
| e
.borrow().is_anchored_start());
563 let anchored
= !self.config
.get_unanchored_prefix() || all_anchored
;
564 let unanchored_prefix
= if anchored
{
567 if self.config
.get_utf8() {
568 self.c_unanchored_prefix_valid_utf8()?
570 self.c_unanchored_prefix_invalid_utf8()?
574 let compiled
= self.c_alternation(
575 exprs
.iter().with_pattern_ids().map(|(pid
, e
)| {
576 let group_kind
= hir
::GroupKind
::CaptureIndex(0);
577 let one
= self.c_group(&group_kind
, e
.borrow())?
;
578 let match_state_id
= self.add_match(pid
, one
.start
)?
;
579 self.patch(one
.end
, match_state_id
)?
;
580 Ok(ThompsonRef { start: one.start, end: match_state_id }
)
583 self.patch(unanchored_prefix
.end
, compiled
.start
)?
;
584 self.finish(compiled
.start
, unanchored_prefix
.start
)?
;
585 Ok(self.nfa
.replace(NFA
::empty()))
588 /// Finishes the compilation process and populates the NFA attached to this
589 /// compiler with the final graph.
592 start_anchored
: StateID
,
593 start_unanchored
: StateID
,
594 ) -> Result
<(), Error
> {
596 "intermediate NFA compilation complete, \
597 intermediate NFA size: {} states, {} bytes on heap",
598 self.states
.borrow().len(),
599 self.nfa_memory_usage(),
601 let mut nfa
= self.nfa
.borrow_mut();
602 let mut bstates
= self.states
.borrow_mut();
603 let mut remap
= self.remap
.borrow_mut();
604 let mut empties
= self.empties
.borrow_mut();
605 remap
.resize(bstates
.len(), StateID
::ZERO
);
608 // The idea here is to convert our intermediate states to their final
609 // form. The only real complexity here is the process of converting
610 // transitions, which are expressed in terms of state IDs. The new
611 // set of states will be smaller because of partial epsilon removal,
612 // so the state IDs will not be the same.
613 for (sid
, bstate
) in bstates
.iter_mut().with_state_ids() {
615 CState
::Empty { next }
=> {
616 // Since we're removing empty states, we need to handle
617 // them later since we don't yet know which new state this
618 // empty state will be mapped to.
619 empties
.push((sid
, next
));
621 CState
::CaptureStart { next, capture_index, ref name }
=> {
622 // We can't remove this empty state because of the side
623 // effect of capturing an offset for this capture slot.
624 remap
[sid
] = nfa
.add_capture_start(
630 CState
::CaptureEnd { next, capture_index }
=> {
631 // We can't remove this empty state because of the side
632 // effect of capturing an offset for this capture slot.
633 remap
[sid
] = nfa
.add_capture_end(next
, capture_index
)?
;
635 CState
::Range { range }
=> {
636 remap
[sid
] = nfa
.add_range(range
)?
;
638 CState
::Sparse { ref mut ranges }
=> {
640 mem
::replace(ranges
, vec
![]).into_boxed_slice();
642 nfa
.add_sparse(SparseTransitions { ranges }
)?
;
644 CState
::Look { look, next }
=> {
645 remap
[sid
] = nfa
.add_look(next
, look
)?
;
647 CState
::Union { ref mut alternates }
=> {
649 mem
::replace(alternates
, vec
![]).into_boxed_slice();
650 remap
[sid
] = nfa
.add_union(alternates
)?
;
652 CState
::UnionReverse { ref mut alternates }
=> {
654 mem
::replace(alternates
, vec
![]).into_boxed_slice();
655 alternates
.reverse();
656 remap
[sid
] = nfa
.add_union(alternates
)?
;
658 CState
::Match { start_id, .. }
=> {
659 remap
[sid
] = nfa
.add_match()?
;
660 nfa
.finish_pattern(start_id
)?
;
664 for &(empty_id
, mut empty_next
) in empties
.iter() {
665 // empty states can point to other empty states, forming a chain.
666 // So we must follow the chain until the end, which must end at
667 // a non-empty state, and therefore, a state that is correctly
668 // remapped. We are guaranteed to terminate because our compiler
669 // never builds a loop among only empty states.
670 while let CState
::Empty { next }
= bstates
[empty_next
] {
673 remap
[empty_id
] = remap
[empty_next
];
675 nfa
.set_start_anchored(start_anchored
);
676 nfa
.set_start_unanchored(start_unanchored
);
679 "final NFA (reverse? {:?}) compilation complete, \
680 final NFA size: {} states, {} bytes on heap",
681 self.config
.get_reverse(),
688 fn c(&self, expr
: &Hir
) -> Result
<ThompsonRef
, Error
> {
690 HirKind
::Empty
=> self.c_empty(),
691 HirKind
::Literal(Literal
::Unicode(ch
)) => self.c_char(ch
),
692 HirKind
::Literal(Literal
::Byte(b
)) => self.c_range(b
, b
),
693 HirKind
::Class(Class
::Bytes(ref c
)) => self.c_byte_class(c
),
694 HirKind
::Class(Class
::Unicode(ref c
)) => self.c_unicode_class(c
),
695 HirKind
::Anchor(ref anchor
) => self.c_anchor(anchor
),
696 HirKind
::WordBoundary(ref wb
) => self.c_word_boundary(wb
),
697 HirKind
::Repetition(ref rep
) => self.c_repetition(rep
),
698 HirKind
::Group(ref group
) => self.c_group(&group
.kind
, &group
.hir
),
699 HirKind
::Concat(ref es
) => {
700 self.c_concat(es
.iter().map(|e
| self.c(e
)))
702 HirKind
::Alternation(ref es
) => {
703 self.c_alternation(es
.iter().map(|e
| self.c(e
)))
708 fn c_concat
<I
>(&self, mut it
: I
) -> Result
<ThompsonRef
, Error
>
710 I
: DoubleEndedIterator
<Item
= Result
<ThompsonRef
, Error
>>,
712 let first
= if self.is_reverse() { it.next_back() }
else { it.next() }
;
713 let ThompsonRef { start, mut end }
= match first
{
714 Some(result
) => result?
,
715 None
=> return self.c_empty(),
719 if self.is_reverse() { it.next_back() }
else { it.next() }
;
720 let compiled
= match next
{
721 Some(result
) => result?
,
724 self.patch(end
, compiled
.start
)?
;
727 Ok(ThompsonRef { start, end }
)
730 fn c_alternation
<I
>(&self, mut it
: I
) -> Result
<ThompsonRef
, Error
>
732 I
: Iterator
<Item
= Result
<ThompsonRef
, Error
>>,
734 let first
= it
.next().expect("alternations must be non-empty")?
;
735 let second
= match it
.next() {
736 None
=> return Ok(first
),
737 Some(result
) => result?
,
740 let union = self.add_union()?
;
741 let end
= self.add_empty()?
;
742 self.patch(union, first
.start
)?
;
743 self.patch(first
.end
, end
)?
;
744 self.patch(union, second
.start
)?
;
745 self.patch(second
.end
, end
)?
;
747 let compiled
= result?
;
748 self.patch(union, compiled
.start
)?
;
749 self.patch(compiled
.end
, end
)?
;
751 Ok(ThompsonRef { start: union, end }
)
756 kind
: &hir
::GroupKind
,
758 ) -> Result
<ThompsonRef
, Error
> {
759 if !self.config
.get_captures() {
762 let (capi
, name
) = match *kind
{
763 hir
::GroupKind
::NonCapturing
=> return self.c(expr
),
764 hir
::GroupKind
::CaptureIndex(index
) => (index
, None
),
765 hir
::GroupKind
::CaptureName { ref name, index }
=> {
766 (index
, Some(Arc
::from(&**name
)))
770 let start
= self.add_capture_start(capi
, name
)?
;
771 let inner
= self.c(expr
)?
;
772 let end
= self.add_capture_end(capi
)?
;
774 self.patch(start
, inner
.start
)?
;
775 self.patch(inner
.end
, end
)?
;
776 Ok(ThompsonRef { start, end }
)
781 rep
: &hir
::Repetition
,
782 ) -> Result
<ThompsonRef
, Error
> {
784 hir
::RepetitionKind
::ZeroOrOne
=> {
785 self.c_zero_or_one(&rep
.hir
, rep
.greedy
)
787 hir
::RepetitionKind
::ZeroOrMore
=> {
788 self.c_at_least(&rep
.hir
, rep
.greedy
, 0)
790 hir
::RepetitionKind
::OneOrMore
=> {
791 self.c_at_least(&rep
.hir
, rep
.greedy
, 1)
793 hir
::RepetitionKind
::Range(ref rng
) => match *rng
{
794 hir
::RepetitionRange
::Exactly(count
) => {
795 self.c_exactly(&rep
.hir
, count
)
797 hir
::RepetitionRange
::AtLeast(m
) => {
798 self.c_at_least(&rep
.hir
, rep
.greedy
, m
)
800 hir
::RepetitionRange
::Bounded(min
, max
) => {
801 self.c_bounded(&rep
.hir
, rep
.greedy
, min
, max
)
813 ) -> Result
<ThompsonRef
, Error
> {
814 let prefix
= self.c_exactly(expr
, min
)?
;
819 // It is tempting here to compile the rest here as a concatenation
820 // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
821 // were `aaa?a?a?`. The problem here is that it leads to this program:
825 // 000002: union(03, 04)
827 // 000004: union(05, 06)
829 // 000006: union(07, 08)
833 // And effectively, once you hit state 2, the epsilon closure will
834 // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
835 // to instead compile it like so:
839 // 000002: union(03, 08)
841 // 000004: union(05, 08)
843 // 000006: union(07, 08)
847 // So that the epsilon closure of state 2 is now just 3 and 8.
848 let empty
= self.add_empty()?
;
849 let mut prev_end
= prefix
.end
;
851 let union = if greedy
{
854 self.add_reverse_union()
856 let compiled
= self.c(expr
)?
;
857 self.patch(prev_end
, union)?
;
858 self.patch(union, compiled
.start
)?
;
859 self.patch(union, empty
)?
;
860 prev_end
= compiled
.end
;
862 self.patch(prev_end
, empty
)?
;
863 Ok(ThompsonRef { start: prefix.start, end: empty }
)
871 ) -> Result
<ThompsonRef
, Error
> {
873 // When the expression cannot match the empty string, then we
874 // can get away with something much simpler: just one 'alt'
875 // instruction that optionally repeats itself. But if the expr
876 // can match the empty string... see below.
877 if !expr
.is_match_empty() {
878 let union = if greedy
{
881 self.add_reverse_union()
883 let compiled
= self.c(expr
)?
;
884 self.patch(union, compiled
.start
)?
;
885 self.patch(compiled
.end
, union)?
;
886 return Ok(ThompsonRef { start: union, end: union }
);
889 // What's going on here? Shouldn't x* be simpler than this? It
890 // turns out that when implementing leftmost-first (Perl-like)
891 // match semantics, x* results in an incorrect preference order
892 // when computing the transitive closure of states if and only if
893 // 'x' can match the empty string. So instead, we compile x* as
894 // (x+)?, which preserves the correct preference order.
896 // See: https://github.com/rust-lang/regex/issues/779
897 let compiled
= self.c(expr
)?
;
898 let plus
= if greedy
{
901 self.add_reverse_union()
903 self.patch(compiled
.end
, plus
)?
;
904 self.patch(plus
, compiled
.start
)?
;
906 let question
= if greedy
{
909 self.add_reverse_union()
911 let empty
= self.add_empty()?
;
912 self.patch(question
, compiled
.start
)?
;
913 self.patch(question
, empty
)?
;
914 self.patch(plus
, empty
)?
;
915 Ok(ThompsonRef { start: question, end: empty }
)
917 let compiled
= self.c(expr
)?
;
918 let union = if greedy
{
921 self.add_reverse_union()
923 self.patch(compiled
.end
, union)?
;
924 self.patch(union, compiled
.start
)?
;
925 Ok(ThompsonRef { start: compiled.start, end: union }
)
927 let prefix
= self.c_exactly(expr
, n
- 1)?
;
928 let last
= self.c(expr
)?
;
929 let union = if greedy
{
932 self.add_reverse_union()
934 self.patch(prefix
.end
, last
.start
)?
;
935 self.patch(last
.end
, union)?
;
936 self.patch(union, last
.start
)?
;
937 Ok(ThompsonRef { start: prefix.start, end: union }
)
945 ) -> Result
<ThompsonRef
, Error
> {
947 if greedy { self.add_union() }
else { self.add_reverse_union() }?
;
948 let compiled
= self.c(expr
)?
;
949 let empty
= self.add_empty()?
;
950 self.patch(union, compiled
.start
)?
;
951 self.patch(union, empty
)?
;
952 self.patch(compiled
.end
, empty
)?
;
953 Ok(ThompsonRef { start: union, end: empty }
)
956 fn c_exactly(&self, expr
: &Hir
, n
: u32) -> Result
<ThompsonRef
, Error
> {
957 let it
= (0..n
).map(|_
| self.c(expr
));
963 cls
: &hir
::ClassBytes
,
964 ) -> Result
<ThompsonRef
, Error
> {
965 let end
= self.add_empty()?
;
966 let mut trans
= Vec
::with_capacity(cls
.ranges().len());
967 for r
in cls
.iter() {
968 trans
.push(Transition
{
974 Ok(ThompsonRef { start: self.add_sparse(trans)?, end }
)
979 cls
: &hir
::ClassUnicode
,
980 ) -> Result
<ThompsonRef
, Error
> {
981 // If all we have are ASCII ranges wrapped in a Unicode package, then
982 // there is zero reason to bring out the big guns. We can fit all ASCII
983 // ranges within a single sparse state.
984 if cls
.is_all_ascii() {
985 let end
= self.add_empty()?
;
986 let mut trans
= Vec
::with_capacity(cls
.ranges().len());
987 for r
in cls
.iter() {
988 assert
!(r
.start() <= '
\x7F'
);
989 assert
!(r
.end() <= '
\x7F'
);
990 trans
.push(Transition
{
991 start
: r
.start() as u8,
996 Ok(ThompsonRef { start: self.add_sparse(trans)?, end }
)
997 } else if self.is_reverse() {
998 if !self.config
.get_shrink() {
999 // When we don't want to spend the extra time shrinking, we
1000 // compile the UTF-8 automaton in reverse using something like
1001 // the "naive" approach, but will attempt to re-use common
1003 self.c_unicode_class_reverse_with_suffix(cls
)
1005 // When we want to shrink our NFA for reverse UTF-8 automata,
1006 // we cannot feed UTF-8 sequences directly to the UTF-8
1007 // compiler, since the UTF-8 compiler requires all sequences
1008 // to be lexicographically sorted. Instead, we organize our
1009 // sequences into a range trie, which can then output our
1010 // sequences in the correct order. Unfortunately, building the
1011 // range trie is fairly expensive (but not nearly as expensive
1012 // as building a DFA). Hence the reason why the 'shrink' option
1013 // exists, so that this path can be toggled off. For example,
1014 // we might want to turn this off if we know we won't be
1016 let mut trie
= self.trie_state
.borrow_mut();
1019 for rng
in cls
.iter() {
1020 for mut seq
in Utf8Sequences
::new(rng
.start(), rng
.end()) {
1022 trie
.insert(seq
.as_slice());
1025 let mut utf8_state
= self.utf8_state
.borrow_mut();
1026 let mut utf8c
= Utf8Compiler
::new(self, &mut *utf8_state
)?
;
1034 // In the forward direction, we always shrink our UTF-8 automata
1035 // because we can stream it right into the UTF-8 compiler. There
1036 // is almost no downside (in either memory or time) to using this
1038 let mut utf8_state
= self.utf8_state
.borrow_mut();
1039 let mut utf8c
= Utf8Compiler
::new(self, &mut *utf8_state
)?
;
1040 for rng
in cls
.iter() {
1041 for seq
in Utf8Sequences
::new(rng
.start(), rng
.end()) {
1042 utf8c
.add(seq
.as_slice())?
;
1048 // For reference, the code below is the "naive" version of compiling a
1049 // UTF-8 automaton. It is deliciously simple (and works for both the
1050 // forward and reverse cases), but will unfortunately produce very
1051 // large NFAs. When compiling a forward automaton, the size difference
1052 // can sometimes be an order of magnitude. For example, the '\w' regex
1053 // will generate about ~3000 NFA states using the naive approach below,
1054 // but only 283 states when using the approach above. This is because
1055 // the approach above actually compiles a *minimal* (or near minimal,
1056 // because of the bounded hashmap for reusing equivalent states) UTF-8
1059 // The code below is kept as a reference point in order to make it
1060 // easier to understand the higher level goal here. Although, it will
1061 // almost certainly bit-rot, so keep that in mind.
1065 .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
1070 .map(|rng| self.c_range(rng.start, rng.end));
1073 self.c_alternation(it)
1077 fn c_unicode_class_reverse_with_suffix(
1079 cls
: &hir
::ClassUnicode
,
1080 ) -> Result
<ThompsonRef
, Error
> {
1081 // N.B. It would likely be better to cache common *prefixes* in the
1082 // reverse direction, but it's not quite clear how to do that. The
1083 // advantage of caching suffixes is that it does give us a win, and
1084 // has a very small additional overhead.
1085 let mut cache
= self.utf8_suffix
.borrow_mut();
1088 let union = self.add_union()?
;
1089 let alt_end
= self.add_empty()?
;
1090 for urng
in cls
.iter() {
1091 for seq
in Utf8Sequences
::new(urng
.start(), urng
.end()) {
1092 let mut end
= alt_end
;
1093 for brng
in seq
.as_slice() {
1094 let key
= Utf8SuffixKey
{
1099 let hash
= cache
.hash(&key
);
1100 if let Some(id
) = cache
.get(&key
, hash
) {
1105 let compiled
= self.c_range(brng
.start
, brng
.end
)?
;
1106 self.patch(compiled
.end
, end
)?
;
1107 end
= compiled
.start
;
1108 cache
.set(key
, hash
, end
);
1110 self.patch(union, end
)?
;
1113 Ok(ThompsonRef { start: union, end: alt_end }
)
1116 fn c_anchor(&self, anchor
: &Anchor
) -> Result
<ThompsonRef
, Error
> {
1117 let look
= match *anchor
{
1118 Anchor
::StartLine
=> Look
::StartLine
,
1119 Anchor
::EndLine
=> Look
::EndLine
,
1120 Anchor
::StartText
=> Look
::StartText
,
1121 Anchor
::EndText
=> Look
::EndText
,
1123 let id
= self.add_look(look
)?
;
1124 Ok(ThompsonRef { start: id, end: id }
)
1130 ) -> Result
<ThompsonRef
, Error
> {
1131 let look
= match *wb
{
1132 WordBoundary
::Unicode
=> Look
::WordBoundaryUnicode
,
1133 WordBoundary
::UnicodeNegate
=> Look
::WordBoundaryUnicodeNegate
,
1134 WordBoundary
::Ascii
=> Look
::WordBoundaryAscii
,
1135 WordBoundary
::AsciiNegate
=> Look
::WordBoundaryAsciiNegate
,
1137 let id
= self.add_look(look
)?
;
1138 Ok(ThompsonRef { start: id, end: id }
)
1141 fn c_char(&self, ch
: char) -> Result
<ThompsonRef
, Error
> {
1142 let mut buf
= [0; 4];
1144 .encode_utf8(&mut buf
)
1147 .map(|&b
| self.c_range(b
, b
));
1151 fn c_range(&self, start
: u8, end
: u8) -> Result
<ThompsonRef
, Error
> {
1152 let id
= self.add_range(start
, end
)?
;
1153 Ok(ThompsonRef { start: id, end: id }
)
1156 fn c_empty(&self) -> Result
<ThompsonRef
, Error
> {
1157 let id
= self.add_empty()?
;
1158 Ok(ThompsonRef { start: id, end: id }
)
1161 fn c_unanchored_prefix_valid_utf8(&self) -> Result
<ThompsonRef
, Error
> {
1162 self.c_at_least(&Hir
::any(false), false, 0)
1165 fn c_unanchored_prefix_invalid_utf8(&self) -> Result
<ThompsonRef
, Error
> {
1166 self.c_at_least(&Hir
::any(true), false, 0)
1169 fn patch(&self, from
: StateID
, to
: StateID
) -> Result
<(), Error
> {
1170 let old_memory_cstates
= self.memory_cstates
.get();
1171 match self.states
.borrow_mut()[from
] {
1172 CState
::Empty { ref mut next }
=> {
1175 CState
::Range { ref mut range }
=> {
1178 CState
::Sparse { .. }
=> {
1179 panic
!("cannot patch from a sparse NFA state")
1181 CState
::Look { ref mut next, .. }
=> {
1184 CState
::Union { ref mut alternates }
=> {
1185 alternates
.push(to
);
1187 .set(old_memory_cstates
+ mem
::size_of
::<StateID
>());
1189 CState
::UnionReverse { ref mut alternates }
=> {
1190 alternates
.push(to
);
1192 .set(old_memory_cstates
+ mem
::size_of
::<StateID
>());
1194 CState
::CaptureStart { ref mut next, .. }
=> {
1197 CState
::CaptureEnd { ref mut next, .. }
=> {
1200 CState
::Match { .. }
=> {}
1202 if old_memory_cstates
!= self.memory_cstates
.get() {
1203 self.check_nfa_size_limit()?
;
1208 fn add_empty(&self) -> Result
<StateID
, Error
> {
1209 self.add_state(CState
::Empty { next: StateID::ZERO }
)
1212 fn add_capture_start(
1215 name
: Option
<Arc
<str>>,
1216 ) -> Result
<StateID
, Error
> {
1217 self.add_state(CState
::CaptureStart
{
1218 next
: StateID
::ZERO
,
1224 fn add_capture_end(&self, capture_index
: u32) -> Result
<StateID
, Error
> {
1225 self.add_state(CState
::CaptureEnd
{
1226 next
: StateID
::ZERO
,
1231 fn add_range(&self, start
: u8, end
: u8) -> Result
<StateID
, Error
> {
1232 let trans
= Transition { start, end, next: StateID::ZERO }
;
1233 self.add_state(CState
::Range { range: trans }
)
1236 fn add_sparse(&self, ranges
: Vec
<Transition
>) -> Result
<StateID
, Error
> {
1237 if ranges
.len() == 1 {
1238 self.add_state(CState
::Range { range: ranges[0] }
)
1240 self.add_state(CState
::Sparse { ranges }
)
1244 fn add_look(&self, mut look
: Look
) -> Result
<StateID
, Error
> {
1245 if self.is_reverse() {
1246 look
= look
.reversed();
1248 self.add_state(CState
::Look { look, next: StateID::ZERO }
)
1251 fn add_union(&self) -> Result
<StateID
, Error
> {
1252 self.add_state(CState
::Union { alternates: vec![] }
)
1255 fn add_reverse_union(&self) -> Result
<StateID
, Error
> {
1256 self.add_state(CState
::UnionReverse { alternates: vec![] }
)
1261 pattern_id
: PatternID
,
1263 ) -> Result
<StateID
, Error
> {
1264 self.add_state(CState
::Match { pattern_id, start_id }
)
1267 fn add_state(&self, state
: CState
) -> Result
<StateID
, Error
> {
1268 let mut states
= self.states
.borrow_mut();
1269 let id
= StateID
::new(states
.len())
1270 .map_err(|_
| Error
::too_many_states(states
.len()))?
;
1272 .set(self.memory_cstates
.get() + state
.memory_usage());
1274 // If we don't explicitly drop this, then 'nfa_memory_usage' will also
1275 // try to borrow it when we check the size limit and hit an error.
1277 self.check_nfa_size_limit()?
;
1281 fn is_reverse(&self) -> bool
{
1282 self.config
.get_reverse()
1285 /// If an NFA size limit was set, this checks that the NFA compiled so far
1286 /// fits within that limit. If so, then nothing is returned. Otherwise, an
1287 /// error is returned.
1289 /// This should be called after increasing the heap usage of the
1290 /// intermediate NFA.
1292 /// Note that this borrows 'self.states', so callers should ensure there is
1293 /// no mutable borrow of it outstanding.
1294 fn check_nfa_size_limit(&self) -> Result
<(), Error
> {
1295 if let Some(limit
) = self.config
.get_nfa_size_limit() {
1296 if self.nfa_memory_usage() > limit
{
1297 return Err(Error
::exceeded_size_limit(limit
));
1303 /// Returns the heap memory usage, in bytes, of the NFA compiled so far.
1305 /// Note that this is an approximation of how big the final NFA will be.
1306 /// In practice, the final NFA will likely be a bit smaller since it uses
1307 /// things like `Box<[T]>` instead of `Vec<T>`.
1308 fn nfa_memory_usage(&self) -> usize {
1309 self.states
.borrow().len() * mem
::size_of
::<CState
>()
1310 + self.memory_cstates
.get()
1315 fn memory_usage(&self) -> usize {
1317 CState
::Empty { .. }
1318 | CState
::Range { .. }
1319 | CState
::Look { .. }
1320 | CState
::CaptureStart { .. }
1321 | CState
::CaptureEnd { .. }
1322 | CState
::Match { .. }
=> 0,
1323 CState
::Sparse { ref ranges }
=> {
1324 ranges
.len() * mem
::size_of
::<Transition
>()
1326 CState
::Union { ref alternates }
=> {
1327 alternates
.len() * mem
::size_of
::<StateID
>()
1329 CState
::UnionReverse { ref alternates }
=> {
1330 alternates
.len() * mem
::size_of
::<StateID
>()
1337 struct Utf8Compiler
<'a
> {
1339 state
: &'a
mut Utf8State
,
1343 #[derive(Clone, Debug)]
1345 compiled
: Utf8BoundedMap
,
1346 uncompiled
: Vec
<Utf8Node
>,
1349 #[derive(Clone, Debug)]
1351 trans
: Vec
<Transition
>,
1352 last
: Option
<Utf8LastTransition
>,
1355 #[derive(Clone, Debug)]
1356 struct Utf8LastTransition
{
1362 fn new() -> Utf8State
{
1363 Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] }
1366 fn clear(&mut self) {
1367 self.compiled
.clear();
1368 self.uncompiled
.clear();
1372 impl<'a
> Utf8Compiler
<'a
> {
1375 state
: &'a
mut Utf8State
,
1376 ) -> Result
<Utf8Compiler
<'a
>, Error
> {
1377 let target
= nfac
.add_empty()?
;
1379 let mut utf8c
= Utf8Compiler { nfac, state, target }
;
1384 fn finish(&mut self) -> Result
<ThompsonRef
, Error
> {
1385 self.compile_from(0)?
;
1386 let node
= self.pop_root();
1387 let start
= self.compile(node
)?
;
1388 Ok(ThompsonRef { start, end: self.target }
)
1391 fn add(&mut self, ranges
: &[Utf8Range
]) -> Result
<(), Error
> {
1392 let prefix_len
= ranges
1394 .zip(&self.state
.uncompiled
)
1395 .take_while(|&(range
, node
)| {
1396 node
.last
.as_ref().map_or(false, |t
| {
1397 (t
.start
, t
.end
) == (range
.start
, range
.end
)
1401 assert
!(prefix_len
< ranges
.len());
1402 self.compile_from(prefix_len
)?
;
1403 self.add_suffix(&ranges
[prefix_len
..]);
1407 fn compile_from(&mut self, from
: usize) -> Result
<(), Error
> {
1408 let mut next
= self.target
;
1409 while from
+ 1 < self.state
.uncompiled
.len() {
1410 let node
= self.pop_freeze(next
);
1411 next
= self.compile(node
)?
;
1413 self.top_last_freeze(next
);
1417 fn compile(&mut self, node
: Vec
<Transition
>) -> Result
<StateID
, Error
> {
1418 let hash
= self.state
.compiled
.hash(&node
);
1419 if let Some(id
) = self.state
.compiled
.get(&node
, hash
) {
1422 let id
= self.nfac
.add_sparse(node
.clone())?
;
1423 self.state
.compiled
.set(node
, hash
, id
);
1427 fn add_suffix(&mut self, ranges
: &[Utf8Range
]) {
1428 assert
!(!ranges
.is_empty());
1434 .expect("non-empty nodes");
1435 assert
!(self.state
.uncompiled
[last
].last
.is_none());
1436 self.state
.uncompiled
[last
].last
= Some(Utf8LastTransition
{
1437 start
: ranges
[0].start
,
1440 for r
in &ranges
[1..] {
1441 self.state
.uncompiled
.push(Utf8Node
{
1443 last
: Some(Utf8LastTransition { start: r.start, end: r.end }
),
1448 fn add_empty(&mut self) {
1449 self.state
.uncompiled
.push(Utf8Node { trans: vec![], last: None }
);
1452 fn pop_freeze(&mut self, next
: StateID
) -> Vec
<Transition
> {
1453 let mut uncompiled
= self.state
.uncompiled
.pop().unwrap();
1454 uncompiled
.set_last_transition(next
);
1458 fn pop_root(&mut self) -> Vec
<Transition
> {
1459 assert_eq
!(self.state
.uncompiled
.len(), 1);
1460 assert
!(self.state
.uncompiled
[0].last
.is_none());
1461 self.state
.uncompiled
.pop().expect("non-empty nodes").trans
1464 fn top_last_freeze(&mut self, next
: StateID
) {
1470 .expect("non-empty nodes");
1471 self.state
.uncompiled
[last
].set_last_transition(next
);
1476 fn set_last_transition(&mut self, next
: StateID
) {
1477 if let Some(last
) = self.last
.take() {
1478 self.trans
.push(Transition
{
1489 use alloc
::vec
::Vec
;
1492 Builder
, Config
, PatternID
, SparseTransitions
, State
, StateID
,
1496 fn build(pattern
: &str) -> NFA
{
1498 .configure(Config
::new().captures(false).unanchored_prefix(false))
1503 fn pid(id
: usize) -> PatternID
{
1504 PatternID
::new(id
).unwrap()
1507 fn sid(id
: usize) -> StateID
{
1508 StateID
::new(id
).unwrap()
1511 fn s_byte(byte
: u8, next
: usize) -> State
{
1512 let next
= sid(next
);
1513 let trans
= Transition { start: byte, end: byte, next }
;
1514 State
::Range { range: trans }
1517 fn s_range(start
: u8, end
: u8, next
: usize) -> State
{
1518 let next
= sid(next
);
1519 let trans
= Transition { start, end, next }
;
1520 State
::Range { range: trans }
1523 fn s_sparse(ranges
: &[(u8, u8, usize)]) -> State
{
1526 .map(|&(start
, end
, next
)| Transition
{
1532 State
::Sparse(SparseTransitions { ranges }
)
1535 fn s_union(alts
: &[usize]) -> State
{
1540 .collect
::<Vec
<StateID
>>()
1541 .into_boxed_slice(),
1545 fn s_match(id
: usize) -> State
{
1546 State
::Match { id: pid(id) }
1549 // Test that building an unanchored NFA has an appropriate `(?s:.)*?`
1552 fn compile_unanchored_prefix() {
1553 // When the machine can only match valid UTF-8.
1554 let nfa
= Builder
::new()
1555 .configure(Config
::new().captures(false))
1558 // There should be many states since the `.` in `(?s:.)*?` matches any
1559 // Unicode scalar value.
1560 assert_eq
!(11, nfa
.len());
1561 assert_eq
!(nfa
.states
[10], s_match(0));
1562 assert_eq
!(nfa
.states
[9], s_byte(b'a'
, 10));
1564 // When the machine can match through invalid UTF-8.
1565 let nfa
= Builder
::new()
1566 .configure(Config
::new().captures(false).utf8(false))
1581 fn compile_empty() {
1582 assert_eq
!(build("").states
, &[s_match(0),]);
1586 fn compile_literal() {
1587 assert_eq
!(build("a").states
, &[s_byte(b'a'
, 1), s_match(0),]);
1590 &[s_byte(b'a'
, 1), s_byte(b'b'
, 2), s_match(0),]
1593 build("ā").states
,
1594 &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)]
1597 // Check that non-UTF-8 literals work.
1598 let nfa
= Builder
::new()
1603 .unanchored_prefix(false),
1605 .syntax(crate::SyntaxConfig
::new().utf8(false))
1606 .build(r
"(?-u)\xFF")
1608 assert_eq
!(nfa
.states
, &[s_byte(b'
\xFF'
, 1), s_match(0),]);
1612 fn compile_class() {
1614 build(r
"[a-z]").states
,
1615 &[s_range(b'a'
, b'z'
, 1), s_match(0),]
1618 build(r
"[x-za-c]").states
,
1619 &[s_sparse(&[(b'a'
, b'c'
, 1), (b'x'
, b'z'
, 1)]), s_match(0)]
1622 build(r
"[\u03B1-\u03B4]").states
,
1623 &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)]
1626 build(r
"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states
,
1628 s_range(0xB1, 0xB4, 5),
1629 s_range(0x99, 0x9E, 5),
1632 s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
1637 build(r
"[a-zā]").states
,
1641 s_sparse(&[(b'a'
, b'z'
, 3), (0xE2, 0xE2, 1)]),
1648 fn compile_repetition() {
1650 build(r
"a?").states
,
1651 &[s_union(&[1, 2]), s_byte(b'a'
, 2), s_match(0),]
1654 build(r
"a??").states
,
1655 &[s_union(&[2, 1]), s_byte(b'a'
, 2), s_match(0),]
1660 fn compile_group() {
1662 build(r
"ab+").states
,
1663 &[s_byte(b'a'
, 1), s_byte(b'b'
, 2), s_union(&[1, 3]), s_match(0)]
1666 build(r
"(ab)").states
,
1667 &[s_byte(b'a'
, 1), s_byte(b'b'
, 2), s_match(0)]
1670 build(r
"(ab)+").states
,
1671 &[s_byte(b'a'
, 1), s_byte(b'b'
, 2), s_union(&[0, 3]), s_match(0)]
1676 fn compile_alternation() {
1678 build(r
"a|b").states
,
1679 &[s_byte(b'a'
, 3), s_byte(b'b'
, 3), s_union(&[0, 1]), s_match(0)]
1682 build(r
"|b").states
,
1683 &[s_byte(b'b'
, 2), s_union(&[2, 0]), s_match(0)]
1686 build(r
"a|").states
,
1687 &[s_byte(b'a'
, 2), s_union(&[0, 2]), s_match(0)]
1692 fn many_start_pattern() {
1693 let nfa
= Builder
::new()
1694 .configure(Config
::new().captures(false).unanchored_prefix(false))
1695 .build_many(&["a", "b"])
1707 assert_eq
!(nfa
.start_anchored().as_usize(), 4);
1708 assert_eq
!(nfa
.start_unanchored().as_usize(), 4);
1709 // Test that the start states for each individual pattern are correct.
1710 assert_eq
!(nfa
.start_pattern(pid(0)), sid(0));
1711 assert_eq
!(nfa
.start_pattern(pid(1)), sid(2));