]> git.proxmox.com Git - rustc.git/blob - vendor/regex-automata/src/nfa/thompson/compiler.rs
New upstream version 1.67.1+dfsg1
[rustc.git] / vendor / regex-automata / src / nfa / thompson / compiler.rs
1 /*
2 This module provides an NFA compiler using Thompson's construction
3 algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA
4 graph as output. The NFA graph is structured in a way that permits it to be
5 executed by a virtual machine and also used to efficiently build a DFA.
6
7 The compiler deals with a slightly expanded set of NFA states that notably
8 includes an empty node that has exactly one epsilon transition to the next
9 state. In other words, it's a "goto" instruction if one views Thompson's NFA
10 as a set of bytecode instructions. These goto instructions are removed in
11 a subsequent phase before returning the NFA to the caller. The purpose of
12 these empty nodes is that they make the construction algorithm substantially
13 simpler to implement. We remove them before returning to the caller because
14 they can represent substantial overhead when traversing the NFA graph
15 (either while searching using the NFA directly or while building a DFA).
16
17 In the future, it would be nice to provide a Glushkov compiler as well,
18 as it would work well as a bit-parallel NFA for smaller regexes. But
19 the Thompson construction is one I'm more familiar with and seems more
20 straight-forward to deal with when it comes to large Unicode character
21 classes.
22
23 Internally, the compiler uses interior mutability to improve composition
24 in the face of the borrow checker. In particular, we'd really like to be
25 able to write things like this:
26
27 self.c_concat(exprs.iter().map(|e| self.c(e)))
28
29 Which elegantly uses iterators to build up a sequence of compiled regex
30 sub-expressions and then hands it off to the concatenating compiler
31 routine. Without interior mutability, the borrow checker won't let us
32 borrow `self` mutably both inside and outside the closure at the same
33 time.
34 */
35
36 use core::{
37 borrow::Borrow,
38 cell::{Cell, RefCell},
39 mem,
40 };
41
42 use alloc::{sync::Arc, vec, vec::Vec};
43
44 use regex_syntax::{
45 hir::{self, Anchor, Class, Hir, HirKind, Literal, WordBoundary},
46 utf8::{Utf8Range, Utf8Sequences},
47 ParserBuilder,
48 };
49
50 use crate::{
51 nfa::thompson::{
52 error::Error,
53 map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap},
54 range_trie::RangeTrie,
55 Look, SparseTransitions, State, Transition, NFA,
56 },
57 util::{
58 alphabet::ByteClassSet,
59 id::{IteratorIDExt, PatternID, StateID},
60 },
61 };
62
63 /// The configuration used for compiling a Thompson NFA from a regex pattern.
64 #[derive(Clone, Copy, Debug, Default)]
65 pub struct Config {
66 reverse: Option<bool>,
67 utf8: Option<bool>,
68 nfa_size_limit: Option<Option<usize>>,
69 shrink: Option<bool>,
70 captures: Option<bool>,
71 #[cfg(test)]
72 unanchored_prefix: Option<bool>,
73 }
74
75 impl Config {
76 /// Return a new default Thompson NFA compiler configuration.
77 pub fn new() -> Config {
78 Config::default()
79 }
80
81 /// Reverse the NFA.
82 ///
83 /// A NFA reversal is performed by reversing all of the concatenated
84 /// sub-expressions in the original pattern, recursively. The resulting
85 /// NFA can be used to match the pattern starting from the end of a string
86 /// instead of the beginning of a string.
87 ///
88 /// Reversing the NFA is useful for building a reverse DFA, which is most
89 /// useful for finding the start of a match after its ending position has
90 /// been found.
91 ///
92 /// This is disabled by default.
93 pub fn reverse(mut self, yes: bool) -> Config {
94 self.reverse = Some(yes);
95 self
96 }
97
98 /// Whether to enable UTF-8 mode or not.
99 ///
100 /// When UTF-8 mode is enabled (which is the default), unanchored searches
101 /// will only match through valid UTF-8. If invalid UTF-8 is seen, then
102 /// an unanchored search will stop at that point. This is equivalent to
103 /// putting a `(?s:.)*?` at the start of the regex.
104 ///
105 /// When UTF-8 mode is disabled, then unanchored searches will match
106 /// through any arbitrary byte. This is equivalent to putting a
107 /// `(?s-u:.)*?` at the start of the regex.
108 ///
109 /// Generally speaking, UTF-8 mode should only be used when you know you
110 /// are searching valid UTF-8, such as a Rust `&str`. If UTF-8 mode is used
111 /// on input that is not valid UTF-8, then the regex is not likely to work
112 /// as expected.
113 ///
114 /// This is enabled by default.
115 pub fn utf8(mut self, yes: bool) -> Config {
116 self.utf8 = Some(yes);
117 self
118 }
119
120 /// Sets an approximate size limit on the total heap used by the NFA being
121 /// compiled.
122 ///
123 /// This permits imposing constraints on the size of a compiled NFA. This
124 /// may be useful in contexts where the regex pattern is untrusted and one
125 /// wants to avoid using too much memory.
126 ///
127 /// This size limit does not apply to auxiliary heap used during
128 /// compilation that is not part of the built NFA.
129 ///
130 /// Note that this size limit is applied during compilation in order for
131 /// the limit to prevent too much heap from being used. However, the
132 /// implementation may use an intermediate NFA representation that is
133 /// otherwise slightly bigger than the final public form. Since the size
134 /// limit may be applied to an intermediate representation, there is not
135 /// necessarily a precise correspondence between the configured size limit
136 /// and the heap usage of the final NFA.
137 ///
138 /// There is no size limit by default.
139 ///
140 /// # Example
141 ///
142 /// This example demonstrates how Unicode mode can greatly increase the
143 /// size of the NFA.
144 ///
145 /// ```
146 /// use regex_automata::nfa::thompson::NFA;
147 ///
148 /// // 300KB isn't enough!
149 /// NFA::builder()
150 /// .configure(NFA::config().nfa_size_limit(Some(300_000)))
151 /// .build(r"\w{20}")
152 /// .unwrap_err();
153 ///
154 /// // ... but 400KB probably is.
155 /// let nfa = NFA::builder()
156 /// .configure(NFA::config().nfa_size_limit(Some(400_000)))
157 /// .build(r"\w{20}")?;
158 ///
159 /// assert_eq!(nfa.pattern_len(), 1);
160 ///
161 /// # Ok::<(), Box<dyn std::error::Error>>(())
162 /// ```
163 pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config {
164 self.nfa_size_limit = Some(bytes);
165 self
166 }
167
168 /// Apply best effort heuristics to shrink the NFA at the expense of more
169 /// time/memory.
170 ///
171 /// This is enabled by default. Generally speaking, if one is using an NFA
172 /// to compile a DFA, then the extra time used to shrink the NFA will be
173 /// more than made up for during DFA construction (potentially by a lot).
174 /// In other words, enabling this can substantially decrease the overall
175 /// amount of time it takes to build a DFA.
176 ///
177 /// The only reason to disable this if you want to compile an NFA and start
178 /// using it as quickly as possible without needing to build a DFA. e.g.,
179 /// for an NFA simulation or for a lazy DFA.
180 ///
181 /// This is enabled by default.
182 pub fn shrink(mut self, yes: bool) -> Config {
183 self.shrink = Some(yes);
184 self
185 }
186
187 /// Whether to include 'Capture' states in the NFA.
188 ///
189 /// This can only be enabled when compiling a forward NFA. This is
190 /// always disabled---with no way to override it---when the `reverse`
191 /// configuration is enabled.
192 ///
193 /// This is enabled by default.
194 pub fn captures(mut self, yes: bool) -> Config {
195 self.captures = Some(yes);
196 self
197 }
198
199 /// Whether to compile an unanchored prefix into this NFA.
200 ///
201 /// This is enabled by default. It is made available for tests only to make
202 /// it easier to unit test the output of the compiler.
203 #[cfg(test)]
204 fn unanchored_prefix(mut self, yes: bool) -> Config {
205 self.unanchored_prefix = Some(yes);
206 self
207 }
208
209 pub fn get_reverse(&self) -> bool {
210 self.reverse.unwrap_or(false)
211 }
212
213 pub fn get_utf8(&self) -> bool {
214 self.utf8.unwrap_or(true)
215 }
216
217 pub fn get_nfa_size_limit(&self) -> Option<usize> {
218 self.nfa_size_limit.unwrap_or(None)
219 }
220
221 pub fn get_shrink(&self) -> bool {
222 self.shrink.unwrap_or(true)
223 }
224
225 pub fn get_captures(&self) -> bool {
226 !self.get_reverse() && self.captures.unwrap_or(true)
227 }
228
229 fn get_unanchored_prefix(&self) -> bool {
230 #[cfg(test)]
231 {
232 self.unanchored_prefix.unwrap_or(true)
233 }
234 #[cfg(not(test))]
235 {
236 true
237 }
238 }
239
240 pub(crate) fn overwrite(self, o: Config) -> Config {
241 Config {
242 reverse: o.reverse.or(self.reverse),
243 utf8: o.utf8.or(self.utf8),
244 nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
245 shrink: o.shrink.or(self.shrink),
246 captures: o.captures.or(self.captures),
247 #[cfg(test)]
248 unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix),
249 }
250 }
251 }
252
253 /// A builder for compiling an NFA.
254 #[derive(Clone, Debug)]
255 pub struct Builder {
256 config: Config,
257 parser: ParserBuilder,
258 }
259
260 impl Builder {
261 /// Create a new NFA builder with its default configuration.
262 pub fn new() -> Builder {
263 Builder { config: Config::default(), parser: ParserBuilder::new() }
264 }
265
266 /// Compile the given regular expression into an NFA.
267 ///
268 /// If there was a problem parsing the regex, then that error is returned.
269 ///
270 /// Otherwise, if there was a problem building the NFA, then an error is
271 /// returned. The only error that can occur is if the compiled regex would
272 /// exceed the size limits configured on this builder.
273 pub fn build(&self, pattern: &str) -> Result<NFA, Error> {
274 self.build_many(&[pattern])
275 }
276
277 pub fn build_many<P: AsRef<str>>(
278 &self,
279 patterns: &[P],
280 ) -> Result<NFA, Error> {
281 let mut hirs = vec![];
282 for p in patterns {
283 hirs.push(
284 self.parser
285 .build()
286 .parse(p.as_ref())
287 .map_err(Error::syntax)?,
288 );
289 log!(log::trace!("parsed: {:?}", p.as_ref()));
290 }
291 self.build_many_from_hir(&hirs)
292 }
293
294 /// Compile the given high level intermediate representation of a regular
295 /// expression into an NFA.
296 ///
297 /// If there was a problem building the NFA, then an error is returned. The
298 /// only error that can occur is if the compiled regex would exceed the
299 /// size limits configured on this builder.
300 pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, Error> {
301 self.build_from_hir_with(&mut Compiler::new(), expr)
302 }
303
304 pub fn build_many_from_hir<H: Borrow<Hir>>(
305 &self,
306 exprs: &[H],
307 ) -> Result<NFA, Error> {
308 self.build_many_from_hir_with(&mut Compiler::new(), exprs)
309 }
310
311 /// Compile the given high level intermediate representation of a regular
312 /// expression into the NFA given using the given compiler. Callers may
313 /// prefer this over `build` if they would like to reuse allocations while
314 /// compiling many regular expressions.
315 ///
316 /// On success, the given NFA is completely overwritten with the NFA
317 /// produced by the compiler.
318 ///
319 /// If there was a problem building the NFA, then an error is returned.
320 /// The only error that can occur is if the compiled regex would exceed
321 /// the size limits configured on this builder. When an error is returned,
322 /// the contents of `nfa` are unspecified and should not be relied upon.
323 /// However, it can still be reused in subsequent calls to this method.
324 fn build_from_hir_with(
325 &self,
326 compiler: &mut Compiler,
327 expr: &Hir,
328 ) -> Result<NFA, Error> {
329 self.build_many_from_hir_with(compiler, &[expr])
330 }
331
332 fn build_many_from_hir_with<H: Borrow<Hir>>(
333 &self,
334 compiler: &mut Compiler,
335 exprs: &[H],
336 ) -> Result<NFA, Error> {
337 compiler.configure(self.config);
338 compiler.compile(exprs)
339 }
340
341 /// Apply the given NFA configuration options to this builder.
342 pub fn configure(&mut self, config: Config) -> &mut Builder {
343 self.config = self.config.overwrite(config);
344 self
345 }
346
347 /// Set the syntax configuration for this builder using
348 /// [`SyntaxConfig`](../../struct.SyntaxConfig.html).
349 ///
350 /// This permits setting things like case insensitivity, Unicode and multi
351 /// line mode.
352 ///
353 /// This syntax configuration generally only applies when an NFA is built
354 /// directly from a pattern string. If an NFA is built from an HIR, then
355 /// all syntax settings are ignored.
356 pub fn syntax(
357 &mut self,
358 config: crate::util::syntax::SyntaxConfig,
359 ) -> &mut Builder {
360 config.apply(&mut self.parser);
361 self
362 }
363 }
364
365 /// A compiler that converts a regex abstract syntax to an NFA via Thompson's
366 /// construction. Namely, this compiler permits epsilon transitions between
367 /// states.
368 #[derive(Clone, Debug)]
369 pub struct Compiler {
370 /// The configuration from the builder.
371 config: Config,
372 /// The final NFA that is built.
373 ///
374 /// Parts of this NFA are constructed during compilation, but the actual
375 /// states aren't added until a final "finish" step. This is because the
376 /// states constructed during compilation have unconditional epsilon
377 /// transitions, which makes the logic of compilation much simpler. The
378 /// "finish" step removes these unconditional epsilon transitions and must
379 /// therefore remap all of the transition state IDs.
380 nfa: RefCell<NFA>,
381 /// The set of compiled NFA states. Once a state is compiled, it is
382 /// assigned a state ID equivalent to its index in this list. Subsequent
383 /// compilation can modify previous states by adding new transitions.
384 states: RefCell<Vec<CState>>,
385 /// State used for compiling character classes to UTF-8 byte automata.
386 /// State is not retained between character class compilations. This just
387 /// serves to amortize allocation to the extent possible.
388 utf8_state: RefCell<Utf8State>,
389 /// State used for arranging character classes in reverse into a trie.
390 trie_state: RefCell<RangeTrie>,
391 /// State used for caching common suffixes when compiling reverse UTF-8
392 /// automata (for Unicode character classes).
393 utf8_suffix: RefCell<Utf8SuffixMap>,
394 /// A map used to re-map state IDs when translating the compiler's internal
395 /// NFA state representation to the external NFA representation.
396 remap: RefCell<Vec<StateID>>,
397 /// A set of compiler internal state IDs that correspond to states that are
398 /// exclusively epsilon transitions, i.e., goto instructions, combined with
399 /// the state that they point to. This is used to record said states while
400 /// transforming the compiler's internal NFA representation to the external
401 /// form.
402 empties: RefCell<Vec<(StateID, StateID)>>,
403 /// The total memory used by each of the 'CState's in 'states'. This only
404 /// includes heap usage by each state, and not the size of the state
405 /// itself.
406 memory_cstates: Cell<usize>,
407 }
408
409 /// A compiler intermediate state representation for an NFA that is only used
410 /// during compilation. Once compilation is done, `CState`s are converted
411 /// to `State`s (defined in the parent module), which have a much simpler
412 /// representation.
413 #[derive(Clone, Debug, Eq, PartialEq)]
414 enum CState {
415 /// An empty state whose only purpose is to forward the automaton to
416 /// another state via en epsilon transition. These are useful during
417 /// compilation but are otherwise removed at the end.
418 Empty {
419 next: StateID,
420 },
421 /// An empty state that records a capture location.
422 ///
423 /// From the perspective of finite automata, this is precisely equivalent
424 /// to 'Empty', but serves the purpose of instructing NFA simulations to
425 /// record additional state when the finite state machine passes through
426 /// this epsilon transition.
427 ///
428 /// These transitions are treated as epsilon transitions with no additional
429 /// effects in DFAs.
430 ///
431 /// 'slot' in this context refers to the specific capture group offset that
432 /// is being recorded. Each capturing group has two slots corresponding to
433 /// the start and end of the matching portion of that group.
434 CaptureStart {
435 next: StateID,
436 capture_index: u32,
437 name: Option<Arc<str>>,
438 },
439 CaptureEnd {
440 next: StateID,
441 capture_index: u32,
442 },
443 /// A state that only transitions to `next` if the current input byte is
444 /// in the range `[start, end]` (inclusive on both ends).
445 Range {
446 range: Transition,
447 },
448 /// A state with possibly many transitions, represented in a sparse
449 /// fashion. Transitions are ordered lexicographically by input range.
450 /// As such, this may only be used when every transition has equal
451 /// priority. (In practice, this is only used for encoding large UTF-8
452 /// automata.) In contrast, a `Union` state has each alternate in order
453 /// of priority. Priority is used to implement greedy matching and also
454 /// alternations themselves, e.g., `abc|a` where `abc` has priority over
455 /// `a`.
456 ///
457 /// To clarify, it is possible to remove `Sparse` and represent all things
458 /// that `Sparse` is used for via `Union`. But this creates a more bloated
459 /// NFA with more epsilon transitions than is necessary in the special case
460 /// of character classes.
461 Sparse {
462 ranges: Vec<Transition>,
463 },
464 /// A conditional epsilon transition satisfied via some sort of
465 /// look-around.
466 Look {
467 look: Look,
468 next: StateID,
469 },
470 /// An alternation such that there exists an epsilon transition to all
471 /// states in `alternates`, where matches found via earlier transitions
472 /// are preferred over later transitions.
473 Union {
474 alternates: Vec<StateID>,
475 },
476 /// An alternation such that there exists an epsilon transition to all
477 /// states in `alternates`, where matches found via later transitions are
478 /// preferred over earlier transitions.
479 ///
480 /// This "reverse" state exists for convenience during compilation that
481 /// permits easy construction of non-greedy combinations of NFA states. At
482 /// the end of compilation, Union and UnionReverse states are merged into
483 /// one Union type of state, where the latter has its epsilon transitions
484 /// reversed to reflect the priority inversion.
485 ///
486 /// The "convenience" here arises from the fact that as new states are
487 /// added to the list of `alternates`, we would like that add operation
488 /// to be amortized constant time. But if we used a `Union`, we'd need to
489 /// prepend the state, which takes O(n) time. There are other approaches we
490 /// could use to solve this, but this seems simple enough.
491 UnionReverse {
492 alternates: Vec<StateID>,
493 },
494 /// A match state. There is at most one such occurrence of this state in
495 /// an NFA for each pattern compiled into the NFA. At time of writing, a
496 /// match state is always produced for every pattern given, but in theory,
497 /// if a pattern can never lead to a match, then the match state could be
498 /// omitted.
499 ///
500 /// `id` refers to the ID of the pattern itself, which corresponds to the
501 /// pattern's index (starting at 0). `start_id` refers to the anchored
502 /// NFA starting state corresponding to this pattern.
503 Match {
504 pattern_id: PatternID,
505 start_id: StateID,
506 },
507 }
508
509 /// A value that represents the result of compiling a sub-expression of a
510 /// regex's HIR. Specifically, this represents a sub-graph of the NFA that
511 /// has an initial state at `start` and a final state at `end`.
512 #[derive(Clone, Copy, Debug)]
513 pub struct ThompsonRef {
514 start: StateID,
515 end: StateID,
516 }
517
518 impl Compiler {
519 /// Create a new compiler.
520 pub fn new() -> Compiler {
521 Compiler {
522 config: Config::default(),
523 nfa: RefCell::new(NFA::empty()),
524 states: RefCell::new(vec![]),
525 utf8_state: RefCell::new(Utf8State::new()),
526 trie_state: RefCell::new(RangeTrie::new()),
527 utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
528 remap: RefCell::new(vec![]),
529 empties: RefCell::new(vec![]),
530 memory_cstates: Cell::new(0),
531 }
532 }
533
534 /// Configure and prepare this compiler from the builder's knobs.
535 ///
536 /// The compiler is must always reconfigured by the builder before using it
537 /// to build an NFA. Namely, this will also clear any latent state in the
538 /// compiler used during previous compilations.
539 fn configure(&mut self, config: Config) {
540 self.config = config;
541 self.nfa.borrow_mut().clear();
542 self.states.borrow_mut().clear();
543 self.memory_cstates.set(0);
544 // We don't need to clear anything else since they are cleared on
545 // their own and only when they are used.
546 }
547
548 /// Convert the current intermediate NFA to its final compiled form.
549 fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, Error> {
550 if exprs.is_empty() {
551 return Ok(NFA::never_match());
552 }
553 if exprs.len() > PatternID::LIMIT {
554 return Err(Error::too_many_patterns(exprs.len()));
555 }
556
557 // We always add an unanchored prefix unless we were specifically told
558 // not to (for tests only), or if we know that the regex is anchored
559 // for all matches. When an unanchored prefix is not added, then the
560 // NFA's anchored and unanchored start states are equivalent.
561 let all_anchored =
562 exprs.iter().all(|e| e.borrow().is_anchored_start());
563 let anchored = !self.config.get_unanchored_prefix() || all_anchored;
564 let unanchored_prefix = if anchored {
565 self.c_empty()?
566 } else {
567 if self.config.get_utf8() {
568 self.c_unanchored_prefix_valid_utf8()?
569 } else {
570 self.c_unanchored_prefix_invalid_utf8()?
571 }
572 };
573
574 let compiled = self.c_alternation(
575 exprs.iter().with_pattern_ids().map(|(pid, e)| {
576 let group_kind = hir::GroupKind::CaptureIndex(0);
577 let one = self.c_group(&group_kind, e.borrow())?;
578 let match_state_id = self.add_match(pid, one.start)?;
579 self.patch(one.end, match_state_id)?;
580 Ok(ThompsonRef { start: one.start, end: match_state_id })
581 }),
582 )?;
583 self.patch(unanchored_prefix.end, compiled.start)?;
584 self.finish(compiled.start, unanchored_prefix.start)?;
585 Ok(self.nfa.replace(NFA::empty()))
586 }
587
588 /// Finishes the compilation process and populates the NFA attached to this
589 /// compiler with the final graph.
590 fn finish(
591 &self,
592 start_anchored: StateID,
593 start_unanchored: StateID,
594 ) -> Result<(), Error> {
595 trace!(
596 "intermediate NFA compilation complete, \
597 intermediate NFA size: {} states, {} bytes on heap",
598 self.states.borrow().len(),
599 self.nfa_memory_usage(),
600 );
601 let mut nfa = self.nfa.borrow_mut();
602 let mut bstates = self.states.borrow_mut();
603 let mut remap = self.remap.borrow_mut();
604 let mut empties = self.empties.borrow_mut();
605 remap.resize(bstates.len(), StateID::ZERO);
606 empties.clear();
607
608 // The idea here is to convert our intermediate states to their final
609 // form. The only real complexity here is the process of converting
610 // transitions, which are expressed in terms of state IDs. The new
611 // set of states will be smaller because of partial epsilon removal,
612 // so the state IDs will not be the same.
613 for (sid, bstate) in bstates.iter_mut().with_state_ids() {
614 match *bstate {
615 CState::Empty { next } => {
616 // Since we're removing empty states, we need to handle
617 // them later since we don't yet know which new state this
618 // empty state will be mapped to.
619 empties.push((sid, next));
620 }
621 CState::CaptureStart { next, capture_index, ref name } => {
622 // We can't remove this empty state because of the side
623 // effect of capturing an offset for this capture slot.
624 remap[sid] = nfa.add_capture_start(
625 next,
626 capture_index,
627 name.clone(),
628 )?;
629 }
630 CState::CaptureEnd { next, capture_index } => {
631 // We can't remove this empty state because of the side
632 // effect of capturing an offset for this capture slot.
633 remap[sid] = nfa.add_capture_end(next, capture_index)?;
634 }
635 CState::Range { range } => {
636 remap[sid] = nfa.add_range(range)?;
637 }
638 CState::Sparse { ref mut ranges } => {
639 let ranges =
640 mem::replace(ranges, vec![]).into_boxed_slice();
641 remap[sid] =
642 nfa.add_sparse(SparseTransitions { ranges })?;
643 }
644 CState::Look { look, next } => {
645 remap[sid] = nfa.add_look(next, look)?;
646 }
647 CState::Union { ref mut alternates } => {
648 let alternates =
649 mem::replace(alternates, vec![]).into_boxed_slice();
650 remap[sid] = nfa.add_union(alternates)?;
651 }
652 CState::UnionReverse { ref mut alternates } => {
653 let mut alternates =
654 mem::replace(alternates, vec![]).into_boxed_slice();
655 alternates.reverse();
656 remap[sid] = nfa.add_union(alternates)?;
657 }
658 CState::Match { start_id, .. } => {
659 remap[sid] = nfa.add_match()?;
660 nfa.finish_pattern(start_id)?;
661 }
662 }
663 }
664 for &(empty_id, mut empty_next) in empties.iter() {
665 // empty states can point to other empty states, forming a chain.
666 // So we must follow the chain until the end, which must end at
667 // a non-empty state, and therefore, a state that is correctly
668 // remapped. We are guaranteed to terminate because our compiler
669 // never builds a loop among only empty states.
670 while let CState::Empty { next } = bstates[empty_next] {
671 empty_next = next;
672 }
673 remap[empty_id] = remap[empty_next];
674 }
675 nfa.set_start_anchored(start_anchored);
676 nfa.set_start_unanchored(start_unanchored);
677 nfa.remap(&remap);
678 trace!(
679 "final NFA (reverse? {:?}) compilation complete, \
680 final NFA size: {} states, {} bytes on heap",
681 self.config.get_reverse(),
682 nfa.states().len(),
683 nfa.memory_usage(),
684 );
685 Ok(())
686 }
687
688 fn c(&self, expr: &Hir) -> Result<ThompsonRef, Error> {
689 match *expr.kind() {
690 HirKind::Empty => self.c_empty(),
691 HirKind::Literal(Literal::Unicode(ch)) => self.c_char(ch),
692 HirKind::Literal(Literal::Byte(b)) => self.c_range(b, b),
693 HirKind::Class(Class::Bytes(ref c)) => self.c_byte_class(c),
694 HirKind::Class(Class::Unicode(ref c)) => self.c_unicode_class(c),
695 HirKind::Anchor(ref anchor) => self.c_anchor(anchor),
696 HirKind::WordBoundary(ref wb) => self.c_word_boundary(wb),
697 HirKind::Repetition(ref rep) => self.c_repetition(rep),
698 HirKind::Group(ref group) => self.c_group(&group.kind, &group.hir),
699 HirKind::Concat(ref es) => {
700 self.c_concat(es.iter().map(|e| self.c(e)))
701 }
702 HirKind::Alternation(ref es) => {
703 self.c_alternation(es.iter().map(|e| self.c(e)))
704 }
705 }
706 }
707
708 fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
709 where
710 I: DoubleEndedIterator<Item = Result<ThompsonRef, Error>>,
711 {
712 let first = if self.is_reverse() { it.next_back() } else { it.next() };
713 let ThompsonRef { start, mut end } = match first {
714 Some(result) => result?,
715 None => return self.c_empty(),
716 };
717 loop {
718 let next =
719 if self.is_reverse() { it.next_back() } else { it.next() };
720 let compiled = match next {
721 Some(result) => result?,
722 None => break,
723 };
724 self.patch(end, compiled.start)?;
725 end = compiled.end;
726 }
727 Ok(ThompsonRef { start, end })
728 }
729
730 fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
731 where
732 I: Iterator<Item = Result<ThompsonRef, Error>>,
733 {
734 let first = it.next().expect("alternations must be non-empty")?;
735 let second = match it.next() {
736 None => return Ok(first),
737 Some(result) => result?,
738 };
739
740 let union = self.add_union()?;
741 let end = self.add_empty()?;
742 self.patch(union, first.start)?;
743 self.patch(first.end, end)?;
744 self.patch(union, second.start)?;
745 self.patch(second.end, end)?;
746 for result in it {
747 let compiled = result?;
748 self.patch(union, compiled.start)?;
749 self.patch(compiled.end, end)?;
750 }
751 Ok(ThompsonRef { start: union, end })
752 }
753
754 fn c_group(
755 &self,
756 kind: &hir::GroupKind,
757 expr: &Hir,
758 ) -> Result<ThompsonRef, Error> {
759 if !self.config.get_captures() {
760 return self.c(expr);
761 }
762 let (capi, name) = match *kind {
763 hir::GroupKind::NonCapturing => return self.c(expr),
764 hir::GroupKind::CaptureIndex(index) => (index, None),
765 hir::GroupKind::CaptureName { ref name, index } => {
766 (index, Some(Arc::from(&**name)))
767 }
768 };
769
770 let start = self.add_capture_start(capi, name)?;
771 let inner = self.c(expr)?;
772 let end = self.add_capture_end(capi)?;
773
774 self.patch(start, inner.start)?;
775 self.patch(inner.end, end)?;
776 Ok(ThompsonRef { start, end })
777 }
778
779 fn c_repetition(
780 &self,
781 rep: &hir::Repetition,
782 ) -> Result<ThompsonRef, Error> {
783 match rep.kind {
784 hir::RepetitionKind::ZeroOrOne => {
785 self.c_zero_or_one(&rep.hir, rep.greedy)
786 }
787 hir::RepetitionKind::ZeroOrMore => {
788 self.c_at_least(&rep.hir, rep.greedy, 0)
789 }
790 hir::RepetitionKind::OneOrMore => {
791 self.c_at_least(&rep.hir, rep.greedy, 1)
792 }
793 hir::RepetitionKind::Range(ref rng) => match *rng {
794 hir::RepetitionRange::Exactly(count) => {
795 self.c_exactly(&rep.hir, count)
796 }
797 hir::RepetitionRange::AtLeast(m) => {
798 self.c_at_least(&rep.hir, rep.greedy, m)
799 }
800 hir::RepetitionRange::Bounded(min, max) => {
801 self.c_bounded(&rep.hir, rep.greedy, min, max)
802 }
803 },
804 }
805 }
806
807 fn c_bounded(
808 &self,
809 expr: &Hir,
810 greedy: bool,
811 min: u32,
812 max: u32,
813 ) -> Result<ThompsonRef, Error> {
814 let prefix = self.c_exactly(expr, min)?;
815 if min == max {
816 return Ok(prefix);
817 }
818
819 // It is tempting here to compile the rest here as a concatenation
820 // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
821 // were `aaa?a?a?`. The problem here is that it leads to this program:
822 //
823 // >000000: 61 => 01
824 // 000001: 61 => 02
825 // 000002: union(03, 04)
826 // 000003: 61 => 04
827 // 000004: union(05, 06)
828 // 000005: 61 => 06
829 // 000006: union(07, 08)
830 // 000007: 61 => 08
831 // 000008: MATCH
832 //
833 // And effectively, once you hit state 2, the epsilon closure will
834 // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
835 // to instead compile it like so:
836 //
837 // >000000: 61 => 01
838 // 000001: 61 => 02
839 // 000002: union(03, 08)
840 // 000003: 61 => 04
841 // 000004: union(05, 08)
842 // 000005: 61 => 06
843 // 000006: union(07, 08)
844 // 000007: 61 => 08
845 // 000008: MATCH
846 //
847 // So that the epsilon closure of state 2 is now just 3 and 8.
848 let empty = self.add_empty()?;
849 let mut prev_end = prefix.end;
850 for _ in min..max {
851 let union = if greedy {
852 self.add_union()
853 } else {
854 self.add_reverse_union()
855 }?;
856 let compiled = self.c(expr)?;
857 self.patch(prev_end, union)?;
858 self.patch(union, compiled.start)?;
859 self.patch(union, empty)?;
860 prev_end = compiled.end;
861 }
862 self.patch(prev_end, empty)?;
863 Ok(ThompsonRef { start: prefix.start, end: empty })
864 }
865
866 fn c_at_least(
867 &self,
868 expr: &Hir,
869 greedy: bool,
870 n: u32,
871 ) -> Result<ThompsonRef, Error> {
872 if n == 0 {
873 // When the expression cannot match the empty string, then we
874 // can get away with something much simpler: just one 'alt'
875 // instruction that optionally repeats itself. But if the expr
876 // can match the empty string... see below.
877 if !expr.is_match_empty() {
878 let union = if greedy {
879 self.add_union()
880 } else {
881 self.add_reverse_union()
882 }?;
883 let compiled = self.c(expr)?;
884 self.patch(union, compiled.start)?;
885 self.patch(compiled.end, union)?;
886 return Ok(ThompsonRef { start: union, end: union });
887 }
888
889 // What's going on here? Shouldn't x* be simpler than this? It
890 // turns out that when implementing leftmost-first (Perl-like)
891 // match semantics, x* results in an incorrect preference order
892 // when computing the transitive closure of states if and only if
893 // 'x' can match the empty string. So instead, we compile x* as
894 // (x+)?, which preserves the correct preference order.
895 //
896 // See: https://github.com/rust-lang/regex/issues/779
897 let compiled = self.c(expr)?;
898 let plus = if greedy {
899 self.add_union()
900 } else {
901 self.add_reverse_union()
902 }?;
903 self.patch(compiled.end, plus)?;
904 self.patch(plus, compiled.start)?;
905
906 let question = if greedy {
907 self.add_union()
908 } else {
909 self.add_reverse_union()
910 }?;
911 let empty = self.add_empty()?;
912 self.patch(question, compiled.start)?;
913 self.patch(question, empty)?;
914 self.patch(plus, empty)?;
915 Ok(ThompsonRef { start: question, end: empty })
916 } else if n == 1 {
917 let compiled = self.c(expr)?;
918 let union = if greedy {
919 self.add_union()
920 } else {
921 self.add_reverse_union()
922 }?;
923 self.patch(compiled.end, union)?;
924 self.patch(union, compiled.start)?;
925 Ok(ThompsonRef { start: compiled.start, end: union })
926 } else {
927 let prefix = self.c_exactly(expr, n - 1)?;
928 let last = self.c(expr)?;
929 let union = if greedy {
930 self.add_union()
931 } else {
932 self.add_reverse_union()
933 }?;
934 self.patch(prefix.end, last.start)?;
935 self.patch(last.end, union)?;
936 self.patch(union, last.start)?;
937 Ok(ThompsonRef { start: prefix.start, end: union })
938 }
939 }
940
941 fn c_zero_or_one(
942 &self,
943 expr: &Hir,
944 greedy: bool,
945 ) -> Result<ThompsonRef, Error> {
946 let union =
947 if greedy { self.add_union() } else { self.add_reverse_union() }?;
948 let compiled = self.c(expr)?;
949 let empty = self.add_empty()?;
950 self.patch(union, compiled.start)?;
951 self.patch(union, empty)?;
952 self.patch(compiled.end, empty)?;
953 Ok(ThompsonRef { start: union, end: empty })
954 }
955
956 fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef, Error> {
957 let it = (0..n).map(|_| self.c(expr));
958 self.c_concat(it)
959 }
960
961 fn c_byte_class(
962 &self,
963 cls: &hir::ClassBytes,
964 ) -> Result<ThompsonRef, Error> {
965 let end = self.add_empty()?;
966 let mut trans = Vec::with_capacity(cls.ranges().len());
967 for r in cls.iter() {
968 trans.push(Transition {
969 start: r.start(),
970 end: r.end(),
971 next: end,
972 });
973 }
974 Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
975 }
976
977 fn c_unicode_class(
978 &self,
979 cls: &hir::ClassUnicode,
980 ) -> Result<ThompsonRef, Error> {
981 // If all we have are ASCII ranges wrapped in a Unicode package, then
982 // there is zero reason to bring out the big guns. We can fit all ASCII
983 // ranges within a single sparse state.
984 if cls.is_all_ascii() {
985 let end = self.add_empty()?;
986 let mut trans = Vec::with_capacity(cls.ranges().len());
987 for r in cls.iter() {
988 assert!(r.start() <= '\x7F');
989 assert!(r.end() <= '\x7F');
990 trans.push(Transition {
991 start: r.start() as u8,
992 end: r.end() as u8,
993 next: end,
994 });
995 }
996 Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
997 } else if self.is_reverse() {
998 if !self.config.get_shrink() {
999 // When we don't want to spend the extra time shrinking, we
1000 // compile the UTF-8 automaton in reverse using something like
1001 // the "naive" approach, but will attempt to re-use common
1002 // suffixes.
1003 self.c_unicode_class_reverse_with_suffix(cls)
1004 } else {
1005 // When we want to shrink our NFA for reverse UTF-8 automata,
1006 // we cannot feed UTF-8 sequences directly to the UTF-8
1007 // compiler, since the UTF-8 compiler requires all sequences
1008 // to be lexicographically sorted. Instead, we organize our
1009 // sequences into a range trie, which can then output our
1010 // sequences in the correct order. Unfortunately, building the
1011 // range trie is fairly expensive (but not nearly as expensive
1012 // as building a DFA). Hence the reason why the 'shrink' option
1013 // exists, so that this path can be toggled off. For example,
1014 // we might want to turn this off if we know we won't be
1015 // compiling a DFA.
1016 let mut trie = self.trie_state.borrow_mut();
1017 trie.clear();
1018
1019 for rng in cls.iter() {
1020 for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
1021 seq.reverse();
1022 trie.insert(seq.as_slice());
1023 }
1024 }
1025 let mut utf8_state = self.utf8_state.borrow_mut();
1026 let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?;
1027 trie.iter(|seq| {
1028 utf8c.add(&seq)?;
1029 Ok(())
1030 })?;
1031 utf8c.finish()
1032 }
1033 } else {
1034 // In the forward direction, we always shrink our UTF-8 automata
1035 // because we can stream it right into the UTF-8 compiler. There
1036 // is almost no downside (in either memory or time) to using this
1037 // approach.
1038 let mut utf8_state = self.utf8_state.borrow_mut();
1039 let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?;
1040 for rng in cls.iter() {
1041 for seq in Utf8Sequences::new(rng.start(), rng.end()) {
1042 utf8c.add(seq.as_slice())?;
1043 }
1044 }
1045 utf8c.finish()
1046 }
1047
1048 // For reference, the code below is the "naive" version of compiling a
1049 // UTF-8 automaton. It is deliciously simple (and works for both the
1050 // forward and reverse cases), but will unfortunately produce very
1051 // large NFAs. When compiling a forward automaton, the size difference
1052 // can sometimes be an order of magnitude. For example, the '\w' regex
1053 // will generate about ~3000 NFA states using the naive approach below,
1054 // but only 283 states when using the approach above. This is because
1055 // the approach above actually compiles a *minimal* (or near minimal,
1056 // because of the bounded hashmap for reusing equivalent states) UTF-8
1057 // automaton.
1058 //
1059 // The code below is kept as a reference point in order to make it
1060 // easier to understand the higher level goal here. Although, it will
1061 // almost certainly bit-rot, so keep that in mind.
1062 /*
1063 let it = cls
1064 .iter()
1065 .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
1066 .map(|seq| {
1067 let it = seq
1068 .as_slice()
1069 .iter()
1070 .map(|rng| self.c_range(rng.start, rng.end));
1071 self.c_concat(it)
1072 });
1073 self.c_alternation(it)
1074 */
1075 }
1076
1077 fn c_unicode_class_reverse_with_suffix(
1078 &self,
1079 cls: &hir::ClassUnicode,
1080 ) -> Result<ThompsonRef, Error> {
1081 // N.B. It would likely be better to cache common *prefixes* in the
1082 // reverse direction, but it's not quite clear how to do that. The
1083 // advantage of caching suffixes is that it does give us a win, and
1084 // has a very small additional overhead.
1085 let mut cache = self.utf8_suffix.borrow_mut();
1086 cache.clear();
1087
1088 let union = self.add_union()?;
1089 let alt_end = self.add_empty()?;
1090 for urng in cls.iter() {
1091 for seq in Utf8Sequences::new(urng.start(), urng.end()) {
1092 let mut end = alt_end;
1093 for brng in seq.as_slice() {
1094 let key = Utf8SuffixKey {
1095 from: end,
1096 start: brng.start,
1097 end: brng.end,
1098 };
1099 let hash = cache.hash(&key);
1100 if let Some(id) = cache.get(&key, hash) {
1101 end = id;
1102 continue;
1103 }
1104
1105 let compiled = self.c_range(brng.start, brng.end)?;
1106 self.patch(compiled.end, end)?;
1107 end = compiled.start;
1108 cache.set(key, hash, end);
1109 }
1110 self.patch(union, end)?;
1111 }
1112 }
1113 Ok(ThompsonRef { start: union, end: alt_end })
1114 }
1115
1116 fn c_anchor(&self, anchor: &Anchor) -> Result<ThompsonRef, Error> {
1117 let look = match *anchor {
1118 Anchor::StartLine => Look::StartLine,
1119 Anchor::EndLine => Look::EndLine,
1120 Anchor::StartText => Look::StartText,
1121 Anchor::EndText => Look::EndText,
1122 };
1123 let id = self.add_look(look)?;
1124 Ok(ThompsonRef { start: id, end: id })
1125 }
1126
1127 fn c_word_boundary(
1128 &self,
1129 wb: &WordBoundary,
1130 ) -> Result<ThompsonRef, Error> {
1131 let look = match *wb {
1132 WordBoundary::Unicode => Look::WordBoundaryUnicode,
1133 WordBoundary::UnicodeNegate => Look::WordBoundaryUnicodeNegate,
1134 WordBoundary::Ascii => Look::WordBoundaryAscii,
1135 WordBoundary::AsciiNegate => Look::WordBoundaryAsciiNegate,
1136 };
1137 let id = self.add_look(look)?;
1138 Ok(ThompsonRef { start: id, end: id })
1139 }
1140
1141 fn c_char(&self, ch: char) -> Result<ThompsonRef, Error> {
1142 let mut buf = [0; 4];
1143 let it = ch
1144 .encode_utf8(&mut buf)
1145 .as_bytes()
1146 .iter()
1147 .map(|&b| self.c_range(b, b));
1148 self.c_concat(it)
1149 }
1150
1151 fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, Error> {
1152 let id = self.add_range(start, end)?;
1153 Ok(ThompsonRef { start: id, end: id })
1154 }
1155
1156 fn c_empty(&self) -> Result<ThompsonRef, Error> {
1157 let id = self.add_empty()?;
1158 Ok(ThompsonRef { start: id, end: id })
1159 }
1160
1161 fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef, Error> {
1162 self.c_at_least(&Hir::any(false), false, 0)
1163 }
1164
1165 fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef, Error> {
1166 self.c_at_least(&Hir::any(true), false, 0)
1167 }
1168
1169 fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> {
1170 let old_memory_cstates = self.memory_cstates.get();
1171 match self.states.borrow_mut()[from] {
1172 CState::Empty { ref mut next } => {
1173 *next = to;
1174 }
1175 CState::Range { ref mut range } => {
1176 range.next = to;
1177 }
1178 CState::Sparse { .. } => {
1179 panic!("cannot patch from a sparse NFA state")
1180 }
1181 CState::Look { ref mut next, .. } => {
1182 *next = to;
1183 }
1184 CState::Union { ref mut alternates } => {
1185 alternates.push(to);
1186 self.memory_cstates
1187 .set(old_memory_cstates + mem::size_of::<StateID>());
1188 }
1189 CState::UnionReverse { ref mut alternates } => {
1190 alternates.push(to);
1191 self.memory_cstates
1192 .set(old_memory_cstates + mem::size_of::<StateID>());
1193 }
1194 CState::CaptureStart { ref mut next, .. } => {
1195 *next = to;
1196 }
1197 CState::CaptureEnd { ref mut next, .. } => {
1198 *next = to;
1199 }
1200 CState::Match { .. } => {}
1201 }
1202 if old_memory_cstates != self.memory_cstates.get() {
1203 self.check_nfa_size_limit()?;
1204 }
1205 Ok(())
1206 }
1207
1208 fn add_empty(&self) -> Result<StateID, Error> {
1209 self.add_state(CState::Empty { next: StateID::ZERO })
1210 }
1211
1212 fn add_capture_start(
1213 &self,
1214 capture_index: u32,
1215 name: Option<Arc<str>>,
1216 ) -> Result<StateID, Error> {
1217 self.add_state(CState::CaptureStart {
1218 next: StateID::ZERO,
1219 capture_index,
1220 name,
1221 })
1222 }
1223
1224 fn add_capture_end(&self, capture_index: u32) -> Result<StateID, Error> {
1225 self.add_state(CState::CaptureEnd {
1226 next: StateID::ZERO,
1227 capture_index,
1228 })
1229 }
1230
1231 fn add_range(&self, start: u8, end: u8) -> Result<StateID, Error> {
1232 let trans = Transition { start, end, next: StateID::ZERO };
1233 self.add_state(CState::Range { range: trans })
1234 }
1235
1236 fn add_sparse(&self, ranges: Vec<Transition>) -> Result<StateID, Error> {
1237 if ranges.len() == 1 {
1238 self.add_state(CState::Range { range: ranges[0] })
1239 } else {
1240 self.add_state(CState::Sparse { ranges })
1241 }
1242 }
1243
1244 fn add_look(&self, mut look: Look) -> Result<StateID, Error> {
1245 if self.is_reverse() {
1246 look = look.reversed();
1247 }
1248 self.add_state(CState::Look { look, next: StateID::ZERO })
1249 }
1250
1251 fn add_union(&self) -> Result<StateID, Error> {
1252 self.add_state(CState::Union { alternates: vec![] })
1253 }
1254
1255 fn add_reverse_union(&self) -> Result<StateID, Error> {
1256 self.add_state(CState::UnionReverse { alternates: vec![] })
1257 }
1258
1259 fn add_match(
1260 &self,
1261 pattern_id: PatternID,
1262 start_id: StateID,
1263 ) -> Result<StateID, Error> {
1264 self.add_state(CState::Match { pattern_id, start_id })
1265 }
1266
1267 fn add_state(&self, state: CState) -> Result<StateID, Error> {
1268 let mut states = self.states.borrow_mut();
1269 let id = StateID::new(states.len())
1270 .map_err(|_| Error::too_many_states(states.len()))?;
1271 self.memory_cstates
1272 .set(self.memory_cstates.get() + state.memory_usage());
1273 states.push(state);
1274 // If we don't explicitly drop this, then 'nfa_memory_usage' will also
1275 // try to borrow it when we check the size limit and hit an error.
1276 drop(states);
1277 self.check_nfa_size_limit()?;
1278 Ok(id)
1279 }
1280
1281 fn is_reverse(&self) -> bool {
1282 self.config.get_reverse()
1283 }
1284
1285 /// If an NFA size limit was set, this checks that the NFA compiled so far
1286 /// fits within that limit. If so, then nothing is returned. Otherwise, an
1287 /// error is returned.
1288 ///
1289 /// This should be called after increasing the heap usage of the
1290 /// intermediate NFA.
1291 ///
1292 /// Note that this borrows 'self.states', so callers should ensure there is
1293 /// no mutable borrow of it outstanding.
1294 fn check_nfa_size_limit(&self) -> Result<(), Error> {
1295 if let Some(limit) = self.config.get_nfa_size_limit() {
1296 if self.nfa_memory_usage() > limit {
1297 return Err(Error::exceeded_size_limit(limit));
1298 }
1299 }
1300 Ok(())
1301 }
1302
1303 /// Returns the heap memory usage, in bytes, of the NFA compiled so far.
1304 ///
1305 /// Note that this is an approximation of how big the final NFA will be.
1306 /// In practice, the final NFA will likely be a bit smaller since it uses
1307 /// things like `Box<[T]>` instead of `Vec<T>`.
1308 fn nfa_memory_usage(&self) -> usize {
1309 self.states.borrow().len() * mem::size_of::<CState>()
1310 + self.memory_cstates.get()
1311 }
1312 }
1313
1314 impl CState {
1315 fn memory_usage(&self) -> usize {
1316 match *self {
1317 CState::Empty { .. }
1318 | CState::Range { .. }
1319 | CState::Look { .. }
1320 | CState::CaptureStart { .. }
1321 | CState::CaptureEnd { .. }
1322 | CState::Match { .. } => 0,
1323 CState::Sparse { ref ranges } => {
1324 ranges.len() * mem::size_of::<Transition>()
1325 }
1326 CState::Union { ref alternates } => {
1327 alternates.len() * mem::size_of::<StateID>()
1328 }
1329 CState::UnionReverse { ref alternates } => {
1330 alternates.len() * mem::size_of::<StateID>()
1331 }
1332 }
1333 }
1334 }
1335
1336 #[derive(Debug)]
1337 struct Utf8Compiler<'a> {
1338 nfac: &'a Compiler,
1339 state: &'a mut Utf8State,
1340 target: StateID,
1341 }
1342
1343 #[derive(Clone, Debug)]
1344 struct Utf8State {
1345 compiled: Utf8BoundedMap,
1346 uncompiled: Vec<Utf8Node>,
1347 }
1348
1349 #[derive(Clone, Debug)]
1350 struct Utf8Node {
1351 trans: Vec<Transition>,
1352 last: Option<Utf8LastTransition>,
1353 }
1354
1355 #[derive(Clone, Debug)]
1356 struct Utf8LastTransition {
1357 start: u8,
1358 end: u8,
1359 }
1360
1361 impl Utf8State {
1362 fn new() -> Utf8State {
1363 Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] }
1364 }
1365
1366 fn clear(&mut self) {
1367 self.compiled.clear();
1368 self.uncompiled.clear();
1369 }
1370 }
1371
1372 impl<'a> Utf8Compiler<'a> {
1373 fn new(
1374 nfac: &'a Compiler,
1375 state: &'a mut Utf8State,
1376 ) -> Result<Utf8Compiler<'a>, Error> {
1377 let target = nfac.add_empty()?;
1378 state.clear();
1379 let mut utf8c = Utf8Compiler { nfac, state, target };
1380 utf8c.add_empty();
1381 Ok(utf8c)
1382 }
1383
1384 fn finish(&mut self) -> Result<ThompsonRef, Error> {
1385 self.compile_from(0)?;
1386 let node = self.pop_root();
1387 let start = self.compile(node)?;
1388 Ok(ThompsonRef { start, end: self.target })
1389 }
1390
1391 fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), Error> {
1392 let prefix_len = ranges
1393 .iter()
1394 .zip(&self.state.uncompiled)
1395 .take_while(|&(range, node)| {
1396 node.last.as_ref().map_or(false, |t| {
1397 (t.start, t.end) == (range.start, range.end)
1398 })
1399 })
1400 .count();
1401 assert!(prefix_len < ranges.len());
1402 self.compile_from(prefix_len)?;
1403 self.add_suffix(&ranges[prefix_len..]);
1404 Ok(())
1405 }
1406
1407 fn compile_from(&mut self, from: usize) -> Result<(), Error> {
1408 let mut next = self.target;
1409 while from + 1 < self.state.uncompiled.len() {
1410 let node = self.pop_freeze(next);
1411 next = self.compile(node)?;
1412 }
1413 self.top_last_freeze(next);
1414 Ok(())
1415 }
1416
1417 fn compile(&mut self, node: Vec<Transition>) -> Result<StateID, Error> {
1418 let hash = self.state.compiled.hash(&node);
1419 if let Some(id) = self.state.compiled.get(&node, hash) {
1420 return Ok(id);
1421 }
1422 let id = self.nfac.add_sparse(node.clone())?;
1423 self.state.compiled.set(node, hash, id);
1424 Ok(id)
1425 }
1426
1427 fn add_suffix(&mut self, ranges: &[Utf8Range]) {
1428 assert!(!ranges.is_empty());
1429 let last = self
1430 .state
1431 .uncompiled
1432 .len()
1433 .checked_sub(1)
1434 .expect("non-empty nodes");
1435 assert!(self.state.uncompiled[last].last.is_none());
1436 self.state.uncompiled[last].last = Some(Utf8LastTransition {
1437 start: ranges[0].start,
1438 end: ranges[0].end,
1439 });
1440 for r in &ranges[1..] {
1441 self.state.uncompiled.push(Utf8Node {
1442 trans: vec![],
1443 last: Some(Utf8LastTransition { start: r.start, end: r.end }),
1444 });
1445 }
1446 }
1447
1448 fn add_empty(&mut self) {
1449 self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
1450 }
1451
1452 fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
1453 let mut uncompiled = self.state.uncompiled.pop().unwrap();
1454 uncompiled.set_last_transition(next);
1455 uncompiled.trans
1456 }
1457
1458 fn pop_root(&mut self) -> Vec<Transition> {
1459 assert_eq!(self.state.uncompiled.len(), 1);
1460 assert!(self.state.uncompiled[0].last.is_none());
1461 self.state.uncompiled.pop().expect("non-empty nodes").trans
1462 }
1463
1464 fn top_last_freeze(&mut self, next: StateID) {
1465 let last = self
1466 .state
1467 .uncompiled
1468 .len()
1469 .checked_sub(1)
1470 .expect("non-empty nodes");
1471 self.state.uncompiled[last].set_last_transition(next);
1472 }
1473 }
1474
1475 impl Utf8Node {
1476 fn set_last_transition(&mut self, next: StateID) {
1477 if let Some(last) = self.last.take() {
1478 self.trans.push(Transition {
1479 start: last.start,
1480 end: last.end,
1481 next,
1482 });
1483 }
1484 }
1485 }
1486
1487 #[cfg(test)]
1488 mod tests {
1489 use alloc::vec::Vec;
1490
1491 use super::{
1492 Builder, Config, PatternID, SparseTransitions, State, StateID,
1493 Transition, NFA,
1494 };
1495
1496 fn build(pattern: &str) -> NFA {
1497 Builder::new()
1498 .configure(Config::new().captures(false).unanchored_prefix(false))
1499 .build(pattern)
1500 .unwrap()
1501 }
1502
1503 fn pid(id: usize) -> PatternID {
1504 PatternID::new(id).unwrap()
1505 }
1506
1507 fn sid(id: usize) -> StateID {
1508 StateID::new(id).unwrap()
1509 }
1510
1511 fn s_byte(byte: u8, next: usize) -> State {
1512 let next = sid(next);
1513 let trans = Transition { start: byte, end: byte, next };
1514 State::Range { range: trans }
1515 }
1516
1517 fn s_range(start: u8, end: u8, next: usize) -> State {
1518 let next = sid(next);
1519 let trans = Transition { start, end, next };
1520 State::Range { range: trans }
1521 }
1522
1523 fn s_sparse(ranges: &[(u8, u8, usize)]) -> State {
1524 let ranges = ranges
1525 .iter()
1526 .map(|&(start, end, next)| Transition {
1527 start,
1528 end,
1529 next: sid(next),
1530 })
1531 .collect();
1532 State::Sparse(SparseTransitions { ranges })
1533 }
1534
1535 fn s_union(alts: &[usize]) -> State {
1536 State::Union {
1537 alternates: alts
1538 .iter()
1539 .map(|&id| sid(id))
1540 .collect::<Vec<StateID>>()
1541 .into_boxed_slice(),
1542 }
1543 }
1544
1545 fn s_match(id: usize) -> State {
1546 State::Match { id: pid(id) }
1547 }
1548
1549 // Test that building an unanchored NFA has an appropriate `(?s:.)*?`
1550 // prefix.
1551 #[test]
1552 fn compile_unanchored_prefix() {
1553 // When the machine can only match valid UTF-8.
1554 let nfa = Builder::new()
1555 .configure(Config::new().captures(false))
1556 .build(r"a")
1557 .unwrap();
1558 // There should be many states since the `.` in `(?s:.)*?` matches any
1559 // Unicode scalar value.
1560 assert_eq!(11, nfa.len());
1561 assert_eq!(nfa.states[10], s_match(0));
1562 assert_eq!(nfa.states[9], s_byte(b'a', 10));
1563
1564 // When the machine can match through invalid UTF-8.
1565 let nfa = Builder::new()
1566 .configure(Config::new().captures(false).utf8(false))
1567 .build(r"a")
1568 .unwrap();
1569 assert_eq!(
1570 nfa.states,
1571 &[
1572 s_union(&[2, 1]),
1573 s_range(0, 255, 0),
1574 s_byte(b'a', 3),
1575 s_match(0),
1576 ]
1577 );
1578 }
1579
1580 #[test]
1581 fn compile_empty() {
1582 assert_eq!(build("").states, &[s_match(0),]);
1583 }
1584
1585 #[test]
1586 fn compile_literal() {
1587 assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(0),]);
1588 assert_eq!(
1589 build("ab").states,
1590 &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),]
1591 );
1592 assert_eq!(
1593 build("ā˜ƒ").states,
1594 &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)]
1595 );
1596
1597 // Check that non-UTF-8 literals work.
1598 let nfa = Builder::new()
1599 .configure(
1600 Config::new()
1601 .captures(false)
1602 .utf8(false)
1603 .unanchored_prefix(false),
1604 )
1605 .syntax(crate::SyntaxConfig::new().utf8(false))
1606 .build(r"(?-u)\xFF")
1607 .unwrap();
1608 assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(0),]);
1609 }
1610
1611 #[test]
1612 fn compile_class() {
1613 assert_eq!(
1614 build(r"[a-z]").states,
1615 &[s_range(b'a', b'z', 1), s_match(0),]
1616 );
1617 assert_eq!(
1618 build(r"[x-za-c]").states,
1619 &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)]
1620 );
1621 assert_eq!(
1622 build(r"[\u03B1-\u03B4]").states,
1623 &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)]
1624 );
1625 assert_eq!(
1626 build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states,
1627 &[
1628 s_range(0xB1, 0xB4, 5),
1629 s_range(0x99, 0x9E, 5),
1630 s_byte(0xA4, 1),
1631 s_byte(0x9F, 2),
1632 s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
1633 s_match(0),
1634 ]
1635 );
1636 assert_eq!(
1637 build(r"[a-zā˜ƒ]").states,
1638 &[
1639 s_byte(0x83, 3),
1640 s_byte(0x98, 0),
1641 s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
1642 s_match(0),
1643 ]
1644 );
1645 }
1646
1647 #[test]
1648 fn compile_repetition() {
1649 assert_eq!(
1650 build(r"a?").states,
1651 &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(0),]
1652 );
1653 assert_eq!(
1654 build(r"a??").states,
1655 &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(0),]
1656 );
1657 }
1658
1659 #[test]
1660 fn compile_group() {
1661 assert_eq!(
1662 build(r"ab+").states,
1663 &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(0)]
1664 );
1665 assert_eq!(
1666 build(r"(ab)").states,
1667 &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)]
1668 );
1669 assert_eq!(
1670 build(r"(ab)+").states,
1671 &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(0)]
1672 );
1673 }
1674
1675 #[test]
1676 fn compile_alternation() {
1677 assert_eq!(
1678 build(r"a|b").states,
1679 &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(0)]
1680 );
1681 assert_eq!(
1682 build(r"|b").states,
1683 &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(0)]
1684 );
1685 assert_eq!(
1686 build(r"a|").states,
1687 &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(0)]
1688 );
1689 }
1690
1691 #[test]
1692 fn many_start_pattern() {
1693 let nfa = Builder::new()
1694 .configure(Config::new().captures(false).unanchored_prefix(false))
1695 .build_many(&["a", "b"])
1696 .unwrap();
1697 assert_eq!(
1698 nfa.states,
1699 &[
1700 s_byte(b'a', 1),
1701 s_match(0),
1702 s_byte(b'b', 3),
1703 s_match(1),
1704 s_union(&[0, 2]),
1705 ]
1706 );
1707 assert_eq!(nfa.start_anchored().as_usize(), 4);
1708 assert_eq!(nfa.start_unanchored().as_usize(), 4);
1709 // Test that the start states for each individual pattern are correct.
1710 assert_eq!(nfa.start_pattern(pid(0)), sid(0));
1711 assert_eq!(nfa.start_pattern(pid(1)), sid(2));
1712 }
1713 }