1 use std
::collections
::HashMap
;
7 use regex_syntax
::hir
::{self, Hir}
;
8 use regex_syntax
::is_word_byte
;
9 use regex_syntax
::utf8
::{Utf8Range, Utf8Sequence, Utf8Sequences}
;
12 EmptyLook
, Inst
, InstBytes
, InstChar
, InstEmptyLook
, InstPtr
, InstRanges
,
13 InstSave
, InstSplit
, Program
,
18 type Result
= result
::Result
<Patch
, Error
>;
19 type ResultOrEmpty
= result
::Result
<Option
<Patch
>, Error
>;
27 /// A compiler translates a regular expression AST to a sequence of
28 /// instructions. The sequence of instructions represents an NFA.
29 // `Compiler` is only public via the `internal` module, so avoid deriving
31 #[allow(missing_debug_implementations)]
33 insts
: Vec
<MaybeInst
>,
35 capture_name_idx
: HashMap
<String
, usize>,
38 suffix_cache
: SuffixCache
,
39 utf8_seqs
: Option
<Utf8Sequences
>,
40 byte_classes
: ByteClassSet
,
41 // This keeps track of extra bytes allocated while compiling the regex
42 // program. Currently, this corresponds to two things. First is the heap
43 // memory allocated by Unicode character classes ('InstRanges'). Second is
44 // a "fake" amount of memory used by empty sub-expressions, so that enough
45 // empty sub-expressions will ultimately trigger the compiler to bail
46 // because of a size limit restriction. (That empty sub-expressions don't
47 // add to heap memory usage is more-or-less an implementation detail.) In
48 // the second case, if we don't bail, then an excessively large repetition
49 // on an empty sub-expression can result in the compiler using a very large
50 // amount of CPU time.
51 extra_inst_bytes
: usize,
55 /// Create a new regular expression compiler.
57 /// Various options can be set before calling `compile` on an expression.
58 pub fn new() -> Self {
61 compiled
: Program
::new(),
62 capture_name_idx
: HashMap
::new(),
64 size_limit
: 10 * (1 << 20),
65 suffix_cache
: SuffixCache
::new(1000),
66 utf8_seqs
: Some(Utf8Sequences
::new('
\x00'
, '
\x00'
)),
67 byte_classes
: ByteClassSet
::new(),
72 /// The size of the resulting program is limited by size_limit. If
73 /// the program approximately exceeds the given size (in bytes), then
74 /// compilation will stop and return an error.
75 pub fn size_limit(mut self, size_limit
: usize) -> Self {
76 self.size_limit
= size_limit
;
80 /// If bytes is true, then the program is compiled as a byte based
81 /// automaton, which incorporates UTF-8 decoding into the machine. If it's
82 /// false, then the automaton is Unicode scalar value based, e.g., an
83 /// engine utilizing such an automaton is responsible for UTF-8 decoding.
85 /// The specific invariant is that when returning a byte based machine,
86 /// the neither the `Char` nor `Ranges` instructions are produced.
87 /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
88 /// instruction is never produced.
90 /// Note that `dfa(true)` implies `bytes(true)`.
91 pub fn bytes(mut self, yes
: bool
) -> Self {
92 self.compiled
.is_bytes
= yes
;
96 /// When disabled, the program compiled may match arbitrary bytes.
98 /// When enabled (the default), all compiled programs exclusively match
99 /// valid UTF-8 bytes.
100 pub fn only_utf8(mut self, yes
: bool
) -> Self {
101 self.compiled
.only_utf8
= yes
;
105 /// When set, the machine returned is suitable for use in the DFA matching
108 /// In particular, this ensures that if the regex is not anchored in the
109 /// beginning, then a preceding `.*?` is included in the program. (The NFA
110 /// based engines handle the preceding `.*?` explicitly, which is difficult
111 /// or impossible in the DFA engine.)
112 pub fn dfa(mut self, yes
: bool
) -> Self {
113 self.compiled
.is_dfa
= yes
;
117 /// When set, the machine returned is suitable for matching text in
118 /// reverse. In particular, all concatenations are flipped.
119 pub fn reverse(mut self, yes
: bool
) -> Self {
120 self.compiled
.is_reverse
= yes
;
124 /// Compile a regular expression given its AST.
126 /// The compiler is guaranteed to succeed unless the program exceeds the
127 /// specified size limit. If the size limit is exceeded, then compilation
128 /// stops and returns an error.
129 pub fn compile(mut self, exprs
: &[Hir
]) -> result
::Result
<Program
, Error
> {
130 debug_assert
!(!exprs
.is_empty());
131 self.num_exprs
= exprs
.len();
132 if exprs
.len() == 1 {
133 self.compile_one(&exprs
[0])
135 self.compile_many(exprs
)
139 fn compile_one(mut self, expr
: &Hir
) -> result
::Result
<Program
, Error
> {
140 // If we're compiling a forward DFA and we aren't anchored, then
141 // add a `.*?` before the first capture group.
142 // Other matching engines handle this by baking the logic into the
143 // matching engine itself.
144 let mut dotstar_patch
= Patch { hole: Hole::None, entry: 0 }
;
145 self.compiled
.is_anchored_start
= expr
.is_anchored_start();
146 self.compiled
.is_anchored_end
= expr
.is_anchored_end();
147 if self.compiled
.needs_dotstar() {
148 dotstar_patch
= self.c_dotstar()?
;
149 self.compiled
.start
= dotstar_patch
.entry
;
151 self.compiled
.captures
= vec
![None
];
152 let patch
= self.c_capture(0, expr
)?
.unwrap_or(self.next_inst());
153 if self.compiled
.needs_dotstar() {
154 self.fill(dotstar_patch
.hole
, patch
.entry
);
156 self.compiled
.start
= patch
.entry
;
158 self.fill_to_next(patch
.hole
);
159 self.compiled
.matches
= vec
![self.insts
.len()];
160 self.push_compiled(Inst
::Match(0));
161 self.compile_finish()
167 ) -> result
::Result
<Program
, Error
> {
168 debug_assert
!(exprs
.len() > 1);
170 self.compiled
.is_anchored_start
=
171 exprs
.iter().all(|e
| e
.is_anchored_start());
172 self.compiled
.is_anchored_end
=
173 exprs
.iter().all(|e
| e
.is_anchored_end());
174 let mut dotstar_patch
= Patch { hole: Hole::None, entry: 0 }
;
175 if self.compiled
.needs_dotstar() {
176 dotstar_patch
= self.c_dotstar()?
;
177 self.compiled
.start
= dotstar_patch
.entry
;
179 self.compiled
.start
= 0; // first instruction is always split
181 self.fill_to_next(dotstar_patch
.hole
);
183 let mut prev_hole
= Hole
::None
;
184 for (i
, expr
) in exprs
[0..exprs
.len() - 1].iter().enumerate() {
185 self.fill_to_next(prev_hole
);
186 let split
= self.push_split_hole();
187 let Patch { hole, entry }
=
188 self.c_capture(0, expr
)?
.unwrap_or(self.next_inst());
189 self.fill_to_next(hole
);
190 self.compiled
.matches
.push(self.insts
.len());
191 self.push_compiled(Inst
::Match(i
));
192 prev_hole
= self.fill_split(split
, Some(entry
), None
);
194 let i
= exprs
.len() - 1;
195 let Patch { hole, entry }
=
196 self.c_capture(0, &exprs
[i
])?
.unwrap_or(self.next_inst());
197 self.fill(prev_hole
, entry
);
198 self.fill_to_next(hole
);
199 self.compiled
.matches
.push(self.insts
.len());
200 self.push_compiled(Inst
::Match(i
));
201 self.compile_finish()
204 fn compile_finish(mut self) -> result
::Result
<Program
, Error
> {
205 self.compiled
.insts
=
206 self.insts
.into_iter().map(|inst
| inst
.unwrap()).collect();
207 self.compiled
.byte_classes
= self.byte_classes
.byte_classes();
208 self.compiled
.capture_name_idx
= Arc
::new(self.capture_name_idx
);
212 /// Compile expr into self.insts, returning a patch on success,
213 /// or an error if we run out of memory.
215 /// All of the c_* methods of the compiler share the contract outlined
218 /// The main thing that a c_* method does is mutate `self.insts`
219 /// to add a list of mostly compiled instructions required to execute
220 /// the given expression. `self.insts` contains MaybeInsts rather than
221 /// Insts because there is some backpatching required.
223 /// The `Patch` value returned by each c_* method provides metadata
224 /// about the compiled instructions emitted to `self.insts`. The
225 /// `entry` member of the patch refers to the first instruction
226 /// (the entry point), while the `hole` member contains zero or
227 /// more offsets to partial instructions that need to be backpatched.
228 /// The c_* routine can't know where its list of instructions are going to
229 /// jump to after execution, so it is up to the caller to patch
230 /// these jumps to point to the right place. So compiling some
231 /// expression, e, we would end up with a situation that looked like:
234 /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
241 /// To compile two expressions, e1 and e2, concatenated together we
245 /// let patch1 = self.c(e1);
246 /// let patch2 = self.c(e2);
249 /// while leaves us with a situation that looks like
252 /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
255 /// entry1 hole1 entry2 hole2
258 /// Then to merge the two patches together into one we would backpatch
259 /// hole1 with entry2 and return a new patch that enters at entry1
260 /// and has hole2 for a hole. In fact, if you look at the c_concat
261 /// method you will see that it does exactly this, though it handles
262 /// a list of expressions rather than just the two that we use for
265 /// Ok(None) is returned when an expression is compiled to no
266 /// instruction, and so no patch.entry value makes sense.
267 fn c(&mut self, expr
: &Hir
) -> ResultOrEmpty
{
269 use regex_syntax
::hir
::HirKind
::*;
273 Empty
=> self.c_empty(),
274 Literal(hir
::Literal
::Unicode(c
)) => self.c_char(c
),
275 Literal(hir
::Literal
::Byte(b
)) => {
276 assert
!(self.compiled
.uses_bytes());
279 Class(hir
::Class
::Unicode(ref cls
)) => self.c_class(cls
.ranges()),
280 Class(hir
::Class
::Bytes(ref cls
)) => {
281 if self.compiled
.uses_bytes() {
282 self.c_class_bytes(cls
.ranges())
284 assert
!(cls
.is_all_ascii());
285 let mut char_ranges
= vec
![];
286 for r
in cls
.iter() {
287 let (s
, e
) = (r
.start() as char, r
.end() as char);
288 char_ranges
.push(hir
::ClassUnicodeRange
::new(s
, e
));
290 self.c_class(&char_ranges
)
293 Anchor(hir
::Anchor
::StartLine
) if self.compiled
.is_reverse
=> {
294 self.byte_classes
.set_range(b'
\n'
, b'
\n'
);
295 self.c_empty_look(prog
::EmptyLook
::EndLine
)
297 Anchor(hir
::Anchor
::StartLine
) => {
298 self.byte_classes
.set_range(b'
\n'
, b'
\n'
);
299 self.c_empty_look(prog
::EmptyLook
::StartLine
)
301 Anchor(hir
::Anchor
::EndLine
) if self.compiled
.is_reverse
=> {
302 self.byte_classes
.set_range(b'
\n'
, b'
\n'
);
303 self.c_empty_look(prog
::EmptyLook
::StartLine
)
305 Anchor(hir
::Anchor
::EndLine
) => {
306 self.byte_classes
.set_range(b'
\n'
, b'
\n'
);
307 self.c_empty_look(prog
::EmptyLook
::EndLine
)
309 Anchor(hir
::Anchor
::StartText
) if self.compiled
.is_reverse
=> {
310 self.c_empty_look(prog
::EmptyLook
::EndText
)
312 Anchor(hir
::Anchor
::StartText
) => {
313 self.c_empty_look(prog
::EmptyLook
::StartText
)
315 Anchor(hir
::Anchor
::EndText
) if self.compiled
.is_reverse
=> {
316 self.c_empty_look(prog
::EmptyLook
::StartText
)
318 Anchor(hir
::Anchor
::EndText
) => {
319 self.c_empty_look(prog
::EmptyLook
::EndText
)
321 WordBoundary(hir
::WordBoundary
::Unicode
) => {
322 if !cfg
!(feature
= "unicode-perl") {
323 return Err(Error
::Syntax(
324 "Unicode word boundaries are unavailable when \
325 the unicode-perl feature is disabled"
329 self.compiled
.has_unicode_word_boundary
= true;
330 self.byte_classes
.set_word_boundary();
331 // We also make sure that all ASCII bytes are in a different
332 // class from non-ASCII bytes. Otherwise, it's possible for
333 // ASCII bytes to get lumped into the same class as non-ASCII
334 // bytes. This in turn may cause the lazy DFA to falsely start
335 // when it sees an ASCII byte that maps to a byte class with
336 // non-ASCII bytes. This ensures that never happens.
337 self.byte_classes
.set_range(0, 0x7F);
338 self.c_empty_look(prog
::EmptyLook
::WordBoundary
)
340 WordBoundary(hir
::WordBoundary
::UnicodeNegate
) => {
341 if !cfg
!(feature
= "unicode-perl") {
342 return Err(Error
::Syntax(
343 "Unicode word boundaries are unavailable when \
344 the unicode-perl feature is disabled"
348 self.compiled
.has_unicode_word_boundary
= true;
349 self.byte_classes
.set_word_boundary();
350 // See comments above for why we set the ASCII range here.
351 self.byte_classes
.set_range(0, 0x7F);
352 self.c_empty_look(prog
::EmptyLook
::NotWordBoundary
)
354 WordBoundary(hir
::WordBoundary
::Ascii
) => {
355 self.byte_classes
.set_word_boundary();
356 self.c_empty_look(prog
::EmptyLook
::WordBoundaryAscii
)
358 WordBoundary(hir
::WordBoundary
::AsciiNegate
) => {
359 self.byte_classes
.set_word_boundary();
360 self.c_empty_look(prog
::EmptyLook
::NotWordBoundaryAscii
)
362 Group(ref g
) => match g
.kind
{
363 hir
::GroupKind
::NonCapturing
=> self.c(&g
.hir
),
364 hir
::GroupKind
::CaptureIndex(index
) => {
365 if index
as usize >= self.compiled
.captures
.len() {
366 self.compiled
.captures
.push(None
);
368 self.c_capture(2 * index
as usize, &g
.hir
)
370 hir
::GroupKind
::CaptureName { index, ref name }
=> {
371 if index
as usize >= self.compiled
.captures
.len() {
372 let n
= name
.to_string();
373 self.compiled
.captures
.push(Some(n
.clone()));
374 self.capture_name_idx
.insert(n
, index
as usize);
376 self.c_capture(2 * index
as usize, &g
.hir
)
380 if self.compiled
.is_reverse
{
381 self.c_concat(es
.iter().rev())
386 Alternation(ref es
) => self.c_alternate(&**es
),
387 Repetition(ref rep
) => self.c_repeat(rep
),
391 fn c_empty(&mut self) -> ResultOrEmpty
{
392 // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
393 // See: CVE-2022-24713
395 // Since 'empty' sub-expressions don't increase the size of
396 // the actual compiled object, we "fake" an increase in its
397 // size so that our 'check_size_limit' routine will eventually
398 // stop compilation if there are too many empty sub-expressions
399 // (e.g., via a large repetition).
400 self.extra_inst_bytes
+= std
::mem
::size_of
::<Inst
>();
404 fn c_capture(&mut self, first_slot
: usize, expr
: &Hir
) -> ResultOrEmpty
{
405 if self.num_exprs
> 1 || self.compiled
.is_dfa
{
406 // Don't ever compile Save instructions for regex sets because
407 // they are never used. They are also never used in DFA programs
408 // because DFAs can't handle captures.
411 let entry
= self.insts
.len();
412 let hole
= self.push_hole(InstHole
::Save { slot: first_slot }
);
413 let patch
= self.c(expr
)?
.unwrap_or(self.next_inst());
414 self.fill(hole
, patch
.entry
);
415 self.fill_to_next(patch
.hole
);
416 let hole
= self.push_hole(InstHole
::Save { slot: first_slot + 1 }
);
417 Ok(Some(Patch { hole: hole, entry: entry }
))
421 fn c_dotstar(&mut self) -> Result
{
422 Ok(if !self.compiled
.only_utf8() {
423 self.c(&Hir
::repetition(hir
::Repetition
{
424 kind
: hir
::RepetitionKind
::ZeroOrMore
,
426 hir
: Box
::new(Hir
::any(true)),
430 self.c(&Hir
::repetition(hir
::Repetition
{
431 kind
: hir
::RepetitionKind
::ZeroOrMore
,
433 hir
: Box
::new(Hir
::any(false)),
439 fn c_char(&mut self, c
: char) -> ResultOrEmpty
{
440 if self.compiled
.uses_bytes() {
444 self.push_hole(InstHole
::Bytes { start: b, end: b }
);
445 self.byte_classes
.set_range(b
, b
);
446 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }
))
448 self.c_class(&[hir
::ClassUnicodeRange
::new(c
, c
)])
451 let hole
= self.push_hole(InstHole
::Char { c: c }
);
452 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }
))
456 fn c_class(&mut self, ranges
: &[hir
::ClassUnicodeRange
]) -> ResultOrEmpty
{
457 use std
::mem
::size_of
;
459 assert
!(!ranges
.is_empty());
460 if self.compiled
.uses_bytes() {
461 Ok(Some(CompileClass { c: self, ranges: ranges }
.compile()?
))
463 let ranges
: Vec
<(char, char)> =
464 ranges
.iter().map(|r
| (r
.start(), r
.end())).collect();
465 let hole
= if ranges
.len() == 1 && ranges
[0].0 == ranges
[0].1 {
466 self.push_hole(InstHole
::Char { c: ranges[0].0 }
)
468 self.extra_inst_bytes
+=
469 ranges
.len() * (size_of
::<char>() * 2);
470 self.push_hole(InstHole
::Ranges { ranges: ranges }
)
472 Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }
))
476 fn c_byte(&mut self, b
: u8) -> ResultOrEmpty
{
477 self.c_class_bytes(&[hir
::ClassBytesRange
::new(b
, b
)])
482 ranges
: &[hir
::ClassBytesRange
],
484 debug_assert
!(!ranges
.is_empty());
486 let first_split_entry
= self.insts
.len();
487 let mut holes
= vec
![];
488 let mut prev_hole
= Hole
::None
;
489 for r
in &ranges
[0..ranges
.len() - 1] {
490 self.fill_to_next(prev_hole
);
491 let split
= self.push_split_hole();
492 let next
= self.insts
.len();
493 self.byte_classes
.set_range(r
.start(), r
.end());
494 holes
.push(self.push_hole(InstHole
::Bytes
{
498 prev_hole
= self.fill_split(split
, Some(next
), None
);
500 let next
= self.insts
.len();
501 let r
= &ranges
[ranges
.len() - 1];
502 self.byte_classes
.set_range(r
.start(), r
.end());
504 self.push_hole(InstHole
::Bytes { start: r.start(), end: r.end() }
),
506 self.fill(prev_hole
, next
);
507 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }
))
510 fn c_empty_look(&mut self, look
: EmptyLook
) -> ResultOrEmpty
{
511 let hole
= self.push_hole(InstHole
::EmptyLook { look: look }
);
512 Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }
))
515 fn c_concat
<'a
, I
>(&mut self, exprs
: I
) -> ResultOrEmpty
517 I
: IntoIterator
<Item
= &'a Hir
>,
519 let mut exprs
= exprs
.into_iter();
520 let Patch { mut hole, entry }
= loop {
522 None
=> return self.c_empty(),
524 if let Some(p
) = self.c(e
)?
{
531 if let Some(p
) = self.c(e
)?
{
532 self.fill(hole
, p
.entry
);
536 Ok(Some(Patch { hole: hole, entry: entry }
))
539 fn c_alternate(&mut self, exprs
: &[Hir
]) -> ResultOrEmpty
{
542 "alternates must have at least 2 exprs"
545 // Initial entry point is always the first split.
546 let first_split_entry
= self.insts
.len();
548 // Save up all of the holes from each alternate. They will all get
549 // patched to point to the same location.
550 let mut holes
= vec
![];
552 // true indicates that the hole is a split where we want to fill
553 // the second branch.
554 let mut prev_hole
= (Hole
::None
, false);
555 for e
in &exprs
[0..exprs
.len() - 1] {
557 let next
= self.insts
.len();
558 self.fill_split(prev_hole
.0, None
, Some(next
));
560 self.fill_to_next(prev_hole
.0);
562 let split
= self.push_split_hole();
563 if let Some(Patch { hole, entry }
) = self.c(e
)?
{
565 prev_hole
= (self.fill_split(split
, Some(entry
), None
), false);
567 let (split1
, split2
) = split
.dup_one();
569 prev_hole
= (split2
, true);
572 if let Some(Patch { hole, entry }
) = self.c(&exprs
[exprs
.len() - 1])?
{
575 self.fill_split(prev_hole
.0, None
, Some(entry
));
577 self.fill(prev_hole
.0, entry
);
580 // We ignore prev_hole.1. When it's true, it means we have two
581 // empty branches both pushing prev_hole.0 into holes, so both
582 // branches will go to the same place anyway.
583 holes
.push(prev_hole
.0);
585 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }
))
588 fn c_repeat(&mut self, rep
: &hir
::Repetition
) -> ResultOrEmpty
{
589 use regex_syntax
::hir
::RepetitionKind
::*;
591 ZeroOrOne
=> self.c_repeat_zero_or_one(&rep
.hir
, rep
.greedy
),
592 ZeroOrMore
=> self.c_repeat_zero_or_more(&rep
.hir
, rep
.greedy
),
593 OneOrMore
=> self.c_repeat_one_or_more(&rep
.hir
, rep
.greedy
),
594 Range(hir
::RepetitionRange
::Exactly(min_max
)) => {
595 self.c_repeat_range(&rep
.hir
, rep
.greedy
, min_max
, min_max
)
597 Range(hir
::RepetitionRange
::AtLeast(min
)) => {
598 self.c_repeat_range_min_or_more(&rep
.hir
, rep
.greedy
, min
)
600 Range(hir
::RepetitionRange
::Bounded(min
, max
)) => {
601 self.c_repeat_range(&rep
.hir
, rep
.greedy
, min
, max
)
606 fn c_repeat_zero_or_one(
611 let split_entry
= self.insts
.len();
612 let split
= self.push_split_hole();
613 let Patch { hole: hole_rep, entry: entry_rep }
= match self.c(expr
)?
{
615 None
=> return self.pop_split_hole(),
617 let split_hole
= if greedy
{
618 self.fill_split(split
, Some(entry_rep
), None
)
620 self.fill_split(split
, None
, Some(entry_rep
))
622 let holes
= vec
![hole_rep
, split_hole
];
623 Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }
))
626 fn c_repeat_zero_or_more(
631 let split_entry
= self.insts
.len();
632 let split
= self.push_split_hole();
633 let Patch { hole: hole_rep, entry: entry_rep }
= match self.c(expr
)?
{
635 None
=> return self.pop_split_hole(),
638 self.fill(hole_rep
, split_entry
);
639 let split_hole
= if greedy
{
640 self.fill_split(split
, Some(entry_rep
), None
)
642 self.fill_split(split
, None
, Some(entry_rep
))
644 Ok(Some(Patch { hole: split_hole, entry: split_entry }
))
647 fn c_repeat_one_or_more(
652 let Patch { hole: hole_rep, entry: entry_rep }
= match self.c(expr
)?
{
654 None
=> return Ok(None
),
656 self.fill_to_next(hole_rep
);
657 let split
= self.push_split_hole();
659 let split_hole
= if greedy
{
660 self.fill_split(split
, Some(entry_rep
), None
)
662 self.fill_split(split
, None
, Some(entry_rep
))
664 Ok(Some(Patch { hole: split_hole, entry: entry_rep }
))
667 fn c_repeat_range_min_or_more(
673 let min
= u32_to_usize(min
);
674 // Using next_inst() is ok, because we can't return it (concat would
675 // have to return Some(_) while c_repeat_range_min_or_more returns
677 let patch_concat
= self
678 .c_concat(iter
::repeat(expr
).take(min
))?
679 .unwrap_or(self.next_inst());
680 if let Some(patch_rep
) = self.c_repeat_zero_or_more(expr
, greedy
)?
{
681 self.fill(patch_concat
.hole
, patch_rep
.entry
);
682 Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }
))
695 let (min
, max
) = (u32_to_usize(min
), u32_to_usize(max
));
696 debug_assert
!(min
<= max
);
697 let patch_concat
= self.c_concat(iter
::repeat(expr
).take(min
))?
;
699 return Ok(patch_concat
);
701 // Same reasoning as in c_repeat_range_min_or_more (we know that min <
702 // max at this point).
703 let patch_concat
= patch_concat
.unwrap_or(self.next_inst());
704 let initial_entry
= patch_concat
.entry
;
705 // It is much simpler to compile, e.g., `a{2,5}` as:
709 // But you end up with a sequence of instructions like this:
721 // This is *incredibly* inefficient because the splits end
722 // up forming a chain, which has to be resolved everything a
723 // transition is followed.
724 let mut holes
= vec
![];
725 let mut prev_hole
= patch_concat
.hole
;
727 self.fill_to_next(prev_hole
);
728 let split
= self.push_split_hole();
729 let Patch { hole, entry }
= match self.c(expr
)?
{
731 None
=> return self.pop_split_hole(),
735 holes
.push(self.fill_split(split
, Some(entry
), None
));
737 holes
.push(self.fill_split(split
, None
, Some(entry
)));
740 holes
.push(prev_hole
);
741 Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }
))
744 /// Can be used as a default value for the c_* functions when the call to
745 /// c_function is followed by inserting at least one instruction that is
746 /// always executed after the ones written by the c* function.
747 fn next_inst(&self) -> Patch
{
748 Patch { hole: Hole::None, entry: self.insts.len() }
751 fn fill(&mut self, hole
: Hole
, goto
: InstPtr
) {
755 self.insts
[pc
].fill(goto
);
757 Hole
::Many(holes
) => {
759 self.fill(hole
, goto
);
765 fn fill_to_next(&mut self, hole
: Hole
) {
766 let next
= self.insts
.len();
767 self.fill(hole
, next
);
773 goto1
: Option
<InstPtr
>,
774 goto2
: Option
<InstPtr
>,
777 Hole
::None
=> Hole
::None
,
778 Hole
::One(pc
) => match (goto1
, goto2
) {
779 (Some(goto1
), Some(goto2
)) => {
780 self.insts
[pc
].fill_split(goto1
, goto2
);
783 (Some(goto1
), None
) => {
784 self.insts
[pc
].half_fill_split_goto1(goto1
);
787 (None
, Some(goto2
)) => {
788 self.insts
[pc
].half_fill_split_goto2(goto2
);
791 (None
, None
) => unreachable
!(
792 "at least one of the split \
793 holes must be filled"
796 Hole
::Many(holes
) => {
797 let mut new_holes
= vec
![];
799 new_holes
.push(self.fill_split(hole
, goto1
, goto2
));
801 if new_holes
.is_empty() {
803 } else if new_holes
.len() == 1 {
804 new_holes
.pop().unwrap()
806 Hole
::Many(new_holes
)
812 fn push_compiled(&mut self, inst
: Inst
) {
813 self.insts
.push(MaybeInst
::Compiled(inst
));
816 fn push_hole(&mut self, inst
: InstHole
) -> Hole
{
817 let hole
= self.insts
.len();
818 self.insts
.push(MaybeInst
::Uncompiled(inst
));
822 fn push_split_hole(&mut self) -> Hole
{
823 let hole
= self.insts
.len();
824 self.insts
.push(MaybeInst
::Split
);
828 fn pop_split_hole(&mut self) -> ResultOrEmpty
{
833 fn check_size(&self) -> result
::Result
<(), Error
> {
834 use std
::mem
::size_of
;
837 self.extra_inst_bytes
+ (self.insts
.len() * size_of
::<Inst
>());
838 if size
> self.size_limit
{
839 Err(Error
::CompiledTooBig(self.size_limit
))
854 fn dup_one(self) -> (Self, Self) {
856 Hole
::One(pc
) => (Hole
::One(pc
), Hole
::One(pc
)),
857 Hole
::None
| Hole
::Many(_
) => {
858 unreachable
!("must be called on single hole")
864 #[derive(Clone, Debug)]
867 Uncompiled(InstHole
),
874 fn fill(&mut self, goto
: InstPtr
) {
875 let maybeinst
= match *self {
876 MaybeInst
::Split
=> MaybeInst
::Split1(goto
),
877 MaybeInst
::Uncompiled(ref inst
) => {
878 MaybeInst
::Compiled(inst
.fill(goto
))
880 MaybeInst
::Split1(goto1
) => {
881 MaybeInst
::Compiled(Inst
::Split(InstSplit
{
886 MaybeInst
::Split2(goto2
) => {
887 MaybeInst
::Compiled(Inst
::Split(InstSplit
{
893 "not all instructions were compiled! \
894 found uncompiled instruction: {:?}",
901 fn fill_split(&mut self, goto1
: InstPtr
, goto2
: InstPtr
) {
902 let filled
= match *self {
903 MaybeInst
::Split
=> {
904 Inst
::Split(InstSplit { goto1: goto1, goto2: goto2 }
)
907 "must be called on Split instruction, \
908 instead it was called on: {:?}",
912 *self = MaybeInst
::Compiled(filled
);
915 fn half_fill_split_goto1(&mut self, goto1
: InstPtr
) {
916 let half_filled
= match *self {
917 MaybeInst
::Split
=> goto1
,
919 "must be called on Split instruction, \
920 instead it was called on: {:?}",
924 *self = MaybeInst
::Split1(half_filled
);
927 fn half_fill_split_goto2(&mut self, goto2
: InstPtr
) {
928 let half_filled
= match *self {
929 MaybeInst
::Split
=> goto2
,
931 "must be called on Split instruction, \
932 instead it was called on: {:?}",
936 *self = MaybeInst
::Split2(half_filled
);
939 fn unwrap(self) -> Inst
{
941 MaybeInst
::Compiled(inst
) => inst
,
943 "must be called on a compiled instruction, \
944 instead it was called on: {:?}",
951 #[derive(Clone, Debug)]
953 Save { slot: usize }
,
954 EmptyLook { look: EmptyLook }
,
956 Ranges { ranges: Vec<(char, char)> }
,
957 Bytes { start: u8, end: u8 }
,
961 fn fill(&self, goto
: InstPtr
) -> Inst
{
963 InstHole
::Save { slot }
=> {
964 Inst
::Save(InstSave { goto: goto, slot: slot }
)
966 InstHole
::EmptyLook { look }
=> {
967 Inst
::EmptyLook(InstEmptyLook { goto: goto, look: look }
)
969 InstHole
::Char { c }
=> Inst
::Char(InstChar { goto: goto, c: c }
),
970 InstHole
::Ranges { ref ranges }
=> Inst
::Ranges(InstRanges
{
972 ranges
: ranges
.clone().into_boxed_slice(),
974 InstHole
::Bytes { start, end }
=> {
975 Inst
::Bytes(InstBytes { goto: goto, start: start, end: end }
)
981 struct CompileClass
<'a
, 'b
> {
983 ranges
: &'b
[hir
::ClassUnicodeRange
],
986 impl<'a
, 'b
> CompileClass
<'a
, 'b
> {
987 fn compile(mut self) -> Result
{
988 let mut holes
= vec
![];
989 let mut initial_entry
= None
;
990 let mut last_split
= Hole
::None
;
991 let mut utf8_seqs
= self.c
.utf8_seqs
.take().unwrap();
992 self.c
.suffix_cache
.clear();
994 for (i
, range
) in self.ranges
.iter().enumerate() {
995 let is_last_range
= i
+ 1 == self.ranges
.len();
996 utf8_seqs
.reset(range
.start(), range
.end());
997 let mut it
= (&mut utf8_seqs
).peekable();
999 let utf8_seq
= match it
.next() {
1001 Some(utf8_seq
) => utf8_seq
,
1003 if is_last_range
&& it
.peek().is_none() {
1004 let Patch { hole, entry }
= self.c_utf8_seq(&utf8_seq
)?
;
1006 self.c
.fill(last_split
, entry
);
1007 last_split
= Hole
::None
;
1008 if initial_entry
.is_none() {
1009 initial_entry
= Some(entry
);
1012 if initial_entry
.is_none() {
1013 initial_entry
= Some(self.c
.insts
.len());
1015 self.c
.fill_to_next(last_split
);
1016 last_split
= self.c
.push_split_hole();
1017 let Patch { hole, entry }
= self.c_utf8_seq(&utf8_seq
)?
;
1020 self.c
.fill_split(last_split
, Some(entry
), None
);
1024 self.c
.utf8_seqs
= Some(utf8_seqs
);
1025 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() }
)
1028 fn c_utf8_seq(&mut self, seq
: &Utf8Sequence
) -> Result
{
1029 if self.c
.compiled
.is_reverse
{
1030 self.c_utf8_seq_(seq
)
1032 self.c_utf8_seq_(seq
.into_iter().rev())
1036 fn c_utf8_seq_
<'r
, I
>(&mut self, seq
: I
) -> Result
1038 I
: IntoIterator
<Item
= &'r Utf8Range
>,
1040 // The initial instruction for each UTF-8 sequence should be the same.
1041 let mut from_inst
= ::std
::usize::MAX
;
1042 let mut last_hole
= Hole
::None
;
1043 for byte_range
in seq
{
1044 let key
= SuffixCacheKey
{
1045 from_inst
: from_inst
,
1046 start
: byte_range
.start
,
1047 end
: byte_range
.end
,
1050 let pc
= self.c
.insts
.len();
1051 if let Some(cached_pc
) = self.c
.suffix_cache
.get(key
, pc
) {
1052 from_inst
= cached_pc
;
1056 self.c
.byte_classes
.set_range(byte_range
.start
, byte_range
.end
);
1057 if from_inst
== ::std
::usize::MAX
{
1058 last_hole
= self.c
.push_hole(InstHole
::Bytes
{
1059 start
: byte_range
.start
,
1060 end
: byte_range
.end
,
1063 self.c
.push_compiled(Inst
::Bytes(InstBytes
{
1065 start
: byte_range
.start
,
1066 end
: byte_range
.end
,
1069 from_inst
= self.c
.insts
.len().checked_sub(1).unwrap();
1070 debug_assert
!(from_inst
< ::std
::usize::MAX
);
1072 debug_assert
!(from_inst
< ::std
::usize::MAX
);
1073 Ok(Patch { hole: last_hole, entry: from_inst }
)
1077 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
1078 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
1079 /// The set of byte ranges looks like this:
1083 /// [E0][A0-BF][80-BF]
1084 /// [E1-EC][80-BF][80-BF]
1085 /// [ED][80-9F][80-BF]
1086 /// [EE-EF][80-BF][80-BF]
1088 /// Each line above translates to one alternate in the compiled regex program.
1089 /// However, all but one of the alternates end in the same suffix, which is
1090 /// a waste of an instruction. The suffix cache facilitates reusing them across
1093 /// Note that a HashMap could be trivially used for this, but we don't need its
1094 /// overhead. Some small bounded space (LRU style) is more than enough.
1096 /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
1097 /// except it uses hashes as original indices and then compares full keys for
1098 /// validation against `dense` array.
1100 struct SuffixCache
{
1101 sparse
: Box
<[usize]>,
1102 dense
: Vec
<SuffixCacheEntry
>,
1105 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1106 struct SuffixCacheEntry
{
1107 key
: SuffixCacheKey
,
1111 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1112 struct SuffixCacheKey
{
1119 fn new(size
: usize) -> Self {
1121 sparse
: vec
![0usize
; size
].into(),
1122 dense
: Vec
::with_capacity(size
),
1126 fn get(&mut self, key
: SuffixCacheKey
, pc
: InstPtr
) -> Option
<InstPtr
> {
1127 let hash
= self.hash(&key
);
1128 let pos
= &mut self.sparse
[hash
];
1129 if let Some(entry
) = self.dense
.get(*pos
) {
1130 if entry
.key
== key
{
1131 return Some(entry
.pc
);
1134 *pos
= self.dense
.len();
1135 self.dense
.push(SuffixCacheEntry { key: key, pc: pc }
);
1139 fn clear(&mut self) {
1143 fn hash(&self, suffix
: &SuffixCacheKey
) -> usize {
1144 // Basic FNV-1a hash as described:
1145 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1146 const FNV_PRIME
: u64 = 1099511628211;
1147 let mut h
= 14695981039346656037;
1148 h
= (h ^
(suffix
.from_inst
as u64)).wrapping_mul(FNV_PRIME
);
1149 h
= (h ^
(suffix
.start
as u64)).wrapping_mul(FNV_PRIME
);
1150 h
= (h ^
(suffix
.end
as u64)).wrapping_mul(FNV_PRIME
);
1151 (h
as usize) % self.sparse
.len()
1155 struct ByteClassSet([bool
; 256]);
1159 ByteClassSet([false; 256])
1162 fn set_range(&mut self, start
: u8, end
: u8) {
1163 debug_assert
!(start
<= end
);
1165 self.0[start
as usize - 1] = true;
1167 self.0[end
as usize] = true;
1170 fn set_word_boundary(&mut self) {
1171 // We need to mark all ranges of bytes whose pairs result in
1172 // evaluating \b differently.
1173 let iswb
= is_word_byte
;
1174 let mut b1
: u16 = 0;
1178 while b2
<= 255 && iswb(b1
as u8) == iswb(b2
as u8) {
1181 self.set_range(b1
as u8, (b2
- 1) as u8);
1186 fn byte_classes(&self) -> Vec
<u8> {
1187 // N.B. If you're debugging the DFA, it's useful to simply return
1188 // `(0..256).collect()`, which effectively removes the byte classes
1189 // and makes the transitions easier to read.
1190 // (0usize..256).map(|x| x as u8).collect()
1191 let mut byte_classes
= vec
![0; 256];
1192 let mut class
= 0u8;
1195 byte_classes
[i
] = class
as u8;
1200 class
= class
.checked_add(1).unwrap();
1208 impl fmt
::Debug
for ByteClassSet
{
1209 fn fmt(&self, f
: &mut fmt
::Formatter
<'_
>) -> fmt
::Result
{
1210 f
.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
1214 fn u32_to_usize(n
: u32) -> usize {
1215 // In case usize is less than 32 bits, we need to guard against overflow.
1216 // On most platforms this compiles to nothing.
1217 // TODO Use `std::convert::TryFrom` once it's stable.
1218 if (n
as u64) > (::std
::usize::MAX
as u64) {
1219 panic
!("BUG: {} is too big to be pointer sized", n
)
1226 use super::ByteClassSet
;
1230 let mut set
= ByteClassSet
::new();
1231 set
.set_range(b'a'
, b'z'
);
1232 let classes
= set
.byte_classes();
1233 assert_eq
!(classes
[0], 0);
1234 assert_eq
!(classes
[1], 0);
1235 assert_eq
!(classes
[2], 0);
1236 assert_eq
!(classes
[b'a'
as usize - 1], 0);
1237 assert_eq
!(classes
[b'a'
as usize], 1);
1238 assert_eq
!(classes
[b'm'
as usize], 1);
1239 assert_eq
!(classes
[b'z'
as usize], 1);
1240 assert_eq
!(classes
[b'z'
as usize + 1], 2);
1241 assert_eq
!(classes
[254], 2);
1242 assert_eq
!(classes
[255], 2);
1244 let mut set
= ByteClassSet
::new();
1245 set
.set_range(0, 2);
1246 set
.set_range(4, 6);
1247 let classes
= set
.byte_classes();
1248 assert_eq
!(classes
[0], 0);
1249 assert_eq
!(classes
[1], 0);
1250 assert_eq
!(classes
[2], 0);
1251 assert_eq
!(classes
[3], 1);
1252 assert_eq
!(classes
[4], 2);
1253 assert_eq
!(classes
[5], 2);
1254 assert_eq
!(classes
[6], 2);
1255 assert_eq
!(classes
[7], 3);
1256 assert_eq
!(classes
[255], 3);
1260 fn full_byte_classes() {
1261 let mut set
= ByteClassSet
::new();
1262 for i
in 0..256u16 {
1263 set
.set_range(i
as u8, i
as u8);
1265 assert_eq
!(set
.byte_classes().len(), 256);