1 // Copyright 2014-2016 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 use std
::collections
::HashMap
;
17 Expr
, Repeater
, CharClass
, ClassRange
, ByteClass
, ByteRange
,
20 use utf8_ranges
::{Utf8Range, Utf8Sequence, Utf8Sequences}
;
23 Program
, Inst
, InstPtr
, EmptyLook
,
24 InstSave
, InstSplit
, InstEmptyLook
, InstChar
, InstRanges
, InstBytes
,
29 type Result
= result
::Result
<Patch
, Error
>;
37 /// A compiler translates a regular expression AST to a sequence of
38 /// instructions. The sequence of instructions represents an NFA.
40 insts
: Vec
<MaybeInst
>,
42 capture_name_idx
: HashMap
<String
, usize>,
45 suffix_cache
: SuffixCache
,
46 utf8_seqs
: Option
<Utf8Sequences
>,
47 byte_classes
: ByteClassSet
,
51 /// Create a new regular expression compiler.
53 /// Various options can be set before calling `compile` on an expression.
54 pub fn new() -> Self {
57 compiled
: Program
::new(),
58 capture_name_idx
: HashMap
::new(),
60 size_limit
: 10 * (1 << 20),
61 suffix_cache
: SuffixCache
::new(1000),
62 utf8_seqs
: Some(Utf8Sequences
::new('
\x00'
, '
\x00'
)),
63 byte_classes
: ByteClassSet
::new(),
67 /// The size of the resulting program is limited by size_limit. If
68 /// the program approximately exceeds the given size (in bytes), then
69 /// compilation will stop and return an error.
70 pub fn size_limit(mut self, size_limit
: usize) -> Self {
71 self.size_limit
= size_limit
;
75 /// If bytes is true, then the program is compiled as a byte based
76 /// automaton, which incorporates UTF-8 decoding into the machine. If it's
77 /// false, then the automaton is Unicode scalar value based, e.g., an
78 /// engine utilizing such an automaton is resposible for UTF-8 decoding.
80 /// The specific invariant is that when returning a byte based machine,
81 /// the neither the `Char` nor `Ranges` instructions are produced.
82 /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
83 /// instruction is never produced.
85 /// Note that `dfa(true)` implies `bytes(true)`.
86 pub fn bytes(mut self, yes
: bool
) -> Self {
87 self.compiled
.is_bytes
= yes
;
91 /// When disabled, the program compiled may match arbitrary bytes.
93 /// When enabled (the default), all compiled programs exclusively match
94 /// valid UTF-8 bytes.
95 pub fn only_utf8(mut self, yes
: bool
) -> Self {
96 self.compiled
.only_utf8
= yes
;
100 /// When set, the machine returned is suitable for use in the DFA matching
103 /// In particular, this ensures that if the regex is not anchored in the
104 /// beginning, then a preceding `.*?` is included in the program. (The NFA
105 /// based engines handle the preceding `.*?` explicitly, which is difficult
106 /// or impossible in the DFA engine.)
107 pub fn dfa(mut self, yes
: bool
) -> Self {
108 self.compiled
.is_dfa
= yes
;
112 /// When set, the machine returned is suitable for matching text in
113 /// reverse. In particular, all concatenations are flipped.
114 pub fn reverse(mut self, yes
: bool
) -> Self {
115 self.compiled
.is_reverse
= yes
;
119 /// Compile a regular expression given its AST.
121 /// The compiler is guaranteed to succeed unless the program exceeds the
122 /// specified size limit. If the size limit is exceeded, then compilation
123 /// stops and returns an error.
127 ) -> result
::Result
<Program
, Error
> {
128 debug_assert
!(exprs
.len() >= 1);
129 self.num_exprs
= exprs
.len();
130 if exprs
.len() == 1 {
131 self.compile_one(&exprs
[0])
133 self.compile_many(exprs
)
137 fn compile_one(mut self, expr
: &Expr
) -> result
::Result
<Program
, Error
> {
138 // If we're compiling a forward DFA and we aren't anchored, then
139 // add a `.*?` before the first capture group.
140 // Other matching engines handle this by baking the logic into the
141 // matching engine itself.
142 let mut dotstar_patch
= Patch { hole: Hole::None, entry: 0 }
;
143 self.compiled
.is_anchored_start
= expr
.is_anchored_start();
144 self.compiled
.is_anchored_end
= expr
.is_anchored_end();
145 if self.compiled
.needs_dotstar() {
146 dotstar_patch
= try
!(self.c_dotstar());
147 self.compiled
.start
= dotstar_patch
.entry
;
149 self.compiled
.captures
= vec
![None
];
150 let patch
= try
!(self.c_capture(0, expr
));
151 if self.compiled
.needs_dotstar() {
152 self.fill(dotstar_patch
.hole
, patch
.entry
);
154 self.compiled
.start
= patch
.entry
;
156 self.fill_to_next(patch
.hole
);
157 self.compiled
.matches
= vec
![self.insts
.len()];
158 self.push_compiled(Inst
::Match(0));
159 self.compile_finish()
165 ) -> result
::Result
<Program
, Error
> {
166 debug_assert
!(exprs
.len() > 1);
168 self.compiled
.is_anchored_start
=
169 exprs
.iter().all(|e
| e
.is_anchored_start());
170 self.compiled
.is_anchored_end
=
171 exprs
.iter().all(|e
| e
.is_anchored_end());
172 let mut dotstar_patch
= Patch { hole: Hole::None, entry: 0 }
;
173 if self.compiled
.needs_dotstar() {
174 dotstar_patch
= try
!(self.c_dotstar());
175 self.compiled
.start
= dotstar_patch
.entry
;
177 self.compiled
.start
= 0; // first instruction is always split
179 self.fill_to_next(dotstar_patch
.hole
);
181 let mut prev_hole
= Hole
::None
;
182 for (i
, expr
) in exprs
[0..exprs
.len() - 1].iter().enumerate() {
183 self.fill_to_next(prev_hole
);
184 let split
= self.push_split_hole();
185 let Patch { hole, entry }
= try
!(self.c_capture(0, expr
));
186 self.fill_to_next(hole
);
187 self.compiled
.matches
.push(self.insts
.len());
188 self.push_compiled(Inst
::Match(i
));
189 prev_hole
= self.fill_split(split
, Some(entry
), None
);
191 let i
= exprs
.len() - 1;
192 let Patch { hole, entry }
= try
!(self.c_capture(0, &exprs
[i
]));
193 self.fill(prev_hole
, entry
);
194 self.fill_to_next(hole
);
195 self.compiled
.matches
.push(self.insts
.len());
196 self.push_compiled(Inst
::Match(i
));
197 self.compile_finish()
200 fn compile_finish(mut self) -> result
::Result
<Program
, Error
> {
201 self.compiled
.insts
=
202 self.insts
.into_iter().map(|inst
| inst
.unwrap()).collect();
203 self.compiled
.byte_classes
= self.byte_classes
.byte_classes();
204 self.compiled
.capture_name_idx
= Arc
::new(self.capture_name_idx
);
208 /// Compile expr into self.insts, returning a patch on success,
209 /// or an error if we run out of memory.
211 /// All of the c_* methods of the compiler share the contract outlined
214 /// The main thing that a c_* method does is mutate `self.insts`
215 /// to add a list of mostly compiled instructions required to execute
216 /// the given expression. `self.insts` contains MaybeInsts rather than
217 /// Insts because there is some backpatching required.
219 /// The `Patch` value returned by each c_* method provides metadata
220 /// about the compiled instructions emitted to `self.insts`. The
221 /// `entry` member of the patch refers to the first instruction
222 /// (the entry point), while the `hole` member contains zero or
223 /// more offsets to partial instructions that need to be backpatched.
224 /// The c_* routine can't know where its list of instructions are going to
225 /// jump to after execution, so it is up to the caller to patch
226 /// these jumps to point to the right place. So compiling some
227 /// expression, e, we would end up with a situation that looked like:
230 /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
237 /// To compile two expressions, e1 and e2, concatinated together we
241 /// let patch1 = self.c(e1);
242 /// let patch2 = self.c(e2);
245 /// while leaves us with a situation that looks like
248 /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
251 /// entry1 hole1 entry2 hole2
254 /// Then to merge the two patches together into one we would backpatch
255 /// hole1 with entry2 and return a new patch that enters at entry1
256 /// and has hole2 for a hole. In fact, if you look at the c_concat
257 /// method you will see that it does exactly this, though it handles
258 /// a list of expressions rather than just the two that we use for
260 fn c(&mut self, expr
: &Expr
) -> Result
{
264 try
!(self.check_size());
266 Empty
=> Ok(Patch { hole: Hole::None, entry: self.insts.len() }
),
267 Literal { ref chars, casei }
=> self.c_literal(chars
, casei
),
268 LiteralBytes { ref bytes, casei }
=> self.c_bytes(bytes
, casei
),
269 AnyChar
=> self.c_class(&[ClassRange
{
275 ClassRange { start: '\x00', end: '\x09' }
,
276 ClassRange { start: '\x0b', end: '\u{10ffff}'
},
280 self.c_class_bytes(&[ByteRange { start: 0, end: 0xFF }
])
283 self.c_class_bytes(&[
284 ByteRange { start: 0, end: 0x9 }
,
285 ByteRange { start: 0xB, end: 0xFF }
,
291 ClassBytes(ref cls
) => {
292 self.c_class_bytes(cls
)
294 StartLine
if self.compiled
.is_reverse
=> {
295 self.byte_classes
.set_range(b'
\n'
, b'
\n'
);
296 self.c_empty_look(prog
::EmptyLook
::EndLine
)
299 self.byte_classes
.set_range(b'
\n'
, b'
\n'
);
300 self.c_empty_look(prog
::EmptyLook
::StartLine
)
302 EndLine
if self.compiled
.is_reverse
=> {
303 self.byte_classes
.set_range(b'
\n'
, b'
\n'
);
304 self.c_empty_look(prog
::EmptyLook
::StartLine
)
307 self.byte_classes
.set_range(b'
\n'
, b'
\n'
);
308 self.c_empty_look(prog
::EmptyLook
::EndLine
)
310 StartText
if self.compiled
.is_reverse
=> {
311 self.c_empty_look(prog
::EmptyLook
::EndText
)
314 self.c_empty_look(prog
::EmptyLook
::StartText
)
316 EndText
if self.compiled
.is_reverse
=> {
317 self.c_empty_look(prog
::EmptyLook
::StartText
)
320 self.c_empty_look(prog
::EmptyLook
::EndText
)
323 self.compiled
.has_unicode_word_boundary
= true;
324 self.byte_classes
.set_word_boundary();
325 self.c_empty_look(prog
::EmptyLook
::WordBoundary
)
328 self.compiled
.has_unicode_word_boundary
= true;
329 self.byte_classes
.set_word_boundary();
330 self.c_empty_look(prog
::EmptyLook
::NotWordBoundary
)
332 WordBoundaryAscii
=> {
333 self.byte_classes
.set_word_boundary();
334 self.c_empty_look(prog
::EmptyLook
::WordBoundaryAscii
)
336 NotWordBoundaryAscii
=> {
337 self.byte_classes
.set_word_boundary();
338 self.c_empty_look(prog
::EmptyLook
::NotWordBoundaryAscii
)
340 Group { ref e, i: None, name: None }
=> self.c(e
),
341 Group { ref e, i, ref name }
=> {
342 // it's impossible to have a named capture without an index
343 let i
= i
.expect("capture index");
344 if i
>= self.compiled
.captures
.len() {
345 self.compiled
.captures
.push(name
.clone());
346 if let Some(ref name
) = *name
{
347 self.capture_name_idx
.insert(name
.to_owned(), i
);
350 self.c_capture(2 * i
, e
)
353 if self.compiled
.is_reverse
{
354 self.c_concat(es
.iter().rev())
359 Alternate(ref es
) => self.c_alternate(&**es
),
360 Repeat { ref e, r, greedy }
=> self.c_repeat(e
, r
, greedy
),
364 fn c_capture(&mut self, first_slot
: usize, expr
: &Expr
) -> Result
{
365 if self.num_exprs
> 1 || self.compiled
.is_dfa
{
366 // Don't ever compile Save instructions for regex sets because
367 // they are never used. They are also never used in DFA programs
368 // because DFAs can't handle captures.
371 let entry
= self.insts
.len();
372 let hole
= self.push_hole(InstHole
::Save { slot: first_slot }
);
373 let patch
= try
!(self.c(expr
));
374 self.fill(hole
, patch
.entry
);
375 self.fill_to_next(patch
.hole
);
376 let hole
= self.push_hole(InstHole
::Save { slot: first_slot + 1 }
);
377 Ok(Patch { hole: hole, entry: entry }
)
381 fn c_dotstar(&mut self) -> Result
{
382 Ok(if !self.compiled
.only_utf8() {
383 try
!(self.c(&Expr
::Repeat
{
384 e
: Box
::new(Expr
::AnyByte
),
385 r
: Repeater
::ZeroOrMore
,
389 try
!(self.c(&Expr
::Repeat
{
390 e
: Box
::new(Expr
::AnyChar
),
391 r
: Repeater
::ZeroOrMore
,
397 fn c_literal(&mut self, chars
: &[char], casei
: bool
) -> Result
{
398 debug_assert
!(!chars
.is_empty());
399 let mut chars
: Box
<Iterator
<Item
=&char>> =
400 if self.compiled
.is_reverse
{
401 Box
::new(chars
.iter().rev())
403 Box
::new(chars
.iter())
405 let first
= *chars
.next().expect("non-empty literal");
406 let Patch { mut hole, entry }
= try
!(self.c_char(first
, casei
));
408 let p
= try
!(self.c_char(c
, casei
));
409 self.fill(hole
, p
.entry
);
412 Ok(Patch { hole: hole, entry: entry }
)
415 fn c_char(&mut self, c
: char, casei
: bool
) -> Result
{
417 self.c_class(&CharClass
::new(vec
![
418 ClassRange { start: c, end: c }
,
421 self.c_class(&[ClassRange { start: c, end: c }
])
425 fn c_class(&mut self, ranges
: &[ClassRange
]) -> Result
{
426 assert
!(!ranges
.is_empty());
427 if self.compiled
.uses_bytes() {
433 let ranges
: Vec
<(char, char)> =
434 ranges
.iter().map(|r
| (r
.start
, r
.end
)).collect();
435 let hole
= if ranges
.len() == 1 && ranges
[0].0 == ranges
[0].1 {
436 self.push_hole(InstHole
::Char { c: ranges[0].0 }
)
438 self.push_hole(InstHole
::Ranges { ranges: ranges }
)
440 Ok(Patch { hole: hole, entry: self.insts.len() - 1 }
)
444 fn c_bytes(&mut self, bytes
: &[u8], casei
: bool
) -> Result
{
445 debug_assert
!(!bytes
.is_empty());
446 let mut bytes
: Box
<Iterator
<Item
=&u8>> =
447 if self.compiled
.is_reverse
{
448 Box
::new(bytes
.iter().rev())
450 Box
::new(bytes
.iter())
452 let first
= *bytes
.next().expect("non-empty literal");
453 let Patch { mut hole, entry }
= try
!(self.c_byte(first
, casei
));
455 let p
= try
!(self.c_byte(b
, casei
));
456 self.fill(hole
, p
.entry
);
459 Ok(Patch { hole: hole, entry: entry }
)
462 fn c_byte(&mut self, b
: u8, casei
: bool
) -> Result
{
464 self.c_class_bytes(&ByteClass
::new(vec
![
465 ByteRange { start: b, end: b }
,
468 self.c_class_bytes(&[ByteRange { start: b, end: b }
])
472 fn c_class_bytes(&mut self, ranges
: &[ByteRange
]) -> Result
{
473 debug_assert
!(!ranges
.is_empty());
475 let first_split_entry
= self.insts
.len();
476 let mut holes
= vec
![];
477 let mut prev_hole
= Hole
::None
;
478 for r
in &ranges
[0..ranges
.len() - 1] {
479 self.fill_to_next(prev_hole
);
480 let split
= self.push_split_hole();
481 let next
= self.insts
.len();
482 self.byte_classes
.set_range(r
.start
, r
.end
);
483 holes
.push(self.push_hole(InstHole
::Bytes
{
484 start
: r
.start
, end
: r
.end
,
486 prev_hole
= self.fill_split(split
, Some(next
), None
);
488 let next
= self.insts
.len();
489 let r
= &ranges
[ranges
.len() - 1];
490 self.byte_classes
.set_range(r
.start
, r
.end
);
491 holes
.push(self.push_hole(InstHole
::Bytes
{
492 start
: r
.start
, end
: r
.end
,
494 self.fill(prev_hole
, next
);
495 Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry }
)
498 fn c_empty_look(&mut self, look
: EmptyLook
) -> Result
{
499 let hole
= self.push_hole(InstHole
::EmptyLook { look: look }
);
500 Ok(Patch { hole: hole, entry: self.insts.len() - 1 }
)
503 fn c_concat
<'a
, I
>(&mut self, exprs
: I
) -> Result
504 where I
: IntoIterator
<Item
=&'a Expr
> {
505 let mut exprs
= exprs
.into_iter();
506 let first
= match exprs
.next() {
509 return Ok(Patch { hole: Hole::None, entry: self.insts.len() }
)
512 let Patch { mut hole, entry }
= try
!(self.c(first
));
514 let p
= try
!(self.c(e
));
515 self.fill(hole
, p
.entry
);
518 Ok(Patch { hole: hole, entry: entry }
)
521 fn c_alternate(&mut self, exprs
: &[Expr
]) -> Result
{
523 exprs
.len() >= 2, "alternates must have at least 2 exprs");
525 // Initial entry point is always the first split.
526 let first_split_entry
= self.insts
.len();
528 // Save up all of the holes from each alternate. They will all get
529 // patched to point to the same location.
530 let mut holes
= vec
![];
532 let mut prev_hole
= Hole
::None
;
533 for e
in &exprs
[0..exprs
.len() - 1] {
534 self.fill_to_next(prev_hole
);
535 let split
= self.push_split_hole();
536 let Patch { hole, entry }
= try
!(self.c(e
));
538 prev_hole
= self.fill_split(split
, Some(entry
), None
);
540 let Patch { hole, entry }
= try
!(self.c(&exprs
[exprs
.len() - 1]));
542 self.fill(prev_hole
, entry
);
543 Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry }
)
553 Repeater
::ZeroOrOne
=> self.c_repeat_zero_or_one(expr
, greedy
),
554 Repeater
::ZeroOrMore
=> self.c_repeat_zero_or_more(expr
, greedy
),
555 Repeater
::OneOrMore
=> self.c_repeat_one_or_more(expr
, greedy
),
556 Repeater
::Range { min, max: None }
=> {
557 self.c_repeat_range_min_or_more(expr
, greedy
, min
)
559 Repeater
::Range { min, max: Some(max) }
=> {
560 self.c_repeat_range(expr
, greedy
, min
, max
)
565 fn c_repeat_zero_or_one(
570 let split_entry
= self.insts
.len();
571 let split
= self.push_split_hole();
572 let Patch { hole: hole_rep, entry: entry_rep }
= try
!(self.c(expr
));
574 let split_hole
= if greedy
{
575 self.fill_split(split
, Some(entry_rep
), None
)
577 self.fill_split(split
, None
, Some(entry_rep
))
579 let holes
= vec
![hole_rep
, split_hole
];
580 Ok(Patch { hole: Hole::Many(holes), entry: split_entry }
)
583 fn c_repeat_zero_or_more(
588 let split_entry
= self.insts
.len();
589 let split
= self.push_split_hole();
590 let Patch { hole: hole_rep, entry: entry_rep }
= try
!(self.c(expr
));
592 self.fill(hole_rep
, split_entry
);
593 let split_hole
= if greedy
{
594 self.fill_split(split
, Some(entry_rep
), None
)
596 self.fill_split(split
, None
, Some(entry_rep
))
598 Ok(Patch { hole: split_hole, entry: split_entry }
)
601 fn c_repeat_one_or_more(
606 let Patch { hole: hole_rep, entry: entry_rep }
= try
!(self.c(expr
));
607 self.fill_to_next(hole_rep
);
608 let split
= self.push_split_hole();
610 let split_hole
= if greedy
{
611 self.fill_split(split
, Some(entry_rep
), None
)
613 self.fill_split(split
, None
, Some(entry_rep
))
615 Ok(Patch { hole: split_hole, entry: entry_rep }
)
618 fn c_repeat_range_min_or_more(
624 let min
= u32_to_usize(min
);
625 let patch_concat
= try
!(self.c_concat(iter
::repeat(expr
).take(min
)));
626 let patch_rep
= try
!(self.c_repeat_zero_or_more(expr
, greedy
));
627 self.fill(patch_concat
.hole
, patch_rep
.entry
);
628 Ok(Patch { hole: patch_rep.hole, entry: patch_concat.entry }
)
638 let (min
, max
) = (u32_to_usize(min
), u32_to_usize(max
));
639 let patch_concat
= try
!(self.c_concat(iter
::repeat(expr
).take(min
)));
640 let initial_entry
= patch_concat
.entry
;
642 return Ok(patch_concat
);
644 // It is much simpler to compile, e.g., `a{2,5}` as:
648 // But you end up with a sequence of instructions like this:
660 // This is *incredibly* inefficient because the splits end
661 // up forming a chain, which has to be resolved everything a
662 // transition is followed.
663 let mut holes
= vec
![];
664 let mut prev_hole
= patch_concat
.hole
;
666 self.fill_to_next(prev_hole
);
667 let split
= self.push_split_hole();
668 let Patch { hole, entry }
= try
!(self.c(expr
));
671 holes
.push(self.fill_split(split
, Some(entry
), None
));
673 holes
.push(self.fill_split(split
, None
, Some(entry
)));
676 holes
.push(prev_hole
);
677 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry }
)
680 fn fill(&mut self, hole
: Hole
, goto
: InstPtr
) {
684 self.insts
[pc
].fill(goto
);
686 Hole
::Many(holes
) => {
688 self.fill(hole
, goto
);
694 fn fill_to_next(&mut self, hole
: Hole
) {
695 let next
= self.insts
.len();
696 self.fill(hole
, next
);
702 goto1
: Option
<InstPtr
>,
703 goto2
: Option
<InstPtr
>,
706 Hole
::None
=> Hole
::None
,
708 match (goto1
, goto2
) {
709 (Some(goto1
), Some(goto2
)) => {
710 self.insts
[pc
].fill_split(goto1
, goto2
);
713 (Some(goto1
), None
) => {
714 self.insts
[pc
].half_fill_split_goto1(goto1
);
717 (None
, Some(goto2
)) => {
718 self.insts
[pc
].half_fill_split_goto2(goto2
);
721 (None
, None
) => unreachable
!("at least one of the split \
722 holes must be filled"),
725 Hole
::Many(holes
) => {
726 let mut new_holes
= vec
![];
728 new_holes
.push(self.fill_split(hole
, goto1
, goto2
));
730 if new_holes
.is_empty() {
732 } else if new_holes
.len() == 1 {
733 new_holes
.pop().unwrap()
735 Hole
::Many(new_holes
)
741 fn push_compiled(&mut self, inst
: Inst
) {
742 self.insts
.push(MaybeInst
::Compiled(inst
));
745 fn push_hole(&mut self, inst
: InstHole
) -> Hole
{
746 let hole
= self.insts
.len();
747 self.insts
.push(MaybeInst
::Uncompiled(inst
));
751 fn push_split_hole(&mut self) -> Hole
{
752 let hole
= self.insts
.len();
753 self.insts
.push(MaybeInst
::Split
);
757 fn check_size(&self) -> result
::Result
<(), Error
> {
758 use std
::mem
::size_of
;
760 if self.insts
.len() * size_of
::<Inst
>() > self.size_limit
{
761 Err(Error
::CompiledTooBig(self.size_limit
))
775 #[derive(Clone, Debug)]
778 Uncompiled(InstHole
),
785 fn fill(&mut self, goto
: InstPtr
) {
786 let filled
= match *self {
787 MaybeInst
::Uncompiled(ref inst
) => inst
.fill(goto
),
788 MaybeInst
::Split1(goto1
) => {
789 Inst
::Split(InstSplit { goto1: goto1, goto2: goto }
)
791 MaybeInst
::Split2(goto2
) => {
792 Inst
::Split(InstSplit { goto1: goto, goto2: goto2 }
)
794 _
=> unreachable
!("not all instructions were compiled! \
795 found uncompiled instruction: {:?}", self),
797 *self = MaybeInst
::Compiled(filled
);
800 fn fill_split(&mut self, goto1
: InstPtr
, goto2
: InstPtr
) {
801 let filled
= match *self {
802 MaybeInst
::Split
=> {
803 Inst
::Split(InstSplit { goto1: goto1, goto2: goto2 }
)
805 _
=> unreachable
!("must be called on Split instruction, \
806 instead it was called on: {:?}", self),
808 *self = MaybeInst
::Compiled(filled
);
811 fn half_fill_split_goto1(&mut self, goto1
: InstPtr
) {
812 let half_filled
= match *self {
813 MaybeInst
::Split
=> goto1
,
814 _
=> unreachable
!("must be called on Split instruction, \
815 instead it was called on: {:?}", self),
817 *self = MaybeInst
::Split1(half_filled
);
820 fn half_fill_split_goto2(&mut self, goto2
: InstPtr
) {
821 let half_filled
= match *self {
822 MaybeInst
::Split
=> goto2
,
823 _
=> unreachable
!("must be called on Split instruction, \
824 instead it was called on: {:?}", self),
826 *self = MaybeInst
::Split2(half_filled
);
829 fn unwrap(self) -> Inst
{
831 MaybeInst
::Compiled(inst
) => inst
,
832 _
=> unreachable
!("must be called on a compiled instruction, \
833 instead it was called on: {:?}", self),
838 #[derive(Clone, Debug)]
840 Save { slot: usize }
,
841 EmptyLook { look: EmptyLook }
,
843 Ranges { ranges: Vec<(char, char)> }
,
844 Bytes { start: u8, end: u8 }
,
848 fn fill(&self, goto
: InstPtr
) -> Inst
{
850 InstHole
::Save { slot }
=> Inst
::Save(InstSave
{
854 InstHole
::EmptyLook { look }
=> Inst
::EmptyLook(InstEmptyLook
{
858 InstHole
::Char { c }
=> Inst
::Char(InstChar
{
862 InstHole
::Ranges { ref ranges }
=> Inst
::Ranges(InstRanges
{
864 ranges
: ranges
.clone(),
866 InstHole
::Bytes { start, end }
=> Inst
::Bytes(InstBytes
{
875 struct CompileClass
<'a
, 'b
> {
877 ranges
: &'b
[ClassRange
],
880 impl<'a
, 'b
> CompileClass
<'a
, 'b
> {
881 fn compile(mut self) -> Result
{
882 let mut holes
= vec
![];
883 let mut initial_entry
= None
;
884 let mut last_split
= Hole
::None
;
885 let mut utf8_seqs
= self.c
.utf8_seqs
.take().unwrap();
886 self.c
.suffix_cache
.clear();
888 for (i
, range
) in self.ranges
.iter().enumerate() {
889 let is_last_range
= i
+ 1 == self.ranges
.len();
890 utf8_seqs
.reset(range
.start
, range
.end
);
891 let mut it
= (&mut utf8_seqs
).peekable();
893 let utf8_seq
= match it
.next() {
895 Some(utf8_seq
) => utf8_seq
,
897 if is_last_range
&& it
.peek().is_none() {
898 let Patch { hole, entry }
= try
!(self.c_utf8_seq(&utf8_seq
));
900 self.c
.fill(last_split
, entry
);
901 last_split
= Hole
::None
;
902 if initial_entry
.is_none() {
903 initial_entry
= Some(entry
);
906 if initial_entry
.is_none() {
907 initial_entry
= Some(self.c
.insts
.len());
909 self.c
.fill_to_next(last_split
);
910 last_split
= self.c
.push_split_hole();
911 let Patch { hole, entry }
= try
!(self.c_utf8_seq(&utf8_seq
));
913 last_split
= self.c
.fill_split(last_split
, Some(entry
), None
);
917 self.c
.utf8_seqs
= Some(utf8_seqs
);
919 hole
: Hole
::Many(holes
),
920 entry
: initial_entry
.unwrap(),
924 fn c_utf8_seq(&mut self, seq
: &Utf8Sequence
) -> Result
{
925 if self.c
.compiled
.is_reverse
{
926 self.c_utf8_seq_(seq
)
928 self.c_utf8_seq_(seq
.into_iter().rev())
932 fn c_utf8_seq_
<'r
, I
>(&mut self, seq
: I
) -> Result
933 where I
: IntoIterator
<Item
=&'r Utf8Range
> {
934 // The initial instruction for each UTF-8 sequence should be the same.
935 let mut from_inst
= ::std
::usize::MAX
;
936 let mut last_hole
= Hole
::None
;
937 for byte_range
in seq
{
938 let key
= SuffixCacheKey
{
939 from_inst
: from_inst
,
940 start
: byte_range
.start
,
944 let pc
= self.c
.insts
.len();
945 if let Some(cached_pc
) = self.c
.suffix_cache
.get(key
, pc
) {
946 from_inst
= cached_pc
;
950 self.c
.byte_classes
.set_range(byte_range
.start
, byte_range
.end
);
951 if from_inst
== ::std
::usize::MAX
{
952 last_hole
= self.c
.push_hole(InstHole
::Bytes
{
953 start
: byte_range
.start
,
957 self.c
.push_compiled(Inst
::Bytes(InstBytes
{
959 start
: byte_range
.start
,
963 from_inst
= self.c
.insts
.len().checked_sub(1).unwrap();
964 debug_assert
!(from_inst
< ::std
::usize::MAX
);
966 debug_assert
!(from_inst
< ::std
::usize::MAX
);
967 Ok(Patch { hole: last_hole, entry: from_inst }
)
971 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
972 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
973 /// The set of byte ranges looks like this:
977 /// [E0][A0-BF][80-BF]
978 /// [E1-EC][80-BF][80-BF]
979 /// [ED][80-9F][80-BF]
980 /// [EE-EF][80-BF][80-BF]
982 /// Each line above translates to one alternate in the compiled regex program.
983 /// However, all but one of the alternates end in the same suffix, which is
984 /// a waste of an instruction. The suffix cache facilitates reusing them across
987 /// Note that a HashMap could be trivially used for this, but we don't need its
988 /// overhead. Some small bounded space (LRU style) is more than enough.
990 table
: Vec
<SuffixCacheEntry
>,
991 // Every time the cache is cleared, we increment the version number instead
992 // of actually zeroing memory. Since we store a copy of the current version
993 // in every element, all we need to do is make sure to invalidate any stale
994 // entries upon access. This saves quite a bit of time!
998 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
999 struct SuffixCacheEntry
{
1000 key
: SuffixCacheKey
,
1005 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1006 struct SuffixCacheKey
{
1013 fn new(size
: usize) -> Self {
1015 table
: vec
![SuffixCacheEntry
::default(); size
],
1020 fn get(&mut self, key
: SuffixCacheKey
, pc
: InstPtr
) -> Option
<InstPtr
> {
1021 let h
= self.hash(&key
);
1022 let e
= self.table
[h
];
1023 if e
.key
== key
&& e
.version
== self.version
{
1026 self.table
[h
] = SuffixCacheEntry
{
1029 version
: self.version
,
1035 fn clear(&mut self) {
1039 fn hash(&self, suffix
: &SuffixCacheKey
) -> usize {
1040 // Basic FNV-1a hash as described:
1041 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1042 const FNV_PRIME
: u64 = 1099511628211;
1043 let mut h
= 14695981039346656037;
1044 h
= (h ^
(suffix
.from_inst
as u64)).wrapping_mul(FNV_PRIME
);
1045 h
= (h ^
(suffix
.start
as u64)).wrapping_mul(FNV_PRIME
);
1046 h
= (h ^
(suffix
.end
as u64)).wrapping_mul(FNV_PRIME
);
1047 (h
as usize) % self.table
.len()
1051 struct ByteClassSet([bool
; 256]);
1055 ByteClassSet([false; 256])
1058 fn set_range(&mut self, start
: u8, end
: u8) {
1059 debug_assert
!(start
<= end
);
1061 self.0[start
as usize - 1] = true;
1063 self.0[end
as usize] = true;
1066 fn set_word_boundary(&mut self) {
1067 // We need to mark all ranges of bytes whose pairs result in
1068 // evaluating \b differently.
1069 let iswb
= is_word_byte
;
1070 let mut b1
: u16 = 0;
1074 while b2
<= 255 && iswb(b1
as u8) == iswb(b2
as u8) {
1077 self.set_range(b1
as u8, (b2
- 1) as u8);
1082 fn byte_classes(&self) -> Vec
<u8> {
1083 // N.B. If you're debugging the DFA, it's useful to simply return
1084 // `(0..256).collect()`, which effectively removes the byte classes
1085 // and makes the transitions easier to read.
1086 // (0usize..256).map(|x| x as u8).collect()
1087 let mut byte_classes
= vec
![0; 256];
1088 let mut class
= 0u8;
1091 byte_classes
[i
] = class
as u8;
1096 class
= class
.checked_add(1).unwrap();
1104 fn u32_to_usize(n
: u32) -> usize {
1105 // In case usize is less than 32 bits, we need to guard against overflow.
1106 // On most platforms this compiles to nothing.
1107 // TODO Use `std::convert::TryFrom` once it's stable.
1108 if (n
as u64) > (::std
::usize::MAX
as u64) {
1109 panic
!("BUG: {} is too big to be pointer sized", n
)
1116 use super::ByteClassSet
;
1120 let mut set
= ByteClassSet
::new();
1121 set
.set_range(b'a'
, b'z'
);
1122 let classes
= set
.byte_classes();
1123 assert_eq
!(classes
[0], 0);
1124 assert_eq
!(classes
[1], 0);
1125 assert_eq
!(classes
[2], 0);
1126 assert_eq
!(classes
[b'a'
as usize - 1], 0);
1127 assert_eq
!(classes
[b'a'
as usize], 1);
1128 assert_eq
!(classes
[b'm'
as usize], 1);
1129 assert_eq
!(classes
[b'z'
as usize], 1);
1130 assert_eq
!(classes
[b'z'
as usize + 1], 2);
1131 assert_eq
!(classes
[254], 2);
1132 assert_eq
!(classes
[255], 2);
1134 let mut set
= ByteClassSet
::new();
1135 set
.set_range(0, 2);
1136 set
.set_range(4, 6);
1137 let classes
= set
.byte_classes();
1138 assert_eq
!(classes
[0], 0);
1139 assert_eq
!(classes
[1], 0);
1140 assert_eq
!(classes
[2], 0);
1141 assert_eq
!(classes
[3], 1);
1142 assert_eq
!(classes
[4], 2);
1143 assert_eq
!(classes
[5], 2);
1144 assert_eq
!(classes
[6], 2);
1145 assert_eq
!(classes
[7], 3);
1146 assert_eq
!(classes
[255], 3);
1150 fn full_byte_classes() {
1151 let mut set
= ByteClassSet
::new();
1152 for i
in 0..256u16 {
1153 set
.set_range(i
as u8, i
as u8);
1155 assert_eq
!(set
.byte_classes().len(), 256);