1 // Copyright 2014-2016 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 use std
::collections
::HashMap
;
17 Expr
, Repeater
, CharClass
, ClassRange
, ByteClass
, ByteRange
,
20 use utf8_ranges
::{Utf8Range, Utf8Sequence, Utf8Sequences}
;
23 Program
, Inst
, InstPtr
, EmptyLook
,
24 InstSave
, InstSplit
, InstEmptyLook
, InstChar
, InstRanges
, InstBytes
,
29 type Result
= result
::Result
<Patch
, Error
>;
37 /// A compiler translates a regular expression AST to a sequence of
38 /// instructions. The sequence of instructions represents an NFA.
40 insts
: Vec
<MaybeInst
>,
42 capture_name_idx
: HashMap
<String
, usize>,
45 suffix_cache
: SuffixCache
,
46 utf8_seqs
: Option
<Utf8Sequences
>,
47 byte_classes
: ByteClassSet
,
51 /// Create a new regular expression compiler.
53 /// Various options can be set before calling `compile` on an expression.
54 pub fn new() -> Self {
57 compiled
: Program
::new(),
58 capture_name_idx
: HashMap
::new(),
60 size_limit
: 10 * (1 << 20),
61 suffix_cache
: SuffixCache
::new(1000),
62 utf8_seqs
: Some(Utf8Sequences
::new('
\x00'
, '
\x00'
)),
63 byte_classes
: ByteClassSet
::new(),
67 /// The size of the resulting program is limited by size_limit. If
68 /// the program approximately exceeds the given size (in bytes), then
69 /// compilation will stop and return an error.
70 pub fn size_limit(mut self, size_limit
: usize) -> Self {
71 self.size_limit
= size_limit
;
75 /// If bytes is true, then the program is compiled as a byte based
76 /// automaton, which incorporates UTF-8 decoding into the machine. If it's
77 /// false, then the automaton is Unicode scalar value based, e.g., an
78 /// engine utilizing such an automaton is resposible for UTF-8 decoding.
80 /// The specific invariant is that when returning a byte based machine,
81 /// the neither the `Char` nor `Ranges` instructions are produced.
82 /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
83 /// instruction is never produced.
85 /// Note that `dfa(true)` implies `bytes(true)`.
86 pub fn bytes(mut self, yes
: bool
) -> Self {
87 self.compiled
.is_bytes
= yes
;
91 /// When disabled, the program compiled may match arbitrary bytes.
93 /// When enabled (the default), all compiled programs exclusively match
94 /// valid UTF-8 bytes.
95 pub fn only_utf8(mut self, yes
: bool
) -> Self {
96 self.compiled
.only_utf8
= yes
;
100 /// When set, the machine returned is suitable for use in the DFA matching
103 /// In particular, this ensures that if the regex is not anchored in the
104 /// beginning, then a preceding `.*?` is included in the program. (The NFA
105 /// based engines handle the preceding `.*?` explicitly, which is difficult
106 /// or impossible in the DFA engine.)
107 pub fn dfa(mut self, yes
: bool
) -> Self {
108 self.compiled
.is_dfa
= yes
;
112 /// When set, the machine returned is suitable for matching text in
113 /// reverse. In particular, all concatenations are flipped.
114 pub fn reverse(mut self, yes
: bool
) -> Self {
115 self.compiled
.is_reverse
= yes
;
119 /// Compile a regular expression given its AST.
121 /// The compiler is guaranteed to succeed unless the program exceeds the
122 /// specified size limit. If the size limit is exceeded, then compilation
123 /// stops and returns an error.
127 ) -> result
::Result
<Program
, Error
> {
128 debug_assert
!(exprs
.len() >= 1);
129 self.num_exprs
= exprs
.len();
130 if exprs
.len() == 1 {
131 self.compile_one(&exprs
[0])
133 self.compile_many(exprs
)
137 fn compile_one(mut self, expr
: &Expr
) -> result
::Result
<Program
, Error
> {
138 // If we're compiling a forward DFA and we aren't anchored, then
139 // add a `.*?` before the first capture group.
140 // Other matching engines handle this by baking the logic into the
141 // matching engine itself.
142 let mut dotstar_patch
= Patch { hole: Hole::None, entry: 0 }
;
143 self.compiled
.is_anchored_start
= expr
.is_anchored_start();
144 self.compiled
.is_anchored_end
= expr
.is_anchored_end();
145 if self.compiled
.needs_dotstar() {
146 dotstar_patch
= try
!(self.c_dotstar());
147 self.compiled
.start
= dotstar_patch
.entry
;
149 self.compiled
.captures
= vec
![None
];
150 let patch
= try
!(self.c_capture(0, expr
));
151 if self.compiled
.needs_dotstar() {
152 self.fill(dotstar_patch
.hole
, patch
.entry
);
154 self.compiled
.start
= patch
.entry
;
156 self.fill_to_next(patch
.hole
);
157 self.compiled
.matches
= vec
![self.insts
.len()];
158 self.push_compiled(Inst
::Match(0));
159 self.compile_finish()
165 ) -> result
::Result
<Program
, Error
> {
166 debug_assert
!(exprs
.len() > 1);
168 self.compiled
.is_anchored_start
=
169 exprs
.iter().all(|e
| e
.is_anchored_start());
170 self.compiled
.is_anchored_end
=
171 exprs
.iter().all(|e
| e
.is_anchored_end());
172 let mut dotstar_patch
= Patch { hole: Hole::None, entry: 0 }
;
173 if self.compiled
.needs_dotstar() {
174 dotstar_patch
= try
!(self.c_dotstar());
175 self.compiled
.start
= dotstar_patch
.entry
;
177 self.compiled
.start
= 0; // first instruction is always split
179 self.fill_to_next(dotstar_patch
.hole
);
181 for (i
, expr
) in exprs
[0..exprs
.len() - 1].iter().enumerate() {
182 let split
= self.push_split_hole();
183 let Patch { hole, entry }
= try
!(self.c_capture(0, expr
));
184 self.fill_to_next(hole
);
185 self.compiled
.matches
.push(self.insts
.len());
186 self.push_compiled(Inst
::Match(i
));
188 let next
= self.insts
.len();
189 self.fill_split(split
, Some(entry
), Some(next
));
191 let i
= exprs
.len() - 1;
192 let Patch { hole, .. }
= try
!(self.c_capture(0, &exprs
[i
]));
193 self.fill_to_next(hole
);
194 self.compiled
.matches
.push(self.insts
.len());
195 self.push_compiled(Inst
::Match(i
));
196 self.compile_finish()
199 fn compile_finish(mut self) -> result
::Result
<Program
, Error
> {
200 self.compiled
.insts
=
201 self.insts
.into_iter().map(|inst
| inst
.unwrap()).collect();
202 self.compiled
.byte_classes
= self.byte_classes
.byte_classes();
203 self.compiled
.capture_name_idx
= Arc
::new(self.capture_name_idx
);
207 fn c(&mut self, expr
: &Expr
) -> Result
{
211 try
!(self.check_size());
213 Empty
=> Ok(Patch { hole: Hole::None, entry: self.insts.len() }
),
214 Literal { ref chars, casei }
=> self.c_literal(chars
, casei
),
215 LiteralBytes { ref bytes, casei }
=> self.c_bytes(bytes
, casei
),
216 AnyChar
=> self.c_class(&[ClassRange
{
222 ClassRange { start: '\x00', end: '\x09' }
,
223 ClassRange { start: '\x0b', end: '\u{10ffff}'
},
227 self.c_class_bytes(&[ByteRange { start: 0, end: 0xFF }
])
230 self.c_class_bytes(&[
231 ByteRange { start: 0, end: 0x9 }
,
232 ByteRange { start: 0xB, end: 0xFF }
,
238 ClassBytes(ref cls
) => {
239 self.c_class_bytes(cls
)
241 StartLine
if self.compiled
.is_reverse
=> {
242 self.byte_classes
.set_range(b'
\n'
, b'
\n'
);
243 self.c_empty_look(prog
::EmptyLook
::EndLine
)
246 self.byte_classes
.set_range(b'
\n'
, b'
\n'
);
247 self.c_empty_look(prog
::EmptyLook
::StartLine
)
249 EndLine
if self.compiled
.is_reverse
=> {
250 self.byte_classes
.set_range(b'
\n'
, b'
\n'
);
251 self.c_empty_look(prog
::EmptyLook
::StartLine
)
254 self.byte_classes
.set_range(b'
\n'
, b'
\n'
);
255 self.c_empty_look(prog
::EmptyLook
::EndLine
)
257 StartText
if self.compiled
.is_reverse
=> {
258 self.c_empty_look(prog
::EmptyLook
::EndText
)
261 self.c_empty_look(prog
::EmptyLook
::StartText
)
263 EndText
if self.compiled
.is_reverse
=> {
264 self.c_empty_look(prog
::EmptyLook
::StartText
)
267 self.c_empty_look(prog
::EmptyLook
::EndText
)
270 self.compiled
.has_unicode_word_boundary
= true;
271 self.byte_classes
.set_word_boundary();
272 self.c_empty_look(prog
::EmptyLook
::WordBoundary
)
275 self.compiled
.has_unicode_word_boundary
= true;
276 self.byte_classes
.set_word_boundary();
277 self.c_empty_look(prog
::EmptyLook
::NotWordBoundary
)
279 WordBoundaryAscii
=> {
280 self.byte_classes
.set_word_boundary();
281 self.c_empty_look(prog
::EmptyLook
::WordBoundaryAscii
)
283 NotWordBoundaryAscii
=> {
284 self.byte_classes
.set_word_boundary();
285 self.c_empty_look(prog
::EmptyLook
::NotWordBoundaryAscii
)
287 Group { ref e, i: None, name: None }
=> self.c(e
),
288 Group { ref e, i, ref name }
=> {
289 // it's impossible to have a named capture without an index
290 let i
= i
.expect("capture index");
291 if i
>= self.compiled
.captures
.len() {
292 self.compiled
.captures
.push(name
.clone());
293 if let Some(ref name
) = *name
{
294 self.capture_name_idx
.insert(name
.to_owned(), i
);
297 self.c_capture(2 * i
, e
)
300 if self.compiled
.is_reverse
{
301 self.c_concat(es
.iter().rev())
306 Alternate(ref es
) => self.c_alternate(&**es
),
307 Repeat { ref e, r, greedy }
=> self.c_repeat(e
, r
, greedy
),
311 fn c_capture(&mut self, first_slot
: usize, expr
: &Expr
) -> Result
{
312 if self.num_exprs
> 1 || self.compiled
.is_dfa
{
313 // Don't ever compile Save instructions for regex sets because
314 // they are never used. They are also never used in DFA programs
315 // because DFAs can't handle captures.
318 let entry
= self.insts
.len();
319 let hole
= self.push_hole(InstHole
::Save { slot: first_slot }
);
320 let patch
= try
!(self.c(expr
));
321 self.fill(hole
, patch
.entry
);
322 self.fill_to_next(patch
.hole
);
323 let hole
= self.push_hole(InstHole
::Save { slot: first_slot + 1 }
);
324 Ok(Patch { hole: hole, entry: entry }
)
328 fn c_dotstar(&mut self) -> Result
{
329 Ok(if !self.compiled
.only_utf8() {
330 try
!(self.c(&Expr
::Repeat
{
331 e
: Box
::new(Expr
::AnyByte
),
332 r
: Repeater
::ZeroOrMore
,
336 try
!(self.c(&Expr
::Repeat
{
337 e
: Box
::new(Expr
::AnyChar
),
338 r
: Repeater
::ZeroOrMore
,
344 fn c_literal(&mut self, chars
: &[char], casei
: bool
) -> Result
{
345 debug_assert
!(!chars
.is_empty());
346 let mut chars
: Box
<Iterator
<Item
=&char>> =
347 if self.compiled
.is_reverse
{
348 Box
::new(chars
.iter().rev())
350 Box
::new(chars
.iter())
352 let first
= *chars
.next().expect("non-empty literal");
353 let Patch { mut hole, entry }
= try
!(self.c_char(first
, casei
));
355 let p
= try
!(self.c_char(c
, casei
));
356 self.fill(hole
, p
.entry
);
359 Ok(Patch { hole: hole, entry: entry }
)
362 fn c_char(&mut self, c
: char, casei
: bool
) -> Result
{
364 self.c_class(&CharClass
::new(vec
![
365 ClassRange { start: c, end: c }
,
368 self.c_class(&[ClassRange { start: c, end: c }
])
372 fn c_class(&mut self, ranges
: &[ClassRange
]) -> Result
{
373 assert
!(!ranges
.is_empty());
374 if self.compiled
.uses_bytes() {
380 let ranges
: Vec
<(char, char)> =
381 ranges
.iter().map(|r
| (r
.start
, r
.end
)).collect();
382 let hole
= if ranges
.len() == 1 && ranges
[0].0 == ranges
[0].1 {
383 self.push_hole(InstHole
::Char { c: ranges[0].0 }
)
385 self.push_hole(InstHole
::Ranges { ranges: ranges }
)
387 Ok(Patch { hole: hole, entry: self.insts.len() - 1 }
)
391 fn c_bytes(&mut self, bytes
: &[u8], casei
: bool
) -> Result
{
392 debug_assert
!(!bytes
.is_empty());
393 let mut bytes
: Box
<Iterator
<Item
=&u8>> =
394 if self.compiled
.is_reverse
{
395 Box
::new(bytes
.iter().rev())
397 Box
::new(bytes
.iter())
399 let first
= *bytes
.next().expect("non-empty literal");
400 let Patch { mut hole, entry }
= try
!(self.c_byte(first
, casei
));
402 let p
= try
!(self.c_byte(b
, casei
));
403 self.fill(hole
, p
.entry
);
406 Ok(Patch { hole: hole, entry: entry }
)
409 fn c_byte(&mut self, b
: u8, casei
: bool
) -> Result
{
411 self.c_class_bytes(&ByteClass
::new(vec
![
412 ByteRange { start: b, end: b }
,
415 self.c_class_bytes(&[ByteRange { start: b, end: b }
])
419 fn c_class_bytes(&mut self, ranges
: &[ByteRange
]) -> Result
{
420 debug_assert
!(!ranges
.is_empty());
422 let first_split_entry
= self.insts
.len();
423 let mut holes
= vec
![];
424 let mut prev_hole
= Hole
::None
;
425 for r
in &ranges
[0..ranges
.len() - 1] {
426 self.fill_to_next(prev_hole
);
427 let split
= self.push_split_hole();
428 let next
= self.insts
.len();
429 self.byte_classes
.set_range(r
.start
, r
.end
);
430 holes
.push(self.push_hole(InstHole
::Bytes
{
431 start
: r
.start
, end
: r
.end
,
433 prev_hole
= self.fill_split(split
, Some(next
), None
);
435 let next
= self.insts
.len();
436 let r
= &ranges
[ranges
.len() - 1];
437 self.byte_classes
.set_range(r
.start
, r
.end
);
438 holes
.push(self.push_hole(InstHole
::Bytes
{
439 start
: r
.start
, end
: r
.end
,
441 self.fill(prev_hole
, next
);
442 Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry }
)
445 fn c_empty_look(&mut self, look
: EmptyLook
) -> Result
{
446 let hole
= self.push_hole(InstHole
::EmptyLook { look: look }
);
447 Ok(Patch { hole: hole, entry: self.insts.len() - 1 }
)
450 fn c_concat
<'a
, I
>(&mut self, exprs
: I
) -> Result
451 where I
: IntoIterator
<Item
=&'a Expr
> {
452 let mut exprs
= exprs
.into_iter();
453 let first
= match exprs
.next() {
456 return Ok(Patch { hole: Hole::None, entry: self.insts.len() }
)
459 let Patch { mut hole, entry }
= try
!(self.c(first
));
461 let p
= try
!(self.c(e
));
462 self.fill(hole
, p
.entry
);
465 Ok(Patch { hole: hole, entry: entry }
)
468 fn c_alternate(&mut self, exprs
: &[Expr
]) -> Result
{
470 exprs
.len() >= 2, "alternates must have at least 2 exprs");
472 // Initial entry point is always the first split.
473 let first_split_entry
= self.insts
.len();
475 // Save up all of the holes from each alternate. They will all get
476 // patched to point to the same location.
477 let mut holes
= vec
![];
479 let mut prev_hole
= Hole
::None
;
480 for e
in &exprs
[0..exprs
.len() - 1] {
481 self.fill_to_next(prev_hole
);
482 let split
= self.push_split_hole();
483 let Patch { hole, entry }
= try
!(self.c(e
));
485 prev_hole
= self.fill_split(split
, Some(entry
), None
);
487 let Patch { hole, entry }
= try
!(self.c(&exprs
[exprs
.len() - 1]));
489 self.fill(prev_hole
, entry
);
490 Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry }
)
500 Repeater
::ZeroOrOne
=> self.c_repeat_zero_or_one(expr
, greedy
),
501 Repeater
::ZeroOrMore
=> self.c_repeat_zero_or_more(expr
, greedy
),
502 Repeater
::OneOrMore
=> self.c_repeat_one_or_more(expr
, greedy
),
503 Repeater
::Range { min, max: None }
=> {
504 self.c_repeat_range_min_or_more(expr
, greedy
, min
)
506 Repeater
::Range { min, max: Some(max) }
=> {
507 self.c_repeat_range(expr
, greedy
, min
, max
)
512 fn c_repeat_zero_or_one(
517 let split_entry
= self.insts
.len();
518 let split
= self.push_split_hole();
519 let Patch { hole: hole_rep, entry: entry_rep }
= try
!(self.c(expr
));
521 let split_hole
= if greedy
{
522 self.fill_split(split
, Some(entry_rep
), None
)
524 self.fill_split(split
, None
, Some(entry_rep
))
526 let holes
= vec
![hole_rep
, split_hole
];
527 Ok(Patch { hole: Hole::Many(holes), entry: split_entry }
)
530 fn c_repeat_zero_or_more(
535 let split_entry
= self.insts
.len();
536 let split
= self.push_split_hole();
537 let Patch { hole: hole_rep, entry: entry_rep }
= try
!(self.c(expr
));
539 self.fill(hole_rep
, split_entry
);
540 let split_hole
= if greedy
{
541 self.fill_split(split
, Some(entry_rep
), None
)
543 self.fill_split(split
, None
, Some(entry_rep
))
545 Ok(Patch { hole: split_hole, entry: split_entry }
)
548 fn c_repeat_one_or_more(
553 let Patch { hole: hole_rep, entry: entry_rep }
= try
!(self.c(expr
));
554 self.fill_to_next(hole_rep
);
555 let split
= self.push_split_hole();
557 let split_hole
= if greedy
{
558 self.fill_split(split
, Some(entry_rep
), None
)
560 self.fill_split(split
, None
, Some(entry_rep
))
562 Ok(Patch { hole: split_hole, entry: entry_rep }
)
565 fn c_repeat_range_min_or_more(
571 let min
= u32_to_usize(min
);
572 let patch_concat
= try
!(self.c_concat(iter
::repeat(expr
).take(min
)));
573 let patch_rep
= try
!(self.c_repeat_zero_or_more(expr
, greedy
));
574 self.fill(patch_concat
.hole
, patch_rep
.entry
);
575 Ok(Patch { hole: patch_rep.hole, entry: patch_concat.entry }
)
585 let (min
, max
) = (u32_to_usize(min
), u32_to_usize(max
));
586 let patch_concat
= try
!(self.c_concat(iter
::repeat(expr
).take(min
)));
587 let initial_entry
= patch_concat
.entry
;
589 return Ok(patch_concat
);
591 // It is much simpler to compile, e.g., `a{2,5}` as:
595 // But you end up with a sequence of instructions like this:
607 // This is *incredibly* inefficient because the splits end
608 // up forming a chain, which has to be resolved everything a
609 // transition is followed.
610 let mut holes
= vec
![];
611 let mut prev_hole
= patch_concat
.hole
;
613 self.fill_to_next(prev_hole
);
614 let split
= self.push_split_hole();
615 let Patch { hole, entry }
= try
!(self.c(expr
));
618 holes
.push(self.fill_split(split
, Some(entry
), None
));
620 holes
.push(self.fill_split(split
, None
, Some(entry
)));
623 holes
.push(prev_hole
);
624 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry }
)
627 fn fill(&mut self, hole
: Hole
, goto
: InstPtr
) {
631 self.insts
[pc
].fill(goto
);
633 Hole
::Many(holes
) => {
635 self.fill(hole
, goto
);
641 fn fill_to_next(&mut self, hole
: Hole
) {
642 let next
= self.insts
.len();
643 self.fill(hole
, next
);
649 goto1
: Option
<InstPtr
>,
650 goto2
: Option
<InstPtr
>,
653 Hole
::None
=> Hole
::None
,
655 match (goto1
, goto2
) {
656 (Some(goto1
), Some(goto2
)) => {
657 self.insts
[pc
].fill_split(goto1
, goto2
);
660 (Some(goto1
), None
) => {
661 self.insts
[pc
].half_fill_split_goto1(goto1
);
664 (None
, Some(goto2
)) => {
665 self.insts
[pc
].half_fill_split_goto2(goto2
);
668 (None
, None
) => unreachable
!("at least one of the split \
669 holes must be filled"),
672 Hole
::Many(holes
) => {
673 let mut new_holes
= vec
![];
675 new_holes
.push(self.fill_split(hole
, goto1
, goto2
));
677 if new_holes
.is_empty() {
679 } else if new_holes
.len() == 1 {
680 new_holes
.pop().unwrap()
682 Hole
::Many(new_holes
)
688 fn push_compiled(&mut self, inst
: Inst
) {
689 self.insts
.push(MaybeInst
::Compiled(inst
));
692 fn push_hole(&mut self, inst
: InstHole
) -> Hole
{
693 let hole
= self.insts
.len();
694 self.insts
.push(MaybeInst
::Uncompiled(inst
));
698 fn push_split_hole(&mut self) -> Hole
{
699 let hole
= self.insts
.len();
700 self.insts
.push(MaybeInst
::Split
);
704 fn check_size(&self) -> result
::Result
<(), Error
> {
705 use std
::mem
::size_of
;
707 if self.insts
.len() * size_of
::<Inst
>() > self.size_limit
{
708 Err(Error
::CompiledTooBig(self.size_limit
))
722 #[derive(Clone, Debug)]
725 Uncompiled(InstHole
),
732 fn fill(&mut self, goto
: InstPtr
) {
733 let filled
= match *self {
734 MaybeInst
::Uncompiled(ref inst
) => inst
.fill(goto
),
735 MaybeInst
::Split1(goto1
) => {
736 Inst
::Split(InstSplit { goto1: goto1, goto2: goto }
)
738 MaybeInst
::Split2(goto2
) => {
739 Inst
::Split(InstSplit { goto1: goto, goto2: goto2 }
)
741 _
=> unreachable
!("not all instructions were compiled! \
742 found uncompiled instruction: {:?}", self),
744 *self = MaybeInst
::Compiled(filled
);
747 fn fill_split(&mut self, goto1
: InstPtr
, goto2
: InstPtr
) {
748 let filled
= match *self {
749 MaybeInst
::Split
=> {
750 Inst
::Split(InstSplit { goto1: goto1, goto2: goto2 }
)
752 _
=> unreachable
!("must be called on Split instruction, \
753 instead it was called on: {:?}", self),
755 *self = MaybeInst
::Compiled(filled
);
758 fn half_fill_split_goto1(&mut self, goto1
: InstPtr
) {
759 let half_filled
= match *self {
760 MaybeInst
::Split
=> goto1
,
761 _
=> unreachable
!("must be called on Split instruction, \
762 instead it was called on: {:?}", self),
764 *self = MaybeInst
::Split1(half_filled
);
767 fn half_fill_split_goto2(&mut self, goto2
: InstPtr
) {
768 let half_filled
= match *self {
769 MaybeInst
::Split
=> goto2
,
770 _
=> unreachable
!("must be called on Split instruction, \
771 instead it was called on: {:?}", self),
773 *self = MaybeInst
::Split2(half_filled
);
776 fn unwrap(self) -> Inst
{
778 MaybeInst
::Compiled(inst
) => inst
,
779 _
=> unreachable
!("must be called on a compiled instruction, \
780 instead it was called on: {:?}", self),
785 #[derive(Clone, Debug)]
787 Save { slot: usize }
,
788 EmptyLook { look: EmptyLook }
,
790 Ranges { ranges: Vec<(char, char)> }
,
791 Bytes { start: u8, end: u8 }
,
795 fn fill(&self, goto
: InstPtr
) -> Inst
{
797 InstHole
::Save { slot }
=> Inst
::Save(InstSave
{
801 InstHole
::EmptyLook { look }
=> Inst
::EmptyLook(InstEmptyLook
{
805 InstHole
::Char { c }
=> Inst
::Char(InstChar
{
809 InstHole
::Ranges { ref ranges }
=> Inst
::Ranges(InstRanges
{
811 ranges
: ranges
.clone(),
813 InstHole
::Bytes { start, end }
=> Inst
::Bytes(InstBytes
{
822 struct CompileClass
<'a
, 'b
> {
824 ranges
: &'b
[ClassRange
],
827 impl<'a
, 'b
> CompileClass
<'a
, 'b
> {
828 fn compile(mut self) -> Result
{
829 let mut holes
= vec
![];
830 let mut initial_entry
= None
;
831 let mut last_split
= Hole
::None
;
832 let mut utf8_seqs
= self.c
.utf8_seqs
.take().unwrap();
833 self.c
.suffix_cache
.clear();
835 for (i
, ref range
) in self.ranges
.iter().enumerate() {
836 let is_last_range
= i
+ 1 == self.ranges
.len();
837 utf8_seqs
.reset(range
.start
, range
.end
);
838 let mut it
= (&mut utf8_seqs
).peekable();
840 let utf8_seq
= match it
.next() {
842 Some(utf8_seq
) => utf8_seq
,
844 if is_last_range
&& it
.peek().is_none() {
845 let Patch { hole, entry }
= try
!(self.c_utf8_seq(&utf8_seq
));
847 self.c
.fill(last_split
, entry
);
848 last_split
= Hole
::None
;
849 if initial_entry
.is_none() {
850 initial_entry
= Some(entry
);
853 if initial_entry
.is_none() {
854 initial_entry
= Some(self.c
.insts
.len());
856 self.c
.fill_to_next(last_split
);
857 last_split
= self.c
.push_split_hole();
858 let Patch { hole, entry }
= try
!(self.c_utf8_seq(&utf8_seq
));
860 last_split
= self.c
.fill_split(last_split
, Some(entry
), None
);
864 self.c
.utf8_seqs
= Some(utf8_seqs
);
866 hole
: Hole
::Many(holes
),
867 entry
: initial_entry
.unwrap(),
871 fn c_utf8_seq(&mut self, seq
: &Utf8Sequence
) -> Result
{
872 if self.c
.compiled
.is_reverse
{
873 self.c_utf8_seq_(seq
)
875 self.c_utf8_seq_(seq
.into_iter().rev())
879 fn c_utf8_seq_
<'r
, I
>(&mut self, seq
: I
) -> Result
880 where I
: IntoIterator
<Item
=&'r Utf8Range
> {
881 // The initial instruction for each UTF-8 sequence should be the same.
882 let mut from_inst
= ::std
::usize::MAX
;
883 let mut last_hole
= Hole
::None
;
884 for byte_range
in seq
{
885 let key
= SuffixCacheKey
{
886 from_inst
: from_inst
,
887 start
: byte_range
.start
,
891 let pc
= self.c
.insts
.len();
892 if let Some(cached_pc
) = self.c
.suffix_cache
.get(key
, pc
) {
893 from_inst
= cached_pc
;
897 self.c
.byte_classes
.set_range(byte_range
.start
, byte_range
.end
);
898 if from_inst
== ::std
::usize::MAX
{
899 last_hole
= self.c
.push_hole(InstHole
::Bytes
{
900 start
: byte_range
.start
,
904 self.c
.push_compiled(Inst
::Bytes(InstBytes
{
906 start
: byte_range
.start
,
910 from_inst
= self.c
.insts
.len().checked_sub(1).unwrap();
911 debug_assert
!(from_inst
< ::std
::usize::MAX
);
913 debug_assert
!(from_inst
< ::std
::usize::MAX
);
914 Ok(Patch { hole: last_hole, entry: from_inst }
)
918 /// SuffixCache is a simple bounded hash map for caching suffix entries in
919 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
920 /// The set of byte ranges looks like this:
924 /// [E0][A0-BF][80-BF]
925 /// [E1-EC][80-BF][80-BF]
926 /// [ED][80-9F][80-BF]
927 /// [EE-EF][80-BF][80-BF]
929 /// Each line above translates to one alternate in the compiled regex program.
930 /// However, all but one of the alternates end in the same suffix, which is
931 /// a waste of an instruction. The suffix cache facilitates reusing them across
934 /// Note that a HashMap could be trivially used for this, but we don't need its
935 /// overhead. Some small bounded space (LRU style) is more than enough.
937 table
: Vec
<SuffixCacheEntry
>,
938 // Every time the cache is cleared, we increment the version number instead
939 // of actually zeroing memory. Since we store a copy of the current version
940 // in every element, all we need to do is make sure to invalidate any stale
941 // entries upon access. This saves quite a bit of time!
945 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
946 struct SuffixCacheEntry
{
952 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
953 struct SuffixCacheKey
{
960 fn new(size
: usize) -> Self {
962 table
: vec
![SuffixCacheEntry
::default(); size
],
967 fn get(&mut self, key
: SuffixCacheKey
, pc
: InstPtr
) -> Option
<InstPtr
> {
968 let h
= self.hash(&key
);
969 let e
= self.table
[h
];
970 if e
.key
== key
&& e
.version
== self.version
{
973 self.table
[h
] = SuffixCacheEntry
{
976 version
: self.version
,
982 fn clear(&mut self) {
986 fn hash(&self, suffix
: &SuffixCacheKey
) -> usize {
987 // Basic FNV-1a hash as described:
988 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
989 const FNV_PRIME
: u64 = 1099511628211;
990 let mut h
= 14695981039346656037;
991 h
= (h ^
(suffix
.from_inst
as u64)).wrapping_mul(FNV_PRIME
);
992 h
= (h ^
(suffix
.start
as u64)).wrapping_mul(FNV_PRIME
);
993 h
= (h ^
(suffix
.end
as u64)).wrapping_mul(FNV_PRIME
);
994 (h
as usize) % self.table
.len()
998 struct ByteClassSet([bool
; 256]);
1002 ByteClassSet([false; 256])
1005 fn set_range(&mut self, start
: u8, end
: u8) {
1006 debug_assert
!(start
<= end
);
1008 self.0[start
as usize - 1] = true;
1010 self.0[end
as usize] = true;
1013 fn set_word_boundary(&mut self) {
1014 // We need to mark all ranges of bytes whose pairs result in
1015 // evaluating \b differently.
1016 let iswb
= is_word_byte
;
1017 let mut b1
: u16 = 0;
1021 while b2
<= 255 && iswb(b1
as u8) == iswb(b2
as u8) {
1024 self.set_range(b1
as u8, (b2
- 1) as u8);
1029 fn byte_classes(&self) -> Vec
<u8> {
1030 // N.B. If you're debugging the DFA, it's useful to simply return
1031 // `(0..256).collect()`, which effectively removes the byte classes
1032 // and makes the transitions easier to read.
1033 // (0usize..256).map(|x| x as u8).collect()
1034 let mut byte_classes
= vec
![0; 256];
1035 let mut class
= 0u8;
1037 byte_classes
[i
] = class
;
1039 class
= class
.checked_add(1).unwrap();
1046 fn u32_to_usize(n
: u32) -> usize {
1047 if (n
as u64) > (::std
::usize::MAX
as u64) {
1048 panic
!("BUG: {} is too big to be pointer sized", n
)
1055 use super::ByteClassSet
;
1059 let mut set
= ByteClassSet
::new();
1060 set
.set_range(b'a'
, b'z'
);
1061 let classes
= set
.byte_classes();
1062 assert_eq
!(classes
[0], 0);
1063 assert_eq
!(classes
[1], 0);
1064 assert_eq
!(classes
[2], 0);
1065 assert_eq
!(classes
[b'a'
as usize - 1], 0);
1066 assert_eq
!(classes
[b'a'
as usize], 1);
1067 assert_eq
!(classes
[b'm'
as usize], 1);
1068 assert_eq
!(classes
[b'z'
as usize], 1);
1069 assert_eq
!(classes
[b'z'
as usize + 1], 2);
1070 assert_eq
!(classes
[254], 2);
1071 assert_eq
!(classes
[255], 2);
1073 let mut set
= ByteClassSet
::new();
1074 set
.set_range(0, 2);
1075 set
.set_range(4, 6);
1076 let classes
= set
.byte_classes();
1077 assert_eq
!(classes
[0], 0);
1078 assert_eq
!(classes
[1], 0);
1079 assert_eq
!(classes
[2], 0);
1080 assert_eq
!(classes
[3], 1);
1081 assert_eq
!(classes
[4], 2);
1082 assert_eq
!(classes
[5], 2);
1083 assert_eq
!(classes
[6], 2);
1084 assert_eq
!(classes
[7], 3);
1085 assert_eq
!(classes
[255], 3);