]> git.proxmox.com Git - cargo.git/blob - vendor/regex-0.2.5/src/compile.rs
New upstream version 0.24.0
[cargo.git] / vendor / regex-0.2.5 / src / compile.rs
1 // Copyright 2014-2016 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 use std::collections::HashMap;
12 use std::iter;
13 use std::result;
14 use std::sync::Arc;
15
16 use syntax::{
17 Expr, Repeater, CharClass, ClassRange, ByteClass, ByteRange,
18 is_word_byte,
19 };
20 use utf8_ranges::{Utf8Range, Utf8Sequence, Utf8Sequences};
21
22 use prog::{
23 Program, Inst, InstPtr, EmptyLook,
24 InstSave, InstSplit, InstEmptyLook, InstChar, InstRanges, InstBytes,
25 };
26
27 use Error;
28
29 type Result = result::Result<Patch, Error>;
30
31 #[derive(Debug)]
32 struct Patch {
33 hole: Hole,
34 entry: InstPtr,
35 }
36
37 /// A compiler translates a regular expression AST to a sequence of
38 /// instructions. The sequence of instructions represents an NFA.
39 pub struct Compiler {
40 insts: Vec<MaybeInst>,
41 compiled: Program,
42 capture_name_idx: HashMap<String, usize>,
43 num_exprs: usize,
44 size_limit: usize,
45 suffix_cache: SuffixCache,
46 utf8_seqs: Option<Utf8Sequences>,
47 byte_classes: ByteClassSet,
48 }
49
50 impl Compiler {
51 /// Create a new regular expression compiler.
52 ///
53 /// Various options can be set before calling `compile` on an expression.
54 pub fn new() -> Self {
55 Compiler {
56 insts: vec![],
57 compiled: Program::new(),
58 capture_name_idx: HashMap::new(),
59 num_exprs: 0,
60 size_limit: 10 * (1 << 20),
61 suffix_cache: SuffixCache::new(1000),
62 utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
63 byte_classes: ByteClassSet::new(),
64 }
65 }
66
67 /// The size of the resulting program is limited by size_limit. If
68 /// the program approximately exceeds the given size (in bytes), then
69 /// compilation will stop and return an error.
70 pub fn size_limit(mut self, size_limit: usize) -> Self {
71 self.size_limit = size_limit;
72 self
73 }
74
75 /// If bytes is true, then the program is compiled as a byte based
76 /// automaton, which incorporates UTF-8 decoding into the machine. If it's
77 /// false, then the automaton is Unicode scalar value based, e.g., an
78 /// engine utilizing such an automaton is resposible for UTF-8 decoding.
79 ///
80 /// The specific invariant is that when returning a byte based machine,
81 /// the neither the `Char` nor `Ranges` instructions are produced.
82 /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
83 /// instruction is never produced.
84 ///
85 /// Note that `dfa(true)` implies `bytes(true)`.
86 pub fn bytes(mut self, yes: bool) -> Self {
87 self.compiled.is_bytes = yes;
88 self
89 }
90
91 /// When disabled, the program compiled may match arbitrary bytes.
92 ///
93 /// When enabled (the default), all compiled programs exclusively match
94 /// valid UTF-8 bytes.
95 pub fn only_utf8(mut self, yes: bool) -> Self {
96 self.compiled.only_utf8 = yes;
97 self
98 }
99
100 /// When set, the machine returned is suitable for use in the DFA matching
101 /// engine.
102 ///
103 /// In particular, this ensures that if the regex is not anchored in the
104 /// beginning, then a preceding `.*?` is included in the program. (The NFA
105 /// based engines handle the preceding `.*?` explicitly, which is difficult
106 /// or impossible in the DFA engine.)
107 pub fn dfa(mut self, yes: bool) -> Self {
108 self.compiled.is_dfa = yes;
109 self
110 }
111
112 /// When set, the machine returned is suitable for matching text in
113 /// reverse. In particular, all concatenations are flipped.
114 pub fn reverse(mut self, yes: bool) -> Self {
115 self.compiled.is_reverse = yes;
116 self
117 }
118
119 /// Compile a regular expression given its AST.
120 ///
121 /// The compiler is guaranteed to succeed unless the program exceeds the
122 /// specified size limit. If the size limit is exceeded, then compilation
123 /// stops and returns an error.
124 pub fn compile(
125 mut self,
126 exprs: &[Expr],
127 ) -> result::Result<Program, Error> {
128 debug_assert!(exprs.len() >= 1);
129 self.num_exprs = exprs.len();
130 if exprs.len() == 1 {
131 self.compile_one(&exprs[0])
132 } else {
133 self.compile_many(exprs)
134 }
135 }
136
137 fn compile_one(mut self, expr: &Expr) -> result::Result<Program, Error> {
138 // If we're compiling a forward DFA and we aren't anchored, then
139 // add a `.*?` before the first capture group.
140 // Other matching engines handle this by baking the logic into the
141 // matching engine itself.
142 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
143 self.compiled.is_anchored_start = expr.is_anchored_start();
144 self.compiled.is_anchored_end = expr.is_anchored_end();
145 if self.compiled.needs_dotstar() {
146 dotstar_patch = try!(self.c_dotstar());
147 self.compiled.start = dotstar_patch.entry;
148 }
149 self.compiled.captures = vec![None];
150 let patch = try!(self.c_capture(0, expr));
151 if self.compiled.needs_dotstar() {
152 self.fill(dotstar_patch.hole, patch.entry);
153 } else {
154 self.compiled.start = patch.entry;
155 }
156 self.fill_to_next(patch.hole);
157 self.compiled.matches = vec![self.insts.len()];
158 self.push_compiled(Inst::Match(0));
159 self.compile_finish()
160 }
161
162 fn compile_many(
163 mut self,
164 exprs: &[Expr],
165 ) -> result::Result<Program, Error> {
166 debug_assert!(exprs.len() > 1);
167
168 self.compiled.is_anchored_start =
169 exprs.iter().all(|e| e.is_anchored_start());
170 self.compiled.is_anchored_end =
171 exprs.iter().all(|e| e.is_anchored_end());
172 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
173 if self.compiled.needs_dotstar() {
174 dotstar_patch = try!(self.c_dotstar());
175 self.compiled.start = dotstar_patch.entry;
176 } else {
177 self.compiled.start = 0; // first instruction is always split
178 }
179 self.fill_to_next(dotstar_patch.hole);
180
181 let mut prev_hole = Hole::None;
182 for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
183 self.fill_to_next(prev_hole);
184 let split = self.push_split_hole();
185 let Patch { hole, entry } = try!(self.c_capture(0, expr));
186 self.fill_to_next(hole);
187 self.compiled.matches.push(self.insts.len());
188 self.push_compiled(Inst::Match(i));
189 prev_hole = self.fill_split(split, Some(entry), None);
190 }
191 let i = exprs.len() - 1;
192 let Patch { hole, entry } = try!(self.c_capture(0, &exprs[i]));
193 self.fill(prev_hole, entry);
194 self.fill_to_next(hole);
195 self.compiled.matches.push(self.insts.len());
196 self.push_compiled(Inst::Match(i));
197 self.compile_finish()
198 }
199
200 fn compile_finish(mut self) -> result::Result<Program, Error> {
201 self.compiled.insts =
202 self.insts.into_iter().map(|inst| inst.unwrap()).collect();
203 self.compiled.byte_classes = self.byte_classes.byte_classes();
204 self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
205 Ok(self.compiled)
206 }
207
208 /// Compile expr into self.insts, returning a patch on success,
209 /// or an error if we run out of memory.
210 ///
211 /// All of the c_* methods of the compiler share the contract outlined
212 /// here.
213 ///
214 /// The main thing that a c_* method does is mutate `self.insts`
215 /// to add a list of mostly compiled instructions required to execute
216 /// the given expression. `self.insts` contains MaybeInsts rather than
217 /// Insts because there is some backpatching required.
218 ///
219 /// The `Patch` value returned by each c_* method provides metadata
220 /// about the compiled instructions emitted to `self.insts`. The
221 /// `entry` member of the patch refers to the first instruction
222 /// (the entry point), while the `hole` member contains zero or
223 /// more offsets to partial instructions that need to be backpatched.
224 /// The c_* routine can't know where its list of instructions are going to
225 /// jump to after execution, so it is up to the caller to patch
226 /// these jumps to point to the right place. So compiling some
227 /// expression, e, we would end up with a situation that looked like:
228 ///
229 /// ```text
230 /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
231 /// ^ ^ ^
232 /// | \ /
233 /// entry \ /
234 /// hole
235 /// ```
236 ///
237 /// To compile two expressions, e1 and e2, concatinated together we
238 /// would do:
239 ///
240 /// ```ignore
241 /// let patch1 = self.c(e1);
242 /// let patch2 = self.c(e2);
243 /// ```
244 ///
245 /// while leaves us with a situation that looks like
246 ///
247 /// ```text
248 /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
249 /// ^ ^ ^ ^
250 /// | | | |
251 /// entry1 hole1 entry2 hole2
252 /// ```
253 ///
254 /// Then to merge the two patches together into one we would backpatch
255 /// hole1 with entry2 and return a new patch that enters at entry1
256 /// and has hole2 for a hole. In fact, if you look at the c_concat
257 /// method you will see that it does exactly this, though it handles
258 /// a list of expressions rather than just the two that we use for
259 /// an example.
260 fn c(&mut self, expr: &Expr) -> Result {
261 use prog;
262 use syntax::Expr::*;
263
264 try!(self.check_size());
265 match *expr {
266 Empty => Ok(Patch { hole: Hole::None, entry: self.insts.len() }),
267 Literal { ref chars, casei } => self.c_literal(chars, casei),
268 LiteralBytes { ref bytes, casei } => self.c_bytes(bytes, casei),
269 AnyChar => self.c_class(&[ClassRange {
270 start: '\x00',
271 end: '\u{10ffff}',
272 }]),
273 AnyCharNoNL => {
274 self.c_class(&[
275 ClassRange { start: '\x00', end: '\x09' },
276 ClassRange { start: '\x0b', end: '\u{10ffff}' },
277 ])
278 }
279 AnyByte => {
280 self.c_class_bytes(&[ByteRange { start: 0, end: 0xFF }])
281 }
282 AnyByteNoNL => {
283 self.c_class_bytes(&[
284 ByteRange { start: 0, end: 0x9 },
285 ByteRange { start: 0xB, end: 0xFF },
286 ])
287 }
288 Class(ref cls) => {
289 self.c_class(cls)
290 }
291 ClassBytes(ref cls) => {
292 self.c_class_bytes(cls)
293 }
294 StartLine if self.compiled.is_reverse => {
295 self.byte_classes.set_range(b'\n', b'\n');
296 self.c_empty_look(prog::EmptyLook::EndLine)
297 }
298 StartLine => {
299 self.byte_classes.set_range(b'\n', b'\n');
300 self.c_empty_look(prog::EmptyLook::StartLine)
301 }
302 EndLine if self.compiled.is_reverse => {
303 self.byte_classes.set_range(b'\n', b'\n');
304 self.c_empty_look(prog::EmptyLook::StartLine)
305 }
306 EndLine => {
307 self.byte_classes.set_range(b'\n', b'\n');
308 self.c_empty_look(prog::EmptyLook::EndLine)
309 }
310 StartText if self.compiled.is_reverse => {
311 self.c_empty_look(prog::EmptyLook::EndText)
312 }
313 StartText => {
314 self.c_empty_look(prog::EmptyLook::StartText)
315 }
316 EndText if self.compiled.is_reverse => {
317 self.c_empty_look(prog::EmptyLook::StartText)
318 }
319 EndText => {
320 self.c_empty_look(prog::EmptyLook::EndText)
321 }
322 WordBoundary => {
323 self.compiled.has_unicode_word_boundary = true;
324 self.byte_classes.set_word_boundary();
325 self.c_empty_look(prog::EmptyLook::WordBoundary)
326 }
327 NotWordBoundary => {
328 self.compiled.has_unicode_word_boundary = true;
329 self.byte_classes.set_word_boundary();
330 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
331 }
332 WordBoundaryAscii => {
333 self.byte_classes.set_word_boundary();
334 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
335 }
336 NotWordBoundaryAscii => {
337 self.byte_classes.set_word_boundary();
338 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
339 }
340 Group { ref e, i: None, name: None } => self.c(e),
341 Group { ref e, i, ref name } => {
342 // it's impossible to have a named capture without an index
343 let i = i.expect("capture index");
344 if i >= self.compiled.captures.len() {
345 self.compiled.captures.push(name.clone());
346 if let Some(ref name) = *name {
347 self.capture_name_idx.insert(name.to_owned(), i);
348 }
349 }
350 self.c_capture(2 * i, e)
351 }
352 Concat(ref es) => {
353 if self.compiled.is_reverse {
354 self.c_concat(es.iter().rev())
355 } else {
356 self.c_concat(es)
357 }
358 }
359 Alternate(ref es) => self.c_alternate(&**es),
360 Repeat { ref e, r, greedy } => self.c_repeat(e, r, greedy),
361 }
362 }
363
364 fn c_capture(&mut self, first_slot: usize, expr: &Expr) -> Result {
365 if self.num_exprs > 1 || self.compiled.is_dfa {
366 // Don't ever compile Save instructions for regex sets because
367 // they are never used. They are also never used in DFA programs
368 // because DFAs can't handle captures.
369 self.c(expr)
370 } else {
371 let entry = self.insts.len();
372 let hole = self.push_hole(InstHole::Save { slot: first_slot });
373 let patch = try!(self.c(expr));
374 self.fill(hole, patch.entry);
375 self.fill_to_next(patch.hole);
376 let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
377 Ok(Patch { hole: hole, entry: entry })
378 }
379 }
380
381 fn c_dotstar(&mut self) -> Result {
382 Ok(if !self.compiled.only_utf8() {
383 try!(self.c(&Expr::Repeat {
384 e: Box::new(Expr::AnyByte),
385 r: Repeater::ZeroOrMore,
386 greedy: false,
387 }))
388 } else {
389 try!(self.c(&Expr::Repeat {
390 e: Box::new(Expr::AnyChar),
391 r: Repeater::ZeroOrMore,
392 greedy: false,
393 }))
394 })
395 }
396
397 fn c_literal(&mut self, chars: &[char], casei: bool) -> Result {
398 debug_assert!(!chars.is_empty());
399 let mut chars: Box<Iterator<Item=&char>> =
400 if self.compiled.is_reverse {
401 Box::new(chars.iter().rev())
402 } else {
403 Box::new(chars.iter())
404 };
405 let first = *chars.next().expect("non-empty literal");
406 let Patch { mut hole, entry } = try!(self.c_char(first, casei));
407 for &c in chars {
408 let p = try!(self.c_char(c, casei));
409 self.fill(hole, p.entry);
410 hole = p.hole;
411 }
412 Ok(Patch { hole: hole, entry: entry })
413 }
414
415 fn c_char(&mut self, c: char, casei: bool) -> Result {
416 if casei {
417 self.c_class(&CharClass::new(vec![
418 ClassRange { start: c, end: c },
419 ]).case_fold())
420 } else {
421 self.c_class(&[ClassRange { start: c, end: c }])
422 }
423 }
424
425 fn c_class(&mut self, ranges: &[ClassRange]) -> Result {
426 assert!(!ranges.is_empty());
427 if self.compiled.uses_bytes() {
428 CompileClass {
429 c: self,
430 ranges: ranges,
431 }.compile()
432 } else {
433 let ranges: Vec<(char, char)> =
434 ranges.iter().map(|r| (r.start, r.end)).collect();
435 let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
436 self.push_hole(InstHole::Char { c: ranges[0].0 })
437 } else {
438 self.push_hole(InstHole::Ranges { ranges: ranges })
439 };
440 Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
441 }
442 }
443
444 fn c_bytes(&mut self, bytes: &[u8], casei: bool) -> Result {
445 debug_assert!(!bytes.is_empty());
446 let mut bytes: Box<Iterator<Item=&u8>> =
447 if self.compiled.is_reverse {
448 Box::new(bytes.iter().rev())
449 } else {
450 Box::new(bytes.iter())
451 };
452 let first = *bytes.next().expect("non-empty literal");
453 let Patch { mut hole, entry } = try!(self.c_byte(first, casei));
454 for &b in bytes {
455 let p = try!(self.c_byte(b, casei));
456 self.fill(hole, p.entry);
457 hole = p.hole;
458 }
459 Ok(Patch { hole: hole, entry: entry })
460 }
461
462 fn c_byte(&mut self, b: u8, casei: bool) -> Result {
463 if casei {
464 self.c_class_bytes(&ByteClass::new(vec![
465 ByteRange { start: b, end: b },
466 ]).case_fold())
467 } else {
468 self.c_class_bytes(&[ByteRange { start: b, end: b }])
469 }
470 }
471
472 fn c_class_bytes(&mut self, ranges: &[ByteRange]) -> Result {
473 debug_assert!(!ranges.is_empty());
474
475 let first_split_entry = self.insts.len();
476 let mut holes = vec![];
477 let mut prev_hole = Hole::None;
478 for r in &ranges[0..ranges.len() - 1] {
479 self.fill_to_next(prev_hole);
480 let split = self.push_split_hole();
481 let next = self.insts.len();
482 self.byte_classes.set_range(r.start, r.end);
483 holes.push(self.push_hole(InstHole::Bytes {
484 start: r.start, end: r.end,
485 }));
486 prev_hole = self.fill_split(split, Some(next), None);
487 }
488 let next = self.insts.len();
489 let r = &ranges[ranges.len() - 1];
490 self.byte_classes.set_range(r.start, r.end);
491 holes.push(self.push_hole(InstHole::Bytes {
492 start: r.start, end: r.end,
493 }));
494 self.fill(prev_hole, next);
495 Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
496 }
497
498 fn c_empty_look(&mut self, look: EmptyLook) -> Result {
499 let hole = self.push_hole(InstHole::EmptyLook { look: look });
500 Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
501 }
502
503 fn c_concat<'a, I>(&mut self, exprs: I) -> Result
504 where I: IntoIterator<Item=&'a Expr> {
505 let mut exprs = exprs.into_iter();
506 let first = match exprs.next() {
507 Some(expr) => expr,
508 None => {
509 return Ok(Patch { hole: Hole::None, entry: self.insts.len() })
510 }
511 };
512 let Patch { mut hole, entry } = try!(self.c(first));
513 for e in exprs {
514 let p = try!(self.c(e));
515 self.fill(hole, p.entry);
516 hole = p.hole;
517 }
518 Ok(Patch { hole: hole, entry: entry })
519 }
520
521 fn c_alternate(&mut self, exprs: &[Expr]) -> Result {
522 debug_assert!(
523 exprs.len() >= 2, "alternates must have at least 2 exprs");
524
525 // Initial entry point is always the first split.
526 let first_split_entry = self.insts.len();
527
528 // Save up all of the holes from each alternate. They will all get
529 // patched to point to the same location.
530 let mut holes = vec![];
531
532 let mut prev_hole = Hole::None;
533 for e in &exprs[0..exprs.len() - 1] {
534 self.fill_to_next(prev_hole);
535 let split = self.push_split_hole();
536 let Patch { hole, entry } = try!(self.c(e));
537 holes.push(hole);
538 prev_hole = self.fill_split(split, Some(entry), None);
539 }
540 let Patch { hole, entry } = try!(self.c(&exprs[exprs.len() - 1]));
541 holes.push(hole);
542 self.fill(prev_hole, entry);
543 Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
544 }
545
546 fn c_repeat(
547 &mut self,
548 expr: &Expr,
549 kind: Repeater,
550 greedy: bool,
551 ) -> Result {
552 match kind {
553 Repeater::ZeroOrOne => self.c_repeat_zero_or_one(expr, greedy),
554 Repeater::ZeroOrMore => self.c_repeat_zero_or_more(expr, greedy),
555 Repeater::OneOrMore => self.c_repeat_one_or_more(expr, greedy),
556 Repeater::Range { min, max: None } => {
557 self.c_repeat_range_min_or_more(expr, greedy, min)
558 }
559 Repeater::Range { min, max: Some(max) } => {
560 self.c_repeat_range(expr, greedy, min, max)
561 }
562 }
563 }
564
565 fn c_repeat_zero_or_one(
566 &mut self,
567 expr: &Expr,
568 greedy: bool,
569 ) -> Result {
570 let split_entry = self.insts.len();
571 let split = self.push_split_hole();
572 let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr));
573
574 let split_hole = if greedy {
575 self.fill_split(split, Some(entry_rep), None)
576 } else {
577 self.fill_split(split, None, Some(entry_rep))
578 };
579 let holes = vec![hole_rep, split_hole];
580 Ok(Patch { hole: Hole::Many(holes), entry: split_entry })
581 }
582
583 fn c_repeat_zero_or_more(
584 &mut self,
585 expr: &Expr,
586 greedy: bool,
587 ) -> Result {
588 let split_entry = self.insts.len();
589 let split = self.push_split_hole();
590 let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr));
591
592 self.fill(hole_rep, split_entry);
593 let split_hole = if greedy {
594 self.fill_split(split, Some(entry_rep), None)
595 } else {
596 self.fill_split(split, None, Some(entry_rep))
597 };
598 Ok(Patch { hole: split_hole, entry: split_entry })
599 }
600
601 fn c_repeat_one_or_more(
602 &mut self,
603 expr: &Expr,
604 greedy: bool,
605 ) -> Result {
606 let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr));
607 self.fill_to_next(hole_rep);
608 let split = self.push_split_hole();
609
610 let split_hole = if greedy {
611 self.fill_split(split, Some(entry_rep), None)
612 } else {
613 self.fill_split(split, None, Some(entry_rep))
614 };
615 Ok(Patch { hole: split_hole, entry: entry_rep })
616 }
617
618 fn c_repeat_range_min_or_more(
619 &mut self,
620 expr: &Expr,
621 greedy: bool,
622 min: u32,
623 ) -> Result {
624 let min = u32_to_usize(min);
625 let patch_concat = try!(self.c_concat(iter::repeat(expr).take(min)));
626 let patch_rep = try!(self.c_repeat_zero_or_more(expr, greedy));
627 self.fill(patch_concat.hole, patch_rep.entry);
628 Ok(Patch { hole: patch_rep.hole, entry: patch_concat.entry })
629 }
630
631 fn c_repeat_range(
632 &mut self,
633 expr: &Expr,
634 greedy: bool,
635 min: u32,
636 max: u32,
637 ) -> Result {
638 let (min, max) = (u32_to_usize(min), u32_to_usize(max));
639 let patch_concat = try!(self.c_concat(iter::repeat(expr).take(min)));
640 let initial_entry = patch_concat.entry;
641 if min == max {
642 return Ok(patch_concat);
643 }
644 // It is much simpler to compile, e.g., `a{2,5}` as:
645 //
646 // aaa?a?a?
647 //
648 // But you end up with a sequence of instructions like this:
649 //
650 // 0: 'a'
651 // 1: 'a',
652 // 2: split(3, 4)
653 // 3: 'a'
654 // 4: split(5, 6)
655 // 5: 'a'
656 // 6: split(7, 8)
657 // 7: 'a'
658 // 8: MATCH
659 //
660 // This is *incredibly* inefficient because the splits end
661 // up forming a chain, which has to be resolved everything a
662 // transition is followed.
663 let mut holes = vec![];
664 let mut prev_hole = patch_concat.hole;
665 for _ in min..max {
666 self.fill_to_next(prev_hole);
667 let split = self.push_split_hole();
668 let Patch { hole, entry } = try!(self.c(expr));
669 prev_hole = hole;
670 if greedy {
671 holes.push(self.fill_split(split, Some(entry), None));
672 } else {
673 holes.push(self.fill_split(split, None, Some(entry)));
674 }
675 }
676 holes.push(prev_hole);
677 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry })
678 }
679
680 fn fill(&mut self, hole: Hole, goto: InstPtr) {
681 match hole {
682 Hole::None => {}
683 Hole::One(pc) => {
684 self.insts[pc].fill(goto);
685 }
686 Hole::Many(holes) => {
687 for hole in holes {
688 self.fill(hole, goto);
689 }
690 }
691 }
692 }
693
694 fn fill_to_next(&mut self, hole: Hole) {
695 let next = self.insts.len();
696 self.fill(hole, next);
697 }
698
699 fn fill_split(
700 &mut self,
701 hole: Hole,
702 goto1: Option<InstPtr>,
703 goto2: Option<InstPtr>,
704 ) -> Hole {
705 match hole {
706 Hole::None => Hole::None,
707 Hole::One(pc) => {
708 match (goto1, goto2) {
709 (Some(goto1), Some(goto2)) => {
710 self.insts[pc].fill_split(goto1, goto2);
711 Hole::None
712 }
713 (Some(goto1), None) => {
714 self.insts[pc].half_fill_split_goto1(goto1);
715 Hole::One(pc)
716 }
717 (None, Some(goto2)) => {
718 self.insts[pc].half_fill_split_goto2(goto2);
719 Hole::One(pc)
720 }
721 (None, None) => unreachable!("at least one of the split \
722 holes must be filled"),
723 }
724 }
725 Hole::Many(holes) => {
726 let mut new_holes = vec![];
727 for hole in holes {
728 new_holes.push(self.fill_split(hole, goto1, goto2));
729 }
730 if new_holes.is_empty() {
731 Hole::None
732 } else if new_holes.len() == 1 {
733 new_holes.pop().unwrap()
734 } else {
735 Hole::Many(new_holes)
736 }
737 }
738 }
739 }
740
741 fn push_compiled(&mut self, inst: Inst) {
742 self.insts.push(MaybeInst::Compiled(inst));
743 }
744
745 fn push_hole(&mut self, inst: InstHole) -> Hole {
746 let hole = self.insts.len();
747 self.insts.push(MaybeInst::Uncompiled(inst));
748 Hole::One(hole)
749 }
750
751 fn push_split_hole(&mut self) -> Hole {
752 let hole = self.insts.len();
753 self.insts.push(MaybeInst::Split);
754 Hole::One(hole)
755 }
756
757 fn check_size(&self) -> result::Result<(), Error> {
758 use std::mem::size_of;
759
760 if self.insts.len() * size_of::<Inst>() > self.size_limit {
761 Err(Error::CompiledTooBig(self.size_limit))
762 } else {
763 Ok(())
764 }
765 }
766 }
767
768 #[derive(Debug)]
769 enum Hole {
770 None,
771 One(InstPtr),
772 Many(Vec<Hole>),
773 }
774
775 #[derive(Clone, Debug)]
776 enum MaybeInst {
777 Compiled(Inst),
778 Uncompiled(InstHole),
779 Split,
780 Split1(InstPtr),
781 Split2(InstPtr),
782 }
783
784 impl MaybeInst {
785 fn fill(&mut self, goto: InstPtr) {
786 let filled = match *self {
787 MaybeInst::Uncompiled(ref inst) => inst.fill(goto),
788 MaybeInst::Split1(goto1) => {
789 Inst::Split(InstSplit { goto1: goto1, goto2: goto })
790 }
791 MaybeInst::Split2(goto2) => {
792 Inst::Split(InstSplit { goto1: goto, goto2: goto2 })
793 }
794 _ => unreachable!("not all instructions were compiled! \
795 found uncompiled instruction: {:?}", self),
796 };
797 *self = MaybeInst::Compiled(filled);
798 }
799
800 fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
801 let filled = match *self {
802 MaybeInst::Split => {
803 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
804 }
805 _ => unreachable!("must be called on Split instruction, \
806 instead it was called on: {:?}", self),
807 };
808 *self = MaybeInst::Compiled(filled);
809 }
810
811 fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
812 let half_filled = match *self {
813 MaybeInst::Split => goto1,
814 _ => unreachable!("must be called on Split instruction, \
815 instead it was called on: {:?}", self),
816 };
817 *self = MaybeInst::Split1(half_filled);
818 }
819
820 fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
821 let half_filled = match *self {
822 MaybeInst::Split => goto2,
823 _ => unreachable!("must be called on Split instruction, \
824 instead it was called on: {:?}", self),
825 };
826 *self = MaybeInst::Split2(half_filled);
827 }
828
829 fn unwrap(self) -> Inst {
830 match self {
831 MaybeInst::Compiled(inst) => inst,
832 _ => unreachable!("must be called on a compiled instruction, \
833 instead it was called on: {:?}", self),
834 }
835 }
836 }
837
838 #[derive(Clone, Debug)]
839 enum InstHole {
840 Save { slot: usize },
841 EmptyLook { look: EmptyLook },
842 Char { c: char },
843 Ranges { ranges: Vec<(char, char)> },
844 Bytes { start: u8, end: u8 },
845 }
846
847 impl InstHole {
848 fn fill(&self, goto: InstPtr) -> Inst {
849 match *self {
850 InstHole::Save { slot } => Inst::Save(InstSave {
851 goto: goto,
852 slot: slot,
853 }),
854 InstHole::EmptyLook { look } => Inst::EmptyLook(InstEmptyLook {
855 goto: goto,
856 look: look,
857 }),
858 InstHole::Char { c } => Inst::Char(InstChar {
859 goto: goto,
860 c: c,
861 }),
862 InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
863 goto: goto,
864 ranges: ranges.clone(),
865 }),
866 InstHole::Bytes { start, end } => Inst::Bytes(InstBytes {
867 goto: goto,
868 start: start,
869 end: end,
870 }),
871 }
872 }
873 }
874
875 struct CompileClass<'a, 'b> {
876 c: &'a mut Compiler,
877 ranges: &'b [ClassRange],
878 }
879
880 impl<'a, 'b> CompileClass<'a, 'b> {
881 fn compile(mut self) -> Result {
882 let mut holes = vec![];
883 let mut initial_entry = None;
884 let mut last_split = Hole::None;
885 let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
886 self.c.suffix_cache.clear();
887
888 for (i, range) in self.ranges.iter().enumerate() {
889 let is_last_range = i + 1 == self.ranges.len();
890 utf8_seqs.reset(range.start, range.end);
891 let mut it = (&mut utf8_seqs).peekable();
892 loop {
893 let utf8_seq = match it.next() {
894 None => break,
895 Some(utf8_seq) => utf8_seq,
896 };
897 if is_last_range && it.peek().is_none() {
898 let Patch { hole, entry } = try!(self.c_utf8_seq(&utf8_seq));
899 holes.push(hole);
900 self.c.fill(last_split, entry);
901 last_split = Hole::None;
902 if initial_entry.is_none() {
903 initial_entry = Some(entry);
904 }
905 } else {
906 if initial_entry.is_none() {
907 initial_entry = Some(self.c.insts.len());
908 }
909 self.c.fill_to_next(last_split);
910 last_split = self.c.push_split_hole();
911 let Patch { hole, entry } = try!(self.c_utf8_seq(&utf8_seq));
912 holes.push(hole);
913 last_split = self.c.fill_split(last_split, Some(entry), None);
914 }
915 }
916 }
917 self.c.utf8_seqs = Some(utf8_seqs);
918 Ok(Patch {
919 hole: Hole::Many(holes),
920 entry: initial_entry.unwrap(),
921 })
922 }
923
924 fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
925 if self.c.compiled.is_reverse {
926 self.c_utf8_seq_(seq)
927 } else {
928 self.c_utf8_seq_(seq.into_iter().rev())
929 }
930 }
931
932 fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
933 where I: IntoIterator<Item=&'r Utf8Range> {
934 // The initial instruction for each UTF-8 sequence should be the same.
935 let mut from_inst = ::std::usize::MAX;
936 let mut last_hole = Hole::None;
937 for byte_range in seq {
938 let key = SuffixCacheKey {
939 from_inst: from_inst,
940 start: byte_range.start,
941 end: byte_range.end,
942 };
943 {
944 let pc = self.c.insts.len();
945 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
946 from_inst = cached_pc;
947 continue;
948 }
949 }
950 self.c.byte_classes.set_range(byte_range.start, byte_range.end);
951 if from_inst == ::std::usize::MAX {
952 last_hole = self.c.push_hole(InstHole::Bytes {
953 start: byte_range.start,
954 end: byte_range.end,
955 });
956 } else {
957 self.c.push_compiled(Inst::Bytes(InstBytes {
958 goto: from_inst,
959 start: byte_range.start,
960 end: byte_range.end,
961 }));
962 }
963 from_inst = self.c.insts.len().checked_sub(1).unwrap();
964 debug_assert!(from_inst < ::std::usize::MAX);
965 }
966 debug_assert!(from_inst < ::std::usize::MAX);
967 Ok(Patch { hole: last_hole, entry: from_inst })
968 }
969 }
970
971 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
972 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
973 /// The set of byte ranges looks like this:
974 ///
975 /// [0-7F]
976 /// [C2-DF][80-BF]
977 /// [E0][A0-BF][80-BF]
978 /// [E1-EC][80-BF][80-BF]
979 /// [ED][80-9F][80-BF]
980 /// [EE-EF][80-BF][80-BF]
981 ///
982 /// Each line above translates to one alternate in the compiled regex program.
983 /// However, all but one of the alternates end in the same suffix, which is
984 /// a waste of an instruction. The suffix cache facilitates reusing them across
985 /// alternates.
986 ///
987 /// Note that a HashMap could be trivially used for this, but we don't need its
988 /// overhead. Some small bounded space (LRU style) is more than enough.
989 struct SuffixCache {
990 table: Vec<SuffixCacheEntry>,
991 // Every time the cache is cleared, we increment the version number instead
992 // of actually zeroing memory. Since we store a copy of the current version
993 // in every element, all we need to do is make sure to invalidate any stale
994 // entries upon access. This saves quite a bit of time!
995 version: usize,
996 }
997
998 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
999 struct SuffixCacheEntry {
1000 key: SuffixCacheKey,
1001 pc: InstPtr,
1002 version: usize,
1003 }
1004
1005 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1006 struct SuffixCacheKey {
1007 from_inst: InstPtr,
1008 start: u8,
1009 end: u8,
1010 }
1011
1012 impl SuffixCache {
1013 fn new(size: usize) -> Self {
1014 SuffixCache {
1015 table: vec![SuffixCacheEntry::default(); size],
1016 version: 0,
1017 }
1018 }
1019
1020 fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1021 let h = self.hash(&key);
1022 let e = self.table[h];
1023 if e.key == key && e.version == self.version {
1024 Some(e.pc)
1025 } else {
1026 self.table[h] = SuffixCacheEntry {
1027 key: key,
1028 pc: pc,
1029 version: self.version,
1030 };
1031 None
1032 }
1033 }
1034
1035 fn clear(&mut self) {
1036 self.version += 1;
1037 }
1038
1039 fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1040 // Basic FNV-1a hash as described:
1041 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1042 const FNV_PRIME: u64 = 1099511628211;
1043 let mut h = 14695981039346656037;
1044 h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1045 h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1046 h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1047 (h as usize) % self.table.len()
1048 }
1049 }
1050
1051 struct ByteClassSet([bool; 256]);
1052
1053 impl ByteClassSet {
1054 fn new() -> Self {
1055 ByteClassSet([false; 256])
1056 }
1057
1058 fn set_range(&mut self, start: u8, end: u8) {
1059 debug_assert!(start <= end);
1060 if start > 0 {
1061 self.0[start as usize - 1] = true;
1062 }
1063 self.0[end as usize] = true;
1064 }
1065
1066 fn set_word_boundary(&mut self) {
1067 // We need to mark all ranges of bytes whose pairs result in
1068 // evaluating \b differently.
1069 let iswb = is_word_byte;
1070 let mut b1: u16 = 0;
1071 let mut b2: u16;
1072 while b1 <= 255 {
1073 b2 = b1 + 1;
1074 while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1075 b2 += 1;
1076 }
1077 self.set_range(b1 as u8, (b2 - 1) as u8);
1078 b1 = b2;
1079 }
1080 }
1081
1082 fn byte_classes(&self) -> Vec<u8> {
1083 // N.B. If you're debugging the DFA, it's useful to simply return
1084 // `(0..256).collect()`, which effectively removes the byte classes
1085 // and makes the transitions easier to read.
1086 // (0usize..256).map(|x| x as u8).collect()
1087 let mut byte_classes = vec![0; 256];
1088 let mut class = 0u8;
1089 let mut i = 0;
1090 loop {
1091 byte_classes[i] = class as u8;
1092 if i >= 255 {
1093 break;
1094 }
1095 if self.0[i] {
1096 class = class.checked_add(1).unwrap();
1097 }
1098 i += 1;
1099 }
1100 byte_classes
1101 }
1102 }
1103
1104 fn u32_to_usize(n: u32) -> usize {
1105 // In case usize is less than 32 bits, we need to guard against overflow.
1106 // On most platforms this compiles to nothing.
1107 // TODO Use `std::convert::TryFrom` once it's stable.
1108 if (n as u64) > (::std::usize::MAX as u64) {
1109 panic!("BUG: {} is too big to be pointer sized", n)
1110 }
1111 n as usize
1112 }
1113
1114 #[cfg(test)]
1115 mod tests {
1116 use super::ByteClassSet;
1117
1118 #[test]
1119 fn byte_classes() {
1120 let mut set = ByteClassSet::new();
1121 set.set_range(b'a', b'z');
1122 let classes = set.byte_classes();
1123 assert_eq!(classes[0], 0);
1124 assert_eq!(classes[1], 0);
1125 assert_eq!(classes[2], 0);
1126 assert_eq!(classes[b'a' as usize - 1], 0);
1127 assert_eq!(classes[b'a' as usize], 1);
1128 assert_eq!(classes[b'm' as usize], 1);
1129 assert_eq!(classes[b'z' as usize], 1);
1130 assert_eq!(classes[b'z' as usize + 1], 2);
1131 assert_eq!(classes[254], 2);
1132 assert_eq!(classes[255], 2);
1133
1134 let mut set = ByteClassSet::new();
1135 set.set_range(0, 2);
1136 set.set_range(4, 6);
1137 let classes = set.byte_classes();
1138 assert_eq!(classes[0], 0);
1139 assert_eq!(classes[1], 0);
1140 assert_eq!(classes[2], 0);
1141 assert_eq!(classes[3], 1);
1142 assert_eq!(classes[4], 2);
1143 assert_eq!(classes[5], 2);
1144 assert_eq!(classes[6], 2);
1145 assert_eq!(classes[7], 3);
1146 assert_eq!(classes[255], 3);
1147 }
1148
1149 #[test]
1150 fn full_byte_classes() {
1151 let mut set = ByteClassSet::new();
1152 for i in 0..256u16 {
1153 set.set_range(i as u8, i as u8);
1154 }
1155 assert_eq!(set.byte_classes().len(), 256);
1156 }
1157 }