]> git.proxmox.com Git - rustc.git/blob - src/vendor/regex/src/compile.rs
New upstream version 1.17.0+dfsg1
[rustc.git] / src / vendor / regex / src / compile.rs
1 // Copyright 2014-2016 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 use std::collections::HashMap;
12 use std::iter;
13 use std::result;
14 use std::sync::Arc;
15
16 use syntax::{
17 Expr, Repeater, CharClass, ClassRange, ByteClass, ByteRange,
18 is_word_byte,
19 };
20 use utf8_ranges::{Utf8Range, Utf8Sequence, Utf8Sequences};
21
22 use prog::{
23 Program, Inst, InstPtr, EmptyLook,
24 InstSave, InstSplit, InstEmptyLook, InstChar, InstRanges, InstBytes,
25 };
26
27 use Error;
28
29 type Result = result::Result<Patch, Error>;
30
31 #[derive(Debug)]
32 struct Patch {
33 hole: Hole,
34 entry: InstPtr,
35 }
36
37 /// A compiler translates a regular expression AST to a sequence of
38 /// instructions. The sequence of instructions represents an NFA.
39 pub struct Compiler {
40 insts: Vec<MaybeInst>,
41 compiled: Program,
42 capture_name_idx: HashMap<String, usize>,
43 num_exprs: usize,
44 size_limit: usize,
45 suffix_cache: SuffixCache,
46 utf8_seqs: Option<Utf8Sequences>,
47 byte_classes: ByteClassSet,
48 }
49
50 impl Compiler {
51 /// Create a new regular expression compiler.
52 ///
53 /// Various options can be set before calling `compile` on an expression.
54 pub fn new() -> Self {
55 Compiler {
56 insts: vec![],
57 compiled: Program::new(),
58 capture_name_idx: HashMap::new(),
59 num_exprs: 0,
60 size_limit: 10 * (1 << 20),
61 suffix_cache: SuffixCache::new(1000),
62 utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
63 byte_classes: ByteClassSet::new(),
64 }
65 }
66
67 /// The size of the resulting program is limited by size_limit. If
68 /// the program approximately exceeds the given size (in bytes), then
69 /// compilation will stop and return an error.
70 pub fn size_limit(mut self, size_limit: usize) -> Self {
71 self.size_limit = size_limit;
72 self
73 }
74
75 /// If bytes is true, then the program is compiled as a byte based
76 /// automaton, which incorporates UTF-8 decoding into the machine. If it's
77 /// false, then the automaton is Unicode scalar value based, e.g., an
78 /// engine utilizing such an automaton is resposible for UTF-8 decoding.
79 ///
80 /// The specific invariant is that when returning a byte based machine,
81 /// the neither the `Char` nor `Ranges` instructions are produced.
82 /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
83 /// instruction is never produced.
84 ///
85 /// Note that `dfa(true)` implies `bytes(true)`.
86 pub fn bytes(mut self, yes: bool) -> Self {
87 self.compiled.is_bytes = yes;
88 self
89 }
90
91 /// When disabled, the program compiled may match arbitrary bytes.
92 ///
93 /// When enabled (the default), all compiled programs exclusively match
94 /// valid UTF-8 bytes.
95 pub fn only_utf8(mut self, yes: bool) -> Self {
96 self.compiled.only_utf8 = yes;
97 self
98 }
99
100 /// When set, the machine returned is suitable for use in the DFA matching
101 /// engine.
102 ///
103 /// In particular, this ensures that if the regex is not anchored in the
104 /// beginning, then a preceding `.*?` is included in the program. (The NFA
105 /// based engines handle the preceding `.*?` explicitly, which is difficult
106 /// or impossible in the DFA engine.)
107 pub fn dfa(mut self, yes: bool) -> Self {
108 self.compiled.is_dfa = yes;
109 self
110 }
111
112 /// When set, the machine returned is suitable for matching text in
113 /// reverse. In particular, all concatenations are flipped.
114 pub fn reverse(mut self, yes: bool) -> Self {
115 self.compiled.is_reverse = yes;
116 self
117 }
118
119 /// Compile a regular expression given its AST.
120 ///
121 /// The compiler is guaranteed to succeed unless the program exceeds the
122 /// specified size limit. If the size limit is exceeded, then compilation
123 /// stops and returns an error.
124 pub fn compile(
125 mut self,
126 exprs: &[Expr],
127 ) -> result::Result<Program, Error> {
128 debug_assert!(exprs.len() >= 1);
129 self.num_exprs = exprs.len();
130 if exprs.len() == 1 {
131 self.compile_one(&exprs[0])
132 } else {
133 self.compile_many(exprs)
134 }
135 }
136
137 fn compile_one(mut self, expr: &Expr) -> result::Result<Program, Error> {
138 // If we're compiling a forward DFA and we aren't anchored, then
139 // add a `.*?` before the first capture group.
140 // Other matching engines handle this by baking the logic into the
141 // matching engine itself.
142 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
143 self.compiled.is_anchored_start = expr.is_anchored_start();
144 self.compiled.is_anchored_end = expr.is_anchored_end();
145 if self.compiled.needs_dotstar() {
146 dotstar_patch = try!(self.c_dotstar());
147 self.compiled.start = dotstar_patch.entry;
148 }
149 self.compiled.captures = vec![None];
150 let patch = try!(self.c_capture(0, expr));
151 if self.compiled.needs_dotstar() {
152 self.fill(dotstar_patch.hole, patch.entry);
153 } else {
154 self.compiled.start = patch.entry;
155 }
156 self.fill_to_next(patch.hole);
157 self.compiled.matches = vec![self.insts.len()];
158 self.push_compiled(Inst::Match(0));
159 self.compile_finish()
160 }
161
162 fn compile_many(
163 mut self,
164 exprs: &[Expr],
165 ) -> result::Result<Program, Error> {
166 debug_assert!(exprs.len() > 1);
167
168 self.compiled.is_anchored_start =
169 exprs.iter().all(|e| e.is_anchored_start());
170 self.compiled.is_anchored_end =
171 exprs.iter().all(|e| e.is_anchored_end());
172 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
173 if self.compiled.needs_dotstar() {
174 dotstar_patch = try!(self.c_dotstar());
175 self.compiled.start = dotstar_patch.entry;
176 } else {
177 self.compiled.start = 0; // first instruction is always split
178 }
179 self.fill_to_next(dotstar_patch.hole);
180
181 for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
182 let split = self.push_split_hole();
183 let Patch { hole, entry } = try!(self.c_capture(0, expr));
184 self.fill_to_next(hole);
185 self.compiled.matches.push(self.insts.len());
186 self.push_compiled(Inst::Match(i));
187
188 let next = self.insts.len();
189 self.fill_split(split, Some(entry), Some(next));
190 }
191 let i = exprs.len() - 1;
192 let Patch { hole, .. } = try!(self.c_capture(0, &exprs[i]));
193 self.fill_to_next(hole);
194 self.compiled.matches.push(self.insts.len());
195 self.push_compiled(Inst::Match(i));
196 self.compile_finish()
197 }
198
199 fn compile_finish(mut self) -> result::Result<Program, Error> {
200 self.compiled.insts =
201 self.insts.into_iter().map(|inst| inst.unwrap()).collect();
202 self.compiled.byte_classes = self.byte_classes.byte_classes();
203 self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
204 Ok(self.compiled)
205 }
206
207 fn c(&mut self, expr: &Expr) -> Result {
208 use prog;
209 use syntax::Expr::*;
210
211 try!(self.check_size());
212 match *expr {
213 Empty => Ok(Patch { hole: Hole::None, entry: self.insts.len() }),
214 Literal { ref chars, casei } => self.c_literal(chars, casei),
215 LiteralBytes { ref bytes, casei } => self.c_bytes(bytes, casei),
216 AnyChar => self.c_class(&[ClassRange {
217 start: '\x00',
218 end: '\u{10ffff}',
219 }]),
220 AnyCharNoNL => {
221 self.c_class(&[
222 ClassRange { start: '\x00', end: '\x09' },
223 ClassRange { start: '\x0b', end: '\u{10ffff}' },
224 ])
225 }
226 AnyByte => {
227 self.c_class_bytes(&[ByteRange { start: 0, end: 0xFF }])
228 }
229 AnyByteNoNL => {
230 self.c_class_bytes(&[
231 ByteRange { start: 0, end: 0x9 },
232 ByteRange { start: 0xB, end: 0xFF },
233 ])
234 }
235 Class(ref cls) => {
236 self.c_class(cls)
237 }
238 ClassBytes(ref cls) => {
239 self.c_class_bytes(cls)
240 }
241 StartLine if self.compiled.is_reverse => {
242 self.byte_classes.set_range(b'\n', b'\n');
243 self.c_empty_look(prog::EmptyLook::EndLine)
244 }
245 StartLine => {
246 self.byte_classes.set_range(b'\n', b'\n');
247 self.c_empty_look(prog::EmptyLook::StartLine)
248 }
249 EndLine if self.compiled.is_reverse => {
250 self.byte_classes.set_range(b'\n', b'\n');
251 self.c_empty_look(prog::EmptyLook::StartLine)
252 }
253 EndLine => {
254 self.byte_classes.set_range(b'\n', b'\n');
255 self.c_empty_look(prog::EmptyLook::EndLine)
256 }
257 StartText if self.compiled.is_reverse => {
258 self.c_empty_look(prog::EmptyLook::EndText)
259 }
260 StartText => {
261 self.c_empty_look(prog::EmptyLook::StartText)
262 }
263 EndText if self.compiled.is_reverse => {
264 self.c_empty_look(prog::EmptyLook::StartText)
265 }
266 EndText => {
267 self.c_empty_look(prog::EmptyLook::EndText)
268 }
269 WordBoundary => {
270 self.compiled.has_unicode_word_boundary = true;
271 self.byte_classes.set_word_boundary();
272 self.c_empty_look(prog::EmptyLook::WordBoundary)
273 }
274 NotWordBoundary => {
275 self.compiled.has_unicode_word_boundary = true;
276 self.byte_classes.set_word_boundary();
277 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
278 }
279 WordBoundaryAscii => {
280 self.byte_classes.set_word_boundary();
281 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
282 }
283 NotWordBoundaryAscii => {
284 self.byte_classes.set_word_boundary();
285 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
286 }
287 Group { ref e, i: None, name: None } => self.c(e),
288 Group { ref e, i, ref name } => {
289 // it's impossible to have a named capture without an index
290 let i = i.expect("capture index");
291 if i >= self.compiled.captures.len() {
292 self.compiled.captures.push(name.clone());
293 if let Some(ref name) = *name {
294 self.capture_name_idx.insert(name.to_owned(), i);
295 }
296 }
297 self.c_capture(2 * i, e)
298 }
299 Concat(ref es) => {
300 if self.compiled.is_reverse {
301 self.c_concat(es.iter().rev())
302 } else {
303 self.c_concat(es)
304 }
305 }
306 Alternate(ref es) => self.c_alternate(&**es),
307 Repeat { ref e, r, greedy } => self.c_repeat(e, r, greedy),
308 }
309 }
310
311 fn c_capture(&mut self, first_slot: usize, expr: &Expr) -> Result {
312 if self.num_exprs > 1 || self.compiled.is_dfa {
313 // Don't ever compile Save instructions for regex sets because
314 // they are never used. They are also never used in DFA programs
315 // because DFAs can't handle captures.
316 self.c(expr)
317 } else {
318 let entry = self.insts.len();
319 let hole = self.push_hole(InstHole::Save { slot: first_slot });
320 let patch = try!(self.c(expr));
321 self.fill(hole, patch.entry);
322 self.fill_to_next(patch.hole);
323 let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
324 Ok(Patch { hole: hole, entry: entry })
325 }
326 }
327
328 fn c_dotstar(&mut self) -> Result {
329 Ok(if !self.compiled.only_utf8() {
330 try!(self.c(&Expr::Repeat {
331 e: Box::new(Expr::AnyByte),
332 r: Repeater::ZeroOrMore,
333 greedy: false,
334 }))
335 } else {
336 try!(self.c(&Expr::Repeat {
337 e: Box::new(Expr::AnyChar),
338 r: Repeater::ZeroOrMore,
339 greedy: false,
340 }))
341 })
342 }
343
344 fn c_literal(&mut self, chars: &[char], casei: bool) -> Result {
345 debug_assert!(!chars.is_empty());
346 let mut chars: Box<Iterator<Item=&char>> =
347 if self.compiled.is_reverse {
348 Box::new(chars.iter().rev())
349 } else {
350 Box::new(chars.iter())
351 };
352 let first = *chars.next().expect("non-empty literal");
353 let Patch { mut hole, entry } = try!(self.c_char(first, casei));
354 for &c in chars {
355 let p = try!(self.c_char(c, casei));
356 self.fill(hole, p.entry);
357 hole = p.hole;
358 }
359 Ok(Patch { hole: hole, entry: entry })
360 }
361
362 fn c_char(&mut self, c: char, casei: bool) -> Result {
363 if casei {
364 self.c_class(&CharClass::new(vec![
365 ClassRange { start: c, end: c },
366 ]).case_fold())
367 } else {
368 self.c_class(&[ClassRange { start: c, end: c }])
369 }
370 }
371
372 fn c_class(&mut self, ranges: &[ClassRange]) -> Result {
373 assert!(!ranges.is_empty());
374 if self.compiled.uses_bytes() {
375 CompileClass {
376 c: self,
377 ranges: ranges,
378 }.compile()
379 } else {
380 let ranges: Vec<(char, char)> =
381 ranges.iter().map(|r| (r.start, r.end)).collect();
382 let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
383 self.push_hole(InstHole::Char { c: ranges[0].0 })
384 } else {
385 self.push_hole(InstHole::Ranges { ranges: ranges })
386 };
387 Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
388 }
389 }
390
391 fn c_bytes(&mut self, bytes: &[u8], casei: bool) -> Result {
392 debug_assert!(!bytes.is_empty());
393 let mut bytes: Box<Iterator<Item=&u8>> =
394 if self.compiled.is_reverse {
395 Box::new(bytes.iter().rev())
396 } else {
397 Box::new(bytes.iter())
398 };
399 let first = *bytes.next().expect("non-empty literal");
400 let Patch { mut hole, entry } = try!(self.c_byte(first, casei));
401 for &b in bytes {
402 let p = try!(self.c_byte(b, casei));
403 self.fill(hole, p.entry);
404 hole = p.hole;
405 }
406 Ok(Patch { hole: hole, entry: entry })
407 }
408
409 fn c_byte(&mut self, b: u8, casei: bool) -> Result {
410 if casei {
411 self.c_class_bytes(&ByteClass::new(vec![
412 ByteRange { start: b, end: b },
413 ]).case_fold())
414 } else {
415 self.c_class_bytes(&[ByteRange { start: b, end: b }])
416 }
417 }
418
419 fn c_class_bytes(&mut self, ranges: &[ByteRange]) -> Result {
420 debug_assert!(!ranges.is_empty());
421
422 let first_split_entry = self.insts.len();
423 let mut holes = vec![];
424 let mut prev_hole = Hole::None;
425 for r in &ranges[0..ranges.len() - 1] {
426 self.fill_to_next(prev_hole);
427 let split = self.push_split_hole();
428 let next = self.insts.len();
429 self.byte_classes.set_range(r.start, r.end);
430 holes.push(self.push_hole(InstHole::Bytes {
431 start: r.start, end: r.end,
432 }));
433 prev_hole = self.fill_split(split, Some(next), None);
434 }
435 let next = self.insts.len();
436 let r = &ranges[ranges.len() - 1];
437 self.byte_classes.set_range(r.start, r.end);
438 holes.push(self.push_hole(InstHole::Bytes {
439 start: r.start, end: r.end,
440 }));
441 self.fill(prev_hole, next);
442 Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
443 }
444
445 fn c_empty_look(&mut self, look: EmptyLook) -> Result {
446 let hole = self.push_hole(InstHole::EmptyLook { look: look });
447 Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
448 }
449
450 fn c_concat<'a, I>(&mut self, exprs: I) -> Result
451 where I: IntoIterator<Item=&'a Expr> {
452 let mut exprs = exprs.into_iter();
453 let first = match exprs.next() {
454 Some(expr) => expr,
455 None => {
456 return Ok(Patch { hole: Hole::None, entry: self.insts.len() })
457 }
458 };
459 let Patch { mut hole, entry } = try!(self.c(first));
460 for e in exprs {
461 let p = try!(self.c(e));
462 self.fill(hole, p.entry);
463 hole = p.hole;
464 }
465 Ok(Patch { hole: hole, entry: entry })
466 }
467
468 fn c_alternate(&mut self, exprs: &[Expr]) -> Result {
469 debug_assert!(
470 exprs.len() >= 2, "alternates must have at least 2 exprs");
471
472 // Initial entry point is always the first split.
473 let first_split_entry = self.insts.len();
474
475 // Save up all of the holes from each alternate. They will all get
476 // patched to point to the same location.
477 let mut holes = vec![];
478
479 let mut prev_hole = Hole::None;
480 for e in &exprs[0..exprs.len() - 1] {
481 self.fill_to_next(prev_hole);
482 let split = self.push_split_hole();
483 let Patch { hole, entry } = try!(self.c(e));
484 holes.push(hole);
485 prev_hole = self.fill_split(split, Some(entry), None);
486 }
487 let Patch { hole, entry } = try!(self.c(&exprs[exprs.len() - 1]));
488 holes.push(hole);
489 self.fill(prev_hole, entry);
490 Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
491 }
492
493 fn c_repeat(
494 &mut self,
495 expr: &Expr,
496 kind: Repeater,
497 greedy: bool,
498 ) -> Result {
499 match kind {
500 Repeater::ZeroOrOne => self.c_repeat_zero_or_one(expr, greedy),
501 Repeater::ZeroOrMore => self.c_repeat_zero_or_more(expr, greedy),
502 Repeater::OneOrMore => self.c_repeat_one_or_more(expr, greedy),
503 Repeater::Range { min, max: None } => {
504 self.c_repeat_range_min_or_more(expr, greedy, min)
505 }
506 Repeater::Range { min, max: Some(max) } => {
507 self.c_repeat_range(expr, greedy, min, max)
508 }
509 }
510 }
511
512 fn c_repeat_zero_or_one(
513 &mut self,
514 expr: &Expr,
515 greedy: bool,
516 ) -> Result {
517 let split_entry = self.insts.len();
518 let split = self.push_split_hole();
519 let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr));
520
521 let split_hole = if greedy {
522 self.fill_split(split, Some(entry_rep), None)
523 } else {
524 self.fill_split(split, None, Some(entry_rep))
525 };
526 let holes = vec![hole_rep, split_hole];
527 Ok(Patch { hole: Hole::Many(holes), entry: split_entry })
528 }
529
530 fn c_repeat_zero_or_more(
531 &mut self,
532 expr: &Expr,
533 greedy: bool,
534 ) -> Result {
535 let split_entry = self.insts.len();
536 let split = self.push_split_hole();
537 let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr));
538
539 self.fill(hole_rep, split_entry);
540 let split_hole = if greedy {
541 self.fill_split(split, Some(entry_rep), None)
542 } else {
543 self.fill_split(split, None, Some(entry_rep))
544 };
545 Ok(Patch { hole: split_hole, entry: split_entry })
546 }
547
548 fn c_repeat_one_or_more(
549 &mut self,
550 expr: &Expr,
551 greedy: bool,
552 ) -> Result {
553 let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr));
554 self.fill_to_next(hole_rep);
555 let split = self.push_split_hole();
556
557 let split_hole = if greedy {
558 self.fill_split(split, Some(entry_rep), None)
559 } else {
560 self.fill_split(split, None, Some(entry_rep))
561 };
562 Ok(Patch { hole: split_hole, entry: entry_rep })
563 }
564
565 fn c_repeat_range_min_or_more(
566 &mut self,
567 expr: &Expr,
568 greedy: bool,
569 min: u32,
570 ) -> Result {
571 let min = u32_to_usize(min);
572 let patch_concat = try!(self.c_concat(iter::repeat(expr).take(min)));
573 let patch_rep = try!(self.c_repeat_zero_or_more(expr, greedy));
574 self.fill(patch_concat.hole, patch_rep.entry);
575 Ok(Patch { hole: patch_rep.hole, entry: patch_concat.entry })
576 }
577
578 fn c_repeat_range(
579 &mut self,
580 expr: &Expr,
581 greedy: bool,
582 min: u32,
583 max: u32,
584 ) -> Result {
585 let (min, max) = (u32_to_usize(min), u32_to_usize(max));
586 let patch_concat = try!(self.c_concat(iter::repeat(expr).take(min)));
587 let initial_entry = patch_concat.entry;
588 if min == max {
589 return Ok(patch_concat);
590 }
591 // It is much simpler to compile, e.g., `a{2,5}` as:
592 //
593 // aaa?a?a?
594 //
595 // But you end up with a sequence of instructions like this:
596 //
597 // 0: 'a'
598 // 1: 'a',
599 // 2: split(3, 4)
600 // 3: 'a'
601 // 4: split(5, 6)
602 // 5: 'a'
603 // 6: split(7, 8)
604 // 7: 'a'
605 // 8: MATCH
606 //
607 // This is *incredibly* inefficient because the splits end
608 // up forming a chain, which has to be resolved everything a
609 // transition is followed.
610 let mut holes = vec![];
611 let mut prev_hole = patch_concat.hole;
612 for _ in min..max {
613 self.fill_to_next(prev_hole);
614 let split = self.push_split_hole();
615 let Patch { hole, entry } = try!(self.c(expr));
616 prev_hole = hole;
617 if greedy {
618 holes.push(self.fill_split(split, Some(entry), None));
619 } else {
620 holes.push(self.fill_split(split, None, Some(entry)));
621 }
622 }
623 holes.push(prev_hole);
624 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry })
625 }
626
627 fn fill(&mut self, hole: Hole, goto: InstPtr) {
628 match hole {
629 Hole::None => {}
630 Hole::One(pc) => {
631 self.insts[pc].fill(goto);
632 }
633 Hole::Many(holes) => {
634 for hole in holes {
635 self.fill(hole, goto);
636 }
637 }
638 }
639 }
640
641 fn fill_to_next(&mut self, hole: Hole) {
642 let next = self.insts.len();
643 self.fill(hole, next);
644 }
645
646 fn fill_split(
647 &mut self,
648 hole: Hole,
649 goto1: Option<InstPtr>,
650 goto2: Option<InstPtr>,
651 ) -> Hole {
652 match hole {
653 Hole::None => Hole::None,
654 Hole::One(pc) => {
655 match (goto1, goto2) {
656 (Some(goto1), Some(goto2)) => {
657 self.insts[pc].fill_split(goto1, goto2);
658 Hole::None
659 }
660 (Some(goto1), None) => {
661 self.insts[pc].half_fill_split_goto1(goto1);
662 Hole::One(pc)
663 }
664 (None, Some(goto2)) => {
665 self.insts[pc].half_fill_split_goto2(goto2);
666 Hole::One(pc)
667 }
668 (None, None) => unreachable!("at least one of the split \
669 holes must be filled"),
670 }
671 }
672 Hole::Many(holes) => {
673 let mut new_holes = vec![];
674 for hole in holes {
675 new_holes.push(self.fill_split(hole, goto1, goto2));
676 }
677 if new_holes.is_empty() {
678 Hole::None
679 } else if new_holes.len() == 1 {
680 new_holes.pop().unwrap()
681 } else {
682 Hole::Many(new_holes)
683 }
684 }
685 }
686 }
687
688 fn push_compiled(&mut self, inst: Inst) {
689 self.insts.push(MaybeInst::Compiled(inst));
690 }
691
692 fn push_hole(&mut self, inst: InstHole) -> Hole {
693 let hole = self.insts.len();
694 self.insts.push(MaybeInst::Uncompiled(inst));
695 Hole::One(hole)
696 }
697
698 fn push_split_hole(&mut self) -> Hole {
699 let hole = self.insts.len();
700 self.insts.push(MaybeInst::Split);
701 Hole::One(hole)
702 }
703
704 fn check_size(&self) -> result::Result<(), Error> {
705 use std::mem::size_of;
706
707 if self.insts.len() * size_of::<Inst>() > self.size_limit {
708 Err(Error::CompiledTooBig(self.size_limit))
709 } else {
710 Ok(())
711 }
712 }
713 }
714
715 #[derive(Debug)]
716 enum Hole {
717 None,
718 One(InstPtr),
719 Many(Vec<Hole>),
720 }
721
722 #[derive(Clone, Debug)]
723 enum MaybeInst {
724 Compiled(Inst),
725 Uncompiled(InstHole),
726 Split,
727 Split1(InstPtr),
728 Split2(InstPtr),
729 }
730
731 impl MaybeInst {
732 fn fill(&mut self, goto: InstPtr) {
733 let filled = match *self {
734 MaybeInst::Uncompiled(ref inst) => inst.fill(goto),
735 MaybeInst::Split1(goto1) => {
736 Inst::Split(InstSplit { goto1: goto1, goto2: goto })
737 }
738 MaybeInst::Split2(goto2) => {
739 Inst::Split(InstSplit { goto1: goto, goto2: goto2 })
740 }
741 _ => unreachable!("not all instructions were compiled! \
742 found uncompiled instruction: {:?}", self),
743 };
744 *self = MaybeInst::Compiled(filled);
745 }
746
747 fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
748 let filled = match *self {
749 MaybeInst::Split => {
750 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
751 }
752 _ => unreachable!("must be called on Split instruction, \
753 instead it was called on: {:?}", self),
754 };
755 *self = MaybeInst::Compiled(filled);
756 }
757
758 fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
759 let half_filled = match *self {
760 MaybeInst::Split => goto1,
761 _ => unreachable!("must be called on Split instruction, \
762 instead it was called on: {:?}", self),
763 };
764 *self = MaybeInst::Split1(half_filled);
765 }
766
767 fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
768 let half_filled = match *self {
769 MaybeInst::Split => goto2,
770 _ => unreachable!("must be called on Split instruction, \
771 instead it was called on: {:?}", self),
772 };
773 *self = MaybeInst::Split2(half_filled);
774 }
775
776 fn unwrap(self) -> Inst {
777 match self {
778 MaybeInst::Compiled(inst) => inst,
779 _ => unreachable!("must be called on a compiled instruction, \
780 instead it was called on: {:?}", self),
781 }
782 }
783 }
784
785 #[derive(Clone, Debug)]
786 enum InstHole {
787 Save { slot: usize },
788 EmptyLook { look: EmptyLook },
789 Char { c: char },
790 Ranges { ranges: Vec<(char, char)> },
791 Bytes { start: u8, end: u8 },
792 }
793
794 impl InstHole {
795 fn fill(&self, goto: InstPtr) -> Inst {
796 match *self {
797 InstHole::Save { slot } => Inst::Save(InstSave {
798 goto: goto,
799 slot: slot,
800 }),
801 InstHole::EmptyLook { look } => Inst::EmptyLook(InstEmptyLook {
802 goto: goto,
803 look: look,
804 }),
805 InstHole::Char { c } => Inst::Char(InstChar {
806 goto: goto,
807 c: c,
808 }),
809 InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
810 goto: goto,
811 ranges: ranges.clone(),
812 }),
813 InstHole::Bytes { start, end } => Inst::Bytes(InstBytes {
814 goto: goto,
815 start: start,
816 end: end,
817 }),
818 }
819 }
820 }
821
822 struct CompileClass<'a, 'b> {
823 c: &'a mut Compiler,
824 ranges: &'b [ClassRange],
825 }
826
827 impl<'a, 'b> CompileClass<'a, 'b> {
828 fn compile(mut self) -> Result {
829 let mut holes = vec![];
830 let mut initial_entry = None;
831 let mut last_split = Hole::None;
832 let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
833 self.c.suffix_cache.clear();
834
835 for (i, ref range) in self.ranges.iter().enumerate() {
836 let is_last_range = i + 1 == self.ranges.len();
837 utf8_seqs.reset(range.start, range.end);
838 let mut it = (&mut utf8_seqs).peekable();
839 loop {
840 let utf8_seq = match it.next() {
841 None => break,
842 Some(utf8_seq) => utf8_seq,
843 };
844 if is_last_range && it.peek().is_none() {
845 let Patch { hole, entry } = try!(self.c_utf8_seq(&utf8_seq));
846 holes.push(hole);
847 self.c.fill(last_split, entry);
848 last_split = Hole::None;
849 if initial_entry.is_none() {
850 initial_entry = Some(entry);
851 }
852 } else {
853 if initial_entry.is_none() {
854 initial_entry = Some(self.c.insts.len());
855 }
856 self.c.fill_to_next(last_split);
857 last_split = self.c.push_split_hole();
858 let Patch { hole, entry } = try!(self.c_utf8_seq(&utf8_seq));
859 holes.push(hole);
860 last_split = self.c.fill_split(last_split, Some(entry), None);
861 }
862 }
863 }
864 self.c.utf8_seqs = Some(utf8_seqs);
865 Ok(Patch {
866 hole: Hole::Many(holes),
867 entry: initial_entry.unwrap(),
868 })
869 }
870
871 fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
872 if self.c.compiled.is_reverse {
873 self.c_utf8_seq_(seq)
874 } else {
875 self.c_utf8_seq_(seq.into_iter().rev())
876 }
877 }
878
879 fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
880 where I: IntoIterator<Item=&'r Utf8Range> {
881 // The initial instruction for each UTF-8 sequence should be the same.
882 let mut from_inst = ::std::usize::MAX;
883 let mut last_hole = Hole::None;
884 for byte_range in seq {
885 let key = SuffixCacheKey {
886 from_inst: from_inst,
887 start: byte_range.start,
888 end: byte_range.end,
889 };
890 {
891 let pc = self.c.insts.len();
892 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
893 from_inst = cached_pc;
894 continue;
895 }
896 }
897 self.c.byte_classes.set_range(byte_range.start, byte_range.end);
898 if from_inst == ::std::usize::MAX {
899 last_hole = self.c.push_hole(InstHole::Bytes {
900 start: byte_range.start,
901 end: byte_range.end,
902 });
903 } else {
904 self.c.push_compiled(Inst::Bytes(InstBytes {
905 goto: from_inst,
906 start: byte_range.start,
907 end: byte_range.end,
908 }));
909 }
910 from_inst = self.c.insts.len().checked_sub(1).unwrap();
911 debug_assert!(from_inst < ::std::usize::MAX);
912 }
913 debug_assert!(from_inst < ::std::usize::MAX);
914 Ok(Patch { hole: last_hole, entry: from_inst })
915 }
916 }
917
918 /// SuffixCache is a simple bounded hash map for caching suffix entries in
919 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
920 /// The set of byte ranges looks like this:
921 ///
922 /// [0-7F]
923 /// [C2-DF][80-BF]
924 /// [E0][A0-BF][80-BF]
925 /// [E1-EC][80-BF][80-BF]
926 /// [ED][80-9F][80-BF]
927 /// [EE-EF][80-BF][80-BF]
928 ///
929 /// Each line above translates to one alternate in the compiled regex program.
930 /// However, all but one of the alternates end in the same suffix, which is
931 /// a waste of an instruction. The suffix cache facilitates reusing them across
932 /// alternates.
933 ///
934 /// Note that a HashMap could be trivially used for this, but we don't need its
935 /// overhead. Some small bounded space (LRU style) is more than enough.
936 struct SuffixCache {
937 table: Vec<SuffixCacheEntry>,
938 // Every time the cache is cleared, we increment the version number instead
939 // of actually zeroing memory. Since we store a copy of the current version
940 // in every element, all we need to do is make sure to invalidate any stale
941 // entries upon access. This saves quite a bit of time!
942 version: usize,
943 }
944
945 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
946 struct SuffixCacheEntry {
947 key: SuffixCacheKey,
948 pc: InstPtr,
949 version: usize,
950 }
951
952 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
953 struct SuffixCacheKey {
954 from_inst: InstPtr,
955 start: u8,
956 end: u8,
957 }
958
959 impl SuffixCache {
960 fn new(size: usize) -> Self {
961 SuffixCache {
962 table: vec![SuffixCacheEntry::default(); size],
963 version: 0,
964 }
965 }
966
967 fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
968 let h = self.hash(&key);
969 let e = self.table[h];
970 if e.key == key && e.version == self.version {
971 Some(e.pc)
972 } else {
973 self.table[h] = SuffixCacheEntry {
974 key: key,
975 pc: pc,
976 version: self.version,
977 };
978 None
979 }
980 }
981
982 fn clear(&mut self) {
983 self.version += 1;
984 }
985
986 fn hash(&self, suffix: &SuffixCacheKey) -> usize {
987 // Basic FNV-1a hash as described:
988 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
989 const FNV_PRIME: u64 = 1099511628211;
990 let mut h = 14695981039346656037;
991 h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
992 h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
993 h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
994 (h as usize) % self.table.len()
995 }
996 }
997
998 struct ByteClassSet([bool; 256]);
999
1000 impl ByteClassSet {
1001 fn new() -> Self {
1002 ByteClassSet([false; 256])
1003 }
1004
1005 fn set_range(&mut self, start: u8, end: u8) {
1006 debug_assert!(start <= end);
1007 if start > 0 {
1008 self.0[start as usize - 1] = true;
1009 }
1010 self.0[end as usize] = true;
1011 }
1012
1013 fn set_word_boundary(&mut self) {
1014 // We need to mark all ranges of bytes whose pairs result in
1015 // evaluating \b differently.
1016 let iswb = is_word_byte;
1017 let mut b1: u16 = 0;
1018 let mut b2: u16;
1019 while b1 <= 255 {
1020 b2 = b1 + 1;
1021 while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1022 b2 += 1;
1023 }
1024 self.set_range(b1 as u8, (b2 - 1) as u8);
1025 b1 = b2;
1026 }
1027 }
1028
1029 fn byte_classes(&self) -> Vec<u8> {
1030 // N.B. If you're debugging the DFA, it's useful to simply return
1031 // `(0..256).collect()`, which effectively removes the byte classes
1032 // and makes the transitions easier to read.
1033 // (0usize..256).map(|x| x as u8).collect()
1034 let mut byte_classes = vec![0; 256];
1035 let mut class = 0u8;
1036 for i in 0..256 {
1037 byte_classes[i] = class;
1038 if self.0[i] {
1039 class = class.checked_add(1).unwrap();
1040 }
1041 }
1042 byte_classes
1043 }
1044 }
1045
1046 fn u32_to_usize(n: u32) -> usize {
1047 if (n as u64) > (::std::usize::MAX as u64) {
1048 panic!("BUG: {} is too big to be pointer sized", n)
1049 }
1050 n as usize
1051 }
1052
1053 #[cfg(test)]
1054 mod tests {
1055 use super::ByteClassSet;
1056
1057 #[test]
1058 fn byte_classes() {
1059 let mut set = ByteClassSet::new();
1060 set.set_range(b'a', b'z');
1061 let classes = set.byte_classes();
1062 assert_eq!(classes[0], 0);
1063 assert_eq!(classes[1], 0);
1064 assert_eq!(classes[2], 0);
1065 assert_eq!(classes[b'a' as usize - 1], 0);
1066 assert_eq!(classes[b'a' as usize], 1);
1067 assert_eq!(classes[b'm' as usize], 1);
1068 assert_eq!(classes[b'z' as usize], 1);
1069 assert_eq!(classes[b'z' as usize + 1], 2);
1070 assert_eq!(classes[254], 2);
1071 assert_eq!(classes[255], 2);
1072
1073 let mut set = ByteClassSet::new();
1074 set.set_range(0, 2);
1075 set.set_range(4, 6);
1076 let classes = set.byte_classes();
1077 assert_eq!(classes[0], 0);
1078 assert_eq!(classes[1], 0);
1079 assert_eq!(classes[2], 0);
1080 assert_eq!(classes[3], 1);
1081 assert_eq!(classes[4], 2);
1082 assert_eq!(classes[5], 2);
1083 assert_eq!(classes[6], 2);
1084 assert_eq!(classes[7], 3);
1085 assert_eq!(classes[255], 3);
1086 }
1087 }