]> git.proxmox.com Git - rustc.git/blob - vendor/regex/src/compile.rs
New upstream version 1.61.0+dfsg1
[rustc.git] / vendor / regex / src / compile.rs
1 use std::collections::HashMap;
2 use std::fmt;
3 use std::iter;
4 use std::result;
5 use std::sync::Arc;
6
7 use regex_syntax::hir::{self, Hir};
8 use regex_syntax::is_word_byte;
9 use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
10
11 use crate::prog::{
12 EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
13 InstSave, InstSplit, Program,
14 };
15
16 use crate::Error;
17
18 type Result = result::Result<Patch, Error>;
19 type ResultOrEmpty = result::Result<Option<Patch>, Error>;
20
21 #[derive(Debug)]
22 struct Patch {
23 hole: Hole,
24 entry: InstPtr,
25 }
26
27 /// A compiler translates a regular expression AST to a sequence of
28 /// instructions. The sequence of instructions represents an NFA.
29 // `Compiler` is only public via the `internal` module, so avoid deriving
30 // `Debug`.
31 #[allow(missing_debug_implementations)]
32 pub struct Compiler {
33 insts: Vec<MaybeInst>,
34 compiled: Program,
35 capture_name_idx: HashMap<String, usize>,
36 num_exprs: usize,
37 size_limit: usize,
38 suffix_cache: SuffixCache,
39 utf8_seqs: Option<Utf8Sequences>,
40 byte_classes: ByteClassSet,
41 // This keeps track of extra bytes allocated while compiling the regex
42 // program. Currently, this corresponds to two things. First is the heap
43 // memory allocated by Unicode character classes ('InstRanges'). Second is
44 // a "fake" amount of memory used by empty sub-expressions, so that enough
45 // empty sub-expressions will ultimately trigger the compiler to bail
46 // because of a size limit restriction. (That empty sub-expressions don't
47 // add to heap memory usage is more-or-less an implementation detail.) In
48 // the second case, if we don't bail, then an excessively large repetition
49 // on an empty sub-expression can result in the compiler using a very large
50 // amount of CPU time.
51 extra_inst_bytes: usize,
52 }
53
54 impl Compiler {
55 /// Create a new regular expression compiler.
56 ///
57 /// Various options can be set before calling `compile` on an expression.
58 pub fn new() -> Self {
59 Compiler {
60 insts: vec![],
61 compiled: Program::new(),
62 capture_name_idx: HashMap::new(),
63 num_exprs: 0,
64 size_limit: 10 * (1 << 20),
65 suffix_cache: SuffixCache::new(1000),
66 utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
67 byte_classes: ByteClassSet::new(),
68 extra_inst_bytes: 0,
69 }
70 }
71
72 /// The size of the resulting program is limited by size_limit. If
73 /// the program approximately exceeds the given size (in bytes), then
74 /// compilation will stop and return an error.
75 pub fn size_limit(mut self, size_limit: usize) -> Self {
76 self.size_limit = size_limit;
77 self
78 }
79
80 /// If bytes is true, then the program is compiled as a byte based
81 /// automaton, which incorporates UTF-8 decoding into the machine. If it's
82 /// false, then the automaton is Unicode scalar value based, e.g., an
83 /// engine utilizing such an automaton is responsible for UTF-8 decoding.
84 ///
85 /// The specific invariant is that when returning a byte based machine,
86 /// the neither the `Char` nor `Ranges` instructions are produced.
87 /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
88 /// instruction is never produced.
89 ///
90 /// Note that `dfa(true)` implies `bytes(true)`.
91 pub fn bytes(mut self, yes: bool) -> Self {
92 self.compiled.is_bytes = yes;
93 self
94 }
95
96 /// When disabled, the program compiled may match arbitrary bytes.
97 ///
98 /// When enabled (the default), all compiled programs exclusively match
99 /// valid UTF-8 bytes.
100 pub fn only_utf8(mut self, yes: bool) -> Self {
101 self.compiled.only_utf8 = yes;
102 self
103 }
104
105 /// When set, the machine returned is suitable for use in the DFA matching
106 /// engine.
107 ///
108 /// In particular, this ensures that if the regex is not anchored in the
109 /// beginning, then a preceding `.*?` is included in the program. (The NFA
110 /// based engines handle the preceding `.*?` explicitly, which is difficult
111 /// or impossible in the DFA engine.)
112 pub fn dfa(mut self, yes: bool) -> Self {
113 self.compiled.is_dfa = yes;
114 self
115 }
116
117 /// When set, the machine returned is suitable for matching text in
118 /// reverse. In particular, all concatenations are flipped.
119 pub fn reverse(mut self, yes: bool) -> Self {
120 self.compiled.is_reverse = yes;
121 self
122 }
123
124 /// Compile a regular expression given its AST.
125 ///
126 /// The compiler is guaranteed to succeed unless the program exceeds the
127 /// specified size limit. If the size limit is exceeded, then compilation
128 /// stops and returns an error.
129 pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
130 debug_assert!(!exprs.is_empty());
131 self.num_exprs = exprs.len();
132 if exprs.len() == 1 {
133 self.compile_one(&exprs[0])
134 } else {
135 self.compile_many(exprs)
136 }
137 }
138
139 fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
140 // If we're compiling a forward DFA and we aren't anchored, then
141 // add a `.*?` before the first capture group.
142 // Other matching engines handle this by baking the logic into the
143 // matching engine itself.
144 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
145 self.compiled.is_anchored_start = expr.is_anchored_start();
146 self.compiled.is_anchored_end = expr.is_anchored_end();
147 if self.compiled.needs_dotstar() {
148 dotstar_patch = self.c_dotstar()?;
149 self.compiled.start = dotstar_patch.entry;
150 }
151 self.compiled.captures = vec![None];
152 let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst());
153 if self.compiled.needs_dotstar() {
154 self.fill(dotstar_patch.hole, patch.entry);
155 } else {
156 self.compiled.start = patch.entry;
157 }
158 self.fill_to_next(patch.hole);
159 self.compiled.matches = vec![self.insts.len()];
160 self.push_compiled(Inst::Match(0));
161 self.compile_finish()
162 }
163
164 fn compile_many(
165 mut self,
166 exprs: &[Hir],
167 ) -> result::Result<Program, Error> {
168 debug_assert!(exprs.len() > 1);
169
170 self.compiled.is_anchored_start =
171 exprs.iter().all(|e| e.is_anchored_start());
172 self.compiled.is_anchored_end =
173 exprs.iter().all(|e| e.is_anchored_end());
174 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
175 if self.compiled.needs_dotstar() {
176 dotstar_patch = self.c_dotstar()?;
177 self.compiled.start = dotstar_patch.entry;
178 } else {
179 self.compiled.start = 0; // first instruction is always split
180 }
181 self.fill_to_next(dotstar_patch.hole);
182
183 let mut prev_hole = Hole::None;
184 for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
185 self.fill_to_next(prev_hole);
186 let split = self.push_split_hole();
187 let Patch { hole, entry } =
188 self.c_capture(0, expr)?.unwrap_or(self.next_inst());
189 self.fill_to_next(hole);
190 self.compiled.matches.push(self.insts.len());
191 self.push_compiled(Inst::Match(i));
192 prev_hole = self.fill_split(split, Some(entry), None);
193 }
194 let i = exprs.len() - 1;
195 let Patch { hole, entry } =
196 self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst());
197 self.fill(prev_hole, entry);
198 self.fill_to_next(hole);
199 self.compiled.matches.push(self.insts.len());
200 self.push_compiled(Inst::Match(i));
201 self.compile_finish()
202 }
203
204 fn compile_finish(mut self) -> result::Result<Program, Error> {
205 self.compiled.insts =
206 self.insts.into_iter().map(|inst| inst.unwrap()).collect();
207 self.compiled.byte_classes = self.byte_classes.byte_classes();
208 self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
209 Ok(self.compiled)
210 }
211
212 /// Compile expr into self.insts, returning a patch on success,
213 /// or an error if we run out of memory.
214 ///
215 /// All of the c_* methods of the compiler share the contract outlined
216 /// here.
217 ///
218 /// The main thing that a c_* method does is mutate `self.insts`
219 /// to add a list of mostly compiled instructions required to execute
220 /// the given expression. `self.insts` contains MaybeInsts rather than
221 /// Insts because there is some backpatching required.
222 ///
223 /// The `Patch` value returned by each c_* method provides metadata
224 /// about the compiled instructions emitted to `self.insts`. The
225 /// `entry` member of the patch refers to the first instruction
226 /// (the entry point), while the `hole` member contains zero or
227 /// more offsets to partial instructions that need to be backpatched.
228 /// The c_* routine can't know where its list of instructions are going to
229 /// jump to after execution, so it is up to the caller to patch
230 /// these jumps to point to the right place. So compiling some
231 /// expression, e, we would end up with a situation that looked like:
232 ///
233 /// ```text
234 /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
235 /// ^ ^ ^
236 /// | \ /
237 /// entry \ /
238 /// hole
239 /// ```
240 ///
241 /// To compile two expressions, e1 and e2, concatenated together we
242 /// would do:
243 ///
244 /// ```ignore
245 /// let patch1 = self.c(e1);
246 /// let patch2 = self.c(e2);
247 /// ```
248 ///
249 /// while leaves us with a situation that looks like
250 ///
251 /// ```text
252 /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
253 /// ^ ^ ^ ^
254 /// | | | |
255 /// entry1 hole1 entry2 hole2
256 /// ```
257 ///
258 /// Then to merge the two patches together into one we would backpatch
259 /// hole1 with entry2 and return a new patch that enters at entry1
260 /// and has hole2 for a hole. In fact, if you look at the c_concat
261 /// method you will see that it does exactly this, though it handles
262 /// a list of expressions rather than just the two that we use for
263 /// an example.
264 ///
265 /// Ok(None) is returned when an expression is compiled to no
266 /// instruction, and so no patch.entry value makes sense.
267 fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
268 use crate::prog;
269 use regex_syntax::hir::HirKind::*;
270
271 self.check_size()?;
272 match *expr.kind() {
273 Empty => self.c_empty(),
274 Literal(hir::Literal::Unicode(c)) => self.c_char(c),
275 Literal(hir::Literal::Byte(b)) => {
276 assert!(self.compiled.uses_bytes());
277 self.c_byte(b)
278 }
279 Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
280 Class(hir::Class::Bytes(ref cls)) => {
281 if self.compiled.uses_bytes() {
282 self.c_class_bytes(cls.ranges())
283 } else {
284 assert!(cls.is_all_ascii());
285 let mut char_ranges = vec![];
286 for r in cls.iter() {
287 let (s, e) = (r.start() as char, r.end() as char);
288 char_ranges.push(hir::ClassUnicodeRange::new(s, e));
289 }
290 self.c_class(&char_ranges)
291 }
292 }
293 Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
294 self.byte_classes.set_range(b'\n', b'\n');
295 self.c_empty_look(prog::EmptyLook::EndLine)
296 }
297 Anchor(hir::Anchor::StartLine) => {
298 self.byte_classes.set_range(b'\n', b'\n');
299 self.c_empty_look(prog::EmptyLook::StartLine)
300 }
301 Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
302 self.byte_classes.set_range(b'\n', b'\n');
303 self.c_empty_look(prog::EmptyLook::StartLine)
304 }
305 Anchor(hir::Anchor::EndLine) => {
306 self.byte_classes.set_range(b'\n', b'\n');
307 self.c_empty_look(prog::EmptyLook::EndLine)
308 }
309 Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
310 self.c_empty_look(prog::EmptyLook::EndText)
311 }
312 Anchor(hir::Anchor::StartText) => {
313 self.c_empty_look(prog::EmptyLook::StartText)
314 }
315 Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
316 self.c_empty_look(prog::EmptyLook::StartText)
317 }
318 Anchor(hir::Anchor::EndText) => {
319 self.c_empty_look(prog::EmptyLook::EndText)
320 }
321 WordBoundary(hir::WordBoundary::Unicode) => {
322 if !cfg!(feature = "unicode-perl") {
323 return Err(Error::Syntax(
324 "Unicode word boundaries are unavailable when \
325 the unicode-perl feature is disabled"
326 .to_string(),
327 ));
328 }
329 self.compiled.has_unicode_word_boundary = true;
330 self.byte_classes.set_word_boundary();
331 // We also make sure that all ASCII bytes are in a different
332 // class from non-ASCII bytes. Otherwise, it's possible for
333 // ASCII bytes to get lumped into the same class as non-ASCII
334 // bytes. This in turn may cause the lazy DFA to falsely start
335 // when it sees an ASCII byte that maps to a byte class with
336 // non-ASCII bytes. This ensures that never happens.
337 self.byte_classes.set_range(0, 0x7F);
338 self.c_empty_look(prog::EmptyLook::WordBoundary)
339 }
340 WordBoundary(hir::WordBoundary::UnicodeNegate) => {
341 if !cfg!(feature = "unicode-perl") {
342 return Err(Error::Syntax(
343 "Unicode word boundaries are unavailable when \
344 the unicode-perl feature is disabled"
345 .to_string(),
346 ));
347 }
348 self.compiled.has_unicode_word_boundary = true;
349 self.byte_classes.set_word_boundary();
350 // See comments above for why we set the ASCII range here.
351 self.byte_classes.set_range(0, 0x7F);
352 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
353 }
354 WordBoundary(hir::WordBoundary::Ascii) => {
355 self.byte_classes.set_word_boundary();
356 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
357 }
358 WordBoundary(hir::WordBoundary::AsciiNegate) => {
359 self.byte_classes.set_word_boundary();
360 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
361 }
362 Group(ref g) => match g.kind {
363 hir::GroupKind::NonCapturing => self.c(&g.hir),
364 hir::GroupKind::CaptureIndex(index) => {
365 if index as usize >= self.compiled.captures.len() {
366 self.compiled.captures.push(None);
367 }
368 self.c_capture(2 * index as usize, &g.hir)
369 }
370 hir::GroupKind::CaptureName { index, ref name } => {
371 if index as usize >= self.compiled.captures.len() {
372 let n = name.to_string();
373 self.compiled.captures.push(Some(n.clone()));
374 self.capture_name_idx.insert(n, index as usize);
375 }
376 self.c_capture(2 * index as usize, &g.hir)
377 }
378 },
379 Concat(ref es) => {
380 if self.compiled.is_reverse {
381 self.c_concat(es.iter().rev())
382 } else {
383 self.c_concat(es)
384 }
385 }
386 Alternation(ref es) => self.c_alternate(&**es),
387 Repetition(ref rep) => self.c_repeat(rep),
388 }
389 }
390
391 fn c_empty(&mut self) -> ResultOrEmpty {
392 // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
393 // See: CVE-2022-24713
394 //
395 // Since 'empty' sub-expressions don't increase the size of
396 // the actual compiled object, we "fake" an increase in its
397 // size so that our 'check_size_limit' routine will eventually
398 // stop compilation if there are too many empty sub-expressions
399 // (e.g., via a large repetition).
400 self.extra_inst_bytes += std::mem::size_of::<Inst>();
401 Ok(None)
402 }
403
404 fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
405 if self.num_exprs > 1 || self.compiled.is_dfa {
406 // Don't ever compile Save instructions for regex sets because
407 // they are never used. They are also never used in DFA programs
408 // because DFAs can't handle captures.
409 self.c(expr)
410 } else {
411 let entry = self.insts.len();
412 let hole = self.push_hole(InstHole::Save { slot: first_slot });
413 let patch = self.c(expr)?.unwrap_or(self.next_inst());
414 self.fill(hole, patch.entry);
415 self.fill_to_next(patch.hole);
416 let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
417 Ok(Some(Patch { hole: hole, entry: entry }))
418 }
419 }
420
421 fn c_dotstar(&mut self) -> Result {
422 Ok(if !self.compiled.only_utf8() {
423 self.c(&Hir::repetition(hir::Repetition {
424 kind: hir::RepetitionKind::ZeroOrMore,
425 greedy: false,
426 hir: Box::new(Hir::any(true)),
427 }))?
428 .unwrap()
429 } else {
430 self.c(&Hir::repetition(hir::Repetition {
431 kind: hir::RepetitionKind::ZeroOrMore,
432 greedy: false,
433 hir: Box::new(Hir::any(false)),
434 }))?
435 .unwrap()
436 })
437 }
438
439 fn c_char(&mut self, c: char) -> ResultOrEmpty {
440 if self.compiled.uses_bytes() {
441 if c.is_ascii() {
442 let b = c as u8;
443 let hole =
444 self.push_hole(InstHole::Bytes { start: b, end: b });
445 self.byte_classes.set_range(b, b);
446 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
447 } else {
448 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
449 }
450 } else {
451 let hole = self.push_hole(InstHole::Char { c: c });
452 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
453 }
454 }
455
456 fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
457 use std::mem::size_of;
458
459 assert!(!ranges.is_empty());
460 if self.compiled.uses_bytes() {
461 Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?))
462 } else {
463 let ranges: Vec<(char, char)> =
464 ranges.iter().map(|r| (r.start(), r.end())).collect();
465 let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
466 self.push_hole(InstHole::Char { c: ranges[0].0 })
467 } else {
468 self.extra_inst_bytes +=
469 ranges.len() * (size_of::<char>() * 2);
470 self.push_hole(InstHole::Ranges { ranges: ranges })
471 };
472 Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
473 }
474 }
475
476 fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
477 self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
478 }
479
480 fn c_class_bytes(
481 &mut self,
482 ranges: &[hir::ClassBytesRange],
483 ) -> ResultOrEmpty {
484 debug_assert!(!ranges.is_empty());
485
486 let first_split_entry = self.insts.len();
487 let mut holes = vec![];
488 let mut prev_hole = Hole::None;
489 for r in &ranges[0..ranges.len() - 1] {
490 self.fill_to_next(prev_hole);
491 let split = self.push_split_hole();
492 let next = self.insts.len();
493 self.byte_classes.set_range(r.start(), r.end());
494 holes.push(self.push_hole(InstHole::Bytes {
495 start: r.start(),
496 end: r.end(),
497 }));
498 prev_hole = self.fill_split(split, Some(next), None);
499 }
500 let next = self.insts.len();
501 let r = &ranges[ranges.len() - 1];
502 self.byte_classes.set_range(r.start(), r.end());
503 holes.push(
504 self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
505 );
506 self.fill(prev_hole, next);
507 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
508 }
509
510 fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
511 let hole = self.push_hole(InstHole::EmptyLook { look: look });
512 Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
513 }
514
515 fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
516 where
517 I: IntoIterator<Item = &'a Hir>,
518 {
519 let mut exprs = exprs.into_iter();
520 let Patch { mut hole, entry } = loop {
521 match exprs.next() {
522 None => return self.c_empty(),
523 Some(e) => {
524 if let Some(p) = self.c(e)? {
525 break p;
526 }
527 }
528 }
529 };
530 for e in exprs {
531 if let Some(p) = self.c(e)? {
532 self.fill(hole, p.entry);
533 hole = p.hole;
534 }
535 }
536 Ok(Some(Patch { hole: hole, entry: entry }))
537 }
538
539 fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
540 debug_assert!(
541 exprs.len() >= 2,
542 "alternates must have at least 2 exprs"
543 );
544
545 // Initial entry point is always the first split.
546 let first_split_entry = self.insts.len();
547
548 // Save up all of the holes from each alternate. They will all get
549 // patched to point to the same location.
550 let mut holes = vec![];
551
552 // true indicates that the hole is a split where we want to fill
553 // the second branch.
554 let mut prev_hole = (Hole::None, false);
555 for e in &exprs[0..exprs.len() - 1] {
556 if prev_hole.1 {
557 let next = self.insts.len();
558 self.fill_split(prev_hole.0, None, Some(next));
559 } else {
560 self.fill_to_next(prev_hole.0);
561 }
562 let split = self.push_split_hole();
563 if let Some(Patch { hole, entry }) = self.c(e)? {
564 holes.push(hole);
565 prev_hole = (self.fill_split(split, Some(entry), None), false);
566 } else {
567 let (split1, split2) = split.dup_one();
568 holes.push(split1);
569 prev_hole = (split2, true);
570 }
571 }
572 if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
573 holes.push(hole);
574 if prev_hole.1 {
575 self.fill_split(prev_hole.0, None, Some(entry));
576 } else {
577 self.fill(prev_hole.0, entry);
578 }
579 } else {
580 // We ignore prev_hole.1. When it's true, it means we have two
581 // empty branches both pushing prev_hole.0 into holes, so both
582 // branches will go to the same place anyway.
583 holes.push(prev_hole.0);
584 }
585 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
586 }
587
588 fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
589 use regex_syntax::hir::RepetitionKind::*;
590 match rep.kind {
591 ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
592 ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
593 OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
594 Range(hir::RepetitionRange::Exactly(min_max)) => {
595 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
596 }
597 Range(hir::RepetitionRange::AtLeast(min)) => {
598 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
599 }
600 Range(hir::RepetitionRange::Bounded(min, max)) => {
601 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
602 }
603 }
604 }
605
606 fn c_repeat_zero_or_one(
607 &mut self,
608 expr: &Hir,
609 greedy: bool,
610 ) -> ResultOrEmpty {
611 let split_entry = self.insts.len();
612 let split = self.push_split_hole();
613 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
614 Some(p) => p,
615 None => return self.pop_split_hole(),
616 };
617 let split_hole = if greedy {
618 self.fill_split(split, Some(entry_rep), None)
619 } else {
620 self.fill_split(split, None, Some(entry_rep))
621 };
622 let holes = vec![hole_rep, split_hole];
623 Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
624 }
625
626 fn c_repeat_zero_or_more(
627 &mut self,
628 expr: &Hir,
629 greedy: bool,
630 ) -> ResultOrEmpty {
631 let split_entry = self.insts.len();
632 let split = self.push_split_hole();
633 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
634 Some(p) => p,
635 None => return self.pop_split_hole(),
636 };
637
638 self.fill(hole_rep, split_entry);
639 let split_hole = if greedy {
640 self.fill_split(split, Some(entry_rep), None)
641 } else {
642 self.fill_split(split, None, Some(entry_rep))
643 };
644 Ok(Some(Patch { hole: split_hole, entry: split_entry }))
645 }
646
647 fn c_repeat_one_or_more(
648 &mut self,
649 expr: &Hir,
650 greedy: bool,
651 ) -> ResultOrEmpty {
652 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
653 Some(p) => p,
654 None => return Ok(None),
655 };
656 self.fill_to_next(hole_rep);
657 let split = self.push_split_hole();
658
659 let split_hole = if greedy {
660 self.fill_split(split, Some(entry_rep), None)
661 } else {
662 self.fill_split(split, None, Some(entry_rep))
663 };
664 Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
665 }
666
667 fn c_repeat_range_min_or_more(
668 &mut self,
669 expr: &Hir,
670 greedy: bool,
671 min: u32,
672 ) -> ResultOrEmpty {
673 let min = u32_to_usize(min);
674 // Using next_inst() is ok, because we can't return it (concat would
675 // have to return Some(_) while c_repeat_range_min_or_more returns
676 // None).
677 let patch_concat = self
678 .c_concat(iter::repeat(expr).take(min))?
679 .unwrap_or(self.next_inst());
680 if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
681 self.fill(patch_concat.hole, patch_rep.entry);
682 Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
683 } else {
684 Ok(None)
685 }
686 }
687
688 fn c_repeat_range(
689 &mut self,
690 expr: &Hir,
691 greedy: bool,
692 min: u32,
693 max: u32,
694 ) -> ResultOrEmpty {
695 let (min, max) = (u32_to_usize(min), u32_to_usize(max));
696 debug_assert!(min <= max);
697 let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
698 if min == max {
699 return Ok(patch_concat);
700 }
701 // Same reasoning as in c_repeat_range_min_or_more (we know that min <
702 // max at this point).
703 let patch_concat = patch_concat.unwrap_or(self.next_inst());
704 let initial_entry = patch_concat.entry;
705 // It is much simpler to compile, e.g., `a{2,5}` as:
706 //
707 // aaa?a?a?
708 //
709 // But you end up with a sequence of instructions like this:
710 //
711 // 0: 'a'
712 // 1: 'a',
713 // 2: split(3, 4)
714 // 3: 'a'
715 // 4: split(5, 6)
716 // 5: 'a'
717 // 6: split(7, 8)
718 // 7: 'a'
719 // 8: MATCH
720 //
721 // This is *incredibly* inefficient because the splits end
722 // up forming a chain, which has to be resolved everything a
723 // transition is followed.
724 let mut holes = vec![];
725 let mut prev_hole = patch_concat.hole;
726 for _ in min..max {
727 self.fill_to_next(prev_hole);
728 let split = self.push_split_hole();
729 let Patch { hole, entry } = match self.c(expr)? {
730 Some(p) => p,
731 None => return self.pop_split_hole(),
732 };
733 prev_hole = hole;
734 if greedy {
735 holes.push(self.fill_split(split, Some(entry), None));
736 } else {
737 holes.push(self.fill_split(split, None, Some(entry)));
738 }
739 }
740 holes.push(prev_hole);
741 Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
742 }
743
744 /// Can be used as a default value for the c_* functions when the call to
745 /// c_function is followed by inserting at least one instruction that is
746 /// always executed after the ones written by the c* function.
747 fn next_inst(&self) -> Patch {
748 Patch { hole: Hole::None, entry: self.insts.len() }
749 }
750
751 fn fill(&mut self, hole: Hole, goto: InstPtr) {
752 match hole {
753 Hole::None => {}
754 Hole::One(pc) => {
755 self.insts[pc].fill(goto);
756 }
757 Hole::Many(holes) => {
758 for hole in holes {
759 self.fill(hole, goto);
760 }
761 }
762 }
763 }
764
765 fn fill_to_next(&mut self, hole: Hole) {
766 let next = self.insts.len();
767 self.fill(hole, next);
768 }
769
770 fn fill_split(
771 &mut self,
772 hole: Hole,
773 goto1: Option<InstPtr>,
774 goto2: Option<InstPtr>,
775 ) -> Hole {
776 match hole {
777 Hole::None => Hole::None,
778 Hole::One(pc) => match (goto1, goto2) {
779 (Some(goto1), Some(goto2)) => {
780 self.insts[pc].fill_split(goto1, goto2);
781 Hole::None
782 }
783 (Some(goto1), None) => {
784 self.insts[pc].half_fill_split_goto1(goto1);
785 Hole::One(pc)
786 }
787 (None, Some(goto2)) => {
788 self.insts[pc].half_fill_split_goto2(goto2);
789 Hole::One(pc)
790 }
791 (None, None) => unreachable!(
792 "at least one of the split \
793 holes must be filled"
794 ),
795 },
796 Hole::Many(holes) => {
797 let mut new_holes = vec![];
798 for hole in holes {
799 new_holes.push(self.fill_split(hole, goto1, goto2));
800 }
801 if new_holes.is_empty() {
802 Hole::None
803 } else if new_holes.len() == 1 {
804 new_holes.pop().unwrap()
805 } else {
806 Hole::Many(new_holes)
807 }
808 }
809 }
810 }
811
812 fn push_compiled(&mut self, inst: Inst) {
813 self.insts.push(MaybeInst::Compiled(inst));
814 }
815
816 fn push_hole(&mut self, inst: InstHole) -> Hole {
817 let hole = self.insts.len();
818 self.insts.push(MaybeInst::Uncompiled(inst));
819 Hole::One(hole)
820 }
821
822 fn push_split_hole(&mut self) -> Hole {
823 let hole = self.insts.len();
824 self.insts.push(MaybeInst::Split);
825 Hole::One(hole)
826 }
827
828 fn pop_split_hole(&mut self) -> ResultOrEmpty {
829 self.insts.pop();
830 Ok(None)
831 }
832
833 fn check_size(&self) -> result::Result<(), Error> {
834 use std::mem::size_of;
835
836 let size =
837 self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
838 if size > self.size_limit {
839 Err(Error::CompiledTooBig(self.size_limit))
840 } else {
841 Ok(())
842 }
843 }
844 }
845
846 #[derive(Debug)]
847 enum Hole {
848 None,
849 One(InstPtr),
850 Many(Vec<Hole>),
851 }
852
853 impl Hole {
854 fn dup_one(self) -> (Self, Self) {
855 match self {
856 Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
857 Hole::None | Hole::Many(_) => {
858 unreachable!("must be called on single hole")
859 }
860 }
861 }
862 }
863
864 #[derive(Clone, Debug)]
865 enum MaybeInst {
866 Compiled(Inst),
867 Uncompiled(InstHole),
868 Split,
869 Split1(InstPtr),
870 Split2(InstPtr),
871 }
872
873 impl MaybeInst {
874 fn fill(&mut self, goto: InstPtr) {
875 let maybeinst = match *self {
876 MaybeInst::Split => MaybeInst::Split1(goto),
877 MaybeInst::Uncompiled(ref inst) => {
878 MaybeInst::Compiled(inst.fill(goto))
879 }
880 MaybeInst::Split1(goto1) => {
881 MaybeInst::Compiled(Inst::Split(InstSplit {
882 goto1: goto1,
883 goto2: goto,
884 }))
885 }
886 MaybeInst::Split2(goto2) => {
887 MaybeInst::Compiled(Inst::Split(InstSplit {
888 goto1: goto,
889 goto2: goto2,
890 }))
891 }
892 _ => unreachable!(
893 "not all instructions were compiled! \
894 found uncompiled instruction: {:?}",
895 self
896 ),
897 };
898 *self = maybeinst;
899 }
900
901 fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
902 let filled = match *self {
903 MaybeInst::Split => {
904 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
905 }
906 _ => unreachable!(
907 "must be called on Split instruction, \
908 instead it was called on: {:?}",
909 self
910 ),
911 };
912 *self = MaybeInst::Compiled(filled);
913 }
914
915 fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
916 let half_filled = match *self {
917 MaybeInst::Split => goto1,
918 _ => unreachable!(
919 "must be called on Split instruction, \
920 instead it was called on: {:?}",
921 self
922 ),
923 };
924 *self = MaybeInst::Split1(half_filled);
925 }
926
927 fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
928 let half_filled = match *self {
929 MaybeInst::Split => goto2,
930 _ => unreachable!(
931 "must be called on Split instruction, \
932 instead it was called on: {:?}",
933 self
934 ),
935 };
936 *self = MaybeInst::Split2(half_filled);
937 }
938
939 fn unwrap(self) -> Inst {
940 match self {
941 MaybeInst::Compiled(inst) => inst,
942 _ => unreachable!(
943 "must be called on a compiled instruction, \
944 instead it was called on: {:?}",
945 self
946 ),
947 }
948 }
949 }
950
951 #[derive(Clone, Debug)]
952 enum InstHole {
953 Save { slot: usize },
954 EmptyLook { look: EmptyLook },
955 Char { c: char },
956 Ranges { ranges: Vec<(char, char)> },
957 Bytes { start: u8, end: u8 },
958 }
959
960 impl InstHole {
961 fn fill(&self, goto: InstPtr) -> Inst {
962 match *self {
963 InstHole::Save { slot } => {
964 Inst::Save(InstSave { goto: goto, slot: slot })
965 }
966 InstHole::EmptyLook { look } => {
967 Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
968 }
969 InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
970 InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
971 goto: goto,
972 ranges: ranges.clone().into_boxed_slice(),
973 }),
974 InstHole::Bytes { start, end } => {
975 Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
976 }
977 }
978 }
979 }
980
981 struct CompileClass<'a, 'b> {
982 c: &'a mut Compiler,
983 ranges: &'b [hir::ClassUnicodeRange],
984 }
985
986 impl<'a, 'b> CompileClass<'a, 'b> {
987 fn compile(mut self) -> Result {
988 let mut holes = vec![];
989 let mut initial_entry = None;
990 let mut last_split = Hole::None;
991 let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
992 self.c.suffix_cache.clear();
993
994 for (i, range) in self.ranges.iter().enumerate() {
995 let is_last_range = i + 1 == self.ranges.len();
996 utf8_seqs.reset(range.start(), range.end());
997 let mut it = (&mut utf8_seqs).peekable();
998 loop {
999 let utf8_seq = match it.next() {
1000 None => break,
1001 Some(utf8_seq) => utf8_seq,
1002 };
1003 if is_last_range && it.peek().is_none() {
1004 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1005 holes.push(hole);
1006 self.c.fill(last_split, entry);
1007 last_split = Hole::None;
1008 if initial_entry.is_none() {
1009 initial_entry = Some(entry);
1010 }
1011 } else {
1012 if initial_entry.is_none() {
1013 initial_entry = Some(self.c.insts.len());
1014 }
1015 self.c.fill_to_next(last_split);
1016 last_split = self.c.push_split_hole();
1017 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1018 holes.push(hole);
1019 last_split =
1020 self.c.fill_split(last_split, Some(entry), None);
1021 }
1022 }
1023 }
1024 self.c.utf8_seqs = Some(utf8_seqs);
1025 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
1026 }
1027
1028 fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
1029 if self.c.compiled.is_reverse {
1030 self.c_utf8_seq_(seq)
1031 } else {
1032 self.c_utf8_seq_(seq.into_iter().rev())
1033 }
1034 }
1035
1036 fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
1037 where
1038 I: IntoIterator<Item = &'r Utf8Range>,
1039 {
1040 // The initial instruction for each UTF-8 sequence should be the same.
1041 let mut from_inst = ::std::usize::MAX;
1042 let mut last_hole = Hole::None;
1043 for byte_range in seq {
1044 let key = SuffixCacheKey {
1045 from_inst: from_inst,
1046 start: byte_range.start,
1047 end: byte_range.end,
1048 };
1049 {
1050 let pc = self.c.insts.len();
1051 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
1052 from_inst = cached_pc;
1053 continue;
1054 }
1055 }
1056 self.c.byte_classes.set_range(byte_range.start, byte_range.end);
1057 if from_inst == ::std::usize::MAX {
1058 last_hole = self.c.push_hole(InstHole::Bytes {
1059 start: byte_range.start,
1060 end: byte_range.end,
1061 });
1062 } else {
1063 self.c.push_compiled(Inst::Bytes(InstBytes {
1064 goto: from_inst,
1065 start: byte_range.start,
1066 end: byte_range.end,
1067 }));
1068 }
1069 from_inst = self.c.insts.len().checked_sub(1).unwrap();
1070 debug_assert!(from_inst < ::std::usize::MAX);
1071 }
1072 debug_assert!(from_inst < ::std::usize::MAX);
1073 Ok(Patch { hole: last_hole, entry: from_inst })
1074 }
1075 }
1076
1077 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
1078 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
1079 /// The set of byte ranges looks like this:
1080 ///
1081 /// [0-7F]
1082 /// [C2-DF][80-BF]
1083 /// [E0][A0-BF][80-BF]
1084 /// [E1-EC][80-BF][80-BF]
1085 /// [ED][80-9F][80-BF]
1086 /// [EE-EF][80-BF][80-BF]
1087 ///
1088 /// Each line above translates to one alternate in the compiled regex program.
1089 /// However, all but one of the alternates end in the same suffix, which is
1090 /// a waste of an instruction. The suffix cache facilitates reusing them across
1091 /// alternates.
1092 ///
1093 /// Note that a HashMap could be trivially used for this, but we don't need its
1094 /// overhead. Some small bounded space (LRU style) is more than enough.
1095 ///
1096 /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
1097 /// except it uses hashes as original indices and then compares full keys for
1098 /// validation against `dense` array.
1099 #[derive(Debug)]
1100 struct SuffixCache {
1101 sparse: Box<[usize]>,
1102 dense: Vec<SuffixCacheEntry>,
1103 }
1104
1105 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1106 struct SuffixCacheEntry {
1107 key: SuffixCacheKey,
1108 pc: InstPtr,
1109 }
1110
1111 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1112 struct SuffixCacheKey {
1113 from_inst: InstPtr,
1114 start: u8,
1115 end: u8,
1116 }
1117
1118 impl SuffixCache {
1119 fn new(size: usize) -> Self {
1120 SuffixCache {
1121 sparse: vec![0usize; size].into(),
1122 dense: Vec::with_capacity(size),
1123 }
1124 }
1125
1126 fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1127 let hash = self.hash(&key);
1128 let pos = &mut self.sparse[hash];
1129 if let Some(entry) = self.dense.get(*pos) {
1130 if entry.key == key {
1131 return Some(entry.pc);
1132 }
1133 }
1134 *pos = self.dense.len();
1135 self.dense.push(SuffixCacheEntry { key: key, pc: pc });
1136 None
1137 }
1138
1139 fn clear(&mut self) {
1140 self.dense.clear();
1141 }
1142
1143 fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1144 // Basic FNV-1a hash as described:
1145 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1146 const FNV_PRIME: u64 = 1099511628211;
1147 let mut h = 14695981039346656037;
1148 h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1149 h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1150 h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1151 (h as usize) % self.sparse.len()
1152 }
1153 }
1154
1155 struct ByteClassSet([bool; 256]);
1156
1157 impl ByteClassSet {
1158 fn new() -> Self {
1159 ByteClassSet([false; 256])
1160 }
1161
1162 fn set_range(&mut self, start: u8, end: u8) {
1163 debug_assert!(start <= end);
1164 if start > 0 {
1165 self.0[start as usize - 1] = true;
1166 }
1167 self.0[end as usize] = true;
1168 }
1169
1170 fn set_word_boundary(&mut self) {
1171 // We need to mark all ranges of bytes whose pairs result in
1172 // evaluating \b differently.
1173 let iswb = is_word_byte;
1174 let mut b1: u16 = 0;
1175 let mut b2: u16;
1176 while b1 <= 255 {
1177 b2 = b1 + 1;
1178 while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1179 b2 += 1;
1180 }
1181 self.set_range(b1 as u8, (b2 - 1) as u8);
1182 b1 = b2;
1183 }
1184 }
1185
1186 fn byte_classes(&self) -> Vec<u8> {
1187 // N.B. If you're debugging the DFA, it's useful to simply return
1188 // `(0..256).collect()`, which effectively removes the byte classes
1189 // and makes the transitions easier to read.
1190 // (0usize..256).map(|x| x as u8).collect()
1191 let mut byte_classes = vec![0; 256];
1192 let mut class = 0u8;
1193 let mut i = 0;
1194 loop {
1195 byte_classes[i] = class as u8;
1196 if i >= 255 {
1197 break;
1198 }
1199 if self.0[i] {
1200 class = class.checked_add(1).unwrap();
1201 }
1202 i += 1;
1203 }
1204 byte_classes
1205 }
1206 }
1207
1208 impl fmt::Debug for ByteClassSet {
1209 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1210 f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
1211 }
1212 }
1213
1214 fn u32_to_usize(n: u32) -> usize {
1215 // In case usize is less than 32 bits, we need to guard against overflow.
1216 // On most platforms this compiles to nothing.
1217 // TODO Use `std::convert::TryFrom` once it's stable.
1218 if (n as u64) > (::std::usize::MAX as u64) {
1219 panic!("BUG: {} is too big to be pointer sized", n)
1220 }
1221 n as usize
1222 }
1223
1224 #[cfg(test)]
1225 mod tests {
1226 use super::ByteClassSet;
1227
1228 #[test]
1229 fn byte_classes() {
1230 let mut set = ByteClassSet::new();
1231 set.set_range(b'a', b'z');
1232 let classes = set.byte_classes();
1233 assert_eq!(classes[0], 0);
1234 assert_eq!(classes[1], 0);
1235 assert_eq!(classes[2], 0);
1236 assert_eq!(classes[b'a' as usize - 1], 0);
1237 assert_eq!(classes[b'a' as usize], 1);
1238 assert_eq!(classes[b'm' as usize], 1);
1239 assert_eq!(classes[b'z' as usize], 1);
1240 assert_eq!(classes[b'z' as usize + 1], 2);
1241 assert_eq!(classes[254], 2);
1242 assert_eq!(classes[255], 2);
1243
1244 let mut set = ByteClassSet::new();
1245 set.set_range(0, 2);
1246 set.set_range(4, 6);
1247 let classes = set.byte_classes();
1248 assert_eq!(classes[0], 0);
1249 assert_eq!(classes[1], 0);
1250 assert_eq!(classes[2], 0);
1251 assert_eq!(classes[3], 1);
1252 assert_eq!(classes[4], 2);
1253 assert_eq!(classes[5], 2);
1254 assert_eq!(classes[6], 2);
1255 assert_eq!(classes[7], 3);
1256 assert_eq!(classes[255], 3);
1257 }
1258
1259 #[test]
1260 fn full_byte_classes() {
1261 let mut set = ByteClassSet::new();
1262 for i in 0..256u16 {
1263 set.set_range(i as u8, i as u8);
1264 }
1265 assert_eq!(set.byte_classes().len(), 256);
1266 }
1267 }