vendor/regex/src/compile.rs

   1 use std::collections::HashMap;
   2 use std::fmt;
   3 use std::iter;
   4 use std::result;
   5 use std::sync::Arc;
   6
   7 use regex_syntax::hir::{self, Hir};
   8 use regex_syntax::is_word_byte;
   9 use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
  10
  11 use crate::prog::{
  12     EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
  13     InstSave, InstSplit, Program,
  14 };
  15
  16 use crate::Error;
  17
  18 type Result = result::Result<Patch, Error>;
  19 type ResultOrEmpty = result::Result<Option<Patch>, Error>;
  20
  21 #[derive(Debug)]
  22 struct Patch {
  23     hole: Hole,
  24     entry: InstPtr,
  25 }
  26
  27 /// A compiler translates a regular expression AST to a sequence of
  28 /// instructions. The sequence of instructions represents an NFA.
  29 // `Compiler` is only public via the `internal` module, so avoid deriving
  30 // `Debug`.
  31 #[allow(missing_debug_implementations)]
  32 pub struct Compiler {
  33     insts: Vec<MaybeInst>,
  34     compiled: Program,
  35     capture_name_idx: HashMap<String, usize>,
  36     num_exprs: usize,
  37     size_limit: usize,
  38     suffix_cache: SuffixCache,
  39     utf8_seqs: Option<Utf8Sequences>,
  40     byte_classes: ByteClassSet,
  41     // This keeps track of extra bytes allocated while compiling the regex
  42     // program. Currently, this corresponds to two things. First is the heap
  43     // memory allocated by Unicode character classes ('InstRanges'). Second is
  44     // a "fake" amount of memory used by empty sub-expressions, so that enough
  45     // empty sub-expressions will ultimately trigger the compiler to bail
  46     // because of a size limit restriction. (That empty sub-expressions don't
  47     // add to heap memory usage is more-or-less an implementation detail.) In
  48     // the second case, if we don't bail, then an excessively large repetition
  49     // on an empty sub-expression can result in the compiler using a very large
  50     // amount of CPU time.
  51     extra_inst_bytes: usize,
  52 }
  53
  54 impl Compiler {
  55     /// Create a new regular expression compiler.
  56     ///
  57     /// Various options can be set before calling `compile` on an expression.
  58     pub fn new() -> Self {
  59         Compiler {
  60             insts: vec![],
  61             compiled: Program::new(),
  62             capture_name_idx: HashMap::new(),
  63             num_exprs: 0,
  64             size_limit: 10 * (1 << 20),
  65             suffix_cache: SuffixCache::new(1000),
  66             utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
  67             byte_classes: ByteClassSet::new(),
  68             extra_inst_bytes: 0,
  69         }
  70     }
  71
  72     /// The size of the resulting program is limited by size_limit. If
  73     /// the program approximately exceeds the given size (in bytes), then
  74     /// compilation will stop and return an error.
  75     pub fn size_limit(mut self, size_limit: usize) -> Self {
  76         self.size_limit = size_limit;
  77         self
  78     }
  79
  80     /// If bytes is true, then the program is compiled as a byte based
  81     /// automaton, which incorporates UTF-8 decoding into the machine. If it's
  82     /// false, then the automaton is Unicode scalar value based, e.g., an
  83     /// engine utilizing such an automaton is responsible for UTF-8 decoding.
  84     ///
  85     /// The specific invariant is that when returning a byte based machine,
  86     /// the neither the `Char` nor `Ranges` instructions are produced.
  87     /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
  88     /// instruction is never produced.
  89     ///
  90     /// Note that `dfa(true)` implies `bytes(true)`.
  91     pub fn bytes(mut self, yes: bool) -> Self {
  92         self.compiled.is_bytes = yes;
  93         self
  94     }
  95
  96     /// When disabled, the program compiled may match arbitrary bytes.
  97     ///
  98     /// When enabled (the default), all compiled programs exclusively match
  99     /// valid UTF-8 bytes.
 100     pub fn only_utf8(mut self, yes: bool) -> Self {
 101         self.compiled.only_utf8 = yes;
 102         self
 103     }
 104
 105     /// When set, the machine returned is suitable for use in the DFA matching
 106     /// engine.
 107     ///
 108     /// In particular, this ensures that if the regex is not anchored in the
 109     /// beginning, then a preceding `.*?` is included in the program. (The NFA
 110     /// based engines handle the preceding `.*?` explicitly, which is difficult
 111     /// or impossible in the DFA engine.)
 112     pub fn dfa(mut self, yes: bool) -> Self {
 113         self.compiled.is_dfa = yes;
 114         self
 115     }
 116
 117     /// When set, the machine returned is suitable for matching text in
 118     /// reverse. In particular, all concatenations are flipped.
 119     pub fn reverse(mut self, yes: bool) -> Self {
 120         self.compiled.is_reverse = yes;
 121         self
 122     }
 123
 124     /// Compile a regular expression given its AST.
 125     ///
 126     /// The compiler is guaranteed to succeed unless the program exceeds the
 127     /// specified size limit. If the size limit is exceeded, then compilation
 128     /// stops and returns an error.
 129     pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
 130         debug_assert!(!exprs.is_empty());
 131         self.num_exprs = exprs.len();
 132         if exprs.len() == 1 {
 133             self.compile_one(&exprs[0])
 134         } else {
 135             self.compile_many(exprs)
 136         }
 137     }
 138
 139     fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
 140         // If we're compiling a forward DFA and we aren't anchored, then
 141         // add a `.*?` before the first capture group.
 142         // Other matching engines handle this by baking the logic into the
 143         // matching engine itself.
 144         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
 145         self.compiled.is_anchored_start = expr.is_anchored_start();
 146         self.compiled.is_anchored_end = expr.is_anchored_end();
 147         if self.compiled.needs_dotstar() {
 148             dotstar_patch = self.c_dotstar()?;
 149             self.compiled.start = dotstar_patch.entry;
 150         }
 151         self.compiled.captures = vec![None];
 152         let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst());
 153         if self.compiled.needs_dotstar() {
 154             self.fill(dotstar_patch.hole, patch.entry);
 155         } else {
 156             self.compiled.start = patch.entry;
 157         }
 158         self.fill_to_next(patch.hole);
 159         self.compiled.matches = vec![self.insts.len()];
 160         self.push_compiled(Inst::Match(0));
 161         self.compile_finish()
 162     }
 163
 164     fn compile_many(
 165         mut self,
 166         exprs: &[Hir],
 167     ) -> result::Result<Program, Error> {
 168         debug_assert!(exprs.len() > 1);
 169
 170         self.compiled.is_anchored_start =
 171             exprs.iter().all(|e| e.is_anchored_start());
 172         self.compiled.is_anchored_end =
 173             exprs.iter().all(|e| e.is_anchored_end());
 174         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
 175         if self.compiled.needs_dotstar() {
 176             dotstar_patch = self.c_dotstar()?;
 177             self.compiled.start = dotstar_patch.entry;
 178         } else {
 179             self.compiled.start = 0; // first instruction is always split
 180         }
 181         self.fill_to_next(dotstar_patch.hole);
 182
 183         let mut prev_hole = Hole::None;
 184         for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
 185             self.fill_to_next(prev_hole);
 186             let split = self.push_split_hole();
 187             let Patch { hole, entry } =
 188                 self.c_capture(0, expr)?.unwrap_or(self.next_inst());
 189             self.fill_to_next(hole);
 190             self.compiled.matches.push(self.insts.len());
 191             self.push_compiled(Inst::Match(i));
 192             prev_hole = self.fill_split(split, Some(entry), None);
 193         }
 194         let i = exprs.len() - 1;
 195         let Patch { hole, entry } =
 196             self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst());
 197         self.fill(prev_hole, entry);
 198         self.fill_to_next(hole);
 199         self.compiled.matches.push(self.insts.len());
 200         self.push_compiled(Inst::Match(i));
 201         self.compile_finish()
 202     }
 203
 204     fn compile_finish(mut self) -> result::Result<Program, Error> {
 205         self.compiled.insts =
 206             self.insts.into_iter().map(|inst| inst.unwrap()).collect();
 207         self.compiled.byte_classes = self.byte_classes.byte_classes();
 208         self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
 209         Ok(self.compiled)
 210     }
 211
 212     /// Compile expr into self.insts, returning a patch on success,
 213     /// or an error if we run out of memory.
 214     ///
 215     /// All of the c_* methods of the compiler share the contract outlined
 216     /// here.
 217     ///
 218     /// The main thing that a c_* method does is mutate `self.insts`
 219     /// to add a list of mostly compiled instructions required to execute
 220     /// the given expression. `self.insts` contains MaybeInsts rather than
 221     /// Insts because there is some backpatching required.
 222     ///
 223     /// The `Patch` value returned by each c_* method provides metadata
 224     /// about the compiled instructions emitted to `self.insts`. The
 225     /// `entry` member of the patch refers to the first instruction
 226     /// (the entry point), while the `hole` member contains zero or
 227     /// more offsets to partial instructions that need to be backpatched.
 228     /// The c_* routine can't know where its list of instructions are going to
 229     /// jump to after execution, so it is up to the caller to patch
 230     /// these jumps to point to the right place. So compiling some
 231     /// expression, e, we would end up with a situation that looked like:
 232     ///
 233     /// ```text
 234     /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
 235     ///                     ^              ^             ^
 236     ///                     |                \         /
 237     ///                   entry                \     /
 238     ///                                         hole
 239     /// ```
 240     ///
 241     /// To compile two expressions, e1 and e2, concatenated together we
 242     /// would do:
 243     ///
 244     /// ```ignore
 245     /// let patch1 = self.c(e1);
 246     /// let patch2 = self.c(e2);
 247     /// ```
 248     ///
 249     /// while leaves us with a situation that looks like
 250     ///
 251     /// ```text
 252     /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
 253     ///                     ^        ^            ^        ^
 254     ///                     |        |            |        |
 255     ///                entry1        hole1   entry2        hole2
 256     /// ```
 257     ///
 258     /// Then to merge the two patches together into one we would backpatch
 259     /// hole1 with entry2 and return a new patch that enters at entry1
 260     /// and has hole2 for a hole. In fact, if you look at the c_concat
 261     /// method you will see that it does exactly this, though it handles
 262     /// a list of expressions rather than just the two that we use for
 263     /// an example.
 264     ///
 265     /// Ok(None) is returned when an expression is compiled to no
 266     /// instruction, and so no patch.entry value makes sense.
 267     fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
 268         use crate::prog;
 269         use regex_syntax::hir::HirKind::*;
 270
 271         self.check_size()?;
 272         match *expr.kind() {
 273             Empty => self.c_empty(),
 274             Literal(hir::Literal::Unicode(c)) => self.c_char(c),
 275             Literal(hir::Literal::Byte(b)) => {
 276                 assert!(self.compiled.uses_bytes());
 277                 self.c_byte(b)
 278             }
 279             Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
 280             Class(hir::Class::Bytes(ref cls)) => {
 281                 if self.compiled.uses_bytes() {
 282                     self.c_class_bytes(cls.ranges())
 283                 } else {
 284                     assert!(cls.is_all_ascii());
 285                     let mut char_ranges = vec![];
 286                     for r in cls.iter() {
 287                         let (s, e) = (r.start() as char, r.end() as char);
 288                         char_ranges.push(hir::ClassUnicodeRange::new(s, e));
 289                     }
 290                     self.c_class(&char_ranges)
 291                 }
 292             }
 293             Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
 294                 self.byte_classes.set_range(b'\n', b'\n');
 295                 self.c_empty_look(prog::EmptyLook::EndLine)
 296             }
 297             Anchor(hir::Anchor::StartLine) => {
 298                 self.byte_classes.set_range(b'\n', b'\n');
 299                 self.c_empty_look(prog::EmptyLook::StartLine)
 300             }
 301             Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
 302                 self.byte_classes.set_range(b'\n', b'\n');
 303                 self.c_empty_look(prog::EmptyLook::StartLine)
 304             }
 305             Anchor(hir::Anchor::EndLine) => {
 306                 self.byte_classes.set_range(b'\n', b'\n');
 307                 self.c_empty_look(prog::EmptyLook::EndLine)
 308             }
 309             Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
 310                 self.c_empty_look(prog::EmptyLook::EndText)
 311             }
 312             Anchor(hir::Anchor::StartText) => {
 313                 self.c_empty_look(prog::EmptyLook::StartText)
 314             }
 315             Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
 316                 self.c_empty_look(prog::EmptyLook::StartText)
 317             }
 318             Anchor(hir::Anchor::EndText) => {
 319                 self.c_empty_look(prog::EmptyLook::EndText)
 320             }
 321             WordBoundary(hir::WordBoundary::Unicode) => {
 322                 if !cfg!(feature = "unicode-perl") {
 323                     return Err(Error::Syntax(
 324                         "Unicode word boundaries are unavailable when \
 325                          the unicode-perl feature is disabled"
 326                             .to_string(),
 327                     ));
 328                 }
 329                 self.compiled.has_unicode_word_boundary = true;
 330                 self.byte_classes.set_word_boundary();
 331                 // We also make sure that all ASCII bytes are in a different
 332                 // class from non-ASCII bytes. Otherwise, it's possible for
 333                 // ASCII bytes to get lumped into the same class as non-ASCII
 334                 // bytes. This in turn may cause the lazy DFA to falsely start
 335                 // when it sees an ASCII byte that maps to a byte class with
 336                 // non-ASCII bytes. This ensures that never happens.
 337                 self.byte_classes.set_range(0, 0x7F);
 338                 self.c_empty_look(prog::EmptyLook::WordBoundary)
 339             }
 340             WordBoundary(hir::WordBoundary::UnicodeNegate) => {
 341                 if !cfg!(feature = "unicode-perl") {
 342                     return Err(Error::Syntax(
 343                         "Unicode word boundaries are unavailable when \
 344                          the unicode-perl feature is disabled"
 345                             .to_string(),
 346                     ));
 347                 }
 348                 self.compiled.has_unicode_word_boundary = true;
 349                 self.byte_classes.set_word_boundary();
 350                 // See comments above for why we set the ASCII range here.
 351                 self.byte_classes.set_range(0, 0x7F);
 352                 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
 353             }
 354             WordBoundary(hir::WordBoundary::Ascii) => {
 355                 self.byte_classes.set_word_boundary();
 356                 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
 357             }
 358             WordBoundary(hir::WordBoundary::AsciiNegate) => {
 359                 self.byte_classes.set_word_boundary();
 360                 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
 361             }
 362             Group(ref g) => match g.kind {
 363                 hir::GroupKind::NonCapturing => self.c(&g.hir),
 364                 hir::GroupKind::CaptureIndex(index) => {
 365                     if index as usize >= self.compiled.captures.len() {
 366                         self.compiled.captures.push(None);
 367                     }
 368                     self.c_capture(2 * index as usize, &g.hir)
 369                 }
 370                 hir::GroupKind::CaptureName { index, ref name } => {
 371                     if index as usize >= self.compiled.captures.len() {
 372                         let n = name.to_string();
 373                         self.compiled.captures.push(Some(n.clone()));
 374                         self.capture_name_idx.insert(n, index as usize);
 375                     }
 376                     self.c_capture(2 * index as usize, &g.hir)
 377                 }
 378             },
 379             Concat(ref es) => {
 380                 if self.compiled.is_reverse {
 381                     self.c_concat(es.iter().rev())
 382                 } else {
 383                     self.c_concat(es)
 384                 }
 385             }
 386             Alternation(ref es) => self.c_alternate(&**es),
 387             Repetition(ref rep) => self.c_repeat(rep),
 388         }
 389     }
 390
 391     fn c_empty(&mut self) -> ResultOrEmpty {
 392         // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
 393         // See: CVE-2022-24713
 394         //
 395         // Since 'empty' sub-expressions don't increase the size of
 396         // the actual compiled object, we "fake" an increase in its
 397         // size so that our 'check_size_limit' routine will eventually
 398         // stop compilation if there are too many empty sub-expressions
 399         // (e.g., via a large repetition).
 400         self.extra_inst_bytes += std::mem::size_of::<Inst>();
 401         Ok(None)
 402     }
 403
 404     fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
 405         if self.num_exprs > 1 || self.compiled.is_dfa {
 406             // Don't ever compile Save instructions for regex sets because
 407             // they are never used. They are also never used in DFA programs
 408             // because DFAs can't handle captures.
 409             self.c(expr)
 410         } else {
 411             let entry = self.insts.len();
 412             let hole = self.push_hole(InstHole::Save { slot: first_slot });
 413             let patch = self.c(expr)?.unwrap_or(self.next_inst());
 414             self.fill(hole, patch.entry);
 415             self.fill_to_next(patch.hole);
 416             let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
 417             Ok(Some(Patch { hole: hole, entry: entry }))
 418         }
 419     }
 420
 421     fn c_dotstar(&mut self) -> Result {
 422         Ok(if !self.compiled.only_utf8() {
 423             self.c(&Hir::repetition(hir::Repetition {
 424                 kind: hir::RepetitionKind::ZeroOrMore,
 425                 greedy: false,
 426                 hir: Box::new(Hir::any(true)),
 427             }))?
 428             .unwrap()
 429         } else {
 430             self.c(&Hir::repetition(hir::Repetition {
 431                 kind: hir::RepetitionKind::ZeroOrMore,
 432                 greedy: false,
 433                 hir: Box::new(Hir::any(false)),
 434             }))?
 435             .unwrap()
 436         })
 437     }
 438
 439     fn c_char(&mut self, c: char) -> ResultOrEmpty {
 440         if self.compiled.uses_bytes() {
 441             if c.is_ascii() {
 442                 let b = c as u8;
 443                 let hole =
 444                     self.push_hole(InstHole::Bytes { start: b, end: b });
 445                 self.byte_classes.set_range(b, b);
 446                 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
 447             } else {
 448                 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
 449             }
 450         } else {
 451             let hole = self.push_hole(InstHole::Char { c: c });
 452             Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
 453         }
 454     }
 455
 456     fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
 457         use std::mem::size_of;
 458
 459         assert!(!ranges.is_empty());
 460         if self.compiled.uses_bytes() {
 461             Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?))
 462         } else {
 463             let ranges: Vec<(char, char)> =
 464                 ranges.iter().map(|r| (r.start(), r.end())).collect();
 465             let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
 466                 self.push_hole(InstHole::Char { c: ranges[0].0 })
 467             } else {
 468                 self.extra_inst_bytes +=
 469                     ranges.len() * (size_of::<char>() * 2);
 470                 self.push_hole(InstHole::Ranges { ranges: ranges })
 471             };
 472             Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
 473         }
 474     }
 475
 476     fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
 477         self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
 478     }
 479
 480     fn c_class_bytes(
 481         &mut self,
 482         ranges: &[hir::ClassBytesRange],
 483     ) -> ResultOrEmpty {
 484         debug_assert!(!ranges.is_empty());
 485
 486         let first_split_entry = self.insts.len();
 487         let mut holes = vec![];
 488         let mut prev_hole = Hole::None;
 489         for r in &ranges[0..ranges.len() - 1] {
 490             self.fill_to_next(prev_hole);
 491             let split = self.push_split_hole();
 492             let next = self.insts.len();
 493             self.byte_classes.set_range(r.start(), r.end());
 494             holes.push(self.push_hole(InstHole::Bytes {
 495                 start: r.start(),
 496                 end: r.end(),
 497             }));
 498             prev_hole = self.fill_split(split, Some(next), None);
 499         }
 500         let next = self.insts.len();
 501         let r = &ranges[ranges.len() - 1];
 502         self.byte_classes.set_range(r.start(), r.end());
 503         holes.push(
 504             self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
 505         );
 506         self.fill(prev_hole, next);
 507         Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
 508     }
 509
 510     fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
 511         let hole = self.push_hole(InstHole::EmptyLook { look: look });
 512         Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
 513     }
 514
 515     fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
 516     where
 517         I: IntoIterator<Item = &'a Hir>,
 518     {
 519         let mut exprs = exprs.into_iter();
 520         let Patch { mut hole, entry } = loop {
 521             match exprs.next() {
 522                 None => return self.c_empty(),
 523                 Some(e) => {
 524                     if let Some(p) = self.c(e)? {
 525                         break p;
 526                     }
 527                 }
 528             }
 529         };
 530         for e in exprs {
 531             if let Some(p) = self.c(e)? {
 532                 self.fill(hole, p.entry);
 533                 hole = p.hole;
 534             }
 535         }
 536         Ok(Some(Patch { hole: hole, entry: entry }))
 537     }
 538
 539     fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
 540         debug_assert!(
 541             exprs.len() >= 2,
 542             "alternates must have at least 2 exprs"
 543         );
 544
 545         // Initial entry point is always the first split.
 546         let first_split_entry = self.insts.len();
 547
 548         // Save up all of the holes from each alternate. They will all get
 549         // patched to point to the same location.
 550         let mut holes = vec![];
 551
 552         // true indicates that the hole is a split where we want to fill
 553         // the second branch.
 554         let mut prev_hole = (Hole::None, false);
 555         for e in &exprs[0..exprs.len() - 1] {
 556             if prev_hole.1 {
 557                 let next = self.insts.len();
 558                 self.fill_split(prev_hole.0, None, Some(next));
 559             } else {
 560                 self.fill_to_next(prev_hole.0);
 561             }
 562             let split = self.push_split_hole();
 563             if let Some(Patch { hole, entry }) = self.c(e)? {
 564                 holes.push(hole);
 565                 prev_hole = (self.fill_split(split, Some(entry), None), false);
 566             } else {
 567                 let (split1, split2) = split.dup_one();
 568                 holes.push(split1);
 569                 prev_hole = (split2, true);
 570             }
 571         }
 572         if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
 573             holes.push(hole);
 574             if prev_hole.1 {
 575                 self.fill_split(prev_hole.0, None, Some(entry));
 576             } else {
 577                 self.fill(prev_hole.0, entry);
 578             }
 579         } else {
 580             // We ignore prev_hole.1. When it's true, it means we have two
 581             // empty branches both pushing prev_hole.0 into holes, so both
 582             // branches will go to the same place anyway.
 583             holes.push(prev_hole.0);
 584         }
 585         Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
 586     }
 587
 588     fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
 589         use regex_syntax::hir::RepetitionKind::*;
 590         match rep.kind {
 591             ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
 592             ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
 593             OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
 594             Range(hir::RepetitionRange::Exactly(min_max)) => {
 595                 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
 596             }
 597             Range(hir::RepetitionRange::AtLeast(min)) => {
 598                 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
 599             }
 600             Range(hir::RepetitionRange::Bounded(min, max)) => {
 601                 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
 602             }
 603         }
 604     }
 605
 606     fn c_repeat_zero_or_one(
 607         &mut self,
 608         expr: &Hir,
 609         greedy: bool,
 610     ) -> ResultOrEmpty {
 611         let split_entry = self.insts.len();
 612         let split = self.push_split_hole();
 613         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
 614             Some(p) => p,
 615             None => return self.pop_split_hole(),
 616         };
 617         let split_hole = if greedy {
 618             self.fill_split(split, Some(entry_rep), None)
 619         } else {
 620             self.fill_split(split, None, Some(entry_rep))
 621         };
 622         let holes = vec![hole_rep, split_hole];
 623         Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
 624     }
 625
 626     fn c_repeat_zero_or_more(
 627         &mut self,
 628         expr: &Hir,
 629         greedy: bool,
 630     ) -> ResultOrEmpty {
 631         let split_entry = self.insts.len();
 632         let split = self.push_split_hole();
 633         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
 634             Some(p) => p,
 635             None => return self.pop_split_hole(),
 636         };
 637
 638         self.fill(hole_rep, split_entry);
 639         let split_hole = if greedy {
 640             self.fill_split(split, Some(entry_rep), None)
 641         } else {
 642             self.fill_split(split, None, Some(entry_rep))
 643         };
 644         Ok(Some(Patch { hole: split_hole, entry: split_entry }))
 645     }
 646
 647     fn c_repeat_one_or_more(
 648         &mut self,
 649         expr: &Hir,
 650         greedy: bool,
 651     ) -> ResultOrEmpty {
 652         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
 653             Some(p) => p,
 654             None => return Ok(None),
 655         };
 656         self.fill_to_next(hole_rep);
 657         let split = self.push_split_hole();
 658
 659         let split_hole = if greedy {
 660             self.fill_split(split, Some(entry_rep), None)
 661         } else {
 662             self.fill_split(split, None, Some(entry_rep))
 663         };
 664         Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
 665     }
 666
 667     fn c_repeat_range_min_or_more(
 668         &mut self,
 669         expr: &Hir,
 670         greedy: bool,
 671         min: u32,
 672     ) -> ResultOrEmpty {
 673         let min = u32_to_usize(min);
 674         // Using next_inst() is ok, because we can't return it (concat would
 675         // have to return Some(_) while c_repeat_range_min_or_more returns
 676         // None).
 677         let patch_concat = self
 678             .c_concat(iter::repeat(expr).take(min))?
 679             .unwrap_or(self.next_inst());
 680         if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
 681             self.fill(patch_concat.hole, patch_rep.entry);
 682             Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
 683         } else {
 684             Ok(None)
 685         }
 686     }
 687
 688     fn c_repeat_range(
 689         &mut self,
 690         expr: &Hir,
 691         greedy: bool,
 692         min: u32,
 693         max: u32,
 694     ) -> ResultOrEmpty {
 695         let (min, max) = (u32_to_usize(min), u32_to_usize(max));
 696         debug_assert!(min <= max);
 697         let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
 698         if min == max {
 699             return Ok(patch_concat);
 700         }
 701         // Same reasoning as in c_repeat_range_min_or_more (we know that min <
 702         // max at this point).
 703         let patch_concat = patch_concat.unwrap_or(self.next_inst());
 704         let initial_entry = patch_concat.entry;
 705         // It is much simpler to compile, e.g., `a{2,5}` as:
 706         //
 707         //     aaa?a?a?
 708         //
 709         // But you end up with a sequence of instructions like this:
 710         //
 711         //     0: 'a'
 712         //     1: 'a',
 713         //     2: split(3, 4)
 714         //     3: 'a'
 715         //     4: split(5, 6)
 716         //     5: 'a'
 717         //     6: split(7, 8)
 718         //     7: 'a'
 719         //     8: MATCH
 720         //
 721         // This is *incredibly* inefficient because the splits end
 722         // up forming a chain, which has to be resolved everything a
 723         // transition is followed.
 724         let mut holes = vec![];
 725         let mut prev_hole = patch_concat.hole;
 726         for _ in min..max {
 727             self.fill_to_next(prev_hole);
 728             let split = self.push_split_hole();
 729             let Patch { hole, entry } = match self.c(expr)? {
 730                 Some(p) => p,
 731                 None => return self.pop_split_hole(),
 732             };
 733             prev_hole = hole;
 734             if greedy {
 735                 holes.push(self.fill_split(split, Some(entry), None));
 736             } else {
 737                 holes.push(self.fill_split(split, None, Some(entry)));
 738             }
 739         }
 740         holes.push(prev_hole);
 741         Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
 742     }
 743
 744     /// Can be used as a default value for the c_* functions when the call to
 745     /// c_function is followed by inserting at least one instruction that is
 746     /// always executed after the ones written by the c* function.
 747     fn next_inst(&self) -> Patch {
 748         Patch { hole: Hole::None, entry: self.insts.len() }
 749     }
 750
 751     fn fill(&mut self, hole: Hole, goto: InstPtr) {
 752         match hole {
 753             Hole::None => {}
 754             Hole::One(pc) => {
 755                 self.insts[pc].fill(goto);
 756             }
 757             Hole::Many(holes) => {
 758                 for hole in holes {
 759                     self.fill(hole, goto);
 760                 }
 761             }
 762         }
 763     }
 764
 765     fn fill_to_next(&mut self, hole: Hole) {
 766         let next = self.insts.len();
 767         self.fill(hole, next);
 768     }
 769
 770     fn fill_split(
 771         &mut self,
 772         hole: Hole,
 773         goto1: Option<InstPtr>,
 774         goto2: Option<InstPtr>,
 775     ) -> Hole {
 776         match hole {
 777             Hole::None => Hole::None,
 778             Hole::One(pc) => match (goto1, goto2) {
 779                 (Some(goto1), Some(goto2)) => {
 780                     self.insts[pc].fill_split(goto1, goto2);
 781                     Hole::None
 782                 }
 783                 (Some(goto1), None) => {
 784                     self.insts[pc].half_fill_split_goto1(goto1);
 785                     Hole::One(pc)
 786                 }
 787                 (None, Some(goto2)) => {
 788                     self.insts[pc].half_fill_split_goto2(goto2);
 789                     Hole::One(pc)
 790                 }
 791                 (None, None) => unreachable!(
 792                     "at least one of the split \
 793                      holes must be filled"
 794                 ),
 795             },
 796             Hole::Many(holes) => {
 797                 let mut new_holes = vec![];
 798                 for hole in holes {
 799                     new_holes.push(self.fill_split(hole, goto1, goto2));
 800                 }
 801                 if new_holes.is_empty() {
 802                     Hole::None
 803                 } else if new_holes.len() == 1 {
 804                     new_holes.pop().unwrap()
 805                 } else {
 806                     Hole::Many(new_holes)
 807                 }
 808             }
 809         }
 810     }
 811
 812     fn push_compiled(&mut self, inst: Inst) {
 813         self.insts.push(MaybeInst::Compiled(inst));
 814     }
 815
 816     fn push_hole(&mut self, inst: InstHole) -> Hole {
 817         let hole = self.insts.len();
 818         self.insts.push(MaybeInst::Uncompiled(inst));
 819         Hole::One(hole)
 820     }
 821
 822     fn push_split_hole(&mut self) -> Hole {
 823         let hole = self.insts.len();
 824         self.insts.push(MaybeInst::Split);
 825         Hole::One(hole)
 826     }
 827
 828     fn pop_split_hole(&mut self) -> ResultOrEmpty {
 829         self.insts.pop();
 830         Ok(None)
 831     }
 832
 833     fn check_size(&self) -> result::Result<(), Error> {
 834         use std::mem::size_of;
 835
 836         let size =
 837             self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
 838         if size > self.size_limit {
 839             Err(Error::CompiledTooBig(self.size_limit))
 840         } else {
 841             Ok(())
 842         }
 843     }
 844 }
 845
 846 #[derive(Debug)]
 847 enum Hole {
 848     None,
 849     One(InstPtr),
 850     Many(Vec<Hole>),
 851 }
 852
 853 impl Hole {
 854     fn dup_one(self) -> (Self, Self) {
 855         match self {
 856             Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
 857             Hole::None | Hole::Many(_) => {
 858                 unreachable!("must be called on single hole")
 859             }
 860         }
 861     }
 862 }
 863
 864 #[derive(Clone, Debug)]
 865 enum MaybeInst {
 866     Compiled(Inst),
 867     Uncompiled(InstHole),
 868     Split,
 869     Split1(InstPtr),
 870     Split2(InstPtr),
 871 }
 872
 873 impl MaybeInst {
 874     fn fill(&mut self, goto: InstPtr) {
 875         let maybeinst = match *self {
 876             MaybeInst::Split => MaybeInst::Split1(goto),
 877             MaybeInst::Uncompiled(ref inst) => {
 878                 MaybeInst::Compiled(inst.fill(goto))
 879             }
 880             MaybeInst::Split1(goto1) => {
 881                 MaybeInst::Compiled(Inst::Split(InstSplit {
 882                     goto1: goto1,
 883                     goto2: goto,
 884                 }))
 885             }
 886             MaybeInst::Split2(goto2) => {
 887                 MaybeInst::Compiled(Inst::Split(InstSplit {
 888                     goto1: goto,
 889                     goto2: goto2,
 890                 }))
 891             }
 892             _ => unreachable!(
 893                 "not all instructions were compiled! \
 894                  found uncompiled instruction: {:?}",
 895                 self
 896             ),
 897         };
 898         *self = maybeinst;
 899     }
 900
 901     fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
 902         let filled = match *self {
 903             MaybeInst::Split => {
 904                 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
 905             }
 906             _ => unreachable!(
 907                 "must be called on Split instruction, \
 908                  instead it was called on: {:?}",
 909                 self
 910             ),
 911         };
 912         *self = MaybeInst::Compiled(filled);
 913     }
 914
 915     fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
 916         let half_filled = match *self {
 917             MaybeInst::Split => goto1,
 918             _ => unreachable!(
 919                 "must be called on Split instruction, \
 920                  instead it was called on: {:?}",
 921                 self
 922             ),
 923         };
 924         *self = MaybeInst::Split1(half_filled);
 925     }
 926
 927     fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
 928         let half_filled = match *self {
 929             MaybeInst::Split => goto2,
 930             _ => unreachable!(
 931                 "must be called on Split instruction, \
 932                  instead it was called on: {:?}",
 933                 self
 934             ),
 935         };
 936         *self = MaybeInst::Split2(half_filled);
 937     }
 938
 939     fn unwrap(self) -> Inst {
 940         match self {
 941             MaybeInst::Compiled(inst) => inst,
 942             _ => unreachable!(
 943                 "must be called on a compiled instruction, \
 944                  instead it was called on: {:?}",
 945                 self
 946             ),
 947         }
 948     }
 949 }
 950
 951 #[derive(Clone, Debug)]
 952 enum InstHole {
 953     Save { slot: usize },
 954     EmptyLook { look: EmptyLook },
 955     Char { c: char },
 956     Ranges { ranges: Vec<(char, char)> },
 957     Bytes { start: u8, end: u8 },
 958 }
 959
 960 impl InstHole {
 961     fn fill(&self, goto: InstPtr) -> Inst {
 962         match *self {
 963             InstHole::Save { slot } => {
 964                 Inst::Save(InstSave { goto: goto, slot: slot })
 965             }
 966             InstHole::EmptyLook { look } => {
 967                 Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
 968             }
 969             InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
 970             InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
 971                 goto: goto,
 972                 ranges: ranges.clone().into_boxed_slice(),
 973             }),
 974             InstHole::Bytes { start, end } => {
 975                 Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
 976             }
 977         }
 978     }
 979 }
 980
 981 struct CompileClass<'a, 'b> {
 982     c: &'a mut Compiler,
 983     ranges: &'b [hir::ClassUnicodeRange],
 984 }
 985
 986 impl<'a, 'b> CompileClass<'a, 'b> {
 987     fn compile(mut self) -> Result {
 988         let mut holes = vec![];
 989         let mut initial_entry = None;
 990         let mut last_split = Hole::None;
 991         let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
 992         self.c.suffix_cache.clear();
 993
 994         for (i, range) in self.ranges.iter().enumerate() {
 995             let is_last_range = i + 1 == self.ranges.len();
 996             utf8_seqs.reset(range.start(), range.end());
 997             let mut it = (&mut utf8_seqs).peekable();
 998             loop {
 999                 let utf8_seq = match it.next() {
1000                     None => break,
1001                     Some(utf8_seq) => utf8_seq,
1002                 };
1003                 if is_last_range && it.peek().is_none() {
1004                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1005                     holes.push(hole);
1006                     self.c.fill(last_split, entry);
1007                     last_split = Hole::None;
1008                     if initial_entry.is_none() {
1009                         initial_entry = Some(entry);
1010                     }
1011                 } else {
1012                     if initial_entry.is_none() {
1013                         initial_entry = Some(self.c.insts.len());
1014                     }
1015                     self.c.fill_to_next(last_split);
1016                     last_split = self.c.push_split_hole();
1017                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1018                     holes.push(hole);
1019                     last_split =
1020                         self.c.fill_split(last_split, Some(entry), None);
1021                 }
1022             }
1023         }
1024         self.c.utf8_seqs = Some(utf8_seqs);
1025         Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
1026     }
1027
1028     fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
1029         if self.c.compiled.is_reverse {
1030             self.c_utf8_seq_(seq)
1031         } else {
1032             self.c_utf8_seq_(seq.into_iter().rev())
1033         }
1034     }
1035
1036     fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
1037     where
1038         I: IntoIterator<Item = &'r Utf8Range>,
1039     {
1040         // The initial instruction for each UTF-8 sequence should be the same.
1041         let mut from_inst = ::std::usize::MAX;
1042         let mut last_hole = Hole::None;
1043         for byte_range in seq {
1044             let key = SuffixCacheKey {
1045                 from_inst: from_inst,
1046                 start: byte_range.start,
1047                 end: byte_range.end,
1048             };
1049             {
1050                 let pc = self.c.insts.len();
1051                 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
1052                     from_inst = cached_pc;
1053                     continue;
1054                 }
1055             }
1056             self.c.byte_classes.set_range(byte_range.start, byte_range.end);
1057             if from_inst == ::std::usize::MAX {
1058                 last_hole = self.c.push_hole(InstHole::Bytes {
1059                     start: byte_range.start,
1060                     end: byte_range.end,
1061                 });
1062             } else {
1063                 self.c.push_compiled(Inst::Bytes(InstBytes {
1064                     goto: from_inst,
1065                     start: byte_range.start,
1066                     end: byte_range.end,
1067                 }));
1068             }
1069             from_inst = self.c.insts.len().checked_sub(1).unwrap();
1070             debug_assert!(from_inst < ::std::usize::MAX);
1071         }
1072         debug_assert!(from_inst < ::std::usize::MAX);
1073         Ok(Patch { hole: last_hole, entry: from_inst })
1074     }
1075 }
1076
1077 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
1078 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
1079 /// The set of byte ranges looks like this:
1080 ///
1081 /// [0-7F]
1082 /// [C2-DF][80-BF]
1083 /// [E0][A0-BF][80-BF]
1084 /// [E1-EC][80-BF][80-BF]
1085 /// [ED][80-9F][80-BF]
1086 /// [EE-EF][80-BF][80-BF]
1087 ///
1088 /// Each line above translates to one alternate in the compiled regex program.
1089 /// However, all but one of the alternates end in the same suffix, which is
1090 /// a waste of an instruction. The suffix cache facilitates reusing them across
1091 /// alternates.
1092 ///
1093 /// Note that a HashMap could be trivially used for this, but we don't need its
1094 /// overhead. Some small bounded space (LRU style) is more than enough.
1095 ///
1096 /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
1097 /// except it uses hashes as original indices and then compares full keys for
1098 /// validation against `dense` array.
1099 #[derive(Debug)]
1100 struct SuffixCache {
1101     sparse: Box<[usize]>,
1102     dense: Vec<SuffixCacheEntry>,
1103 }
1104
1105 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1106 struct SuffixCacheEntry {
1107     key: SuffixCacheKey,
1108     pc: InstPtr,
1109 }
1110
1111 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1112 struct SuffixCacheKey {
1113     from_inst: InstPtr,
1114     start: u8,
1115     end: u8,
1116 }
1117
1118 impl SuffixCache {
1119     fn new(size: usize) -> Self {
1120         SuffixCache {
1121             sparse: vec![0usize; size].into(),
1122             dense: Vec::with_capacity(size),
1123         }
1124     }
1125
1126     fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1127         let hash = self.hash(&key);
1128         let pos = &mut self.sparse[hash];
1129         if let Some(entry) = self.dense.get(*pos) {
1130             if entry.key == key {
1131                 return Some(entry.pc);
1132             }
1133         }
1134         *pos = self.dense.len();
1135         self.dense.push(SuffixCacheEntry { key: key, pc: pc });
1136         None
1137     }
1138
1139     fn clear(&mut self) {
1140         self.dense.clear();
1141     }
1142
1143     fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1144         // Basic FNV-1a hash as described:
1145         // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1146         const FNV_PRIME: u64 = 1099511628211;
1147         let mut h = 14695981039346656037;
1148         h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1149         h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1150         h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1151         (h as usize) % self.sparse.len()
1152     }
1153 }
1154
1155 struct ByteClassSet([bool; 256]);
1156
1157 impl ByteClassSet {
1158     fn new() -> Self {
1159         ByteClassSet([false; 256])
1160     }
1161
1162     fn set_range(&mut self, start: u8, end: u8) {
1163         debug_assert!(start <= end);
1164         if start > 0 {
1165             self.0[start as usize - 1] = true;
1166         }
1167         self.0[end as usize] = true;
1168     }
1169
1170     fn set_word_boundary(&mut self) {
1171         // We need to mark all ranges of bytes whose pairs result in
1172         // evaluating \b differently.
1173         let iswb = is_word_byte;
1174         let mut b1: u16 = 0;
1175         let mut b2: u16;
1176         while b1 <= 255 {
1177             b2 = b1 + 1;
1178             while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1179                 b2 += 1;
1180             }
1181             self.set_range(b1 as u8, (b2 - 1) as u8);
1182             b1 = b2;
1183         }
1184     }
1185
1186     fn byte_classes(&self) -> Vec<u8> {
1187         // N.B. If you're debugging the DFA, it's useful to simply return
1188         // `(0..256).collect()`, which effectively removes the byte classes
1189         // and makes the transitions easier to read.
1190         // (0usize..256).map(|x| x as u8).collect()
1191         let mut byte_classes = vec![0; 256];
1192         let mut class = 0u8;
1193         let mut i = 0;
1194         loop {
1195             byte_classes[i] = class as u8;
1196             if i >= 255 {
1197                 break;
1198             }
1199             if self.0[i] {
1200                 class = class.checked_add(1).unwrap();
1201             }
1202             i += 1;
1203         }
1204         byte_classes
1205     }
1206 }
1207
1208 impl fmt::Debug for ByteClassSet {
1209     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1210         f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
1211     }
1212 }
1213
1214 fn u32_to_usize(n: u32) -> usize {
1215     // In case usize is less than 32 bits, we need to guard against overflow.
1216     // On most platforms this compiles to nothing.
1217     // TODO Use `std::convert::TryFrom` once it's stable.
1218     if (n as u64) > (::std::usize::MAX as u64) {
1219         panic!("BUG: {} is too big to be pointer sized", n)
1220     }
1221     n as usize
1222 }
1223
1224 #[cfg(test)]
1225 mod tests {
1226     use super::ByteClassSet;
1227
1228     #[test]
1229     fn byte_classes() {
1230         let mut set = ByteClassSet::new();
1231         set.set_range(b'a', b'z');
1232         let classes = set.byte_classes();
1233         assert_eq!(classes[0], 0);
1234         assert_eq!(classes[1], 0);
1235         assert_eq!(classes[2], 0);
1236         assert_eq!(classes[b'a' as usize - 1], 0);
1237         assert_eq!(classes[b'a' as usize], 1);
1238         assert_eq!(classes[b'm' as usize], 1);
1239         assert_eq!(classes[b'z' as usize], 1);
1240         assert_eq!(classes[b'z' as usize + 1], 2);
1241         assert_eq!(classes[254], 2);
1242         assert_eq!(classes[255], 2);
1243
1244         let mut set = ByteClassSet::new();
1245         set.set_range(0, 2);
1246         set.set_range(4, 6);
1247         let classes = set.byte_classes();
1248         assert_eq!(classes[0], 0);
1249         assert_eq!(classes[1], 0);
1250         assert_eq!(classes[2], 0);
1251         assert_eq!(classes[3], 1);
1252         assert_eq!(classes[4], 2);
1253         assert_eq!(classes[5], 2);
1254         assert_eq!(classes[6], 2);
1255         assert_eq!(classes[7], 3);
1256         assert_eq!(classes[255], 3);
1257     }
1258
1259     #[test]
1260     fn full_byte_classes() {
1261         let mut set = ByteClassSet::new();
1262         for i in 0..256u16 {
1263             set.set_range(i as u8, i as u8);
1264         }
1265         assert_eq!(set.byte_classes().len(), 256);
1266     }
1267 }