vendor/regex-1.4.3/src/prog.rs

   1 use std::cmp::Ordering;
   2 use std::collections::HashMap;
   3 use std::fmt;
   4 use std::mem;
   5 use std::ops::Deref;
   6 use std::slice;
   7 use std::sync::Arc;
   8
   9 use input::Char;
  10 use literal::LiteralSearcher;
  11
  12 /// `InstPtr` represents the index of an instruction in a regex program.
  13 pub type InstPtr = usize;
  14
  15 /// Program is a sequence of instructions and various facts about thos
  16 /// instructions.
  17 #[derive(Clone)]
  18 pub struct Program {
  19     /// A sequence of instructions that represents an NFA.
  20     pub insts: Vec<Inst>,
  21     /// Pointers to each Match instruction in the sequence.
  22     ///
  23     /// This is always length 1 unless this program represents a regex set.
  24     pub matches: Vec<InstPtr>,
  25     /// The ordered sequence of all capture groups extracted from the AST.
  26     /// Unnamed groups are `None`.
  27     pub captures: Vec<Option<String>>,
  28     /// Pointers to all named capture groups into `captures`.
  29     pub capture_name_idx: Arc<HashMap<String, usize>>,
  30     /// A pointer to the start instruction. This can vary depending on how
  31     /// the program was compiled. For example, programs for use with the DFA
  32     /// engine have a `.*?` inserted at the beginning of unanchored regular
  33     /// expressions. The actual starting point of the program is after the
  34     /// `.*?`.
  35     pub start: InstPtr,
  36     /// A set of equivalence classes for discriminating bytes in the compiled
  37     /// program.
  38     pub byte_classes: Vec<u8>,
  39     /// When true, this program can only match valid UTF-8.
  40     pub only_utf8: bool,
  41     /// When true, this program uses byte range instructions instead of Unicode
  42     /// range instructions.
  43     pub is_bytes: bool,
  44     /// When true, the program is compiled for DFA matching. For example, this
  45     /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
  46     /// regexes.
  47     pub is_dfa: bool,
  48     /// When true, the program matches text in reverse (for use only in the
  49     /// DFA).
  50     pub is_reverse: bool,
  51     /// Whether the regex must match from the start of the input.
  52     pub is_anchored_start: bool,
  53     /// Whether the regex must match at the end of the input.
  54     pub is_anchored_end: bool,
  55     /// Whether this program contains a Unicode word boundary instruction.
  56     pub has_unicode_word_boundary: bool,
  57     /// A possibly empty machine for very quickly matching prefix literals.
  58     pub prefixes: LiteralSearcher,
  59     /// A limit on the size of the cache that the DFA is allowed to use while
  60     /// matching.
  61     ///
  62     /// The cache limit specifies approximately how much space we're willing to
  63     /// give to the state cache. Once the state cache exceeds the size, it is
  64     /// wiped and all states must be re-computed.
  65     ///
  66     /// Note that this value does not impact correctness. It can be set to 0
  67     /// and the DFA will run just fine. (It will only ever store exactly one
  68     /// state in the cache, and will likely run very slowly, but it will work.)
  69     ///
  70     /// Also note that this limit is *per thread of execution*. That is,
  71     /// if the same regex is used to search text across multiple threads
  72     /// simultaneously, then the DFA cache is not shared. Instead, copies are
  73     /// made.
  74     pub dfa_size_limit: usize,
  75 }
  76
  77 impl Program {
  78     /// Creates an empty instruction sequence. Fields are given default
  79     /// values.
  80     pub fn new() -> Self {
  81         Program {
  82             insts: vec![],
  83             matches: vec![],
  84             captures: vec![],
  85             capture_name_idx: Arc::new(HashMap::new()),
  86             start: 0,
  87             byte_classes: vec![0; 256],
  88             only_utf8: true,
  89             is_bytes: false,
  90             is_dfa: false,
  91             is_reverse: false,
  92             is_anchored_start: false,
  93             is_anchored_end: false,
  94             has_unicode_word_boundary: false,
  95             prefixes: LiteralSearcher::empty(),
  96             dfa_size_limit: 2 * (1 << 20),
  97         }
  98     }
  99
 100     /// If pc is an index to a no-op instruction (like Save), then return the
 101     /// next pc that is not a no-op instruction.
 102     pub fn skip(&self, mut pc: usize) -> usize {
 103         loop {
 104             match self[pc] {
 105                 Inst::Save(ref i) => pc = i.goto,
 106                 _ => return pc,
 107             }
 108         }
 109     }
 110
 111     /// Return true if and only if an execution engine at instruction `pc` will
 112     /// always lead to a match.
 113     pub fn leads_to_match(&self, pc: usize) -> bool {
 114         if self.matches.len() > 1 {
 115             // If we have a regex set, then we have more than one ending
 116             // state, so leading to one of those states is generally
 117             // meaningless.
 118             return false;
 119         }
 120         match self[self.skip(pc)] {
 121             Inst::Match(_) => true,
 122             _ => false,
 123         }
 124     }
 125
 126     /// Returns true if the current configuration demands that an implicit
 127     /// `.*?` be prepended to the instruction sequence.
 128     pub fn needs_dotstar(&self) -> bool {
 129         self.is_dfa && !self.is_reverse && !self.is_anchored_start
 130     }
 131
 132     /// Returns true if this program uses Byte instructions instead of
 133     /// Char/Range instructions.
 134     pub fn uses_bytes(&self) -> bool {
 135         self.is_bytes || self.is_dfa
 136     }
 137
 138     /// Returns true if this program exclusively matches valid UTF-8 bytes.
 139     ///
 140     /// That is, if an invalid UTF-8 byte is seen, then no match is possible.
 141     pub fn only_utf8(&self) -> bool {
 142         self.only_utf8
 143     }
 144
 145     /// Return the approximate heap usage of this instruction sequence in
 146     /// bytes.
 147     pub fn approximate_size(&self) -> usize {
 148         // The only instruction that uses heap space is Ranges (for
 149         // Unicode codepoint programs) to store non-overlapping codepoint
 150         // ranges. To keep this operation constant time, we ignore them.
 151         (self.len() * mem::size_of::<Inst>())
 152             + (self.matches.len() * mem::size_of::<InstPtr>())
 153             + (self.captures.len() * mem::size_of::<Option<String>>())
 154             + (self.capture_name_idx.len()
 155                 * (mem::size_of::<String>() + mem::size_of::<usize>()))
 156             + (self.byte_classes.len() * mem::size_of::<u8>())
 157             + self.prefixes.approximate_size()
 158     }
 159 }
 160
 161 impl Deref for Program {
 162     type Target = [Inst];
 163
 164     #[cfg_attr(feature = "perf-inline", inline(always))]
 165     fn deref(&self) -> &Self::Target {
 166         &*self.insts
 167     }
 168 }
 169
 170 impl fmt::Debug for Program {
 171     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 172         use self::Inst::*;
 173
 174         fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
 175             if goto == cur + 1 {
 176                 fmtd
 177             } else {
 178                 format!("{} (goto: {})", fmtd, goto)
 179             }
 180         }
 181
 182         fn visible_byte(b: u8) -> String {
 183             use std::ascii::escape_default;
 184             let escaped = escape_default(b).collect::<Vec<u8>>();
 185             String::from_utf8_lossy(&escaped).into_owned()
 186         }
 187
 188         for (pc, inst) in self.iter().enumerate() {
 189             match *inst {
 190                 Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?,
 191                 Save(ref inst) => {
 192                     let s = format!("{:04} Save({})", pc, inst.slot);
 193                     write!(f, "{}", with_goto(pc, inst.goto, s))?;
 194                 }
 195                 Split(ref inst) => {
 196                     write!(
 197                         f,
 198                         "{:04} Split({}, {})",
 199                         pc, inst.goto1, inst.goto2
 200                     )?;
 201                 }
 202                 EmptyLook(ref inst) => {
 203                     let s = format!("{:?}", inst.look);
 204                     write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
 205                 }
 206                 Char(ref inst) => {
 207                     let s = format!("{:?}", inst.c);
 208                     write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
 209                 }
 210                 Ranges(ref inst) => {
 211                     let ranges = inst
 212                         .ranges
 213                         .iter()
 214                         .map(|r| format!("{:?}-{:?}", r.0, r.1))
 215                         .collect::<Vec<String>>()
 216                         .join(", ");
 217                     write!(
 218                         f,
 219                         "{:04} {}",
 220                         pc,
 221                         with_goto(pc, inst.goto, ranges)
 222                     )?;
 223                 }
 224                 Bytes(ref inst) => {
 225                     let s = format!(
 226                         "Bytes({}, {})",
 227                         visible_byte(inst.start),
 228                         visible_byte(inst.end)
 229                     );
 230                     write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
 231                 }
 232             }
 233             if pc == self.start {
 234                 write!(f, " (start)")?;
 235             }
 236             write!(f, "\n")?;
 237         }
 238         Ok(())
 239     }
 240 }
 241
 242 impl<'a> IntoIterator for &'a Program {
 243     type Item = &'a Inst;
 244     type IntoIter = slice::Iter<'a, Inst>;
 245     fn into_iter(self) -> Self::IntoIter {
 246         self.iter()
 247     }
 248 }
 249
 250 /// Inst is an instruction code in a Regex program.
 251 ///
 252 /// Regrettably, a regex program either contains Unicode codepoint
 253 /// instructions (Char and Ranges) or it contains byte instructions (Bytes).
 254 /// A regex program can never contain both.
 255 ///
 256 /// It would be worth investigating splitting this into two distinct types and
 257 /// then figuring out how to make the matching engines polymorphic over those
 258 /// types without sacrificing performance.
 259 ///
 260 /// Other than the benefit of moving invariants into the type system, another
 261 /// benefit is the decreased size. If we remove the `Char` and `Ranges`
 262 /// instructions from the `Inst` enum, then its size shrinks from 40 bytes to
 263 /// 24 bytes. (This is because of the removal of a `Vec` in the `Ranges`
 264 /// variant.) Given that byte based machines are typically much bigger than
 265 /// their Unicode analogues (because they can decode UTF-8 directly), this ends
 266 /// up being a pretty significant savings.
 267 #[derive(Clone, Debug)]
 268 pub enum Inst {
 269     /// Match indicates that the program has reached a match state.
 270     ///
 271     /// The number in the match corresponds to the Nth logical regular
 272     /// expression in this program. This index is always 0 for normal regex
 273     /// programs. Values greater than 0 appear when compiling regex sets, and
 274     /// each match instruction gets its own unique value. The value corresponds
 275     /// to the Nth regex in the set.
 276     Match(usize),
 277     /// Save causes the program to save the current location of the input in
 278     /// the slot indicated by InstSave.
 279     Save(InstSave),
 280     /// Split causes the program to diverge to one of two paths in the
 281     /// program, preferring goto1 in InstSplit.
 282     Split(InstSplit),
 283     /// EmptyLook represents a zero-width assertion in a regex program. A
 284     /// zero-width assertion does not consume any of the input text.
 285     EmptyLook(InstEmptyLook),
 286     /// Char requires the regex program to match the character in InstChar at
 287     /// the current position in the input.
 288     Char(InstChar),
 289     /// Ranges requires the regex program to match the character at the current
 290     /// position in the input with one of the ranges specified in InstRanges.
 291     Ranges(InstRanges),
 292     /// Bytes is like Ranges, except it expresses a single byte range. It is
 293     /// used in conjunction with Split instructions to implement multi-byte
 294     /// character classes.
 295     Bytes(InstBytes),
 296 }
 297
 298 impl Inst {
 299     /// Returns true if and only if this is a match instruction.
 300     pub fn is_match(&self) -> bool {
 301         match *self {
 302             Inst::Match(_) => true,
 303             _ => false,
 304         }
 305     }
 306 }
 307
 308 /// Representation of the Save instruction.
 309 #[derive(Clone, Debug)]
 310 pub struct InstSave {
 311     /// The next location to execute in the program.
 312     pub goto: InstPtr,
 313     /// The capture slot (there are two slots for every capture in a regex,
 314     /// including the zeroth capture for the entire match).
 315     pub slot: usize,
 316 }
 317
 318 /// Representation of the Split instruction.
 319 #[derive(Clone, Debug)]
 320 pub struct InstSplit {
 321     /// The first instruction to try. A match resulting from following goto1
 322     /// has precedence over a match resulting from following goto2.
 323     pub goto1: InstPtr,
 324     /// The second instruction to try. A match resulting from following goto1
 325     /// has precedence over a match resulting from following goto2.
 326     pub goto2: InstPtr,
 327 }
 328
 329 /// Representation of the `EmptyLook` instruction.
 330 #[derive(Clone, Debug)]
 331 pub struct InstEmptyLook {
 332     /// The next location to execute in the program if this instruction
 333     /// succeeds.
 334     pub goto: InstPtr,
 335     /// The type of zero-width assertion to check.
 336     pub look: EmptyLook,
 337 }
 338
 339 /// The set of zero-width match instructions.
 340 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 341 pub enum EmptyLook {
 342     /// Start of line or input.
 343     StartLine,
 344     /// End of line or input.
 345     EndLine,
 346     /// Start of input.
 347     StartText,
 348     /// End of input.
 349     EndText,
 350     /// Word character on one side and non-word character on other.
 351     WordBoundary,
 352     /// Word character on both sides or non-word character on both sides.
 353     NotWordBoundary,
 354     /// ASCII word boundary.
 355     WordBoundaryAscii,
 356     /// Not ASCII word boundary.
 357     NotWordBoundaryAscii,
 358 }
 359
 360 /// Representation of the Char instruction.
 361 #[derive(Clone, Debug)]
 362 pub struct InstChar {
 363     /// The next location to execute in the program if this instruction
 364     /// succeeds.
 365     pub goto: InstPtr,
 366     /// The character to test.
 367     pub c: char,
 368 }
 369
 370 /// Representation of the Ranges instruction.
 371 #[derive(Clone, Debug)]
 372 pub struct InstRanges {
 373     /// The next location to execute in the program if this instruction
 374     /// succeeds.
 375     pub goto: InstPtr,
 376     /// The set of Unicode scalar value ranges to test.
 377     pub ranges: Vec<(char, char)>,
 378 }
 379
 380 impl InstRanges {
 381     /// Tests whether the given input character matches this instruction.
 382     pub fn matches(&self, c: Char) -> bool {
 383         // This speeds up the `match_class_unicode` benchmark by checking
 384         // some common cases quickly without binary search. e.g., Matching
 385         // a Unicode class on predominantly ASCII text.
 386         for r in self.ranges.iter().take(4) {
 387             if c < r.0 {
 388                 return false;
 389             }
 390             if c <= r.1 {
 391                 return true;
 392             }
 393         }
 394         self.ranges
 395             .binary_search_by(|r| {
 396                 if r.1 < c {
 397                     Ordering::Less
 398                 } else if r.0 > c {
 399                     Ordering::Greater
 400                 } else {
 401                     Ordering::Equal
 402                 }
 403             })
 404             .is_ok()
 405     }
 406
 407     /// Return the number of distinct characters represented by all of the
 408     /// ranges.
 409     pub fn num_chars(&self) -> usize {
 410         self.ranges
 411             .iter()
 412             .map(|&(s, e)| 1 + (e as u32) - (s as u32))
 413             .sum::<u32>() as usize
 414     }
 415 }
 416
 417 /// Representation of the Bytes instruction.
 418 #[derive(Clone, Debug)]
 419 pub struct InstBytes {
 420     /// The next location to execute in the program if this instruction
 421     /// succeeds.
 422     pub goto: InstPtr,
 423     /// The start (inclusive) of this byte range.
 424     pub start: u8,
 425     /// The end (inclusive) of this byte range.
 426     pub end: u8,
 427 }
 428
 429 impl InstBytes {
 430     /// Returns true if and only if the given byte is in this range.
 431     pub fn matches(&self, byte: u8) -> bool {
 432         self.start <= byte && byte <= self.end
 433     }
 434 }