[rustc.git] / vendor / regex / src / prog.rs

use std::cmp::Ordering;
use std::collections::HashMap;
use std::fmt;
use std::mem;
use std::ops::Deref;
use std::slice;
use std::sync::Arc;

use input::Char;
use literal::LiteralSearcher;

/// `InstPtr` represents the index of an instruction in a regex program.
pub type InstPtr = usize;

/// Program is a sequence of instructions and various facts about thos
/// instructions.
#[derive(Clone)]
pub struct Program {
    /// A sequence of instructions that represents an NFA.
    pub insts: Vec<Inst>,
    /// Pointers to each Match instruction in the sequence.
    ///
    /// This is always length 1 unless this program represents a regex set.
    pub matches: Vec<InstPtr>,
    /// The ordered sequence of all capture groups extracted from the AST.
    /// Unnamed groups are `None`.
    pub captures: Vec<Option<String>>,
    /// Pointers to all named capture groups into `captures`.
    pub capture_name_idx: Arc<HashMap<String, usize>>,
    /// A pointer to the start instruction. This can vary depending on how
    /// the program was compiled. For example, programs for use with the DFA
    /// engine have a `.*?` inserted at the beginning of unanchored regular
    /// expressions. The actual starting point of the program is after the
    /// `.*?`.
    pub start: InstPtr,
    /// A set of equivalence classes for discriminating bytes in the compiled
    /// program.
    pub byte_classes: Vec<u8>,
    /// When true, this program can only match valid UTF-8.
    pub only_utf8: bool,
    /// When true, this program uses byte range instructions instead of Unicode
    /// range instructions.
    pub is_bytes: bool,
    /// When true, the program is compiled for DFA matching. For example, this
    /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
    /// regexes.
    pub is_dfa: bool,
    /// When true, the program matches text in reverse (for use only in the
    /// DFA).
    pub is_reverse: bool,
    /// Whether the regex must match from the start of the input.
    pub is_anchored_start: bool,
    /// Whether the regex must match at the end of the input.
    pub is_anchored_end: bool,
    /// Whether this program contains a Unicode word boundary instruction.
    pub has_unicode_word_boundary: bool,
    /// A possibly empty machine for very quickly matching prefix literals.
    pub prefixes: LiteralSearcher,
    /// A limit on the size of the cache that the DFA is allowed to use while
    /// matching.
    ///
    /// The cache limit specifies approximately how much space we're willing to
    /// give to the state cache. Once the state cache exceeds the size, it is
    /// wiped and all states must be re-computed.
    ///
    /// Note that this value does not impact correctness. It can be set to 0
    /// and the DFA will run just fine. (It will only ever store exactly one
    /// state in the cache, and will likely run very slowly, but it will work.)
    ///
    /// Also note that this limit is *per thread of execution*. That is,
    /// if the same regex is used to search text across multiple threads
    /// simultaneously, then the DFA cache is not shared. Instead, copies are
    /// made.
    pub dfa_size_limit: usize,
}

impl Program {
    /// Creates an empty instruction sequence. Fields are given default
    /// values.
    pub fn new() -> Self {
        Program {
            insts: vec![],
            matches: vec![],
            captures: vec![],
            capture_name_idx: Arc::new(HashMap::new()),
            start: 0,
            byte_classes: vec![0; 256],
            only_utf8: true,
            is_bytes: false,
            is_dfa: false,
            is_reverse: false,
            is_anchored_start: false,
            is_anchored_end: false,
            has_unicode_word_boundary: false,
            prefixes: LiteralSearcher::empty(),
            dfa_size_limit: 2 * (1 << 20),
        }
    }

    /// If pc is an index to a no-op instruction (like Save), then return the
    /// next pc that is not a no-op instruction.
    pub fn skip(&self, mut pc: usize) -> usize {
        loop {
            match self[pc] {
                Inst::Save(ref i) => pc = i.goto,
                _ => return pc,
            }
        }
    }

    /// Return true if and only if an execution engine at instruction `pc` will
    /// always lead to a match.
    pub fn leads_to_match(&self, pc: usize) -> bool {
        if self.matches.len() > 1 {
            // If we have a regex set, then we have more than one ending
            // state, so leading to one of those states is generally
            // meaningless.
            return false;
        }
        match self[self.skip(pc)] {
            Inst::Match(_) => true,
            _ => false,
        }
    }

    /// Returns true if the current configuration demands that an implicit
    /// `.*?` be prepended to the instruction sequence.
    pub fn needs_dotstar(&self) -> bool {
        self.is_dfa && !self.is_reverse && !self.is_anchored_start
    }

    /// Returns true if this program uses Byte instructions instead of
    /// Char/Range instructions.
    pub fn uses_bytes(&self) -> bool {
        self.is_bytes || self.is_dfa
    }

    /// Returns true if this program exclusively matches valid UTF-8 bytes.
    ///
    /// That is, if an invalid UTF-8 byte is seen, then no match is possible.
    pub fn only_utf8(&self) -> bool {
        self.only_utf8
    }

    /// Return the approximate heap usage of this instruction sequence in
    /// bytes.
    pub fn approximate_size(&self) -> usize {
        // The only instruction that uses heap space is Ranges (for
        // Unicode codepoint programs) to store non-overlapping codepoint
        // ranges. To keep this operation constant time, we ignore them.
        (self.len() * mem::size_of::<Inst>())
            + (self.matches.len() * mem::size_of::<InstPtr>())
            + (self.captures.len() * mem::size_of::<Option<String>>())
            + (self.capture_name_idx.len()
                * (mem::size_of::<String>() + mem::size_of::<usize>()))
            + (self.byte_classes.len() * mem::size_of::<u8>())
            + self.prefixes.approximate_size()
    }
}

impl Deref for Program {
    type Target = [Inst];

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn deref(&self) -> &Self::Target {
        &*self.insts
    }
}

impl fmt::Debug for Program {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        use self::Inst::*;

        fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
            if goto == cur + 1 {
                fmtd
            } else {
                format!("{} (goto: {})", fmtd, goto)
            }
        }

        fn visible_byte(b: u8) -> String {
            use std::ascii::escape_default;
            let escaped = escape_default(b).collect::<Vec<u8>>();
            String::from_utf8_lossy(&escaped).into_owned()
        }

        for (pc, inst) in self.iter().enumerate() {
            match *inst {
                Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?,
                Save(ref inst) => {
                    let s = format!("{:04} Save({})", pc, inst.slot);
                    write!(f, "{}", with_goto(pc, inst.goto, s))?;
                }
                Split(ref inst) => {
                    write!(
                        f,
                        "{:04} Split({}, {})",
                        pc, inst.goto1, inst.goto2
                    )?;
                }
                EmptyLook(ref inst) => {
                    let s = format!("{:?}", inst.look);
                    write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
                }
                Char(ref inst) => {
                    let s = format!("{:?}", inst.c);
                    write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
                }
                Ranges(ref inst) => {
                    let ranges = inst
                        .ranges
                        .iter()
                        .map(|r| format!("{:?}-{:?}", r.0, r.1))
                        .collect::<Vec<String>>()
                        .join(", ");
                    write!(
                        f,
                        "{:04} {}",
                        pc,
                        with_goto(pc, inst.goto, ranges)
                    )?;
                }
                Bytes(ref inst) => {
                    let s = format!(
                        "Bytes({}, {})",
                        visible_byte(inst.start),
                        visible_byte(inst.end)
                    );
                    write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
                }
            }
            if pc == self.start {
                write!(f, " (start)")?;
            }
            write!(f, "\n")?;
        }
        Ok(())
    }
}

impl<'a> IntoIterator for &'a Program {
    type Item = &'a Inst;
    type IntoIter = slice::Iter<'a, Inst>;
    fn into_iter(self) -> Self::IntoIter {
        self.iter()
    }
}

/// Inst is an instruction code in a Regex program.
///
/// Regrettably, a regex program either contains Unicode codepoint
/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
/// A regex program can never contain both.
///
/// It would be worth investigating splitting this into two distinct types and
/// then figuring out how to make the matching engines polymorphic over those
/// types without sacrificing performance.
///
/// Other than the benefit of moving invariants into the type system, another
/// benefit is the decreased size. If we remove the `Char` and `Ranges`
/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to
/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges`
/// variant.) Given that byte based machines are typically much bigger than
/// their Unicode analogues (because they can decode UTF-8 directly), this ends
/// up being a pretty significant savings.
#[derive(Clone, Debug)]
pub enum Inst {
    /// Match indicates that the program has reached a match state.
    ///
    /// The number in the match corresponds to the Nth logical regular
    /// expression in this program. This index is always 0 for normal regex
    /// programs. Values greater than 0 appear when compiling regex sets, and
    /// each match instruction gets its own unique value. The value corresponds
    /// to the Nth regex in the set.
    Match(usize),
    /// Save causes the program to save the current location of the input in
    /// the slot indicated by InstSave.
    Save(InstSave),
    /// Split causes the program to diverge to one of two paths in the
    /// program, preferring goto1 in InstSplit.
    Split(InstSplit),
    /// EmptyLook represents a zero-width assertion in a regex program. A
    /// zero-width assertion does not consume any of the input text.
    EmptyLook(InstEmptyLook),
    /// Char requires the regex program to match the character in InstChar at
    /// the current position in the input.
    Char(InstChar),
    /// Ranges requires the regex program to match the character at the current
    /// position in the input with one of the ranges specified in InstRanges.
    Ranges(InstRanges),
    /// Bytes is like Ranges, except it expresses a single byte range. It is
    /// used in conjunction with Split instructions to implement multi-byte
    /// character classes.
    Bytes(InstBytes),
}

impl Inst {
    /// Returns true if and only if this is a match instruction.
    pub fn is_match(&self) -> bool {
        match *self {
            Inst::Match(_) => true,
            _ => false,
        }
    }
}

/// Representation of the Save instruction.
#[derive(Clone, Debug)]
pub struct InstSave {
    /// The next location to execute in the program.
    pub goto: InstPtr,
    /// The capture slot (there are two slots for every capture in a regex,
    /// including the zeroth capture for the entire match).
    pub slot: usize,
}

/// Representation of the Split instruction.
#[derive(Clone, Debug)]
pub struct InstSplit {
    /// The first instruction to try. A match resulting from following goto1
    /// has precedence over a match resulting from following goto2.
    pub goto1: InstPtr,
    /// The second instruction to try. A match resulting from following goto1
    /// has precedence over a match resulting from following goto2.
    pub goto2: InstPtr,
}

/// Representation of the `EmptyLook` instruction.
#[derive(Clone, Debug)]
pub struct InstEmptyLook {
    /// The next location to execute in the program if this instruction
    /// succeeds.
    pub goto: InstPtr,
    /// The type of zero-width assertion to check.
    pub look: EmptyLook,
}

/// The set of zero-width match instructions.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum EmptyLook {
    /// Start of line or input.
    StartLine,
    /// End of line or input.
    EndLine,
    /// Start of input.
    StartText,
    /// End of input.
    EndText,
    /// Word character on one side and non-word character on other.
    WordBoundary,
    /// Word character on both sides or non-word character on both sides.
    NotWordBoundary,
    /// ASCII word boundary.
    WordBoundaryAscii,
    /// Not ASCII word boundary.
    NotWordBoundaryAscii,
}

/// Representation of the Char instruction.
#[derive(Clone, Debug)]
pub struct InstChar {
    /// The next location to execute in the program if this instruction
    /// succeeds.
    pub goto: InstPtr,
    /// The character to test.
    pub c: char,
}

/// Representation of the Ranges instruction.
#[derive(Clone, Debug)]
pub struct InstRanges {
    /// The next location to execute in the program if this instruction
    /// succeeds.
    pub goto: InstPtr,
    /// The set of Unicode scalar value ranges to test.
    pub ranges: Box<[(char, char)]>,
}

impl InstRanges {
    /// Tests whether the given input character matches this instruction.
    pub fn matches(&self, c: Char) -> bool {
        // This speeds up the `match_class_unicode` benchmark by checking
        // some common cases quickly without binary search. e.g., Matching
        // a Unicode class on predominantly ASCII text.
        for r in self.ranges.iter().take(4) {
            if c < r.0 {
                return false;
            }
            if c <= r.1 {
                return true;
            }
        }
        self.ranges
            .binary_search_by(|r| {
                if r.1 < c {
                    Ordering::Less
                } else if r.0 > c {
                    Ordering::Greater
                } else {
                    Ordering::Equal
                }
            })
            .is_ok()
    }

    /// Return the number of distinct characters represented by all of the
    /// ranges.
    pub fn num_chars(&self) -> usize {
        self.ranges
            .iter()
            .map(|&(s, e)| 1 + (e as u32) - (s as u32))
            .sum::<u32>() as usize
    }
}

/// Representation of the Bytes instruction.
#[derive(Clone, Debug)]
pub struct InstBytes {
    /// The next location to execute in the program if this instruction
    /// succeeds.
    pub goto: InstPtr,
    /// The start (inclusive) of this byte range.
    pub start: u8,
    /// The end (inclusive) of this byte range.
    pub end: u8,
}

impl InstBytes {
    /// Returns true if and only if the given byte is in this range.
    pub fn matches(&self, byte: u8) -> bool {
        self.start <= byte && byte <= self.end
    }
}

#[cfg(test)]
mod test {
    #[test]
    #[cfg(target_pointer_width = "64")]
    fn test_size_of_inst() {
        use std::mem::size_of;

        use super::Inst;

        assert_eq!(32, size_of::<Inst>());
    }
}
Commit	Line	Data
8bb4bdeb	1	use std::cmp::Ordering;
f9f354fc	2	use std::collections::HashMap;
8bb4bdeb	3	use std::fmt;
8bb4bdeb	4	use std::mem;
f9f354fc	5	use std::ops::Deref;
8bb4bdeb XL	6	use std::slice;
	7	use std::sync::Arc;
	8
	9	use input::Char;
0531ce1d	10	use literal::LiteralSearcher;
8bb4bdeb	11
ff7c6d11	12	/// `InstPtr` represents the index of an instruction in a regex program.
8bb4bdeb XL	13	pub type InstPtr = usize;
	14
	15	/// Program is a sequence of instructions and various facts about thos
	16	/// instructions.
	17	#[derive(Clone)]
	18	pub struct Program {
	19	/// A sequence of instructions that represents an NFA.
	20	pub insts: Vec<Inst>,
	21	/// Pointers to each Match instruction in the sequence.
	22	///
	23	/// This is always length 1 unless this program represents a regex set.
	24	pub matches: Vec<InstPtr>,
	25	/// The ordered sequence of all capture groups extracted from the AST.
	26	/// Unnamed groups are `None`.
	27	pub captures: Vec<Option<String>>,
	28	/// Pointers to all named capture groups into `captures`.
	29	pub capture_name_idx: Arc<HashMap<String, usize>>,
	30	/// A pointer to the start instruction. This can vary depending on how
	31	/// the program was compiled. For example, programs for use with the DFA
	32	/// engine have a `.*?` inserted at the beginning of unanchored regular
	33	/// expressions. The actual starting point of the program is after the
	34	/// `.*?`.
	35	pub start: InstPtr,
	36	/// A set of equivalence classes for discriminating bytes in the compiled
	37	/// program.
	38	pub byte_classes: Vec<u8>,
	39	/// When true, this program can only match valid UTF-8.
	40	pub only_utf8: bool,
	41	/// When true, this program uses byte range instructions instead of Unicode
	42	/// range instructions.
	43	pub is_bytes: bool,
	44	/// When true, the program is compiled for DFA matching. For example, this
	45	/// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
	46	/// regexes.
	47	pub is_dfa: bool,
	48	/// When true, the program matches text in reverse (for use only in the
	49	/// DFA).
	50	pub is_reverse: bool,
	51	/// Whether the regex must match from the start of the input.
	52	pub is_anchored_start: bool,
	53	/// Whether the regex must match at the end of the input.
	54	pub is_anchored_end: bool,
	55	/// Whether this program contains a Unicode word boundary instruction.
	56	pub has_unicode_word_boundary: bool,
	57	/// A possibly empty machine for very quickly matching prefix literals.
	58	pub prefixes: LiteralSearcher,
	59	/// A limit on the size of the cache that the DFA is allowed to use while
	60	/// matching.
	61	///
	62	/// The cache limit specifies approximately how much space we're willing to
	63	/// give to the state cache. Once the state cache exceeds the size, it is
	64	/// wiped and all states must be re-computed.
	65	///
	66	/// Note that this value does not impact correctness. It can be set to 0
	67	/// and the DFA will run just fine. (It will only ever store exactly one
	68	/// state in the cache, and will likely run very slowly, but it will work.)
	69	///
	70	/// Also note that this limit is per thread of execution. That is,
	71	/// if the same regex is used to search text across multiple threads
	72	/// simultaneously, then the DFA cache is not shared. Instead, copies are
	73	/// made.
	74	pub dfa_size_limit: usize,
	75	}
	76
77	impl Program {
78	/// Creates an empty instruction sequence. Fields are given default
79	/// values.
80	pub fn new() -> Self {
81	Program {
82	insts: vec![],
83	matches: vec![],
84	captures: vec![],
85	capture_name_idx: Arc::new(HashMap::new()),
86	start: 0,
87	byte_classes: vec![0; 256],
88	only_utf8: true,
89	is_bytes: false,
90	is_dfa: false,
91	is_reverse: false,
92	is_anchored_start: false,
93	is_anchored_end: false,
94	has_unicode_word_boundary: false,
95	prefixes: LiteralSearcher::empty(),
f9f354fc	96	dfa_size_limit: 2 * (1 << 20),
8bb4bdeb XL	97	}
	98	}
	99
	100	/// If pc is an index to a no-op instruction (like Save), then return the
	101	/// next pc that is not a no-op instruction.
	102	pub fn skip(&self, mut pc: usize) -> usize {
	103	loop {
	104	match self[pc] {
	105	Inst::Save(ref i) => pc = i.goto,
	106	_ => return pc,
	107	}
	108	}
	109	}
	110
	111	/// Return true if and only if an execution engine at instruction `pc` will
	112	/// always lead to a match.
	113	pub fn leads_to_match(&self, pc: usize) -> bool {
	114	if self.matches.len() > 1 {
	115	// If we have a regex set, then we have more than one ending
	116	// state, so leading to one of those states is generally
	117	// meaningless.
	118	return false;
	119	}
	120	match self[self.skip(pc)] {
	121	Inst::Match(_) => true,
	122	_ => false,
	123	}
	124	}
	125
	126	/// Returns true if the current configuration demands that an implicit
	127	/// `.*?` be prepended to the instruction sequence.
	128	pub fn needs_dotstar(&self) -> bool {
	129	self.is_dfa && !self.is_reverse && !self.is_anchored_start
	130	}
	131
	132	/// Returns true if this program uses Byte instructions instead of
	133	/// Char/Range instructions.
	134	pub fn uses_bytes(&self) -> bool {
	135	self.is_bytes \|\| self.is_dfa
	136	}
	137
	138	/// Returns true if this program exclusively matches valid UTF-8 bytes.
	139	///
	140	/// That is, if an invalid UTF-8 byte is seen, then no match is possible.
	141	pub fn only_utf8(&self) -> bool {
	142	self.only_utf8
	143	}
	144
	145	/// Return the approximate heap usage of this instruction sequence in
	146	/// bytes.
	147	pub fn approximate_size(&self) -> usize {
	148	// The only instruction that uses heap space is Ranges (for
	149	// Unicode codepoint programs) to store non-overlapping codepoint
	150	// ranges. To keep this operation constant time, we ignore them.
	151	(self.len() * mem::size_of::<Inst>())
f9f354fc XL	152	+ (self.matches.len() * mem::size_of::<InstPtr>())
	153	+ (self.captures.len() * mem::size_of::<Option<String>>())
	154	+ (self.capture_name_idx.len()
	155	* (mem::size_of::<String>() + mem::size_of::<usize>()))
	156	+ (self.byte_classes.len() * mem::size_of::<u8>())
	157	+ self.prefixes.approximate_size()
8bb4bdeb XL	158	}
	159	}
	160
	161	impl Deref for Program {
	162	type Target = [Inst];
	163
f9f354fc	164	#[cfg_attr(feature = "perf-inline", inline(always))]
8bb4bdeb XL	165	fn deref(&self) -> &Self::Target {
	166	&*self.insts
	167	}
	168	}
	169
	170	impl fmt::Debug for Program {
	171	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
	172	use self::Inst::*;
	173
	174	fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
	175	if goto == cur + 1 {
	176	fmtd
	177	} else {
	178	format!("{} (goto: {})", fmtd, goto)
	179	}
	180	}
	181
	182	fn visible_byte(b: u8) -> String {
	183	use std::ascii::escape_default;
	184	let escaped = escape_default(b).collect::<Vec<u8>>();
	185	String::from_utf8_lossy(&escaped).into_owned()
	186	}
	187
	188	for (pc, inst) in self.iter().enumerate() {
	189	match *inst {
f9f354fc	190	Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?,
8bb4bdeb XL	191	Save(ref inst) => {
8bb4bdeb XL	192	let s = format!("{:04} Save({})", pc, inst.slot);
94b46f34	193	write!(f, "{}", with_goto(pc, inst.goto, s))?;
8bb4bdeb XL	194	}
8bb4bdeb XL	195	Split(ref inst) => {
94b46f34	196	write!(
f9f354fc XL	197	f,
	198	"{:04} Split({}, {})",
	199	pc, inst.goto1, inst.goto2
	200	)?;
8bb4bdeb XL	201	}
	202	EmptyLook(ref inst) => {
	203	let s = format!("{:?}", inst.look);
94b46f34	204	write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
8bb4bdeb XL	205	}
	206	Char(ref inst) => {
	207	let s = format!("{:?}", inst.c);
94b46f34	208	write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
8bb4bdeb XL	209	}
8bb4bdeb XL	210	Ranges(ref inst) => {
f9f354fc XL	211	let ranges = inst
f9f354fc XL	212	.ranges
8bb4bdeb XL	213	.iter()
	214	.map(\|r\| format!("{:?}-{:?}", r.0, r.1))
	215	.collect::<Vec<String>>()
	216	.join(", ");
94b46f34	217	write!(
f9f354fc XL	218	f,
	219	"{:04} {}",
	220	pc,
	221	with_goto(pc, inst.goto, ranges)
	222	)?;
8bb4bdeb XL	223	}
	224	Bytes(ref inst) => {
	225	let s = format!(
	226	"Bytes({}, {})",
	227	visible_byte(inst.start),
f9f354fc XL	228	visible_byte(inst.end)
f9f354fc XL	229	);
94b46f34	230	write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
8bb4bdeb XL	231	}
	232	}
	233	if pc == self.start {
94b46f34	234	write!(f, " (start)")?;
8bb4bdeb	235	}
94b46f34	236	write!(f, "\n")?;
8bb4bdeb XL	237	}
	238	Ok(())
	239	}
	240	}
	241
	242	impl<'a> IntoIterator for &'a Program {
	243	type Item = &'a Inst;
	244	type IntoIter = slice::Iter<'a, Inst>;
f9f354fc XL	245	fn into_iter(self) -> Self::IntoIter {
	246	self.iter()
	247	}
8bb4bdeb XL	248	}
	249
	250	/// Inst is an instruction code in a Regex program.
	251	///
	252	/// Regrettably, a regex program either contains Unicode codepoint
	253	/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
	254	/// A regex program can never contain both.
	255	///
	256	/// It would be worth investigating splitting this into two distinct types and
	257	/// then figuring out how to make the matching engines polymorphic over those
	258	/// types without sacrificing performance.
	259	///
	260	/// Other than the benefit of moving invariants into the type system, another
	261	/// benefit is the decreased size. If we remove the `Char` and `Ranges`
cdc7bbd5 XL	262	/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to
cdc7bbd5 XL	263	/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges`
8bb4bdeb XL	264	/// variant.) Given that byte based machines are typically much bigger than
	265	/// their Unicode analogues (because they can decode UTF-8 directly), this ends
	266	/// up being a pretty significant savings.
	267	#[derive(Clone, Debug)]
	268	pub enum Inst {
	269	/// Match indicates that the program has reached a match state.
	270	///
	271	/// The number in the match corresponds to the Nth logical regular
	272	/// expression in this program. This index is always 0 for normal regex
	273	/// programs. Values greater than 0 appear when compiling regex sets, and
	274	/// each match instruction gets its own unique value. The value corresponds
	275	/// to the Nth regex in the set.
	276	Match(usize),
	277	/// Save causes the program to save the current location of the input in
	278	/// the slot indicated by InstSave.
	279	Save(InstSave),
	280	/// Split causes the program to diverge to one of two paths in the
	281	/// program, preferring goto1 in InstSplit.
	282	Split(InstSplit),
	283	/// EmptyLook represents a zero-width assertion in a regex program. A
	284	/// zero-width assertion does not consume any of the input text.
	285	EmptyLook(InstEmptyLook),
	286	/// Char requires the regex program to match the character in InstChar at
	287	/// the current position in the input.
	288	Char(InstChar),
	289	/// Ranges requires the regex program to match the character at the current
	290	/// position in the input with one of the ranges specified in InstRanges.
	291	Ranges(InstRanges),
	292	/// Bytes is like Ranges, except it expresses a single byte range. It is
	293	/// used in conjunction with Split instructions to implement multi-byte
	294	/// character classes.
	295	Bytes(InstBytes),
	296	}
	297
	298	impl Inst {
	299	/// Returns true if and only if this is a match instruction.
	300	pub fn is_match(&self) -> bool {
	301	match *self {
	302	Inst::Match(_) => true,
	303	_ => false,
	304	}
	305	}
	306	}
	307
	308	/// Representation of the Save instruction.
	309	#[derive(Clone, Debug)]
	310	pub struct InstSave {
	311	/// The next location to execute in the program.
	312	pub goto: InstPtr,
	313	/// The capture slot (there are two slots for every capture in a regex,
	314	/// including the zeroth capture for the entire match).
	315	pub slot: usize,
	316	}
	317
	318	/// Representation of the Split instruction.
	319	#[derive(Clone, Debug)]
	320	pub struct InstSplit {
	321	/// The first instruction to try. A match resulting from following goto1
	322	/// has precedence over a match resulting from following goto2.
	323	pub goto1: InstPtr,
	324	/// The second instruction to try. A match resulting from following goto1
	325	/// has precedence over a match resulting from following goto2.
	326	pub goto2: InstPtr,
	327	}
328
ff7c6d11	329	/// Representation of the `EmptyLook` instruction.
8bb4bdeb XL	330	#[derive(Clone, Debug)]
	331	pub struct InstEmptyLook {
	332	/// The next location to execute in the program if this instruction
	333	/// succeeds.
	334	pub goto: InstPtr,
	335	/// The type of zero-width assertion to check.
	336	pub look: EmptyLook,
	337	}
	338
	339	/// The set of zero-width match instructions.
	340	#[derive(Clone, Copy, Debug, PartialEq, Eq)]
	341	pub enum EmptyLook {
	342	/// Start of line or input.
	343	StartLine,
	344	/// End of line or input.
	345	EndLine,
	346	/// Start of input.
	347	StartText,
	348	/// End of input.
	349	EndText,
	350	/// Word character on one side and non-word character on other.
	351	WordBoundary,
	352	/// Word character on both sides or non-word character on both sides.
	353	NotWordBoundary,
	354	/// ASCII word boundary.
	355	WordBoundaryAscii,
	356	/// Not ASCII word boundary.
	357	NotWordBoundaryAscii,
	358	}
	359
	360	/// Representation of the Char instruction.
	361	#[derive(Clone, Debug)]
	362	pub struct InstChar {
	363	/// The next location to execute in the program if this instruction
	364	/// succeeds.
	365	pub goto: InstPtr,
	366	/// The character to test.
	367	pub c: char,
	368	}
	369
	370	/// Representation of the Ranges instruction.
	371	#[derive(Clone, Debug)]
	372	pub struct InstRanges {
	373	/// The next location to execute in the program if this instruction
	374	/// succeeds.
	375	pub goto: InstPtr,
	376	/// The set of Unicode scalar value ranges to test.
cdc7bbd5	377	pub ranges: Box<[(char, char)]>,
8bb4bdeb XL	378	}
	379
	380	impl InstRanges {
	381	/// Tests whether the given input character matches this instruction.
	382	pub fn matches(&self, c: Char) -> bool {
	383	// This speeds up the `match_class_unicode` benchmark by checking
	384	// some common cases quickly without binary search. e.g., Matching
	385	// a Unicode class on predominantly ASCII text.
	386	for r in self.ranges.iter().take(4) {
	387	if c < r.0 {
	388	return false;
	389	}
	390	if c <= r.1 {
	391	return true;
	392	}
	393	}
f9f354fc XL	394	self.ranges
	395	.binary_search_by(\|r\| {
	396	if r.1 < c {
	397	Ordering::Less
	398	} else if r.0 > c {
	399	Ordering::Greater
	400	} else {
	401	Ordering::Equal
	402	}
	403	})
	404	.is_ok()
8bb4bdeb XL	405	}
	406
	407	/// Return the number of distinct characters represented by all of the
	408	/// ranges.
	409	pub fn num_chars(&self) -> usize {
f9f354fc XL	410	self.ranges
f9f354fc XL	411	.iter()
8bb4bdeb	412	.map(\|&(s, e)\| 1 + (e as u32) - (s as u32))
f9f354fc	413	.sum::<u32>() as usize
8bb4bdeb XL	414	}
	415	}
	416
	417	/// Representation of the Bytes instruction.
	418	#[derive(Clone, Debug)]
	419	pub struct InstBytes {
	420	/// The next location to execute in the program if this instruction
	421	/// succeeds.
	422	pub goto: InstPtr,
	423	/// The start (inclusive) of this byte range.
	424	pub start: u8,
	425	/// The end (inclusive) of this byte range.
	426	pub end: u8,
	427	}
	428
	429	impl InstBytes {
	430	/// Returns true if and only if the given byte is in this range.
	431	pub fn matches(&self, byte: u8) -> bool {
	432	self.start <= byte && byte <= self.end
	433	}
	434	}
cdc7bbd5 XL	435
	436	#[cfg(test)]
	437	mod test {
	438	#[test]
	439	#[cfg(target_pointer_width = "64")]
	440	fn test_size_of_inst() {
	441	use std::mem::size_of;
	442
	443	use super::Inst;
	444
	445	assert_eq!(32, size_of::<Inst>());
	446	}
	447	}