[rustc.git] / vendor / regex-0.2.11 / src / prog.rs

use std::collections::HashMap;
use std::cmp::Ordering;
use std::fmt;
use std::ops::Deref;
use std::mem;
use std::slice;
use std::sync::Arc;

use input::Char;
use literal::LiteralSearcher;

/// `InstPtr` represents the index of an instruction in a regex program.
pub type InstPtr = usize;

/// Program is a sequence of instructions and various facts about thos
/// instructions.
#[derive(Clone)]
pub struct Program {
    /// A sequence of instructions that represents an NFA.
    pub insts: Vec<Inst>,
    /// Pointers to each Match instruction in the sequence.
    ///
    /// This is always length 1 unless this program represents a regex set.
    pub matches: Vec<InstPtr>,
    /// The ordered sequence of all capture groups extracted from the AST.
    /// Unnamed groups are `None`.
    pub captures: Vec<Option<String>>,
    /// Pointers to all named capture groups into `captures`.
    pub capture_name_idx: Arc<HashMap<String, usize>>,
    /// A pointer to the start instruction. This can vary depending on how
    /// the program was compiled. For example, programs for use with the DFA
    /// engine have a `.*?` inserted at the beginning of unanchored regular
    /// expressions. The actual starting point of the program is after the
    /// `.*?`.
    pub start: InstPtr,
    /// A set of equivalence classes for discriminating bytes in the compiled
    /// program.
    pub byte_classes: Vec<u8>,
    /// When true, this program can only match valid UTF-8.
    pub only_utf8: bool,
    /// When true, this program uses byte range instructions instead of Unicode
    /// range instructions.
    pub is_bytes: bool,
    /// When true, the program is compiled for DFA matching. For example, this
    /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
    /// regexes.
    pub is_dfa: bool,
    /// When true, the program matches text in reverse (for use only in the
    /// DFA).
    pub is_reverse: bool,
    /// Whether the regex must match from the start of the input.
    pub is_anchored_start: bool,
    /// Whether the regex must match at the end of the input.
    pub is_anchored_end: bool,
    /// Whether this program contains a Unicode word boundary instruction.
    pub has_unicode_word_boundary: bool,
    /// A possibly empty machine for very quickly matching prefix literals.
    pub prefixes: LiteralSearcher,
    /// A limit on the size of the cache that the DFA is allowed to use while
    /// matching.
    ///
    /// The cache limit specifies approximately how much space we're willing to
    /// give to the state cache. Once the state cache exceeds the size, it is
    /// wiped and all states must be re-computed.
    ///
    /// Note that this value does not impact correctness. It can be set to 0
    /// and the DFA will run just fine. (It will only ever store exactly one
    /// state in the cache, and will likely run very slowly, but it will work.)
    ///
    /// Also note that this limit is *per thread of execution*. That is,
    /// if the same regex is used to search text across multiple threads
    /// simultaneously, then the DFA cache is not shared. Instead, copies are
    /// made.
    pub dfa_size_limit: usize,
}

impl Program {
    /// Creates an empty instruction sequence. Fields are given default
    /// values.
    pub fn new() -> Self {
        Program {
            insts: vec![],
            matches: vec![],
            captures: vec![],
            capture_name_idx: Arc::new(HashMap::new()),
            start: 0,
            byte_classes: vec![0; 256],
            only_utf8: true,
            is_bytes: false,
            is_dfa: false,
            is_reverse: false,
            is_anchored_start: false,
            is_anchored_end: false,
            has_unicode_word_boundary: false,
            prefixes: LiteralSearcher::empty(),
            dfa_size_limit: 2 * (1<<20),
        }
    }

    /// If pc is an index to a no-op instruction (like Save), then return the
    /// next pc that is not a no-op instruction.
    pub fn skip(&self, mut pc: usize) -> usize {
        loop {
            match self[pc] {
                Inst::Save(ref i) => pc = i.goto,
                _ => return pc,
            }
        }
    }

    /// Return true if and only if an execution engine at instruction `pc` will
    /// always lead to a match.
    pub fn leads_to_match(&self, pc: usize) -> bool {
        if self.matches.len() > 1 {
            // If we have a regex set, then we have more than one ending
            // state, so leading to one of those states is generally
            // meaningless.
            return false;
        }
        match self[self.skip(pc)] {
            Inst::Match(_) => true,
            _ => false,
        }
    }

    /// Returns true if the current configuration demands that an implicit
    /// `.*?` be prepended to the instruction sequence.
    pub fn needs_dotstar(&self) -> bool {
        self.is_dfa && !self.is_reverse && !self.is_anchored_start
    }

    /// Returns true if this program uses Byte instructions instead of
    /// Char/Range instructions.
    pub fn uses_bytes(&self) -> bool {
        self.is_bytes || self.is_dfa
    }

    /// Returns true if this program exclusively matches valid UTF-8 bytes.
    ///
    /// That is, if an invalid UTF-8 byte is seen, then no match is possible.
    pub fn only_utf8(&self) -> bool {
        self.only_utf8
    }

    /// Return the approximate heap usage of this instruction sequence in
    /// bytes.
    pub fn approximate_size(&self) -> usize {
        // The only instruction that uses heap space is Ranges (for
        // Unicode codepoint programs) to store non-overlapping codepoint
        // ranges. To keep this operation constant time, we ignore them.
        (self.len() * mem::size_of::<Inst>())
        + (self.matches.len() * mem::size_of::<InstPtr>())
        + (self.captures.len() * mem::size_of::<Option<String>>())
        + (self.capture_name_idx.len() *
           (mem::size_of::<String>() + mem::size_of::<usize>()))
        + (self.byte_classes.len() * mem::size_of::<u8>())
        + self.prefixes.approximate_size()
    }
}

impl Deref for Program {
    type Target = [Inst];

    fn deref(&self) -> &Self::Target {
        &*self.insts
    }
}

impl fmt::Debug for Program {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        use self::Inst::*;

        fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
            if goto == cur + 1 {
                fmtd
            } else {
                format!("{} (goto: {})", fmtd, goto)
            }
        }

        fn visible_byte(b: u8) -> String {
            use std::ascii::escape_default;
            let escaped = escape_default(b).collect::<Vec<u8>>();
            String::from_utf8_lossy(&escaped).into_owned()
        }

        for (pc, inst) in self.iter().enumerate() {
            match *inst {
                Match(slot) => {
                    try!(write!(f, "{:04} Match({:?})", pc, slot))
                }
                Save(ref inst) => {
                    let s = format!("{:04} Save({})", pc, inst.slot);
                    try!(write!(f, "{}", with_goto(pc, inst.goto, s)));
                }
                Split(ref inst) => {
                    try!(write!(f, "{:04} Split({}, {})",
                                pc, inst.goto1, inst.goto2));
                }
                EmptyLook(ref inst) => {
                    let s = format!("{:?}", inst.look);
                    try!(write!(f, "{:04} {}",
                                pc, with_goto(pc, inst.goto, s)));
                }
                Char(ref inst) => {
                    let s = format!("{:?}", inst.c);
                    try!(write!(f, "{:04} {}",
                                pc, with_goto(pc, inst.goto, s)));
                }
                Ranges(ref inst) => {
                    let ranges = inst.ranges
                        .iter()
                        .map(|r| format!("{:?}-{:?}", r.0, r.1))
                        .collect::<Vec<String>>()
                        .join(", ");
                    try!(write!(f, "{:04} {}",
                                pc, with_goto(pc, inst.goto, ranges)));
                }
                Bytes(ref inst) => {
                    let s = format!(
                        "Bytes({}, {})",
                        visible_byte(inst.start),
                        visible_byte(inst.end));
                    try!(write!(f, "{:04} {}",
                                pc, with_goto(pc, inst.goto, s)));
                }
            }
            if pc == self.start {
                try!(write!(f, " (start)"));
            }
            try!(write!(f, "\n"));
        }
        Ok(())
    }
}

impl<'a> IntoIterator for &'a Program {
    type Item = &'a Inst;
    type IntoIter = slice::Iter<'a, Inst>;
    fn into_iter(self) -> Self::IntoIter { self.iter() }
}

/// Inst is an instruction code in a Regex program.
///
/// Regrettably, a regex program either contains Unicode codepoint
/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
/// A regex program can never contain both.
///
/// It would be worth investigating splitting this into two distinct types and
/// then figuring out how to make the matching engines polymorphic over those
/// types without sacrificing performance.
///
/// Other than the benefit of moving invariants into the type system, another
/// benefit is the decreased size. If we remove the `Char` and `Ranges`
/// instructions from the `Inst` enum, then its size shrinks from 40 bytes to
/// 24 bytes. (This is because of the removal of a `Vec` in the `Ranges`
/// variant.) Given that byte based machines are typically much bigger than
/// their Unicode analogues (because they can decode UTF-8 directly), this ends
/// up being a pretty significant savings.
#[derive(Clone, Debug)]
pub enum Inst {
    /// Match indicates that the program has reached a match state.
    ///
    /// The number in the match corresponds to the Nth logical regular
    /// expression in this program. This index is always 0 for normal regex
    /// programs. Values greater than 0 appear when compiling regex sets, and
    /// each match instruction gets its own unique value. The value corresponds
    /// to the Nth regex in the set.
    Match(usize),
    /// Save causes the program to save the current location of the input in
    /// the slot indicated by InstSave.
    Save(InstSave),
    /// Split causes the program to diverge to one of two paths in the
    /// program, preferring goto1 in InstSplit.
    Split(InstSplit),
    /// EmptyLook represents a zero-width assertion in a regex program. A
    /// zero-width assertion does not consume any of the input text.
    EmptyLook(InstEmptyLook),
    /// Char requires the regex program to match the character in InstChar at
    /// the current position in the input.
    Char(InstChar),
    /// Ranges requires the regex program to match the character at the current
    /// position in the input with one of the ranges specified in InstRanges.
    Ranges(InstRanges),
    /// Bytes is like Ranges, except it expresses a single byte range. It is
    /// used in conjunction with Split instructions to implement multi-byte
    /// character classes.
    Bytes(InstBytes),
}

impl Inst {
    /// Returns true if and only if this is a match instruction.
    pub fn is_match(&self) -> bool {
        match *self {
            Inst::Match(_) => true,
            _ => false,
        }
    }
}

/// Representation of the Save instruction.
#[derive(Clone, Debug)]
pub struct InstSave {
    /// The next location to execute in the program.
    pub goto: InstPtr,
    /// The capture slot (there are two slots for every capture in a regex,
    /// including the zeroth capture for the entire match).
    pub slot: usize,
}

/// Representation of the Split instruction.
#[derive(Clone, Debug)]
pub struct InstSplit {
    /// The first instruction to try. A match resulting from following goto1
    /// has precedence over a match resulting from following goto2.
    pub goto1: InstPtr,
    /// The second instruction to try. A match resulting from following goto1
    /// has precedence over a match resulting from following goto2.
    pub goto2: InstPtr,
}

/// Representation of the `EmptyLook` instruction.
#[derive(Clone, Debug)]
pub struct InstEmptyLook {
    /// The next location to execute in the program if this instruction
    /// succeeds.
    pub goto: InstPtr,
    /// The type of zero-width assertion to check.
    pub look: EmptyLook,
}

/// The set of zero-width match instructions.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum EmptyLook {
    /// Start of line or input.
    StartLine,
    /// End of line or input.
    EndLine,
    /// Start of input.
    StartText,
    /// End of input.
    EndText,
    /// Word character on one side and non-word character on other.
    WordBoundary,
    /// Word character on both sides or non-word character on both sides.
    NotWordBoundary,
    /// ASCII word boundary.
    WordBoundaryAscii,
    /// Not ASCII word boundary.
    NotWordBoundaryAscii,
}

/// Representation of the Char instruction.
#[derive(Clone, Debug)]
pub struct InstChar {
    /// The next location to execute in the program if this instruction
    /// succeeds.
    pub goto: InstPtr,
    /// The character to test.
    pub c: char,
}

/// Representation of the Ranges instruction.
#[derive(Clone, Debug)]
pub struct InstRanges {
    /// The next location to execute in the program if this instruction
    /// succeeds.
    pub goto: InstPtr,
    /// The set of Unicode scalar value ranges to test.
    pub ranges: Vec<(char, char)>,
}

impl InstRanges {
    /// Tests whether the given input character matches this instruction.
    pub fn matches(&self, c: Char) -> bool {
        // This speeds up the `match_class_unicode` benchmark by checking
        // some common cases quickly without binary search. e.g., Matching
        // a Unicode class on predominantly ASCII text.
        for r in self.ranges.iter().take(4) {
            if c < r.0 {
                return false;
            }
            if c <= r.1 {
                return true;
            }
        }
        self.ranges.binary_search_by(|r| {
            if r.1 < c {
                Ordering::Less
            } else if r.0 > c {
                Ordering::Greater
            } else {
                Ordering::Equal
            }
        }).is_ok()
    }

    /// Return the number of distinct characters represented by all of the
    /// ranges.
    pub fn num_chars(&self) -> usize {
        self.ranges.iter()
            .map(|&(s, e)| 1 + (e as u32) - (s as u32))
            .fold(0, |acc, len| acc + len)
            as usize
    }
}

/// Representation of the Bytes instruction.
#[derive(Clone, Debug)]
pub struct InstBytes {
    /// The next location to execute in the program if this instruction
    /// succeeds.
    pub goto: InstPtr,
    /// The start (inclusive) of this byte range.
    pub start: u8,
    /// The end (inclusive) of this byte range.
    pub end: u8,
}

impl InstBytes {
    /// Returns true if and only if the given byte is in this range.
    pub fn matches(&self, byte: u8) -> bool {
        self.start <= byte && byte <= self.end
    }
}
Commit	Line	Data
94b46f34 XL	1	use std::collections::HashMap;
	2	use std::cmp::Ordering;
	3	use std::fmt;
	4	use std::ops::Deref;
	5	use std::mem;
	6	use std::slice;
	7	use std::sync::Arc;
	8
	9	use input::Char;
	10	use literal::LiteralSearcher;
	11
	12	/// `InstPtr` represents the index of an instruction in a regex program.
	13	pub type InstPtr = usize;
	14
	15	/// Program is a sequence of instructions and various facts about thos
	16	/// instructions.
	17	#[derive(Clone)]
	18	pub struct Program {
	19	/// A sequence of instructions that represents an NFA.
	20	pub insts: Vec<Inst>,
	21	/// Pointers to each Match instruction in the sequence.
	22	///
	23	/// This is always length 1 unless this program represents a regex set.
	24	pub matches: Vec<InstPtr>,
	25	/// The ordered sequence of all capture groups extracted from the AST.
	26	/// Unnamed groups are `None`.
	27	pub captures: Vec<Option<String>>,
	28	/// Pointers to all named capture groups into `captures`.
	29	pub capture_name_idx: Arc<HashMap<String, usize>>,
	30	/// A pointer to the start instruction. This can vary depending on how
	31	/// the program was compiled. For example, programs for use with the DFA
	32	/// engine have a `.*?` inserted at the beginning of unanchored regular
	33	/// expressions. The actual starting point of the program is after the
	34	/// `.*?`.
	35	pub start: InstPtr,
	36	/// A set of equivalence classes for discriminating bytes in the compiled
	37	/// program.
	38	pub byte_classes: Vec<u8>,
	39	/// When true, this program can only match valid UTF-8.
	40	pub only_utf8: bool,
	41	/// When true, this program uses byte range instructions instead of Unicode
	42	/// range instructions.
	43	pub is_bytes: bool,
	44	/// When true, the program is compiled for DFA matching. For example, this
	45	/// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
	46	/// regexes.
	47	pub is_dfa: bool,
	48	/// When true, the program matches text in reverse (for use only in the
	49	/// DFA).
	50	pub is_reverse: bool,
	51	/// Whether the regex must match from the start of the input.
	52	pub is_anchored_start: bool,
	53	/// Whether the regex must match at the end of the input.
	54	pub is_anchored_end: bool,
	55	/// Whether this program contains a Unicode word boundary instruction.
	56	pub has_unicode_word_boundary: bool,
	57	/// A possibly empty machine for very quickly matching prefix literals.
	58	pub prefixes: LiteralSearcher,
	59	/// A limit on the size of the cache that the DFA is allowed to use while
	60	/// matching.
	61	///
	62	/// The cache limit specifies approximately how much space we're willing to
	63	/// give to the state cache. Once the state cache exceeds the size, it is
	64	/// wiped and all states must be re-computed.
65	///
66	/// Note that this value does not impact correctness. It can be set to 0
67	/// and the DFA will run just fine. (It will only ever store exactly one
68	/// state in the cache, and will likely run very slowly, but it will work.)
69	///
70	/// Also note that this limit is per thread of execution. That is,
71	/// if the same regex is used to search text across multiple threads
72	/// simultaneously, then the DFA cache is not shared. Instead, copies are
73	/// made.
74	pub dfa_size_limit: usize,
75	}
76
77	impl Program {
78	/// Creates an empty instruction sequence. Fields are given default
79	/// values.
80	pub fn new() -> Self {
81	Program {
82	insts: vec![],
83	matches: vec![],
84	captures: vec![],
85	capture_name_idx: Arc::new(HashMap::new()),
86	start: 0,
87	byte_classes: vec![0; 256],
88	only_utf8: true,
89	is_bytes: false,
90	is_dfa: false,
91	is_reverse: false,
92	is_anchored_start: false,
93	is_anchored_end: false,
94	has_unicode_word_boundary: false,
95	prefixes: LiteralSearcher::empty(),
96	dfa_size_limit: 2 * (1<<20),
97	}
98	}
99
100	/// If pc is an index to a no-op instruction (like Save), then return the
101	/// next pc that is not a no-op instruction.
102	pub fn skip(&self, mut pc: usize) -> usize {
103	loop {
104	match self[pc] {
105	Inst::Save(ref i) => pc = i.goto,
106	_ => return pc,
107	}
108	}
109	}
110
111	/// Return true if and only if an execution engine at instruction `pc` will
112	/// always lead to a match.
113	pub fn leads_to_match(&self, pc: usize) -> bool {
114	if self.matches.len() > 1 {
115	// If we have a regex set, then we have more than one ending
116	// state, so leading to one of those states is generally
117	// meaningless.
118	return false;
119	}
120	match self[self.skip(pc)] {
121	Inst::Match(_) => true,
122	_ => false,
123	}
124	}
125
126	/// Returns true if the current configuration demands that an implicit
127	/// `.*?` be prepended to the instruction sequence.
128	pub fn needs_dotstar(&self) -> bool {
129	self.is_dfa && !self.is_reverse && !self.is_anchored_start
130	}
131
132	/// Returns true if this program uses Byte instructions instead of
133	/// Char/Range instructions.
134	pub fn uses_bytes(&self) -> bool {
135	self.is_bytes \|\| self.is_dfa
136	}
137
138	/// Returns true if this program exclusively matches valid UTF-8 bytes.
139	///
140	/// That is, if an invalid UTF-8 byte is seen, then no match is possible.
141	pub fn only_utf8(&self) -> bool {
142	self.only_utf8
143	}
144
145	/// Return the approximate heap usage of this instruction sequence in
146	/// bytes.
147	pub fn approximate_size(&self) -> usize {
148	// The only instruction that uses heap space is Ranges (for
149	// Unicode codepoint programs) to store non-overlapping codepoint
150	// ranges. To keep this operation constant time, we ignore them.
151	(self.len() * mem::size_of::<Inst>())
152	+ (self.matches.len() * mem::size_of::<InstPtr>())
153	+ (self.captures.len() * mem::size_of::<Option<String>>())
154	+ (self.capture_name_idx.len() *
155	(mem::size_of::<String>() + mem::size_of::<usize>()))
156	+ (self.byte_classes.len() * mem::size_of::<u8>())
157	+ self.prefixes.approximate_size()
158	}
159	}
160
161	impl Deref for Program {
162	type Target = [Inst];
163
164	fn deref(&self) -> &Self::Target {
165	&*self.insts
166	}
167	}
168
169	impl fmt::Debug for Program {
170	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
171	use self::Inst::*;
172
173	fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
174	if goto == cur + 1 {
175	fmtd
176	} else {
177	format!("{} (goto: {})", fmtd, goto)
178	}
179	}
180
181	fn visible_byte(b: u8) -> String {
182	use std::ascii::escape_default;
183	let escaped = escape_default(b).collect::<Vec<u8>>();
184	String::from_utf8_lossy(&escaped).into_owned()
185	}
186
187	for (pc, inst) in self.iter().enumerate() {
188	match *inst {
189	Match(slot) => {
190	try!(write!(f, "{:04} Match({:?})", pc, slot))
191	}
192	Save(ref inst) => {
193	let s = format!("{:04} Save({})", pc, inst.slot);
194	try!(write!(f, "{}", with_goto(pc, inst.goto, s)));
195	}
196	Split(ref inst) => {
197	try!(write!(f, "{:04} Split({}, {})",
198	pc, inst.goto1, inst.goto2));
199	}
200	EmptyLook(ref inst) => {
201	let s = format!("{:?}", inst.look);
202	try!(write!(f, "{:04} {}",
203	pc, with_goto(pc, inst.goto, s)));
204	}
205	Char(ref inst) => {
206	let s = format!("{:?}", inst.c);
207	try!(write!(f, "{:04} {}",
208	pc, with_goto(pc, inst.goto, s)));
209	}
210	Ranges(ref inst) => {
211	let ranges = inst.ranges
212	.iter()
213	.map(\|r\| format!("{:?}-{:?}", r.0, r.1))
214	.collect::<Vec<String>>()
215	.join(", ");
216	try!(write!(f, "{:04} {}",
217	pc, with_goto(pc, inst.goto, ranges)));
218	}
219	Bytes(ref inst) => {
220	let s = format!(
221	"Bytes({}, {})",
222	visible_byte(inst.start),
223	visible_byte(inst.end));
224	try!(write!(f, "{:04} {}",
225	pc, with_goto(pc, inst.goto, s)));
226	}
227	}
228	if pc == self.start {
229	try!(write!(f, " (start)"));
230	}
231	try!(write!(f, "\n"));
232	}
233	Ok(())
234	}
235	}
236
237	impl<'a> IntoIterator for &'a Program {
238	type Item = &'a Inst;
239	type IntoIter = slice::Iter<'a, Inst>;
240	fn into_iter(self) -> Self::IntoIter { self.iter() }
241	}
242
243	/// Inst is an instruction code in a Regex program.
244	///
245	/// Regrettably, a regex program either contains Unicode codepoint
246	/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
247	/// A regex program can never contain both.
248	///
249	/// It would be worth investigating splitting this into two distinct types and
250	/// then figuring out how to make the matching engines polymorphic over those
251	/// types without sacrificing performance.
252	///
253	/// Other than the benefit of moving invariants into the type system, another
254	/// benefit is the decreased size. If we remove the `Char` and `Ranges`
255	/// instructions from the `Inst` enum, then its size shrinks from 40 bytes to
256	/// 24 bytes. (This is because of the removal of a `Vec` in the `Ranges`
257	/// variant.) Given that byte based machines are typically much bigger than
258	/// their Unicode analogues (because they can decode UTF-8 directly), this ends
259	/// up being a pretty significant savings.
260	#[derive(Clone, Debug)]
261	pub enum Inst {
262	/// Match indicates that the program has reached a match state.
263	///
264	/// The number in the match corresponds to the Nth logical regular
265	/// expression in this program. This index is always 0 for normal regex
266	/// programs. Values greater than 0 appear when compiling regex sets, and
267	/// each match instruction gets its own unique value. The value corresponds
268	/// to the Nth regex in the set.
269	Match(usize),
270	/// Save causes the program to save the current location of the input in
271	/// the slot indicated by InstSave.
272	Save(InstSave),
273	/// Split causes the program to diverge to one of two paths in the
274	/// program, preferring goto1 in InstSplit.
275	Split(InstSplit),
276	/// EmptyLook represents a zero-width assertion in a regex program. A
277	/// zero-width assertion does not consume any of the input text.
278	EmptyLook(InstEmptyLook),
279	/// Char requires the regex program to match the character in InstChar at
280	/// the current position in the input.
281	Char(InstChar),
282	/// Ranges requires the regex program to match the character at the current
283	/// position in the input with one of the ranges specified in InstRanges.
284	Ranges(InstRanges),
285	/// Bytes is like Ranges, except it expresses a single byte range. It is
286	/// used in conjunction with Split instructions to implement multi-byte
287	/// character classes.
288	Bytes(InstBytes),
289	}
290
291	impl Inst {
292	/// Returns true if and only if this is a match instruction.
293	pub fn is_match(&self) -> bool {
294	match *self {
295	Inst::Match(_) => true,
296	_ => false,
297	}
298	}
299	}
300
301	/// Representation of the Save instruction.
302	#[derive(Clone, Debug)]
303	pub struct InstSave {
304	/// The next location to execute in the program.
305	pub goto: InstPtr,
306	/// The capture slot (there are two slots for every capture in a regex,
307	/// including the zeroth capture for the entire match).
308	pub slot: usize,
309	}
310
311	/// Representation of the Split instruction.
312	#[derive(Clone, Debug)]
313	pub struct InstSplit {
314	/// The first instruction to try. A match resulting from following goto1
315	/// has precedence over a match resulting from following goto2.
316	pub goto1: InstPtr,
317	/// The second instruction to try. A match resulting from following goto1
318	/// has precedence over a match resulting from following goto2.
319	pub goto2: InstPtr,
320	}
321
322	/// Representation of the `EmptyLook` instruction.
323	#[derive(Clone, Debug)]
324	pub struct InstEmptyLook {
325	/// The next location to execute in the program if this instruction
326	/// succeeds.
327	pub goto: InstPtr,
328	/// The type of zero-width assertion to check.
329	pub look: EmptyLook,
330	}
331
332	/// The set of zero-width match instructions.
333	#[derive(Clone, Copy, Debug, PartialEq, Eq)]
334	pub enum EmptyLook {
335	/// Start of line or input.
336	StartLine,
337	/// End of line or input.
338	EndLine,
339	/// Start of input.
340	StartText,
341	/// End of input.
342	EndText,
343	/// Word character on one side and non-word character on other.
344	WordBoundary,
345	/// Word character on both sides or non-word character on both sides.
346	NotWordBoundary,
347	/// ASCII word boundary.
348	WordBoundaryAscii,
349	/// Not ASCII word boundary.
350	NotWordBoundaryAscii,
351	}
352
353	/// Representation of the Char instruction.
354	#[derive(Clone, Debug)]
355	pub struct InstChar {
356	/// The next location to execute in the program if this instruction
357	/// succeeds.
358	pub goto: InstPtr,
359	/// The character to test.
360	pub c: char,
361	}
362
363	/// Representation of the Ranges instruction.
364	#[derive(Clone, Debug)]
365	pub struct InstRanges {
366	/// The next location to execute in the program if this instruction
367	/// succeeds.
368	pub goto: InstPtr,
369	/// The set of Unicode scalar value ranges to test.
370	pub ranges: Vec<(char, char)>,
371	}
372
373	impl InstRanges {
374	/// Tests whether the given input character matches this instruction.
375	pub fn matches(&self, c: Char) -> bool {
376	// This speeds up the `match_class_unicode` benchmark by checking
377	// some common cases quickly without binary search. e.g., Matching
378	// a Unicode class on predominantly ASCII text.
379	for r in self.ranges.iter().take(4) {
380	if c < r.0 {
381	return false;
382	}
383	if c <= r.1 {
384	return true;
385	}
386	}
387	self.ranges.binary_search_by(\|r\| {
388	if r.1 < c {
389	Ordering::Less
390	} else if r.0 > c {
391	Ordering::Greater
392	} else {
393	Ordering::Equal
394	}
395	}).is_ok()
396	}
397
398	/// Return the number of distinct characters represented by all of the
399	/// ranges.
400	pub fn num_chars(&self) -> usize {
401	self.ranges.iter()
402	.map(\|&(s, e)\| 1 + (e as u32) - (s as u32))
403	.fold(0, \|acc, len\| acc + len)
404	as usize
405	}
406	}
407
408	/// Representation of the Bytes instruction.
409	#[derive(Clone, Debug)]
410	pub struct InstBytes {
411	/// The next location to execute in the program if this instruction
412	/// succeeds.
413	pub goto: InstPtr,
414	/// The start (inclusive) of this byte range.
415	pub start: u8,
416	/// The end (inclusive) of this byte range.
417	pub end: u8,
418	}
419
420	impl InstBytes {
421	/// Returns true if and only if the given byte is in this range.
422	pub fn matches(&self, byte: u8) -> bool {
423	self.start <= byte && byte <= self.end
424	}
425	}