]> git.proxmox.com Git - rustc.git/blame - vendor/regex-0.2.11/src/prog.rs
New upstream version 1.33.0+dfsg1
[rustc.git] / vendor / regex-0.2.11 / src / prog.rs
CommitLineData
94b46f34
XL
1use std::collections::HashMap;
2use std::cmp::Ordering;
3use std::fmt;
4use std::ops::Deref;
5use std::mem;
6use std::slice;
7use std::sync::Arc;
8
9use input::Char;
10use literal::LiteralSearcher;
11
12/// `InstPtr` represents the index of an instruction in a regex program.
13pub type InstPtr = usize;
14
15/// Program is a sequence of instructions and various facts about thos
16/// instructions.
17#[derive(Clone)]
18pub struct Program {
19 /// A sequence of instructions that represents an NFA.
20 pub insts: Vec<Inst>,
21 /// Pointers to each Match instruction in the sequence.
22 ///
23 /// This is always length 1 unless this program represents a regex set.
24 pub matches: Vec<InstPtr>,
25 /// The ordered sequence of all capture groups extracted from the AST.
26 /// Unnamed groups are `None`.
27 pub captures: Vec<Option<String>>,
28 /// Pointers to all named capture groups into `captures`.
29 pub capture_name_idx: Arc<HashMap<String, usize>>,
30 /// A pointer to the start instruction. This can vary depending on how
31 /// the program was compiled. For example, programs for use with the DFA
32 /// engine have a `.*?` inserted at the beginning of unanchored regular
33 /// expressions. The actual starting point of the program is after the
34 /// `.*?`.
35 pub start: InstPtr,
36 /// A set of equivalence classes for discriminating bytes in the compiled
37 /// program.
38 pub byte_classes: Vec<u8>,
39 /// When true, this program can only match valid UTF-8.
40 pub only_utf8: bool,
41 /// When true, this program uses byte range instructions instead of Unicode
42 /// range instructions.
43 pub is_bytes: bool,
44 /// When true, the program is compiled for DFA matching. For example, this
45 /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
46 /// regexes.
47 pub is_dfa: bool,
48 /// When true, the program matches text in reverse (for use only in the
49 /// DFA).
50 pub is_reverse: bool,
51 /// Whether the regex must match from the start of the input.
52 pub is_anchored_start: bool,
53 /// Whether the regex must match at the end of the input.
54 pub is_anchored_end: bool,
55 /// Whether this program contains a Unicode word boundary instruction.
56 pub has_unicode_word_boundary: bool,
57 /// A possibly empty machine for very quickly matching prefix literals.
58 pub prefixes: LiteralSearcher,
59 /// A limit on the size of the cache that the DFA is allowed to use while
60 /// matching.
61 ///
62 /// The cache limit specifies approximately how much space we're willing to
63 /// give to the state cache. Once the state cache exceeds the size, it is
64 /// wiped and all states must be re-computed.
65 ///
66 /// Note that this value does not impact correctness. It can be set to 0
67 /// and the DFA will run just fine. (It will only ever store exactly one
68 /// state in the cache, and will likely run very slowly, but it will work.)
69 ///
70 /// Also note that this limit is *per thread of execution*. That is,
71 /// if the same regex is used to search text across multiple threads
72 /// simultaneously, then the DFA cache is not shared. Instead, copies are
73 /// made.
74 pub dfa_size_limit: usize,
75}
76
77impl Program {
78 /// Creates an empty instruction sequence. Fields are given default
79 /// values.
80 pub fn new() -> Self {
81 Program {
82 insts: vec![],
83 matches: vec![],
84 captures: vec![],
85 capture_name_idx: Arc::new(HashMap::new()),
86 start: 0,
87 byte_classes: vec![0; 256],
88 only_utf8: true,
89 is_bytes: false,
90 is_dfa: false,
91 is_reverse: false,
92 is_anchored_start: false,
93 is_anchored_end: false,
94 has_unicode_word_boundary: false,
95 prefixes: LiteralSearcher::empty(),
96 dfa_size_limit: 2 * (1<<20),
97 }
98 }
99
100 /// If pc is an index to a no-op instruction (like Save), then return the
101 /// next pc that is not a no-op instruction.
102 pub fn skip(&self, mut pc: usize) -> usize {
103 loop {
104 match self[pc] {
105 Inst::Save(ref i) => pc = i.goto,
106 _ => return pc,
107 }
108 }
109 }
110
111 /// Return true if and only if an execution engine at instruction `pc` will
112 /// always lead to a match.
113 pub fn leads_to_match(&self, pc: usize) -> bool {
114 if self.matches.len() > 1 {
115 // If we have a regex set, then we have more than one ending
116 // state, so leading to one of those states is generally
117 // meaningless.
118 return false;
119 }
120 match self[self.skip(pc)] {
121 Inst::Match(_) => true,
122 _ => false,
123 }
124 }
125
126 /// Returns true if the current configuration demands that an implicit
127 /// `.*?` be prepended to the instruction sequence.
128 pub fn needs_dotstar(&self) -> bool {
129 self.is_dfa && !self.is_reverse && !self.is_anchored_start
130 }
131
132 /// Returns true if this program uses Byte instructions instead of
133 /// Char/Range instructions.
134 pub fn uses_bytes(&self) -> bool {
135 self.is_bytes || self.is_dfa
136 }
137
138 /// Returns true if this program exclusively matches valid UTF-8 bytes.
139 ///
140 /// That is, if an invalid UTF-8 byte is seen, then no match is possible.
141 pub fn only_utf8(&self) -> bool {
142 self.only_utf8
143 }
144
145 /// Return the approximate heap usage of this instruction sequence in
146 /// bytes.
147 pub fn approximate_size(&self) -> usize {
148 // The only instruction that uses heap space is Ranges (for
149 // Unicode codepoint programs) to store non-overlapping codepoint
150 // ranges. To keep this operation constant time, we ignore them.
151 (self.len() * mem::size_of::<Inst>())
152 + (self.matches.len() * mem::size_of::<InstPtr>())
153 + (self.captures.len() * mem::size_of::<Option<String>>())
154 + (self.capture_name_idx.len() *
155 (mem::size_of::<String>() + mem::size_of::<usize>()))
156 + (self.byte_classes.len() * mem::size_of::<u8>())
157 + self.prefixes.approximate_size()
158 }
159}
160
161impl Deref for Program {
162 type Target = [Inst];
163
164 fn deref(&self) -> &Self::Target {
165 &*self.insts
166 }
167}
168
169impl fmt::Debug for Program {
170 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
171 use self::Inst::*;
172
173 fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
174 if goto == cur + 1 {
175 fmtd
176 } else {
177 format!("{} (goto: {})", fmtd, goto)
178 }
179 }
180
181 fn visible_byte(b: u8) -> String {
182 use std::ascii::escape_default;
183 let escaped = escape_default(b).collect::<Vec<u8>>();
184 String::from_utf8_lossy(&escaped).into_owned()
185 }
186
187 for (pc, inst) in self.iter().enumerate() {
188 match *inst {
189 Match(slot) => {
190 try!(write!(f, "{:04} Match({:?})", pc, slot))
191 }
192 Save(ref inst) => {
193 let s = format!("{:04} Save({})", pc, inst.slot);
194 try!(write!(f, "{}", with_goto(pc, inst.goto, s)));
195 }
196 Split(ref inst) => {
197 try!(write!(f, "{:04} Split({}, {})",
198 pc, inst.goto1, inst.goto2));
199 }
200 EmptyLook(ref inst) => {
201 let s = format!("{:?}", inst.look);
202 try!(write!(f, "{:04} {}",
203 pc, with_goto(pc, inst.goto, s)));
204 }
205 Char(ref inst) => {
206 let s = format!("{:?}", inst.c);
207 try!(write!(f, "{:04} {}",
208 pc, with_goto(pc, inst.goto, s)));
209 }
210 Ranges(ref inst) => {
211 let ranges = inst.ranges
212 .iter()
213 .map(|r| format!("{:?}-{:?}", r.0, r.1))
214 .collect::<Vec<String>>()
215 .join(", ");
216 try!(write!(f, "{:04} {}",
217 pc, with_goto(pc, inst.goto, ranges)));
218 }
219 Bytes(ref inst) => {
220 let s = format!(
221 "Bytes({}, {})",
222 visible_byte(inst.start),
223 visible_byte(inst.end));
224 try!(write!(f, "{:04} {}",
225 pc, with_goto(pc, inst.goto, s)));
226 }
227 }
228 if pc == self.start {
229 try!(write!(f, " (start)"));
230 }
231 try!(write!(f, "\n"));
232 }
233 Ok(())
234 }
235}
236
237impl<'a> IntoIterator for &'a Program {
238 type Item = &'a Inst;
239 type IntoIter = slice::Iter<'a, Inst>;
240 fn into_iter(self) -> Self::IntoIter { self.iter() }
241}
242
243/// Inst is an instruction code in a Regex program.
244///
245/// Regrettably, a regex program either contains Unicode codepoint
246/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
247/// A regex program can never contain both.
248///
249/// It would be worth investigating splitting this into two distinct types and
250/// then figuring out how to make the matching engines polymorphic over those
251/// types without sacrificing performance.
252///
253/// Other than the benefit of moving invariants into the type system, another
254/// benefit is the decreased size. If we remove the `Char` and `Ranges`
255/// instructions from the `Inst` enum, then its size shrinks from 40 bytes to
256/// 24 bytes. (This is because of the removal of a `Vec` in the `Ranges`
257/// variant.) Given that byte based machines are typically much bigger than
258/// their Unicode analogues (because they can decode UTF-8 directly), this ends
259/// up being a pretty significant savings.
260#[derive(Clone, Debug)]
261pub enum Inst {
262 /// Match indicates that the program has reached a match state.
263 ///
264 /// The number in the match corresponds to the Nth logical regular
265 /// expression in this program. This index is always 0 for normal regex
266 /// programs. Values greater than 0 appear when compiling regex sets, and
267 /// each match instruction gets its own unique value. The value corresponds
268 /// to the Nth regex in the set.
269 Match(usize),
270 /// Save causes the program to save the current location of the input in
271 /// the slot indicated by InstSave.
272 Save(InstSave),
273 /// Split causes the program to diverge to one of two paths in the
274 /// program, preferring goto1 in InstSplit.
275 Split(InstSplit),
276 /// EmptyLook represents a zero-width assertion in a regex program. A
277 /// zero-width assertion does not consume any of the input text.
278 EmptyLook(InstEmptyLook),
279 /// Char requires the regex program to match the character in InstChar at
280 /// the current position in the input.
281 Char(InstChar),
282 /// Ranges requires the regex program to match the character at the current
283 /// position in the input with one of the ranges specified in InstRanges.
284 Ranges(InstRanges),
285 /// Bytes is like Ranges, except it expresses a single byte range. It is
286 /// used in conjunction with Split instructions to implement multi-byte
287 /// character classes.
288 Bytes(InstBytes),
289}
290
291impl Inst {
292 /// Returns true if and only if this is a match instruction.
293 pub fn is_match(&self) -> bool {
294 match *self {
295 Inst::Match(_) => true,
296 _ => false,
297 }
298 }
299}
300
301/// Representation of the Save instruction.
302#[derive(Clone, Debug)]
303pub struct InstSave {
304 /// The next location to execute in the program.
305 pub goto: InstPtr,
306 /// The capture slot (there are two slots for every capture in a regex,
307 /// including the zeroth capture for the entire match).
308 pub slot: usize,
309}
310
311/// Representation of the Split instruction.
312#[derive(Clone, Debug)]
313pub struct InstSplit {
314 /// The first instruction to try. A match resulting from following goto1
315 /// has precedence over a match resulting from following goto2.
316 pub goto1: InstPtr,
317 /// The second instruction to try. A match resulting from following goto1
318 /// has precedence over a match resulting from following goto2.
319 pub goto2: InstPtr,
320}
321
322/// Representation of the `EmptyLook` instruction.
323#[derive(Clone, Debug)]
324pub struct InstEmptyLook {
325 /// The next location to execute in the program if this instruction
326 /// succeeds.
327 pub goto: InstPtr,
328 /// The type of zero-width assertion to check.
329 pub look: EmptyLook,
330}
331
332/// The set of zero-width match instructions.
333#[derive(Clone, Copy, Debug, PartialEq, Eq)]
334pub enum EmptyLook {
335 /// Start of line or input.
336 StartLine,
337 /// End of line or input.
338 EndLine,
339 /// Start of input.
340 StartText,
341 /// End of input.
342 EndText,
343 /// Word character on one side and non-word character on other.
344 WordBoundary,
345 /// Word character on both sides or non-word character on both sides.
346 NotWordBoundary,
347 /// ASCII word boundary.
348 WordBoundaryAscii,
349 /// Not ASCII word boundary.
350 NotWordBoundaryAscii,
351}
352
353/// Representation of the Char instruction.
354#[derive(Clone, Debug)]
355pub struct InstChar {
356 /// The next location to execute in the program if this instruction
357 /// succeeds.
358 pub goto: InstPtr,
359 /// The character to test.
360 pub c: char,
361}
362
363/// Representation of the Ranges instruction.
364#[derive(Clone, Debug)]
365pub struct InstRanges {
366 /// The next location to execute in the program if this instruction
367 /// succeeds.
368 pub goto: InstPtr,
369 /// The set of Unicode scalar value ranges to test.
370 pub ranges: Vec<(char, char)>,
371}
372
373impl InstRanges {
374 /// Tests whether the given input character matches this instruction.
375 pub fn matches(&self, c: Char) -> bool {
376 // This speeds up the `match_class_unicode` benchmark by checking
377 // some common cases quickly without binary search. e.g., Matching
378 // a Unicode class on predominantly ASCII text.
379 for r in self.ranges.iter().take(4) {
380 if c < r.0 {
381 return false;
382 }
383 if c <= r.1 {
384 return true;
385 }
386 }
387 self.ranges.binary_search_by(|r| {
388 if r.1 < c {
389 Ordering::Less
390 } else if r.0 > c {
391 Ordering::Greater
392 } else {
393 Ordering::Equal
394 }
395 }).is_ok()
396 }
397
398 /// Return the number of distinct characters represented by all of the
399 /// ranges.
400 pub fn num_chars(&self) -> usize {
401 self.ranges.iter()
402 .map(|&(s, e)| 1 + (e as u32) - (s as u32))
403 .fold(0, |acc, len| acc + len)
404 as usize
405 }
406}
407
408/// Representation of the Bytes instruction.
409#[derive(Clone, Debug)]
410pub struct InstBytes {
411 /// The next location to execute in the program if this instruction
412 /// succeeds.
413 pub goto: InstPtr,
414 /// The start (inclusive) of this byte range.
415 pub start: u8,
416 /// The end (inclusive) of this byte range.
417 pub end: u8,
418}
419
420impl InstBytes {
421 /// Returns true if and only if the given byte is in this range.
422 pub fn matches(&self, byte: u8) -> bool {
423 self.start <= byte && byte <= self.end
424 }
425}