]>
Commit | Line | Data |
---|---|---|
94b46f34 XL |
1 | use std::collections::HashMap; |
2 | use std::cmp::Ordering; | |
3 | use std::fmt; | |
4 | use std::ops::Deref; | |
5 | use std::mem; | |
6 | use std::slice; | |
7 | use std::sync::Arc; | |
8 | ||
9 | use input::Char; | |
10 | use literal::LiteralSearcher; | |
11 | ||
12 | /// `InstPtr` represents the index of an instruction in a regex program. | |
13 | pub type InstPtr = usize; | |
14 | ||
15 | /// Program is a sequence of instructions and various facts about thos | |
16 | /// instructions. | |
17 | #[derive(Clone)] | |
18 | pub struct Program { | |
19 | /// A sequence of instructions that represents an NFA. | |
20 | pub insts: Vec<Inst>, | |
21 | /// Pointers to each Match instruction in the sequence. | |
22 | /// | |
23 | /// This is always length 1 unless this program represents a regex set. | |
24 | pub matches: Vec<InstPtr>, | |
25 | /// The ordered sequence of all capture groups extracted from the AST. | |
26 | /// Unnamed groups are `None`. | |
27 | pub captures: Vec<Option<String>>, | |
28 | /// Pointers to all named capture groups into `captures`. | |
29 | pub capture_name_idx: Arc<HashMap<String, usize>>, | |
30 | /// A pointer to the start instruction. This can vary depending on how | |
31 | /// the program was compiled. For example, programs for use with the DFA | |
32 | /// engine have a `.*?` inserted at the beginning of unanchored regular | |
33 | /// expressions. The actual starting point of the program is after the | |
34 | /// `.*?`. | |
35 | pub start: InstPtr, | |
36 | /// A set of equivalence classes for discriminating bytes in the compiled | |
37 | /// program. | |
38 | pub byte_classes: Vec<u8>, | |
39 | /// When true, this program can only match valid UTF-8. | |
40 | pub only_utf8: bool, | |
41 | /// When true, this program uses byte range instructions instead of Unicode | |
42 | /// range instructions. | |
43 | pub is_bytes: bool, | |
44 | /// When true, the program is compiled for DFA matching. For example, this | |
45 | /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored | |
46 | /// regexes. | |
47 | pub is_dfa: bool, | |
48 | /// When true, the program matches text in reverse (for use only in the | |
49 | /// DFA). | |
50 | pub is_reverse: bool, | |
51 | /// Whether the regex must match from the start of the input. | |
52 | pub is_anchored_start: bool, | |
53 | /// Whether the regex must match at the end of the input. | |
54 | pub is_anchored_end: bool, | |
55 | /// Whether this program contains a Unicode word boundary instruction. | |
56 | pub has_unicode_word_boundary: bool, | |
57 | /// A possibly empty machine for very quickly matching prefix literals. | |
58 | pub prefixes: LiteralSearcher, | |
59 | /// A limit on the size of the cache that the DFA is allowed to use while | |
60 | /// matching. | |
61 | /// | |
62 | /// The cache limit specifies approximately how much space we're willing to | |
63 | /// give to the state cache. Once the state cache exceeds the size, it is | |
64 | /// wiped and all states must be re-computed. | |
65 | /// | |
66 | /// Note that this value does not impact correctness. It can be set to 0 | |
67 | /// and the DFA will run just fine. (It will only ever store exactly one | |
68 | /// state in the cache, and will likely run very slowly, but it will work.) | |
69 | /// | |
70 | /// Also note that this limit is *per thread of execution*. That is, | |
71 | /// if the same regex is used to search text across multiple threads | |
72 | /// simultaneously, then the DFA cache is not shared. Instead, copies are | |
73 | /// made. | |
74 | pub dfa_size_limit: usize, | |
75 | } | |
76 | ||
77 | impl Program { | |
78 | /// Creates an empty instruction sequence. Fields are given default | |
79 | /// values. | |
80 | pub fn new() -> Self { | |
81 | Program { | |
82 | insts: vec![], | |
83 | matches: vec![], | |
84 | captures: vec![], | |
85 | capture_name_idx: Arc::new(HashMap::new()), | |
86 | start: 0, | |
87 | byte_classes: vec![0; 256], | |
88 | only_utf8: true, | |
89 | is_bytes: false, | |
90 | is_dfa: false, | |
91 | is_reverse: false, | |
92 | is_anchored_start: false, | |
93 | is_anchored_end: false, | |
94 | has_unicode_word_boundary: false, | |
95 | prefixes: LiteralSearcher::empty(), | |
96 | dfa_size_limit: 2 * (1<<20), | |
97 | } | |
98 | } | |
99 | ||
100 | /// If pc is an index to a no-op instruction (like Save), then return the | |
101 | /// next pc that is not a no-op instruction. | |
102 | pub fn skip(&self, mut pc: usize) -> usize { | |
103 | loop { | |
104 | match self[pc] { | |
105 | Inst::Save(ref i) => pc = i.goto, | |
106 | _ => return pc, | |
107 | } | |
108 | } | |
109 | } | |
110 | ||
111 | /// Return true if and only if an execution engine at instruction `pc` will | |
112 | /// always lead to a match. | |
113 | pub fn leads_to_match(&self, pc: usize) -> bool { | |
114 | if self.matches.len() > 1 { | |
115 | // If we have a regex set, then we have more than one ending | |
116 | // state, so leading to one of those states is generally | |
117 | // meaningless. | |
118 | return false; | |
119 | } | |
120 | match self[self.skip(pc)] { | |
121 | Inst::Match(_) => true, | |
122 | _ => false, | |
123 | } | |
124 | } | |
125 | ||
126 | /// Returns true if the current configuration demands that an implicit | |
127 | /// `.*?` be prepended to the instruction sequence. | |
128 | pub fn needs_dotstar(&self) -> bool { | |
129 | self.is_dfa && !self.is_reverse && !self.is_anchored_start | |
130 | } | |
131 | ||
132 | /// Returns true if this program uses Byte instructions instead of | |
133 | /// Char/Range instructions. | |
134 | pub fn uses_bytes(&self) -> bool { | |
135 | self.is_bytes || self.is_dfa | |
136 | } | |
137 | ||
138 | /// Returns true if this program exclusively matches valid UTF-8 bytes. | |
139 | /// | |
140 | /// That is, if an invalid UTF-8 byte is seen, then no match is possible. | |
141 | pub fn only_utf8(&self) -> bool { | |
142 | self.only_utf8 | |
143 | } | |
144 | ||
145 | /// Return the approximate heap usage of this instruction sequence in | |
146 | /// bytes. | |
147 | pub fn approximate_size(&self) -> usize { | |
148 | // The only instruction that uses heap space is Ranges (for | |
149 | // Unicode codepoint programs) to store non-overlapping codepoint | |
150 | // ranges. To keep this operation constant time, we ignore them. | |
151 | (self.len() * mem::size_of::<Inst>()) | |
152 | + (self.matches.len() * mem::size_of::<InstPtr>()) | |
153 | + (self.captures.len() * mem::size_of::<Option<String>>()) | |
154 | + (self.capture_name_idx.len() * | |
155 | (mem::size_of::<String>() + mem::size_of::<usize>())) | |
156 | + (self.byte_classes.len() * mem::size_of::<u8>()) | |
157 | + self.prefixes.approximate_size() | |
158 | } | |
159 | } | |
160 | ||
161 | impl Deref for Program { | |
162 | type Target = [Inst]; | |
163 | ||
164 | fn deref(&self) -> &Self::Target { | |
165 | &*self.insts | |
166 | } | |
167 | } | |
168 | ||
169 | impl fmt::Debug for Program { | |
170 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
171 | use self::Inst::*; | |
172 | ||
173 | fn with_goto(cur: usize, goto: usize, fmtd: String) -> String { | |
174 | if goto == cur + 1 { | |
175 | fmtd | |
176 | } else { | |
177 | format!("{} (goto: {})", fmtd, goto) | |
178 | } | |
179 | } | |
180 | ||
181 | fn visible_byte(b: u8) -> String { | |
182 | use std::ascii::escape_default; | |
183 | let escaped = escape_default(b).collect::<Vec<u8>>(); | |
184 | String::from_utf8_lossy(&escaped).into_owned() | |
185 | } | |
186 | ||
187 | for (pc, inst) in self.iter().enumerate() { | |
188 | match *inst { | |
189 | Match(slot) => { | |
190 | try!(write!(f, "{:04} Match({:?})", pc, slot)) | |
191 | } | |
192 | Save(ref inst) => { | |
193 | let s = format!("{:04} Save({})", pc, inst.slot); | |
194 | try!(write!(f, "{}", with_goto(pc, inst.goto, s))); | |
195 | } | |
196 | Split(ref inst) => { | |
197 | try!(write!(f, "{:04} Split({}, {})", | |
198 | pc, inst.goto1, inst.goto2)); | |
199 | } | |
200 | EmptyLook(ref inst) => { | |
201 | let s = format!("{:?}", inst.look); | |
202 | try!(write!(f, "{:04} {}", | |
203 | pc, with_goto(pc, inst.goto, s))); | |
204 | } | |
205 | Char(ref inst) => { | |
206 | let s = format!("{:?}", inst.c); | |
207 | try!(write!(f, "{:04} {}", | |
208 | pc, with_goto(pc, inst.goto, s))); | |
209 | } | |
210 | Ranges(ref inst) => { | |
211 | let ranges = inst.ranges | |
212 | .iter() | |
213 | .map(|r| format!("{:?}-{:?}", r.0, r.1)) | |
214 | .collect::<Vec<String>>() | |
215 | .join(", "); | |
216 | try!(write!(f, "{:04} {}", | |
217 | pc, with_goto(pc, inst.goto, ranges))); | |
218 | } | |
219 | Bytes(ref inst) => { | |
220 | let s = format!( | |
221 | "Bytes({}, {})", | |
222 | visible_byte(inst.start), | |
223 | visible_byte(inst.end)); | |
224 | try!(write!(f, "{:04} {}", | |
225 | pc, with_goto(pc, inst.goto, s))); | |
226 | } | |
227 | } | |
228 | if pc == self.start { | |
229 | try!(write!(f, " (start)")); | |
230 | } | |
231 | try!(write!(f, "\n")); | |
232 | } | |
233 | Ok(()) | |
234 | } | |
235 | } | |
236 | ||
237 | impl<'a> IntoIterator for &'a Program { | |
238 | type Item = &'a Inst; | |
239 | type IntoIter = slice::Iter<'a, Inst>; | |
240 | fn into_iter(self) -> Self::IntoIter { self.iter() } | |
241 | } | |
242 | ||
243 | /// Inst is an instruction code in a Regex program. | |
244 | /// | |
245 | /// Regrettably, a regex program either contains Unicode codepoint | |
246 | /// instructions (Char and Ranges) or it contains byte instructions (Bytes). | |
247 | /// A regex program can never contain both. | |
248 | /// | |
249 | /// It would be worth investigating splitting this into two distinct types and | |
250 | /// then figuring out how to make the matching engines polymorphic over those | |
251 | /// types without sacrificing performance. | |
252 | /// | |
253 | /// Other than the benefit of moving invariants into the type system, another | |
254 | /// benefit is the decreased size. If we remove the `Char` and `Ranges` | |
255 | /// instructions from the `Inst` enum, then its size shrinks from 40 bytes to | |
256 | /// 24 bytes. (This is because of the removal of a `Vec` in the `Ranges` | |
257 | /// variant.) Given that byte based machines are typically much bigger than | |
258 | /// their Unicode analogues (because they can decode UTF-8 directly), this ends | |
259 | /// up being a pretty significant savings. | |
260 | #[derive(Clone, Debug)] | |
261 | pub enum Inst { | |
262 | /// Match indicates that the program has reached a match state. | |
263 | /// | |
264 | /// The number in the match corresponds to the Nth logical regular | |
265 | /// expression in this program. This index is always 0 for normal regex | |
266 | /// programs. Values greater than 0 appear when compiling regex sets, and | |
267 | /// each match instruction gets its own unique value. The value corresponds | |
268 | /// to the Nth regex in the set. | |
269 | Match(usize), | |
270 | /// Save causes the program to save the current location of the input in | |
271 | /// the slot indicated by InstSave. | |
272 | Save(InstSave), | |
273 | /// Split causes the program to diverge to one of two paths in the | |
274 | /// program, preferring goto1 in InstSplit. | |
275 | Split(InstSplit), | |
276 | /// EmptyLook represents a zero-width assertion in a regex program. A | |
277 | /// zero-width assertion does not consume any of the input text. | |
278 | EmptyLook(InstEmptyLook), | |
279 | /// Char requires the regex program to match the character in InstChar at | |
280 | /// the current position in the input. | |
281 | Char(InstChar), | |
282 | /// Ranges requires the regex program to match the character at the current | |
283 | /// position in the input with one of the ranges specified in InstRanges. | |
284 | Ranges(InstRanges), | |
285 | /// Bytes is like Ranges, except it expresses a single byte range. It is | |
286 | /// used in conjunction with Split instructions to implement multi-byte | |
287 | /// character classes. | |
288 | Bytes(InstBytes), | |
289 | } | |
290 | ||
291 | impl Inst { | |
292 | /// Returns true if and only if this is a match instruction. | |
293 | pub fn is_match(&self) -> bool { | |
294 | match *self { | |
295 | Inst::Match(_) => true, | |
296 | _ => false, | |
297 | } | |
298 | } | |
299 | } | |
300 | ||
301 | /// Representation of the Save instruction. | |
302 | #[derive(Clone, Debug)] | |
303 | pub struct InstSave { | |
304 | /// The next location to execute in the program. | |
305 | pub goto: InstPtr, | |
306 | /// The capture slot (there are two slots for every capture in a regex, | |
307 | /// including the zeroth capture for the entire match). | |
308 | pub slot: usize, | |
309 | } | |
310 | ||
311 | /// Representation of the Split instruction. | |
312 | #[derive(Clone, Debug)] | |
313 | pub struct InstSplit { | |
314 | /// The first instruction to try. A match resulting from following goto1 | |
315 | /// has precedence over a match resulting from following goto2. | |
316 | pub goto1: InstPtr, | |
317 | /// The second instruction to try. A match resulting from following goto1 | |
318 | /// has precedence over a match resulting from following goto2. | |
319 | pub goto2: InstPtr, | |
320 | } | |
321 | ||
322 | /// Representation of the `EmptyLook` instruction. | |
323 | #[derive(Clone, Debug)] | |
324 | pub struct InstEmptyLook { | |
325 | /// The next location to execute in the program if this instruction | |
326 | /// succeeds. | |
327 | pub goto: InstPtr, | |
328 | /// The type of zero-width assertion to check. | |
329 | pub look: EmptyLook, | |
330 | } | |
331 | ||
332 | /// The set of zero-width match instructions. | |
333 | #[derive(Clone, Copy, Debug, PartialEq, Eq)] | |
334 | pub enum EmptyLook { | |
335 | /// Start of line or input. | |
336 | StartLine, | |
337 | /// End of line or input. | |
338 | EndLine, | |
339 | /// Start of input. | |
340 | StartText, | |
341 | /// End of input. | |
342 | EndText, | |
343 | /// Word character on one side and non-word character on other. | |
344 | WordBoundary, | |
345 | /// Word character on both sides or non-word character on both sides. | |
346 | NotWordBoundary, | |
347 | /// ASCII word boundary. | |
348 | WordBoundaryAscii, | |
349 | /// Not ASCII word boundary. | |
350 | NotWordBoundaryAscii, | |
351 | } | |
352 | ||
353 | /// Representation of the Char instruction. | |
354 | #[derive(Clone, Debug)] | |
355 | pub struct InstChar { | |
356 | /// The next location to execute in the program if this instruction | |
357 | /// succeeds. | |
358 | pub goto: InstPtr, | |
359 | /// The character to test. | |
360 | pub c: char, | |
361 | } | |
362 | ||
363 | /// Representation of the Ranges instruction. | |
364 | #[derive(Clone, Debug)] | |
365 | pub struct InstRanges { | |
366 | /// The next location to execute in the program if this instruction | |
367 | /// succeeds. | |
368 | pub goto: InstPtr, | |
369 | /// The set of Unicode scalar value ranges to test. | |
370 | pub ranges: Vec<(char, char)>, | |
371 | } | |
372 | ||
373 | impl InstRanges { | |
374 | /// Tests whether the given input character matches this instruction. | |
375 | pub fn matches(&self, c: Char) -> bool { | |
376 | // This speeds up the `match_class_unicode` benchmark by checking | |
377 | // some common cases quickly without binary search. e.g., Matching | |
378 | // a Unicode class on predominantly ASCII text. | |
379 | for r in self.ranges.iter().take(4) { | |
380 | if c < r.0 { | |
381 | return false; | |
382 | } | |
383 | if c <= r.1 { | |
384 | return true; | |
385 | } | |
386 | } | |
387 | self.ranges.binary_search_by(|r| { | |
388 | if r.1 < c { | |
389 | Ordering::Less | |
390 | } else if r.0 > c { | |
391 | Ordering::Greater | |
392 | } else { | |
393 | Ordering::Equal | |
394 | } | |
395 | }).is_ok() | |
396 | } | |
397 | ||
398 | /// Return the number of distinct characters represented by all of the | |
399 | /// ranges. | |
400 | pub fn num_chars(&self) -> usize { | |
401 | self.ranges.iter() | |
402 | .map(|&(s, e)| 1 + (e as u32) - (s as u32)) | |
403 | .fold(0, |acc, len| acc + len) | |
404 | as usize | |
405 | } | |
406 | } | |
407 | ||
408 | /// Representation of the Bytes instruction. | |
409 | #[derive(Clone, Debug)] | |
410 | pub struct InstBytes { | |
411 | /// The next location to execute in the program if this instruction | |
412 | /// succeeds. | |
413 | pub goto: InstPtr, | |
414 | /// The start (inclusive) of this byte range. | |
415 | pub start: u8, | |
416 | /// The end (inclusive) of this byte range. | |
417 | pub end: u8, | |
418 | } | |
419 | ||
420 | impl InstBytes { | |
421 | /// Returns true if and only if the given byte is in this range. | |
422 | pub fn matches(&self, byte: u8) -> bool { | |
423 | self.start <= byte && byte <= self.end | |
424 | } | |
425 | } |