]>
Commit | Line | Data |
---|---|---|
8bb4bdeb | 1 | use std::cmp::Ordering; |
f9f354fc | 2 | use std::collections::HashMap; |
8bb4bdeb | 3 | use std::fmt; |
8bb4bdeb | 4 | use std::mem; |
f9f354fc | 5 | use std::ops::Deref; |
8bb4bdeb XL |
6 | use std::slice; |
7 | use std::sync::Arc; | |
8 | ||
9 | use input::Char; | |
0531ce1d | 10 | use literal::LiteralSearcher; |
8bb4bdeb | 11 | |
ff7c6d11 | 12 | /// `InstPtr` represents the index of an instruction in a regex program. |
8bb4bdeb XL |
13 | pub type InstPtr = usize; |
14 | ||
15 | /// Program is a sequence of instructions and various facts about thos | |
16 | /// instructions. | |
17 | #[derive(Clone)] | |
18 | pub struct Program { | |
19 | /// A sequence of instructions that represents an NFA. | |
20 | pub insts: Vec<Inst>, | |
21 | /// Pointers to each Match instruction in the sequence. | |
22 | /// | |
23 | /// This is always length 1 unless this program represents a regex set. | |
24 | pub matches: Vec<InstPtr>, | |
25 | /// The ordered sequence of all capture groups extracted from the AST. | |
26 | /// Unnamed groups are `None`. | |
27 | pub captures: Vec<Option<String>>, | |
28 | /// Pointers to all named capture groups into `captures`. | |
29 | pub capture_name_idx: Arc<HashMap<String, usize>>, | |
30 | /// A pointer to the start instruction. This can vary depending on how | |
31 | /// the program was compiled. For example, programs for use with the DFA | |
32 | /// engine have a `.*?` inserted at the beginning of unanchored regular | |
33 | /// expressions. The actual starting point of the program is after the | |
34 | /// `.*?`. | |
35 | pub start: InstPtr, | |
36 | /// A set of equivalence classes for discriminating bytes in the compiled | |
37 | /// program. | |
38 | pub byte_classes: Vec<u8>, | |
39 | /// When true, this program can only match valid UTF-8. | |
40 | pub only_utf8: bool, | |
41 | /// When true, this program uses byte range instructions instead of Unicode | |
42 | /// range instructions. | |
43 | pub is_bytes: bool, | |
44 | /// When true, the program is compiled for DFA matching. For example, this | |
45 | /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored | |
46 | /// regexes. | |
47 | pub is_dfa: bool, | |
48 | /// When true, the program matches text in reverse (for use only in the | |
49 | /// DFA). | |
50 | pub is_reverse: bool, | |
51 | /// Whether the regex must match from the start of the input. | |
52 | pub is_anchored_start: bool, | |
53 | /// Whether the regex must match at the end of the input. | |
54 | pub is_anchored_end: bool, | |
55 | /// Whether this program contains a Unicode word boundary instruction. | |
56 | pub has_unicode_word_boundary: bool, | |
57 | /// A possibly empty machine for very quickly matching prefix literals. | |
58 | pub prefixes: LiteralSearcher, | |
59 | /// A limit on the size of the cache that the DFA is allowed to use while | |
60 | /// matching. | |
61 | /// | |
62 | /// The cache limit specifies approximately how much space we're willing to | |
63 | /// give to the state cache. Once the state cache exceeds the size, it is | |
64 | /// wiped and all states must be re-computed. | |
65 | /// | |
66 | /// Note that this value does not impact correctness. It can be set to 0 | |
67 | /// and the DFA will run just fine. (It will only ever store exactly one | |
68 | /// state in the cache, and will likely run very slowly, but it will work.) | |
69 | /// | |
70 | /// Also note that this limit is *per thread of execution*. That is, | |
71 | /// if the same regex is used to search text across multiple threads | |
72 | /// simultaneously, then the DFA cache is not shared. Instead, copies are | |
73 | /// made. | |
74 | pub dfa_size_limit: usize, | |
75 | } | |
76 | ||
77 | impl Program { | |
78 | /// Creates an empty instruction sequence. Fields are given default | |
79 | /// values. | |
80 | pub fn new() -> Self { | |
81 | Program { | |
82 | insts: vec![], | |
83 | matches: vec![], | |
84 | captures: vec![], | |
85 | capture_name_idx: Arc::new(HashMap::new()), | |
86 | start: 0, | |
87 | byte_classes: vec![0; 256], | |
88 | only_utf8: true, | |
89 | is_bytes: false, | |
90 | is_dfa: false, | |
91 | is_reverse: false, | |
92 | is_anchored_start: false, | |
93 | is_anchored_end: false, | |
94 | has_unicode_word_boundary: false, | |
95 | prefixes: LiteralSearcher::empty(), | |
f9f354fc | 96 | dfa_size_limit: 2 * (1 << 20), |
8bb4bdeb XL |
97 | } |
98 | } | |
99 | ||
100 | /// If pc is an index to a no-op instruction (like Save), then return the | |
101 | /// next pc that is not a no-op instruction. | |
102 | pub fn skip(&self, mut pc: usize) -> usize { | |
103 | loop { | |
104 | match self[pc] { | |
105 | Inst::Save(ref i) => pc = i.goto, | |
106 | _ => return pc, | |
107 | } | |
108 | } | |
109 | } | |
110 | ||
111 | /// Return true if and only if an execution engine at instruction `pc` will | |
112 | /// always lead to a match. | |
113 | pub fn leads_to_match(&self, pc: usize) -> bool { | |
114 | if self.matches.len() > 1 { | |
115 | // If we have a regex set, then we have more than one ending | |
116 | // state, so leading to one of those states is generally | |
117 | // meaningless. | |
118 | return false; | |
119 | } | |
120 | match self[self.skip(pc)] { | |
121 | Inst::Match(_) => true, | |
122 | _ => false, | |
123 | } | |
124 | } | |
125 | ||
126 | /// Returns true if the current configuration demands that an implicit | |
127 | /// `.*?` be prepended to the instruction sequence. | |
128 | pub fn needs_dotstar(&self) -> bool { | |
129 | self.is_dfa && !self.is_reverse && !self.is_anchored_start | |
130 | } | |
131 | ||
132 | /// Returns true if this program uses Byte instructions instead of | |
133 | /// Char/Range instructions. | |
134 | pub fn uses_bytes(&self) -> bool { | |
135 | self.is_bytes || self.is_dfa | |
136 | } | |
137 | ||
138 | /// Returns true if this program exclusively matches valid UTF-8 bytes. | |
139 | /// | |
140 | /// That is, if an invalid UTF-8 byte is seen, then no match is possible. | |
141 | pub fn only_utf8(&self) -> bool { | |
142 | self.only_utf8 | |
143 | } | |
144 | ||
145 | /// Return the approximate heap usage of this instruction sequence in | |
146 | /// bytes. | |
147 | pub fn approximate_size(&self) -> usize { | |
148 | // The only instruction that uses heap space is Ranges (for | |
149 | // Unicode codepoint programs) to store non-overlapping codepoint | |
150 | // ranges. To keep this operation constant time, we ignore them. | |
151 | (self.len() * mem::size_of::<Inst>()) | |
f9f354fc XL |
152 | + (self.matches.len() * mem::size_of::<InstPtr>()) |
153 | + (self.captures.len() * mem::size_of::<Option<String>>()) | |
154 | + (self.capture_name_idx.len() | |
155 | * (mem::size_of::<String>() + mem::size_of::<usize>())) | |
156 | + (self.byte_classes.len() * mem::size_of::<u8>()) | |
157 | + self.prefixes.approximate_size() | |
8bb4bdeb XL |
158 | } |
159 | } | |
160 | ||
161 | impl Deref for Program { | |
162 | type Target = [Inst]; | |
163 | ||
f9f354fc | 164 | #[cfg_attr(feature = "perf-inline", inline(always))] |
8bb4bdeb XL |
165 | fn deref(&self) -> &Self::Target { |
166 | &*self.insts | |
167 | } | |
168 | } | |
169 | ||
170 | impl fmt::Debug for Program { | |
171 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
172 | use self::Inst::*; | |
173 | ||
174 | fn with_goto(cur: usize, goto: usize, fmtd: String) -> String { | |
175 | if goto == cur + 1 { | |
176 | fmtd | |
177 | } else { | |
178 | format!("{} (goto: {})", fmtd, goto) | |
179 | } | |
180 | } | |
181 | ||
182 | fn visible_byte(b: u8) -> String { | |
183 | use std::ascii::escape_default; | |
184 | let escaped = escape_default(b).collect::<Vec<u8>>(); | |
185 | String::from_utf8_lossy(&escaped).into_owned() | |
186 | } | |
187 | ||
188 | for (pc, inst) in self.iter().enumerate() { | |
189 | match *inst { | |
f9f354fc | 190 | Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?, |
8bb4bdeb XL |
191 | Save(ref inst) => { |
192 | let s = format!("{:04} Save({})", pc, inst.slot); | |
94b46f34 | 193 | write!(f, "{}", with_goto(pc, inst.goto, s))?; |
8bb4bdeb XL |
194 | } |
195 | Split(ref inst) => { | |
94b46f34 | 196 | write!( |
f9f354fc XL |
197 | f, |
198 | "{:04} Split({}, {})", | |
199 | pc, inst.goto1, inst.goto2 | |
200 | )?; | |
8bb4bdeb XL |
201 | } |
202 | EmptyLook(ref inst) => { | |
203 | let s = format!("{:?}", inst.look); | |
94b46f34 | 204 | write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; |
8bb4bdeb XL |
205 | } |
206 | Char(ref inst) => { | |
207 | let s = format!("{:?}", inst.c); | |
94b46f34 | 208 | write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; |
8bb4bdeb XL |
209 | } |
210 | Ranges(ref inst) => { | |
f9f354fc XL |
211 | let ranges = inst |
212 | .ranges | |
8bb4bdeb XL |
213 | .iter() |
214 | .map(|r| format!("{:?}-{:?}", r.0, r.1)) | |
215 | .collect::<Vec<String>>() | |
216 | .join(", "); | |
94b46f34 | 217 | write!( |
f9f354fc XL |
218 | f, |
219 | "{:04} {}", | |
220 | pc, | |
221 | with_goto(pc, inst.goto, ranges) | |
222 | )?; | |
8bb4bdeb XL |
223 | } |
224 | Bytes(ref inst) => { | |
225 | let s = format!( | |
226 | "Bytes({}, {})", | |
227 | visible_byte(inst.start), | |
f9f354fc XL |
228 | visible_byte(inst.end) |
229 | ); | |
94b46f34 | 230 | write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; |
8bb4bdeb XL |
231 | } |
232 | } | |
233 | if pc == self.start { | |
94b46f34 | 234 | write!(f, " (start)")?; |
8bb4bdeb | 235 | } |
94b46f34 | 236 | write!(f, "\n")?; |
8bb4bdeb XL |
237 | } |
238 | Ok(()) | |
239 | } | |
240 | } | |
241 | ||
242 | impl<'a> IntoIterator for &'a Program { | |
243 | type Item = &'a Inst; | |
244 | type IntoIter = slice::Iter<'a, Inst>; | |
f9f354fc XL |
245 | fn into_iter(self) -> Self::IntoIter { |
246 | self.iter() | |
247 | } | |
8bb4bdeb XL |
248 | } |
249 | ||
250 | /// Inst is an instruction code in a Regex program. | |
251 | /// | |
252 | /// Regrettably, a regex program either contains Unicode codepoint | |
253 | /// instructions (Char and Ranges) or it contains byte instructions (Bytes). | |
254 | /// A regex program can never contain both. | |
255 | /// | |
256 | /// It would be worth investigating splitting this into two distinct types and | |
257 | /// then figuring out how to make the matching engines polymorphic over those | |
258 | /// types without sacrificing performance. | |
259 | /// | |
260 | /// Other than the benefit of moving invariants into the type system, another | |
261 | /// benefit is the decreased size. If we remove the `Char` and `Ranges` | |
17df50a5 XL |
262 | /// instructions from the `Inst` enum, then its size shrinks from 40 bytes to |
263 | /// 24 bytes. (This is because of the removal of a `Vec` in the `Ranges` | |
8bb4bdeb XL |
264 | /// variant.) Given that byte based machines are typically much bigger than |
265 | /// their Unicode analogues (because they can decode UTF-8 directly), this ends | |
266 | /// up being a pretty significant savings. | |
267 | #[derive(Clone, Debug)] | |
268 | pub enum Inst { | |
269 | /// Match indicates that the program has reached a match state. | |
270 | /// | |
271 | /// The number in the match corresponds to the Nth logical regular | |
272 | /// expression in this program. This index is always 0 for normal regex | |
273 | /// programs. Values greater than 0 appear when compiling regex sets, and | |
274 | /// each match instruction gets its own unique value. The value corresponds | |
275 | /// to the Nth regex in the set. | |
276 | Match(usize), | |
277 | /// Save causes the program to save the current location of the input in | |
278 | /// the slot indicated by InstSave. | |
279 | Save(InstSave), | |
280 | /// Split causes the program to diverge to one of two paths in the | |
281 | /// program, preferring goto1 in InstSplit. | |
282 | Split(InstSplit), | |
283 | /// EmptyLook represents a zero-width assertion in a regex program. A | |
284 | /// zero-width assertion does not consume any of the input text. | |
285 | EmptyLook(InstEmptyLook), | |
286 | /// Char requires the regex program to match the character in InstChar at | |
287 | /// the current position in the input. | |
288 | Char(InstChar), | |
289 | /// Ranges requires the regex program to match the character at the current | |
290 | /// position in the input with one of the ranges specified in InstRanges. | |
291 | Ranges(InstRanges), | |
292 | /// Bytes is like Ranges, except it expresses a single byte range. It is | |
293 | /// used in conjunction with Split instructions to implement multi-byte | |
294 | /// character classes. | |
295 | Bytes(InstBytes), | |
296 | } | |
297 | ||
298 | impl Inst { | |
299 | /// Returns true if and only if this is a match instruction. | |
300 | pub fn is_match(&self) -> bool { | |
301 | match *self { | |
302 | Inst::Match(_) => true, | |
303 | _ => false, | |
304 | } | |
305 | } | |
306 | } | |
307 | ||
308 | /// Representation of the Save instruction. | |
309 | #[derive(Clone, Debug)] | |
310 | pub struct InstSave { | |
311 | /// The next location to execute in the program. | |
312 | pub goto: InstPtr, | |
313 | /// The capture slot (there are two slots for every capture in a regex, | |
314 | /// including the zeroth capture for the entire match). | |
315 | pub slot: usize, | |
316 | } | |
317 | ||
318 | /// Representation of the Split instruction. | |
319 | #[derive(Clone, Debug)] | |
320 | pub struct InstSplit { | |
321 | /// The first instruction to try. A match resulting from following goto1 | |
322 | /// has precedence over a match resulting from following goto2. | |
323 | pub goto1: InstPtr, | |
324 | /// The second instruction to try. A match resulting from following goto1 | |
325 | /// has precedence over a match resulting from following goto2. | |
326 | pub goto2: InstPtr, | |
327 | } | |
328 | ||
ff7c6d11 | 329 | /// Representation of the `EmptyLook` instruction. |
8bb4bdeb XL |
330 | #[derive(Clone, Debug)] |
331 | pub struct InstEmptyLook { | |
332 | /// The next location to execute in the program if this instruction | |
333 | /// succeeds. | |
334 | pub goto: InstPtr, | |
335 | /// The type of zero-width assertion to check. | |
336 | pub look: EmptyLook, | |
337 | } | |
338 | ||
339 | /// The set of zero-width match instructions. | |
340 | #[derive(Clone, Copy, Debug, PartialEq, Eq)] | |
341 | pub enum EmptyLook { | |
342 | /// Start of line or input. | |
343 | StartLine, | |
344 | /// End of line or input. | |
345 | EndLine, | |
346 | /// Start of input. | |
347 | StartText, | |
348 | /// End of input. | |
349 | EndText, | |
350 | /// Word character on one side and non-word character on other. | |
351 | WordBoundary, | |
352 | /// Word character on both sides or non-word character on both sides. | |
353 | NotWordBoundary, | |
354 | /// ASCII word boundary. | |
355 | WordBoundaryAscii, | |
356 | /// Not ASCII word boundary. | |
357 | NotWordBoundaryAscii, | |
358 | } | |
359 | ||
360 | /// Representation of the Char instruction. | |
361 | #[derive(Clone, Debug)] | |
362 | pub struct InstChar { | |
363 | /// The next location to execute in the program if this instruction | |
364 | /// succeeds. | |
365 | pub goto: InstPtr, | |
366 | /// The character to test. | |
367 | pub c: char, | |
368 | } | |
369 | ||
370 | /// Representation of the Ranges instruction. | |
371 | #[derive(Clone, Debug)] | |
372 | pub struct InstRanges { | |
373 | /// The next location to execute in the program if this instruction | |
374 | /// succeeds. | |
375 | pub goto: InstPtr, | |
376 | /// The set of Unicode scalar value ranges to test. | |
17df50a5 | 377 | pub ranges: Vec<(char, char)>, |
8bb4bdeb XL |
378 | } |
379 | ||
380 | impl InstRanges { | |
381 | /// Tests whether the given input character matches this instruction. | |
382 | pub fn matches(&self, c: Char) -> bool { | |
383 | // This speeds up the `match_class_unicode` benchmark by checking | |
384 | // some common cases quickly without binary search. e.g., Matching | |
385 | // a Unicode class on predominantly ASCII text. | |
386 | for r in self.ranges.iter().take(4) { | |
387 | if c < r.0 { | |
388 | return false; | |
389 | } | |
390 | if c <= r.1 { | |
391 | return true; | |
392 | } | |
393 | } | |
f9f354fc XL |
394 | self.ranges |
395 | .binary_search_by(|r| { | |
396 | if r.1 < c { | |
397 | Ordering::Less | |
398 | } else if r.0 > c { | |
399 | Ordering::Greater | |
400 | } else { | |
401 | Ordering::Equal | |
402 | } | |
403 | }) | |
404 | .is_ok() | |
8bb4bdeb XL |
405 | } |
406 | ||
407 | /// Return the number of distinct characters represented by all of the | |
408 | /// ranges. | |
409 | pub fn num_chars(&self) -> usize { | |
f9f354fc XL |
410 | self.ranges |
411 | .iter() | |
8bb4bdeb | 412 | .map(|&(s, e)| 1 + (e as u32) - (s as u32)) |
f9f354fc | 413 | .sum::<u32>() as usize |
8bb4bdeb XL |
414 | } |
415 | } | |
416 | ||
417 | /// Representation of the Bytes instruction. | |
418 | #[derive(Clone, Debug)] | |
419 | pub struct InstBytes { | |
420 | /// The next location to execute in the program if this instruction | |
421 | /// succeeds. | |
422 | pub goto: InstPtr, | |
423 | /// The start (inclusive) of this byte range. | |
424 | pub start: u8, | |
425 | /// The end (inclusive) of this byte range. | |
426 | pub end: u8, | |
427 | } | |
428 | ||
429 | impl InstBytes { | |
430 | /// Returns true if and only if the given byte is in this range. | |
431 | pub fn matches(&self, byte: u8) -> bool { | |
432 | self.start <= byte && byte <= self.end | |
433 | } | |
434 | } |