]> git.proxmox.com Git - rustc.git/blame - vendor/regex/src/prog.rs
New upstream version 1.53.0+dfsg1
[rustc.git] / vendor / regex / src / prog.rs
CommitLineData
8bb4bdeb 1use std::cmp::Ordering;
f9f354fc 2use std::collections::HashMap;
8bb4bdeb 3use std::fmt;
8bb4bdeb 4use std::mem;
f9f354fc 5use std::ops::Deref;
8bb4bdeb
XL
6use std::slice;
7use std::sync::Arc;
8
9use input::Char;
0531ce1d 10use literal::LiteralSearcher;
8bb4bdeb 11
ff7c6d11 12/// `InstPtr` represents the index of an instruction in a regex program.
8bb4bdeb
XL
13pub type InstPtr = usize;
14
15/// Program is a sequence of instructions and various facts about thos
16/// instructions.
17#[derive(Clone)]
18pub struct Program {
19 /// A sequence of instructions that represents an NFA.
20 pub insts: Vec<Inst>,
21 /// Pointers to each Match instruction in the sequence.
22 ///
23 /// This is always length 1 unless this program represents a regex set.
24 pub matches: Vec<InstPtr>,
25 /// The ordered sequence of all capture groups extracted from the AST.
26 /// Unnamed groups are `None`.
27 pub captures: Vec<Option<String>>,
28 /// Pointers to all named capture groups into `captures`.
29 pub capture_name_idx: Arc<HashMap<String, usize>>,
30 /// A pointer to the start instruction. This can vary depending on how
31 /// the program was compiled. For example, programs for use with the DFA
32 /// engine have a `.*?` inserted at the beginning of unanchored regular
33 /// expressions. The actual starting point of the program is after the
34 /// `.*?`.
35 pub start: InstPtr,
36 /// A set of equivalence classes for discriminating bytes in the compiled
37 /// program.
38 pub byte_classes: Vec<u8>,
39 /// When true, this program can only match valid UTF-8.
40 pub only_utf8: bool,
41 /// When true, this program uses byte range instructions instead of Unicode
42 /// range instructions.
43 pub is_bytes: bool,
44 /// When true, the program is compiled for DFA matching. For example, this
45 /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
46 /// regexes.
47 pub is_dfa: bool,
48 /// When true, the program matches text in reverse (for use only in the
49 /// DFA).
50 pub is_reverse: bool,
51 /// Whether the regex must match from the start of the input.
52 pub is_anchored_start: bool,
53 /// Whether the regex must match at the end of the input.
54 pub is_anchored_end: bool,
55 /// Whether this program contains a Unicode word boundary instruction.
56 pub has_unicode_word_boundary: bool,
57 /// A possibly empty machine for very quickly matching prefix literals.
58 pub prefixes: LiteralSearcher,
59 /// A limit on the size of the cache that the DFA is allowed to use while
60 /// matching.
61 ///
62 /// The cache limit specifies approximately how much space we're willing to
63 /// give to the state cache. Once the state cache exceeds the size, it is
64 /// wiped and all states must be re-computed.
65 ///
66 /// Note that this value does not impact correctness. It can be set to 0
67 /// and the DFA will run just fine. (It will only ever store exactly one
68 /// state in the cache, and will likely run very slowly, but it will work.)
69 ///
70 /// Also note that this limit is *per thread of execution*. That is,
71 /// if the same regex is used to search text across multiple threads
72 /// simultaneously, then the DFA cache is not shared. Instead, copies are
73 /// made.
74 pub dfa_size_limit: usize,
75}
76
77impl Program {
78 /// Creates an empty instruction sequence. Fields are given default
79 /// values.
80 pub fn new() -> Self {
81 Program {
82 insts: vec![],
83 matches: vec![],
84 captures: vec![],
85 capture_name_idx: Arc::new(HashMap::new()),
86 start: 0,
87 byte_classes: vec![0; 256],
88 only_utf8: true,
89 is_bytes: false,
90 is_dfa: false,
91 is_reverse: false,
92 is_anchored_start: false,
93 is_anchored_end: false,
94 has_unicode_word_boundary: false,
95 prefixes: LiteralSearcher::empty(),
f9f354fc 96 dfa_size_limit: 2 * (1 << 20),
8bb4bdeb
XL
97 }
98 }
99
100 /// If pc is an index to a no-op instruction (like Save), then return the
101 /// next pc that is not a no-op instruction.
102 pub fn skip(&self, mut pc: usize) -> usize {
103 loop {
104 match self[pc] {
105 Inst::Save(ref i) => pc = i.goto,
106 _ => return pc,
107 }
108 }
109 }
110
111 /// Return true if and only if an execution engine at instruction `pc` will
112 /// always lead to a match.
113 pub fn leads_to_match(&self, pc: usize) -> bool {
114 if self.matches.len() > 1 {
115 // If we have a regex set, then we have more than one ending
116 // state, so leading to one of those states is generally
117 // meaningless.
118 return false;
119 }
120 match self[self.skip(pc)] {
121 Inst::Match(_) => true,
122 _ => false,
123 }
124 }
125
126 /// Returns true if the current configuration demands that an implicit
127 /// `.*?` be prepended to the instruction sequence.
128 pub fn needs_dotstar(&self) -> bool {
129 self.is_dfa && !self.is_reverse && !self.is_anchored_start
130 }
131
132 /// Returns true if this program uses Byte instructions instead of
133 /// Char/Range instructions.
134 pub fn uses_bytes(&self) -> bool {
135 self.is_bytes || self.is_dfa
136 }
137
138 /// Returns true if this program exclusively matches valid UTF-8 bytes.
139 ///
140 /// That is, if an invalid UTF-8 byte is seen, then no match is possible.
141 pub fn only_utf8(&self) -> bool {
142 self.only_utf8
143 }
144
145 /// Return the approximate heap usage of this instruction sequence in
146 /// bytes.
147 pub fn approximate_size(&self) -> usize {
148 // The only instruction that uses heap space is Ranges (for
149 // Unicode codepoint programs) to store non-overlapping codepoint
150 // ranges. To keep this operation constant time, we ignore them.
151 (self.len() * mem::size_of::<Inst>())
f9f354fc
XL
152 + (self.matches.len() * mem::size_of::<InstPtr>())
153 + (self.captures.len() * mem::size_of::<Option<String>>())
154 + (self.capture_name_idx.len()
155 * (mem::size_of::<String>() + mem::size_of::<usize>()))
156 + (self.byte_classes.len() * mem::size_of::<u8>())
157 + self.prefixes.approximate_size()
8bb4bdeb
XL
158 }
159}
160
161impl Deref for Program {
162 type Target = [Inst];
163
f9f354fc 164 #[cfg_attr(feature = "perf-inline", inline(always))]
8bb4bdeb
XL
165 fn deref(&self) -> &Self::Target {
166 &*self.insts
167 }
168}
169
170impl fmt::Debug for Program {
171 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
172 use self::Inst::*;
173
174 fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
175 if goto == cur + 1 {
176 fmtd
177 } else {
178 format!("{} (goto: {})", fmtd, goto)
179 }
180 }
181
182 fn visible_byte(b: u8) -> String {
183 use std::ascii::escape_default;
184 let escaped = escape_default(b).collect::<Vec<u8>>();
185 String::from_utf8_lossy(&escaped).into_owned()
186 }
187
188 for (pc, inst) in self.iter().enumerate() {
189 match *inst {
f9f354fc 190 Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?,
8bb4bdeb
XL
191 Save(ref inst) => {
192 let s = format!("{:04} Save({})", pc, inst.slot);
94b46f34 193 write!(f, "{}", with_goto(pc, inst.goto, s))?;
8bb4bdeb
XL
194 }
195 Split(ref inst) => {
94b46f34 196 write!(
f9f354fc
XL
197 f,
198 "{:04} Split({}, {})",
199 pc, inst.goto1, inst.goto2
200 )?;
8bb4bdeb
XL
201 }
202 EmptyLook(ref inst) => {
203 let s = format!("{:?}", inst.look);
94b46f34 204 write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
8bb4bdeb
XL
205 }
206 Char(ref inst) => {
207 let s = format!("{:?}", inst.c);
94b46f34 208 write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
8bb4bdeb
XL
209 }
210 Ranges(ref inst) => {
f9f354fc
XL
211 let ranges = inst
212 .ranges
8bb4bdeb
XL
213 .iter()
214 .map(|r| format!("{:?}-{:?}", r.0, r.1))
215 .collect::<Vec<String>>()
216 .join(", ");
94b46f34 217 write!(
f9f354fc
XL
218 f,
219 "{:04} {}",
220 pc,
221 with_goto(pc, inst.goto, ranges)
222 )?;
8bb4bdeb
XL
223 }
224 Bytes(ref inst) => {
225 let s = format!(
226 "Bytes({}, {})",
227 visible_byte(inst.start),
f9f354fc
XL
228 visible_byte(inst.end)
229 );
94b46f34 230 write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
8bb4bdeb
XL
231 }
232 }
233 if pc == self.start {
94b46f34 234 write!(f, " (start)")?;
8bb4bdeb 235 }
94b46f34 236 write!(f, "\n")?;
8bb4bdeb
XL
237 }
238 Ok(())
239 }
240}
241
242impl<'a> IntoIterator for &'a Program {
243 type Item = &'a Inst;
244 type IntoIter = slice::Iter<'a, Inst>;
f9f354fc
XL
245 fn into_iter(self) -> Self::IntoIter {
246 self.iter()
247 }
8bb4bdeb
XL
248}
249
250/// Inst is an instruction code in a Regex program.
251///
252/// Regrettably, a regex program either contains Unicode codepoint
253/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
254/// A regex program can never contain both.
255///
256/// It would be worth investigating splitting this into two distinct types and
257/// then figuring out how to make the matching engines polymorphic over those
258/// types without sacrificing performance.
259///
260/// Other than the benefit of moving invariants into the type system, another
261/// benefit is the decreased size. If we remove the `Char` and `Ranges`
cdc7bbd5
XL
262/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to
263/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges`
8bb4bdeb
XL
264/// variant.) Given that byte based machines are typically much bigger than
265/// their Unicode analogues (because they can decode UTF-8 directly), this ends
266/// up being a pretty significant savings.
267#[derive(Clone, Debug)]
268pub enum Inst {
269 /// Match indicates that the program has reached a match state.
270 ///
271 /// The number in the match corresponds to the Nth logical regular
272 /// expression in this program. This index is always 0 for normal regex
273 /// programs. Values greater than 0 appear when compiling regex sets, and
274 /// each match instruction gets its own unique value. The value corresponds
275 /// to the Nth regex in the set.
276 Match(usize),
277 /// Save causes the program to save the current location of the input in
278 /// the slot indicated by InstSave.
279 Save(InstSave),
280 /// Split causes the program to diverge to one of two paths in the
281 /// program, preferring goto1 in InstSplit.
282 Split(InstSplit),
283 /// EmptyLook represents a zero-width assertion in a regex program. A
284 /// zero-width assertion does not consume any of the input text.
285 EmptyLook(InstEmptyLook),
286 /// Char requires the regex program to match the character in InstChar at
287 /// the current position in the input.
288 Char(InstChar),
289 /// Ranges requires the regex program to match the character at the current
290 /// position in the input with one of the ranges specified in InstRanges.
291 Ranges(InstRanges),
292 /// Bytes is like Ranges, except it expresses a single byte range. It is
293 /// used in conjunction with Split instructions to implement multi-byte
294 /// character classes.
295 Bytes(InstBytes),
296}
297
298impl Inst {
299 /// Returns true if and only if this is a match instruction.
300 pub fn is_match(&self) -> bool {
301 match *self {
302 Inst::Match(_) => true,
303 _ => false,
304 }
305 }
306}
307
308/// Representation of the Save instruction.
309#[derive(Clone, Debug)]
310pub struct InstSave {
311 /// The next location to execute in the program.
312 pub goto: InstPtr,
313 /// The capture slot (there are two slots for every capture in a regex,
314 /// including the zeroth capture for the entire match).
315 pub slot: usize,
316}
317
318/// Representation of the Split instruction.
319#[derive(Clone, Debug)]
320pub struct InstSplit {
321 /// The first instruction to try. A match resulting from following goto1
322 /// has precedence over a match resulting from following goto2.
323 pub goto1: InstPtr,
324 /// The second instruction to try. A match resulting from following goto1
325 /// has precedence over a match resulting from following goto2.
326 pub goto2: InstPtr,
327}
328
ff7c6d11 329/// Representation of the `EmptyLook` instruction.
8bb4bdeb
XL
330#[derive(Clone, Debug)]
331pub struct InstEmptyLook {
332 /// The next location to execute in the program if this instruction
333 /// succeeds.
334 pub goto: InstPtr,
335 /// The type of zero-width assertion to check.
336 pub look: EmptyLook,
337}
338
339/// The set of zero-width match instructions.
340#[derive(Clone, Copy, Debug, PartialEq, Eq)]
341pub enum EmptyLook {
342 /// Start of line or input.
343 StartLine,
344 /// End of line or input.
345 EndLine,
346 /// Start of input.
347 StartText,
348 /// End of input.
349 EndText,
350 /// Word character on one side and non-word character on other.
351 WordBoundary,
352 /// Word character on both sides or non-word character on both sides.
353 NotWordBoundary,
354 /// ASCII word boundary.
355 WordBoundaryAscii,
356 /// Not ASCII word boundary.
357 NotWordBoundaryAscii,
358}
359
360/// Representation of the Char instruction.
361#[derive(Clone, Debug)]
362pub struct InstChar {
363 /// The next location to execute in the program if this instruction
364 /// succeeds.
365 pub goto: InstPtr,
366 /// The character to test.
367 pub c: char,
368}
369
370/// Representation of the Ranges instruction.
371#[derive(Clone, Debug)]
372pub struct InstRanges {
373 /// The next location to execute in the program if this instruction
374 /// succeeds.
375 pub goto: InstPtr,
376 /// The set of Unicode scalar value ranges to test.
cdc7bbd5 377 pub ranges: Box<[(char, char)]>,
8bb4bdeb
XL
378}
379
380impl InstRanges {
381 /// Tests whether the given input character matches this instruction.
382 pub fn matches(&self, c: Char) -> bool {
383 // This speeds up the `match_class_unicode` benchmark by checking
384 // some common cases quickly without binary search. e.g., Matching
385 // a Unicode class on predominantly ASCII text.
386 for r in self.ranges.iter().take(4) {
387 if c < r.0 {
388 return false;
389 }
390 if c <= r.1 {
391 return true;
392 }
393 }
f9f354fc
XL
394 self.ranges
395 .binary_search_by(|r| {
396 if r.1 < c {
397 Ordering::Less
398 } else if r.0 > c {
399 Ordering::Greater
400 } else {
401 Ordering::Equal
402 }
403 })
404 .is_ok()
8bb4bdeb
XL
405 }
406
407 /// Return the number of distinct characters represented by all of the
408 /// ranges.
409 pub fn num_chars(&self) -> usize {
f9f354fc
XL
410 self.ranges
411 .iter()
8bb4bdeb 412 .map(|&(s, e)| 1 + (e as u32) - (s as u32))
f9f354fc 413 .sum::<u32>() as usize
8bb4bdeb
XL
414 }
415}
416
417/// Representation of the Bytes instruction.
418#[derive(Clone, Debug)]
419pub struct InstBytes {
420 /// The next location to execute in the program if this instruction
421 /// succeeds.
422 pub goto: InstPtr,
423 /// The start (inclusive) of this byte range.
424 pub start: u8,
425 /// The end (inclusive) of this byte range.
426 pub end: u8,
427}
428
429impl InstBytes {
430 /// Returns true if and only if the given byte is in this range.
431 pub fn matches(&self, byte: u8) -> bool {
432 self.start <= byte && byte <= self.end
433 }
434}
cdc7bbd5
XL
435
436#[cfg(test)]
437mod test {
438 #[test]
439 #[cfg(target_pointer_width = "64")]
440 fn test_size_of_inst() {
441 use std::mem::size_of;
442
443 use super::Inst;
444
445 assert_eq!(32, size_of::<Inst>());
446 }
447}