1 use std
::cmp
::Ordering
;
2 use std
::collections
::HashMap
;
10 use literal
::LiteralSearcher
;
12 /// `InstPtr` represents the index of an instruction in a regex program.
13 pub type InstPtr
= usize;
15 /// Program is a sequence of instructions and various facts about thos
19 /// A sequence of instructions that represents an NFA.
21 /// Pointers to each Match instruction in the sequence.
23 /// This is always length 1 unless this program represents a regex set.
24 pub matches
: Vec
<InstPtr
>,
25 /// The ordered sequence of all capture groups extracted from the AST.
26 /// Unnamed groups are `None`.
27 pub captures
: Vec
<Option
<String
>>,
28 /// Pointers to all named capture groups into `captures`.
29 pub capture_name_idx
: Arc
<HashMap
<String
, usize>>,
30 /// A pointer to the start instruction. This can vary depending on how
31 /// the program was compiled. For example, programs for use with the DFA
32 /// engine have a `.*?` inserted at the beginning of unanchored regular
33 /// expressions. The actual starting point of the program is after the
36 /// A set of equivalence classes for discriminating bytes in the compiled
38 pub byte_classes
: Vec
<u8>,
39 /// When true, this program can only match valid UTF-8.
41 /// When true, this program uses byte range instructions instead of Unicode
42 /// range instructions.
44 /// When true, the program is compiled for DFA matching. For example, this
45 /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
48 /// When true, the program matches text in reverse (for use only in the
51 /// Whether the regex must match from the start of the input.
52 pub is_anchored_start
: bool
,
53 /// Whether the regex must match at the end of the input.
54 pub is_anchored_end
: bool
,
55 /// Whether this program contains a Unicode word boundary instruction.
56 pub has_unicode_word_boundary
: bool
,
57 /// A possibly empty machine for very quickly matching prefix literals.
58 pub prefixes
: LiteralSearcher
,
59 /// A limit on the size of the cache that the DFA is allowed to use while
62 /// The cache limit specifies approximately how much space we're willing to
63 /// give to the state cache. Once the state cache exceeds the size, it is
64 /// wiped and all states must be re-computed.
66 /// Note that this value does not impact correctness. It can be set to 0
67 /// and the DFA will run just fine. (It will only ever store exactly one
68 /// state in the cache, and will likely run very slowly, but it will work.)
70 /// Also note that this limit is *per thread of execution*. That is,
71 /// if the same regex is used to search text across multiple threads
72 /// simultaneously, then the DFA cache is not shared. Instead, copies are
74 pub dfa_size_limit
: usize,
78 /// Creates an empty instruction sequence. Fields are given default
80 pub fn new() -> Self {
85 capture_name_idx
: Arc
::new(HashMap
::new()),
87 byte_classes
: vec
![0; 256],
92 is_anchored_start
: false,
93 is_anchored_end
: false,
94 has_unicode_word_boundary
: false,
95 prefixes
: LiteralSearcher
::empty(),
96 dfa_size_limit
: 2 * (1 << 20),
100 /// If pc is an index to a no-op instruction (like Save), then return the
101 /// next pc that is not a no-op instruction.
102 pub fn skip(&self, mut pc
: usize) -> usize {
105 Inst
::Save(ref i
) => pc
= i
.goto
,
111 /// Return true if and only if an execution engine at instruction `pc` will
112 /// always lead to a match.
113 pub fn leads_to_match(&self, pc
: usize) -> bool
{
114 if self.matches
.len() > 1 {
115 // If we have a regex set, then we have more than one ending
116 // state, so leading to one of those states is generally
120 match self[self.skip(pc
)] {
121 Inst
::Match(_
) => true,
126 /// Returns true if the current configuration demands that an implicit
127 /// `.*?` be prepended to the instruction sequence.
128 pub fn needs_dotstar(&self) -> bool
{
129 self.is_dfa
&& !self.is_reverse
&& !self.is_anchored_start
132 /// Returns true if this program uses Byte instructions instead of
133 /// Char/Range instructions.
134 pub fn uses_bytes(&self) -> bool
{
135 self.is_bytes
|| self.is_dfa
138 /// Returns true if this program exclusively matches valid UTF-8 bytes.
140 /// That is, if an invalid UTF-8 byte is seen, then no match is possible.
141 pub fn only_utf8(&self) -> bool
{
145 /// Return the approximate heap usage of this instruction sequence in
147 pub fn approximate_size(&self) -> usize {
148 // The only instruction that uses heap space is Ranges (for
149 // Unicode codepoint programs) to store non-overlapping codepoint
150 // ranges. To keep this operation constant time, we ignore them.
151 (self.len() * mem
::size_of
::<Inst
>())
152 + (self.matches
.len() * mem
::size_of
::<InstPtr
>())
153 + (self.captures
.len() * mem
::size_of
::<Option
<String
>>())
154 + (self.capture_name_idx
.len()
155 * (mem
::size_of
::<String
>() + mem
::size_of
::<usize>()))
156 + (self.byte_classes
.len() * mem
::size_of
::<u8>())
157 + self.prefixes
.approximate_size()
161 impl Deref
for Program
{
162 type Target
= [Inst
];
164 #[cfg_attr(feature = "perf-inline", inline(always))]
165 fn deref(&self) -> &Self::Target
{
170 impl fmt
::Debug
for Program
{
171 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
174 fn with_goto(cur
: usize, goto
: usize, fmtd
: String
) -> String
{
178 format
!("{} (goto: {})", fmtd
, goto
)
182 fn visible_byte(b
: u8) -> String
{
183 use std
::ascii
::escape_default
;
184 let escaped
= escape_default(b
).collect
::<Vec
<u8>>();
185 String
::from_utf8_lossy(&escaped
).into_owned()
188 for (pc
, inst
) in self.iter().enumerate() {
190 Match(slot
) => write
!(f
, "{:04} Match({:?})", pc
, slot
)?
,
192 let s
= format
!("{:04} Save({})", pc
, inst
.slot
);
193 write
!(f
, "{}", with_goto(pc
, inst
.goto
, s
))?
;
198 "{:04} Split({}, {})",
199 pc
, inst
.goto1
, inst
.goto2
202 EmptyLook(ref inst
) => {
203 let s
= format
!("{:?}", inst
.look
);
204 write
!(f
, "{:04} {}", pc
, with_goto(pc
, inst
.goto
, s
))?
;
207 let s
= format
!("{:?}", inst
.c
);
208 write
!(f
, "{:04} {}", pc
, with_goto(pc
, inst
.goto
, s
))?
;
210 Ranges(ref inst
) => {
214 .map(|r
| format
!("{:?}-{:?}", r
.0, r
.1))
215 .collect
::<Vec
<String
>>()
221 with_goto(pc
, inst
.goto
, ranges
)
227 visible_byte(inst
.start
),
228 visible_byte(inst
.end
)
230 write
!(f
, "{:04} {}", pc
, with_goto(pc
, inst
.goto
, s
))?
;
233 if pc
== self.start
{
234 write
!(f
, " (start)")?
;
242 impl<'a
> IntoIterator
for &'a Program
{
243 type Item
= &'a Inst
;
244 type IntoIter
= slice
::Iter
<'a
, Inst
>;
245 fn into_iter(self) -> Self::IntoIter
{
250 /// Inst is an instruction code in a Regex program.
252 /// Regrettably, a regex program either contains Unicode codepoint
253 /// instructions (Char and Ranges) or it contains byte instructions (Bytes).
254 /// A regex program can never contain both.
256 /// It would be worth investigating splitting this into two distinct types and
257 /// then figuring out how to make the matching engines polymorphic over those
258 /// types without sacrificing performance.
260 /// Other than the benefit of moving invariants into the type system, another
261 /// benefit is the decreased size. If we remove the `Char` and `Ranges`
262 /// instructions from the `Inst` enum, then its size shrinks from 40 bytes to
263 /// 24 bytes. (This is because of the removal of a `Vec` in the `Ranges`
264 /// variant.) Given that byte based machines are typically much bigger than
265 /// their Unicode analogues (because they can decode UTF-8 directly), this ends
266 /// up being a pretty significant savings.
267 #[derive(Clone, Debug)]
269 /// Match indicates that the program has reached a match state.
271 /// The number in the match corresponds to the Nth logical regular
272 /// expression in this program. This index is always 0 for normal regex
273 /// programs. Values greater than 0 appear when compiling regex sets, and
274 /// each match instruction gets its own unique value. The value corresponds
275 /// to the Nth regex in the set.
277 /// Save causes the program to save the current location of the input in
278 /// the slot indicated by InstSave.
280 /// Split causes the program to diverge to one of two paths in the
281 /// program, preferring goto1 in InstSplit.
283 /// EmptyLook represents a zero-width assertion in a regex program. A
284 /// zero-width assertion does not consume any of the input text.
285 EmptyLook(InstEmptyLook
),
286 /// Char requires the regex program to match the character in InstChar at
287 /// the current position in the input.
289 /// Ranges requires the regex program to match the character at the current
290 /// position in the input with one of the ranges specified in InstRanges.
292 /// Bytes is like Ranges, except it expresses a single byte range. It is
293 /// used in conjunction with Split instructions to implement multi-byte
294 /// character classes.
299 /// Returns true if and only if this is a match instruction.
300 pub fn is_match(&self) -> bool
{
302 Inst
::Match(_
) => true,
308 /// Representation of the Save instruction.
309 #[derive(Clone, Debug)]
310 pub struct InstSave
{
311 /// The next location to execute in the program.
313 /// The capture slot (there are two slots for every capture in a regex,
314 /// including the zeroth capture for the entire match).
318 /// Representation of the Split instruction.
319 #[derive(Clone, Debug)]
320 pub struct InstSplit
{
321 /// The first instruction to try. A match resulting from following goto1
322 /// has precedence over a match resulting from following goto2.
324 /// The second instruction to try. A match resulting from following goto1
325 /// has precedence over a match resulting from following goto2.
329 /// Representation of the `EmptyLook` instruction.
330 #[derive(Clone, Debug)]
331 pub struct InstEmptyLook
{
332 /// The next location to execute in the program if this instruction
335 /// The type of zero-width assertion to check.
339 /// The set of zero-width match instructions.
340 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
342 /// Start of line or input.
344 /// End of line or input.
350 /// Word character on one side and non-word character on other.
352 /// Word character on both sides or non-word character on both sides.
354 /// ASCII word boundary.
356 /// Not ASCII word boundary.
357 NotWordBoundaryAscii
,
360 /// Representation of the Char instruction.
361 #[derive(Clone, Debug)]
362 pub struct InstChar
{
363 /// The next location to execute in the program if this instruction
366 /// The character to test.
370 /// Representation of the Ranges instruction.
371 #[derive(Clone, Debug)]
372 pub struct InstRanges
{
373 /// The next location to execute in the program if this instruction
376 /// The set of Unicode scalar value ranges to test.
377 pub ranges
: Vec
<(char, char)>,
381 /// Tests whether the given input character matches this instruction.
382 pub fn matches(&self, c
: Char
) -> bool
{
383 // This speeds up the `match_class_unicode` benchmark by checking
384 // some common cases quickly without binary search. e.g., Matching
385 // a Unicode class on predominantly ASCII text.
386 for r
in self.ranges
.iter().take(4) {
395 .binary_search_by(|r
| {
407 /// Return the number of distinct characters represented by all of the
409 pub fn num_chars(&self) -> usize {
412 .map(|&(s
, e
)| 1 + (e
as u32) - (s
as u32))
413 .sum
::<u32>() as usize
417 /// Representation of the Bytes instruction.
418 #[derive(Clone, Debug)]
419 pub struct InstBytes
{
420 /// The next location to execute in the program if this instruction
423 /// The start (inclusive) of this byte range.
425 /// The end (inclusive) of this byte range.
430 /// Returns true if and only if the given byte is in this range.
431 pub fn matches(&self, byte
: u8) -> bool
{
432 self.start
<= byte
&& byte
<= self.end