]> git.proxmox.com Git - rustc.git/blob - vendor/regex-automata/src/nfa/thompson/pikevm.rs
New upstream version 1.68.2+dfsg1
[rustc.git] / vendor / regex-automata / src / nfa / thompson / pikevm.rs
1 use alloc::{sync::Arc, vec, vec::Vec};
2
3 use crate::{
4 nfa::thompson::{self, Error, State, NFA},
5 util::{
6 id::{PatternID, StateID},
7 matchtypes::MultiMatch,
8 sparse_set::SparseSet,
9 },
10 };
11
12 #[derive(Clone, Copy, Debug, Default)]
13 pub struct Config {
14 anchored: Option<bool>,
15 utf8: Option<bool>,
16 }
17
18 impl Config {
19 /// Return a new default PikeVM configuration.
20 pub fn new() -> Config {
21 Config::default()
22 }
23
24 pub fn anchored(mut self, yes: bool) -> Config {
25 self.anchored = Some(yes);
26 self
27 }
28
29 pub fn utf8(mut self, yes: bool) -> Config {
30 self.utf8 = Some(yes);
31 self
32 }
33
34 pub fn get_anchored(&self) -> bool {
35 self.anchored.unwrap_or(false)
36 }
37
38 pub fn get_utf8(&self) -> bool {
39 self.utf8.unwrap_or(true)
40 }
41
42 pub(crate) fn overwrite(self, o: Config) -> Config {
43 Config {
44 anchored: o.anchored.or(self.anchored),
45 utf8: o.utf8.or(self.utf8),
46 }
47 }
48 }
49
50 /// A builder for a PikeVM.
51 #[derive(Clone, Debug)]
52 pub struct Builder {
53 config: Config,
54 thompson: thompson::Builder,
55 }
56
57 impl Builder {
58 /// Create a new PikeVM builder with its default configuration.
59 pub fn new() -> Builder {
60 Builder {
61 config: Config::default(),
62 thompson: thompson::Builder::new(),
63 }
64 }
65
66 pub fn build(&self, pattern: &str) -> Result<PikeVM, Error> {
67 self.build_many(&[pattern])
68 }
69
70 pub fn build_many<P: AsRef<str>>(
71 &self,
72 patterns: &[P],
73 ) -> Result<PikeVM, Error> {
74 let nfa = self.thompson.build_many(patterns)?;
75 self.build_from_nfa(Arc::new(nfa))
76 }
77
78 pub fn build_from_nfa(&self, nfa: Arc<NFA>) -> Result<PikeVM, Error> {
79 // TODO: Check that this is correct.
80 // if !cfg!(all(
81 // feature = "dfa",
82 // feature = "syntax",
83 // feature = "unicode-perl"
84 // )) {
85 if !cfg!(feature = "syntax") {
86 if nfa.has_word_boundary_unicode() {
87 return Err(Error::unicode_word_unavailable());
88 }
89 }
90 Ok(PikeVM { config: self.config, nfa })
91 }
92
93 pub fn configure(&mut self, config: Config) -> &mut Builder {
94 self.config = self.config.overwrite(config);
95 self
96 }
97
98 /// Set the syntax configuration for this builder using
99 /// [`SyntaxConfig`](crate::SyntaxConfig).
100 ///
101 /// This permits setting things like case insensitivity, Unicode and multi
102 /// line mode.
103 ///
104 /// These settings only apply when constructing a PikeVM directly from a
105 /// pattern.
106 pub fn syntax(
107 &mut self,
108 config: crate::util::syntax::SyntaxConfig,
109 ) -> &mut Builder {
110 self.thompson.syntax(config);
111 self
112 }
113
114 /// Set the Thompson NFA configuration for this builder using
115 /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
116 ///
117 /// This permits setting things like if additional time should be spent
118 /// shrinking the size of the NFA.
119 ///
120 /// These settings only apply when constructing a PikeVM directly from a
121 /// pattern.
122 pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
123 self.thompson.configure(config);
124 self
125 }
126 }
127
128 #[derive(Clone, Debug)]
129 pub struct PikeVM {
130 config: Config,
131 nfa: Arc<NFA>,
132 }
133
134 impl PikeVM {
135 pub fn new(pattern: &str) -> Result<PikeVM, Error> {
136 PikeVM::builder().build(pattern)
137 }
138
139 pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<PikeVM, Error> {
140 PikeVM::builder().build_many(patterns)
141 }
142
143 pub fn config() -> Config {
144 Config::new()
145 }
146
147 pub fn builder() -> Builder {
148 Builder::new()
149 }
150
151 pub fn create_cache(&self) -> Cache {
152 Cache::new(self.nfa())
153 }
154
155 pub fn create_captures(&self) -> Captures {
156 Captures::new(self.nfa())
157 }
158
159 pub fn nfa(&self) -> &Arc<NFA> {
160 &self.nfa
161 }
162
163 pub fn find_leftmost_iter<'r, 'c, 't>(
164 &'r self,
165 cache: &'c mut Cache,
166 haystack: &'t [u8],
167 ) -> FindLeftmostMatches<'r, 'c, 't> {
168 FindLeftmostMatches::new(self, cache, haystack)
169 }
170
171 // BREADCRUMBS:
172 //
173 // 1) Don't forget about prefilters.
174 //
175 // 2) Consider the case of using a PikeVM with an NFA that has Capture
176 // states, but where we don't want to track capturing groups (other than
177 // group 0). This potentially saves a lot of copying around and what not. I
178 // believe the current regex crate does this, for example. The interesting
179 // bit here is how to handle the case of multiple patterns...
180 //
181 // 3) Permit the caller to specify a pattern ID to run an anchored-only
182 // search on.
183 //
184 // 4) How to do overlapping? The way multi-regex support works in the regex
185 // crate currently is to run the PikeVM until either we reach the end of
186 // the haystack or when we know all regexes have matched. The latter case
187 // is probably quite rare, so the common case is likely that we're always
188 // searching the entire input. The question is: can we emulate that with
189 // our typical 'overlapping' APIs on DFAs? I believe we can. If so, then
190 // all we need to do is provide an overlapping API on the PikeVM that
191 // roughly matches the ones we provide on DFAs. For those APIs, the only
192 // thing they need over non-overlapping APIs is "caller state." For DFAs,
193 // the caller state is simple: it contains the last state visited and the
194 // last match reported. For the PikeVM (and NFAs in general), the "last
195 // state" is actually a *set* of NFA states. So I think what happens here
196 // is that we can just force the `Cache` to subsume this role. We'll still
197 // need some additional state to track the last match reported though.
198 // Because when two or more patterns match at the same location, we need a
199 // way to know to iterate over them. Although maybe it's not match index we
200 // need, but the state index of the last NFA state processed in the cache.
201 // Then we just pick up where we left off. There might be another match
202 // state, in which case, we report it.
203
204 pub fn find_leftmost_at(
205 &self,
206 cache: &mut Cache,
207 haystack: &[u8],
208 start: usize,
209 end: usize,
210 caps: &mut Captures,
211 ) -> Option<MultiMatch> {
212 let anchored =
213 self.config.get_anchored() || self.nfa.is_always_start_anchored();
214 let mut at = start;
215 let mut matched_pid = None;
216 cache.clear();
217 'LOOP: loop {
218 if cache.clist.set.is_empty() {
219 if matched_pid.is_some() || (anchored && at > start) {
220 break 'LOOP;
221 }
222 // TODO: prefilter
223 }
224 if (!anchored && matched_pid.is_none())
225 || cache.clist.set.is_empty()
226 {
227 self.epsilon_closure(
228 &mut cache.clist,
229 &mut caps.slots,
230 &mut cache.stack,
231 self.nfa.start_anchored(),
232 haystack,
233 at,
234 );
235 }
236 for i in 0..cache.clist.set.len() {
237 let sid = cache.clist.set.get(i);
238 let pid = match self.step(
239 &mut cache.nlist,
240 &mut caps.slots,
241 cache.clist.caps(sid),
242 &mut cache.stack,
243 sid,
244 haystack,
245 at,
246 ) {
247 None => continue,
248 Some(pid) => pid,
249 };
250 matched_pid = Some(pid);
251 break;
252 }
253 if at >= end {
254 break;
255 }
256 at += 1;
257 cache.swap();
258 cache.nlist.set.clear();
259 }
260 matched_pid.map(|pid| {
261 let slots = self.nfa.pattern_slots(pid);
262 let (start, end) = (slots.start, slots.start + 1);
263 MultiMatch::new(
264 pid,
265 caps.slots[start].unwrap(),
266 caps.slots[end].unwrap(),
267 )
268 })
269 }
270
271 #[inline(always)]
272 fn step(
273 &self,
274 nlist: &mut Threads,
275 slots: &mut [Slot],
276 thread_caps: &mut [Slot],
277 stack: &mut Vec<FollowEpsilon>,
278 sid: StateID,
279 haystack: &[u8],
280 at: usize,
281 ) -> Option<PatternID> {
282 match *self.nfa.state(sid) {
283 State::Fail
284 | State::Look { .. }
285 | State::Union { .. }
286 | State::Capture { .. } => None,
287 State::Range { ref range } => {
288 if range.matches(haystack, at) {
289 self.epsilon_closure(
290 nlist,
291 thread_caps,
292 stack,
293 range.next,
294 haystack,
295 at + 1,
296 );
297 }
298 None
299 }
300 State::Sparse(ref sparse) => {
301 if let Some(next) = sparse.matches(haystack, at) {
302 self.epsilon_closure(
303 nlist,
304 thread_caps,
305 stack,
306 next,
307 haystack,
308 at + 1,
309 );
310 }
311 None
312 }
313 State::Match { id } => {
314 slots.copy_from_slice(thread_caps);
315 Some(id)
316 }
317 }
318 }
319
320 #[inline(always)]
321 fn epsilon_closure(
322 &self,
323 nlist: &mut Threads,
324 thread_caps: &mut [Slot],
325 stack: &mut Vec<FollowEpsilon>,
326 sid: StateID,
327 haystack: &[u8],
328 at: usize,
329 ) {
330 stack.push(FollowEpsilon::StateID(sid));
331 while let Some(frame) = stack.pop() {
332 match frame {
333 FollowEpsilon::StateID(sid) => {
334 self.epsilon_closure_step(
335 nlist,
336 thread_caps,
337 stack,
338 sid,
339 haystack,
340 at,
341 );
342 }
343 FollowEpsilon::Capture { slot, pos } => {
344 thread_caps[slot] = pos;
345 }
346 }
347 }
348 }
349
350 #[inline(always)]
351 fn epsilon_closure_step(
352 &self,
353 nlist: &mut Threads,
354 thread_caps: &mut [Slot],
355 stack: &mut Vec<FollowEpsilon>,
356 mut sid: StateID,
357 haystack: &[u8],
358 at: usize,
359 ) {
360 loop {
361 if !nlist.set.insert(sid) {
362 return;
363 }
364 match *self.nfa.state(sid) {
365 State::Fail
366 | State::Range { .. }
367 | State::Sparse { .. }
368 | State::Match { .. } => {
369 let t = &mut nlist.caps(sid);
370 t.copy_from_slice(thread_caps);
371 return;
372 }
373 State::Look { look, next } => {
374 if !look.matches(haystack, at) {
375 return;
376 }
377 sid = next;
378 }
379 State::Union { ref alternates } => {
380 sid = match alternates.get(0) {
381 None => return,
382 Some(&sid) => sid,
383 };
384 stack.extend(
385 alternates[1..]
386 .iter()
387 .copied()
388 .rev()
389 .map(FollowEpsilon::StateID),
390 );
391 }
392 State::Capture { next, slot } => {
393 if slot < thread_caps.len() {
394 stack.push(FollowEpsilon::Capture {
395 slot,
396 pos: thread_caps[slot],
397 });
398 thread_caps[slot] = Some(at);
399 }
400 sid = next;
401 }
402 }
403 }
404 }
405 }
406
407 /// An iterator over all non-overlapping leftmost matches for a particular
408 /// infallible search.
409 ///
410 /// The iterator yields a [`MultiMatch`] value until no more matches could be
411 /// found. If the underlying search returns an error, then this panics.
412 ///
413 /// The lifetime variables are as follows:
414 ///
415 /// * `'r` is the lifetime of the regular expression itself.
416 /// * `'c` is the lifetime of the mutable cache used during search.
417 /// * `'t` is the lifetime of the text being searched.
418 #[derive(Debug)]
419 pub struct FindLeftmostMatches<'r, 'c, 't> {
420 vm: &'r PikeVM,
421 cache: &'c mut Cache,
422 // scanner: Option<prefilter::Scanner<'r>>,
423 text: &'t [u8],
424 last_end: usize,
425 last_match: Option<usize>,
426 }
427
428 impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> {
429 fn new(
430 vm: &'r PikeVM,
431 cache: &'c mut Cache,
432 text: &'t [u8],
433 ) -> FindLeftmostMatches<'r, 'c, 't> {
434 FindLeftmostMatches { vm, cache, text, last_end: 0, last_match: None }
435 }
436 }
437
438 impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> {
439 // type Item = Captures;
440 type Item = MultiMatch;
441
442 // fn next(&mut self) -> Option<Captures> {
443 fn next(&mut self) -> Option<MultiMatch> {
444 if self.last_end > self.text.len() {
445 return None;
446 }
447 let mut caps = self.vm.create_captures();
448 let m = self.vm.find_leftmost_at(
449 self.cache,
450 self.text,
451 self.last_end,
452 self.text.len(),
453 &mut caps,
454 )?;
455 if m.is_empty() {
456 // This is an empty match. To ensure we make progress, start
457 // the next search at the smallest possible starting position
458 // of the next match following this one.
459 self.last_end = if self.vm.config.get_utf8() {
460 crate::util::next_utf8(self.text, m.end())
461 } else {
462 m.end() + 1
463 };
464 // Don't accept empty matches immediately following a match.
465 // Just move on to the next match.
466 if Some(m.end()) == self.last_match {
467 return self.next();
468 }
469 } else {
470 self.last_end = m.end();
471 }
472 self.last_match = Some(m.end());
473 Some(m)
474 }
475 }
476
477 #[derive(Clone, Debug)]
478 pub struct Captures {
479 slots: Vec<Slot>,
480 }
481
482 impl Captures {
483 pub fn new(nfa: &NFA) -> Captures {
484 Captures { slots: vec![None; nfa.capture_slot_len()] }
485 }
486 }
487
488 #[derive(Clone, Debug)]
489 pub struct Cache {
490 stack: Vec<FollowEpsilon>,
491 clist: Threads,
492 nlist: Threads,
493 }
494
495 type Slot = Option<usize>;
496
497 #[derive(Clone, Debug)]
498 struct Threads {
499 set: SparseSet,
500 caps: Vec<Slot>,
501 slots_per_thread: usize,
502 }
503
504 #[derive(Clone, Debug)]
505 enum FollowEpsilon {
506 StateID(StateID),
507 Capture { slot: usize, pos: Slot },
508 }
509
510 impl Cache {
511 pub fn new(nfa: &NFA) -> Cache {
512 Cache {
513 stack: vec![],
514 clist: Threads::new(nfa),
515 nlist: Threads::new(nfa),
516 }
517 }
518
519 fn clear(&mut self) {
520 self.stack.clear();
521 self.clist.set.clear();
522 self.nlist.set.clear();
523 }
524
525 fn swap(&mut self) {
526 core::mem::swap(&mut self.clist, &mut self.nlist);
527 }
528 }
529
530 impl Threads {
531 fn new(nfa: &NFA) -> Threads {
532 let mut threads = Threads {
533 set: SparseSet::new(0),
534 caps: vec![],
535 slots_per_thread: 0,
536 };
537 threads.resize(nfa);
538 threads
539 }
540
541 fn resize(&mut self, nfa: &NFA) {
542 if nfa.states().len() == self.set.capacity() {
543 return;
544 }
545 self.slots_per_thread = nfa.capture_slot_len();
546 self.set.resize(nfa.states().len());
547 self.caps.resize(self.slots_per_thread * nfa.states().len(), None);
548 }
549
550 fn caps(&mut self, sid: StateID) -> &mut [Slot] {
551 let i = sid.as_usize() * self.slots_per_thread;
552 &mut self.caps[i..i + self.slots_per_thread]
553 }
554 }