1 use alloc
::{sync::Arc, vec, vec::Vec}
;
4 nfa
::thompson
::{self, Error, State, NFA}
,
6 id
::{PatternID, StateID}
,
7 matchtypes
::MultiMatch
,
12 #[derive(Clone, Copy, Debug, Default)]
14 anchored
: Option
<bool
>,
19 /// Return a new default PikeVM configuration.
20 pub fn new() -> Config
{
24 pub fn anchored(mut self, yes
: bool
) -> Config
{
25 self.anchored
= Some(yes
);
29 pub fn utf8(mut self, yes
: bool
) -> Config
{
30 self.utf8
= Some(yes
);
34 pub fn get_anchored(&self) -> bool
{
35 self.anchored
.unwrap_or(false)
38 pub fn get_utf8(&self) -> bool
{
39 self.utf8
.unwrap_or(true)
42 pub(crate) fn overwrite(self, o
: Config
) -> Config
{
44 anchored
: o
.anchored
.or(self.anchored
),
45 utf8
: o
.utf8
.or(self.utf8
),
50 /// A builder for a PikeVM.
51 #[derive(Clone, Debug)]
54 thompson
: thompson
::Builder
,
58 /// Create a new PikeVM builder with its default configuration.
59 pub fn new() -> Builder
{
61 config
: Config
::default(),
62 thompson
: thompson
::Builder
::new(),
66 pub fn build(&self, pattern
: &str) -> Result
<PikeVM
, Error
> {
67 self.build_many(&[pattern
])
70 pub fn build_many
<P
: AsRef
<str>>(
73 ) -> Result
<PikeVM
, Error
> {
74 let nfa
= self.thompson
.build_many(patterns
)?
;
75 self.build_from_nfa(Arc
::new(nfa
))
78 pub fn build_from_nfa(&self, nfa
: Arc
<NFA
>) -> Result
<PikeVM
, Error
> {
79 // TODO: Check that this is correct.
82 // feature = "syntax",
83 // feature = "unicode-perl"
85 if !cfg
!(feature
= "syntax") {
86 if nfa
.has_word_boundary_unicode() {
87 return Err(Error
::unicode_word_unavailable());
90 Ok(PikeVM { config: self.config, nfa }
)
93 pub fn configure(&mut self, config
: Config
) -> &mut Builder
{
94 self.config
= self.config
.overwrite(config
);
98 /// Set the syntax configuration for this builder using
99 /// [`SyntaxConfig`](crate::SyntaxConfig).
101 /// This permits setting things like case insensitivity, Unicode and multi
104 /// These settings only apply when constructing a PikeVM directly from a
108 config
: crate::util
::syntax
::SyntaxConfig
,
110 self.thompson
.syntax(config
);
114 /// Set the Thompson NFA configuration for this builder using
115 /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
117 /// This permits setting things like if additional time should be spent
118 /// shrinking the size of the NFA.
120 /// These settings only apply when constructing a PikeVM directly from a
122 pub fn thompson(&mut self, config
: thompson
::Config
) -> &mut Builder
{
123 self.thompson
.configure(config
);
128 #[derive(Clone, Debug)]
135 pub fn new(pattern
: &str) -> Result
<PikeVM
, Error
> {
136 PikeVM
::builder().build(pattern
)
139 pub fn new_many
<P
: AsRef
<str>>(patterns
: &[P
]) -> Result
<PikeVM
, Error
> {
140 PikeVM
::builder().build_many(patterns
)
143 pub fn config() -> Config
{
147 pub fn builder() -> Builder
{
151 pub fn create_cache(&self) -> Cache
{
152 Cache
::new(self.nfa())
155 pub fn create_captures(&self) -> Captures
{
156 Captures
::new(self.nfa())
159 pub fn nfa(&self) -> &Arc
<NFA
> {
163 pub fn find_leftmost_iter
<'r
, 'c
, 't
>(
165 cache
: &'c
mut Cache
,
167 ) -> FindLeftmostMatches
<'r
, 'c
, 't
> {
168 FindLeftmostMatches
::new(self, cache
, haystack
)
173 // 1) Don't forget about prefilters.
175 // 2) Consider the case of using a PikeVM with an NFA that has Capture
176 // states, but where we don't want to track capturing groups (other than
177 // group 0). This potentially saves a lot of copying around and what not. I
178 // believe the current regex crate does this, for example. The interesting
179 // bit here is how to handle the case of multiple patterns...
181 // 3) Permit the caller to specify a pattern ID to run an anchored-only
184 // 4) How to do overlapping? The way multi-regex support works in the regex
185 // crate currently is to run the PikeVM until either we reach the end of
186 // the haystack or when we know all regexes have matched. The latter case
187 // is probably quite rare, so the common case is likely that we're always
188 // searching the entire input. The question is: can we emulate that with
189 // our typical 'overlapping' APIs on DFAs? I believe we can. If so, then
190 // all we need to do is provide an overlapping API on the PikeVM that
191 // roughly matches the ones we provide on DFAs. For those APIs, the only
192 // thing they need over non-overlapping APIs is "caller state." For DFAs,
193 // the caller state is simple: it contains the last state visited and the
194 // last match reported. For the PikeVM (and NFAs in general), the "last
195 // state" is actually a *set* of NFA states. So I think what happens here
196 // is that we can just force the `Cache` to subsume this role. We'll still
197 // need some additional state to track the last match reported though.
198 // Because when two or more patterns match at the same location, we need a
199 // way to know to iterate over them. Although maybe it's not match index we
200 // need, but the state index of the last NFA state processed in the cache.
201 // Then we just pick up where we left off. There might be another match
202 // state, in which case, we report it.
204 pub fn find_leftmost_at(
211 ) -> Option
<MultiMatch
> {
213 self.config
.get_anchored() || self.nfa
.is_always_start_anchored();
215 let mut matched_pid
= None
;
218 if cache
.clist
.set
.is_empty() {
219 if matched_pid
.is_some() || (anchored
&& at
> start
) {
224 if (!anchored
&& matched_pid
.is_none())
225 || cache
.clist
.set
.is_empty()
227 self.epsilon_closure(
231 self.nfa
.start_anchored(),
236 for i
in 0..cache
.clist
.set
.len() {
237 let sid
= cache
.clist
.set
.get(i
);
238 let pid
= match self.step(
241 cache
.clist
.caps(sid
),
250 matched_pid
= Some(pid
);
258 cache
.nlist
.set
.clear();
260 matched_pid
.map(|pid
| {
261 let slots
= self.nfa
.pattern_slots(pid
);
262 let (start
, end
) = (slots
.start
, slots
.start
+ 1);
265 caps
.slots
[start
].unwrap(),
266 caps
.slots
[end
].unwrap(),
276 thread_caps
: &mut [Slot
],
277 stack
: &mut Vec
<FollowEpsilon
>,
281 ) -> Option
<PatternID
> {
282 match *self.nfa
.state(sid
) {
285 | State
::Union { .. }
286 | State
::Capture { .. }
=> None
,
287 State
::Range { ref range }
=> {
288 if range
.matches(haystack
, at
) {
289 self.epsilon_closure(
300 State
::Sparse(ref sparse
) => {
301 if let Some(next
) = sparse
.matches(haystack
, at
) {
302 self.epsilon_closure(
313 State
::Match { id }
=> {
314 slots
.copy_from_slice(thread_caps
);
324 thread_caps
: &mut [Slot
],
325 stack
: &mut Vec
<FollowEpsilon
>,
330 stack
.push(FollowEpsilon
::StateID(sid
));
331 while let Some(frame
) = stack
.pop() {
333 FollowEpsilon
::StateID(sid
) => {
334 self.epsilon_closure_step(
343 FollowEpsilon
::Capture { slot, pos }
=> {
344 thread_caps
[slot
] = pos
;
351 fn epsilon_closure_step(
354 thread_caps
: &mut [Slot
],
355 stack
: &mut Vec
<FollowEpsilon
>,
361 if !nlist
.set
.insert(sid
) {
364 match *self.nfa
.state(sid
) {
366 | State
::Range { .. }
367 | State
::Sparse { .. }
368 | State
::Match { .. }
=> {
369 let t
= &mut nlist
.caps(sid
);
370 t
.copy_from_slice(thread_caps
);
373 State
::Look { look, next }
=> {
374 if !look
.matches(haystack
, at
) {
379 State
::Union { ref alternates }
=> {
380 sid
= match alternates
.get(0) {
389 .map(FollowEpsilon
::StateID
),
392 State
::Capture { next, slot }
=> {
393 if slot
< thread_caps
.len() {
394 stack
.push(FollowEpsilon
::Capture
{
396 pos
: thread_caps
[slot
],
398 thread_caps
[slot
] = Some(at
);
407 /// An iterator over all non-overlapping leftmost matches for a particular
408 /// infallible search.
410 /// The iterator yields a [`MultiMatch`] value until no more matches could be
411 /// found. If the underlying search returns an error, then this panics.
413 /// The lifetime variables are as follows:
415 /// * `'r` is the lifetime of the regular expression itself.
416 /// * `'c` is the lifetime of the mutable cache used during search.
417 /// * `'t` is the lifetime of the text being searched.
419 pub struct FindLeftmostMatches
<'r
, 'c
, 't
> {
421 cache
: &'c
mut Cache
,
422 // scanner: Option<prefilter::Scanner<'r>>,
425 last_match
: Option
<usize>,
428 impl<'r
, 'c
, 't
> FindLeftmostMatches
<'r
, 'c
, 't
> {
431 cache
: &'c
mut Cache
,
433 ) -> FindLeftmostMatches
<'r
, 'c
, 't
> {
434 FindLeftmostMatches { vm, cache, text, last_end: 0, last_match: None }
438 impl<'r
, 'c
, 't
> Iterator
for FindLeftmostMatches
<'r
, 'c
, 't
> {
439 // type Item = Captures;
440 type Item
= MultiMatch
;
442 // fn next(&mut self) -> Option<Captures> {
443 fn next(&mut self) -> Option
<MultiMatch
> {
444 if self.last_end
> self.text
.len() {
447 let mut caps
= self.vm
.create_captures();
448 let m
= self.vm
.find_leftmost_at(
456 // This is an empty match. To ensure we make progress, start
457 // the next search at the smallest possible starting position
458 // of the next match following this one.
459 self.last_end
= if self.vm
.config
.get_utf8() {
460 crate::util
::next_utf8(self.text
, m
.end())
464 // Don't accept empty matches immediately following a match.
465 // Just move on to the next match.
466 if Some(m
.end()) == self.last_match
{
470 self.last_end
= m
.end();
472 self.last_match
= Some(m
.end());
477 #[derive(Clone, Debug)]
478 pub struct Captures
{
483 pub fn new(nfa
: &NFA
) -> Captures
{
484 Captures { slots: vec![None; nfa.capture_slot_len()] }
488 #[derive(Clone, Debug)]
490 stack
: Vec
<FollowEpsilon
>,
495 type Slot
= Option
<usize>;
497 #[derive(Clone, Debug)]
501 slots_per_thread
: usize,
504 #[derive(Clone, Debug)]
507 Capture { slot: usize, pos: Slot }
,
511 pub fn new(nfa
: &NFA
) -> Cache
{
514 clist
: Threads
::new(nfa
),
515 nlist
: Threads
::new(nfa
),
519 fn clear(&mut self) {
521 self.clist
.set
.clear();
522 self.nlist
.set
.clear();
526 core
::mem
::swap(&mut self.clist
, &mut self.nlist
);
531 fn new(nfa
: &NFA
) -> Threads
{
532 let mut threads
= Threads
{
533 set
: SparseSet
::new(0),
541 fn resize(&mut self, nfa
: &NFA
) {
542 if nfa
.states().len() == self.set
.capacity() {
545 self.slots_per_thread
= nfa
.capture_slot_len();
546 self.set
.resize(nfa
.states().len());
547 self.caps
.resize(self.slots_per_thread
* nfa
.states().len(), None
);
550 fn caps(&mut self, sid
: StateID
) -> &mut [Slot
] {
551 let i
= sid
.as_usize() * self.slots_per_thread
;
552 &mut self.caps
[i
..i
+ self.slots_per_thread
]