vendor/regex-automata/src/nfa/thompson/pikevm.rs

   1 use alloc::{sync::Arc, vec, vec::Vec};
   2
   3 use crate::{
   4     nfa::thompson::{self, Error, State, NFA},
   5     util::{
   6         id::{PatternID, StateID},
   7         matchtypes::MultiMatch,
   8         sparse_set::SparseSet,
   9     },
  10 };
  11
  12 #[derive(Clone, Copy, Debug, Default)]
  13 pub struct Config {
  14     anchored: Option<bool>,
  15     utf8: Option<bool>,
  16 }
  17
  18 impl Config {
  19     /// Return a new default PikeVM configuration.
  20     pub fn new() -> Config {
  21         Config::default()
  22     }
  23
  24     pub fn anchored(mut self, yes: bool) -> Config {
  25         self.anchored = Some(yes);
  26         self
  27     }
  28
  29     pub fn utf8(mut self, yes: bool) -> Config {
  30         self.utf8 = Some(yes);
  31         self
  32     }
  33
  34     pub fn get_anchored(&self) -> bool {
  35         self.anchored.unwrap_or(false)
  36     }
  37
  38     pub fn get_utf8(&self) -> bool {
  39         self.utf8.unwrap_or(true)
  40     }
  41
  42     pub(crate) fn overwrite(self, o: Config) -> Config {
  43         Config {
  44             anchored: o.anchored.or(self.anchored),
  45             utf8: o.utf8.or(self.utf8),
  46         }
  47     }
  48 }
  49
  50 /// A builder for a PikeVM.
  51 #[derive(Clone, Debug)]
  52 pub struct Builder {
  53     config: Config,
  54     thompson: thompson::Builder,
  55 }
  56
  57 impl Builder {
  58     /// Create a new PikeVM builder with its default configuration.
  59     pub fn new() -> Builder {
  60         Builder {
  61             config: Config::default(),
  62             thompson: thompson::Builder::new(),
  63         }
  64     }
  65
  66     pub fn build(&self, pattern: &str) -> Result<PikeVM, Error> {
  67         self.build_many(&[pattern])
  68     }
  69
  70     pub fn build_many<P: AsRef<str>>(
  71         &self,
  72         patterns: &[P],
  73     ) -> Result<PikeVM, Error> {
  74         let nfa = self.thompson.build_many(patterns)?;
  75         self.build_from_nfa(Arc::new(nfa))
  76     }
  77
  78     pub fn build_from_nfa(&self, nfa: Arc<NFA>) -> Result<PikeVM, Error> {
  79         // TODO: Check that this is correct.
  80         // if !cfg!(all(
  81         // feature = "dfa",
  82         // feature = "syntax",
  83         // feature = "unicode-perl"
  84         // )) {
  85         if !cfg!(feature = "syntax") {
  86             if nfa.has_word_boundary_unicode() {
  87                 return Err(Error::unicode_word_unavailable());
  88             }
  89         }
  90         Ok(PikeVM { config: self.config, nfa })
  91     }
  92
  93     pub fn configure(&mut self, config: Config) -> &mut Builder {
  94         self.config = self.config.overwrite(config);
  95         self
  96     }
  97
  98     /// Set the syntax configuration for this builder using
  99     /// [`SyntaxConfig`](crate::SyntaxConfig).
 100     ///
 101     /// This permits setting things like case insensitivity, Unicode and multi
 102     /// line mode.
 103     ///
 104     /// These settings only apply when constructing a PikeVM directly from a
 105     /// pattern.
 106     pub fn syntax(
 107         &mut self,
 108         config: crate::util::syntax::SyntaxConfig,
 109     ) -> &mut Builder {
 110         self.thompson.syntax(config);
 111         self
 112     }
 113
 114     /// Set the Thompson NFA configuration for this builder using
 115     /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
 116     ///
 117     /// This permits setting things like if additional time should be spent
 118     /// shrinking the size of the NFA.
 119     ///
 120     /// These settings only apply when constructing a PikeVM directly from a
 121     /// pattern.
 122     pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
 123         self.thompson.configure(config);
 124         self
 125     }
 126 }
 127
 128 #[derive(Clone, Debug)]
 129 pub struct PikeVM {
 130     config: Config,
 131     nfa: Arc<NFA>,
 132 }
 133
 134 impl PikeVM {
 135     pub fn new(pattern: &str) -> Result<PikeVM, Error> {
 136         PikeVM::builder().build(pattern)
 137     }
 138
 139     pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<PikeVM, Error> {
 140         PikeVM::builder().build_many(patterns)
 141     }
 142
 143     pub fn config() -> Config {
 144         Config::new()
 145     }
 146
 147     pub fn builder() -> Builder {
 148         Builder::new()
 149     }
 150
 151     pub fn create_cache(&self) -> Cache {
 152         Cache::new(self.nfa())
 153     }
 154
 155     pub fn create_captures(&self) -> Captures {
 156         Captures::new(self.nfa())
 157     }
 158
 159     pub fn nfa(&self) -> &Arc<NFA> {
 160         &self.nfa
 161     }
 162
 163     pub fn find_leftmost_iter<'r, 'c, 't>(
 164         &'r self,
 165         cache: &'c mut Cache,
 166         haystack: &'t [u8],
 167     ) -> FindLeftmostMatches<'r, 'c, 't> {
 168         FindLeftmostMatches::new(self, cache, haystack)
 169     }
 170
 171     // BREADCRUMBS:
 172     //
 173     // 1) Don't forget about prefilters.
 174     //
 175     // 2) Consider the case of using a PikeVM with an NFA that has Capture
 176     // states, but where we don't want to track capturing groups (other than
 177     // group 0). This potentially saves a lot of copying around and what not. I
 178     // believe the current regex crate does this, for example. The interesting
 179     // bit here is how to handle the case of multiple patterns...
 180     //
 181     // 3) Permit the caller to specify a pattern ID to run an anchored-only
 182     // search on.
 183     //
 184     // 4) How to do overlapping? The way multi-regex support works in the regex
 185     // crate currently is to run the PikeVM until either we reach the end of
 186     // the haystack or when we know all regexes have matched. The latter case
 187     // is probably quite rare, so the common case is likely that we're always
 188     // searching the entire input. The question is: can we emulate that with
 189     // our typical 'overlapping' APIs on DFAs? I believe we can. If so, then
 190     // all we need to do is provide an overlapping API on the PikeVM that
 191     // roughly matches the ones we provide on DFAs. For those APIs, the only
 192     // thing they need over non-overlapping APIs is "caller state." For DFAs,
 193     // the caller state is simple: it contains the last state visited and the
 194     // last match reported. For the PikeVM (and NFAs in general), the "last
 195     // state" is actually a *set* of NFA states. So I think what happens here
 196     // is that we can just force the `Cache` to subsume this role. We'll still
 197     // need some additional state to track the last match reported though.
 198     // Because when two or more patterns match at the same location, we need a
 199     // way to know to iterate over them. Although maybe it's not match index we
 200     // need, but the state index of the last NFA state processed in the cache.
 201     // Then we just pick up where we left off. There might be another match
 202     // state, in which case, we report it.
 203
 204     pub fn find_leftmost_at(
 205         &self,
 206         cache: &mut Cache,
 207         haystack: &[u8],
 208         start: usize,
 209         end: usize,
 210         caps: &mut Captures,
 211     ) -> Option<MultiMatch> {
 212         let anchored =
 213             self.config.get_anchored() || self.nfa.is_always_start_anchored();
 214         let mut at = start;
 215         let mut matched_pid = None;
 216         cache.clear();
 217         'LOOP: loop {
 218             if cache.clist.set.is_empty() {
 219                 if matched_pid.is_some() || (anchored && at > start) {
 220                     break 'LOOP;
 221                 }
 222                 // TODO: prefilter
 223             }
 224             if (!anchored && matched_pid.is_none())
 225                 || cache.clist.set.is_empty()
 226             {
 227                 self.epsilon_closure(
 228                     &mut cache.clist,
 229                     &mut caps.slots,
 230                     &mut cache.stack,
 231                     self.nfa.start_anchored(),
 232                     haystack,
 233                     at,
 234                 );
 235             }
 236             for i in 0..cache.clist.set.len() {
 237                 let sid = cache.clist.set.get(i);
 238                 let pid = match self.step(
 239                     &mut cache.nlist,
 240                     &mut caps.slots,
 241                     cache.clist.caps(sid),
 242                     &mut cache.stack,
 243                     sid,
 244                     haystack,
 245                     at,
 246                 ) {
 247                     None => continue,
 248                     Some(pid) => pid,
 249                 };
 250                 matched_pid = Some(pid);
 251                 break;
 252             }
 253             if at >= end {
 254                 break;
 255             }
 256             at += 1;
 257             cache.swap();
 258             cache.nlist.set.clear();
 259         }
 260         matched_pid.map(|pid| {
 261             let slots = self.nfa.pattern_slots(pid);
 262             let (start, end) = (slots.start, slots.start + 1);
 263             MultiMatch::new(
 264                 pid,
 265                 caps.slots[start].unwrap(),
 266                 caps.slots[end].unwrap(),
 267             )
 268         })
 269     }
 270
 271     #[inline(always)]
 272     fn step(
 273         &self,
 274         nlist: &mut Threads,
 275         slots: &mut [Slot],
 276         thread_caps: &mut [Slot],
 277         stack: &mut Vec<FollowEpsilon>,
 278         sid: StateID,
 279         haystack: &[u8],
 280         at: usize,
 281     ) -> Option<PatternID> {
 282         match *self.nfa.state(sid) {
 283             State::Fail
 284             | State::Look { .. }
 285             | State::Union { .. }
 286             | State::Capture { .. } => None,
 287             State::Range { ref range } => {
 288                 if range.matches(haystack, at) {
 289                     self.epsilon_closure(
 290                         nlist,
 291                         thread_caps,
 292                         stack,
 293                         range.next,
 294                         haystack,
 295                         at + 1,
 296                     );
 297                 }
 298                 None
 299             }
 300             State::Sparse(ref sparse) => {
 301                 if let Some(next) = sparse.matches(haystack, at) {
 302                     self.epsilon_closure(
 303                         nlist,
 304                         thread_caps,
 305                         stack,
 306                         next,
 307                         haystack,
 308                         at + 1,
 309                     );
 310                 }
 311                 None
 312             }
 313             State::Match { id } => {
 314                 slots.copy_from_slice(thread_caps);
 315                 Some(id)
 316             }
 317         }
 318     }
 319
 320     #[inline(always)]
 321     fn epsilon_closure(
 322         &self,
 323         nlist: &mut Threads,
 324         thread_caps: &mut [Slot],
 325         stack: &mut Vec<FollowEpsilon>,
 326         sid: StateID,
 327         haystack: &[u8],
 328         at: usize,
 329     ) {
 330         stack.push(FollowEpsilon::StateID(sid));
 331         while let Some(frame) = stack.pop() {
 332             match frame {
 333                 FollowEpsilon::StateID(sid) => {
 334                     self.epsilon_closure_step(
 335                         nlist,
 336                         thread_caps,
 337                         stack,
 338                         sid,
 339                         haystack,
 340                         at,
 341                     );
 342                 }
 343                 FollowEpsilon::Capture { slot, pos } => {
 344                     thread_caps[slot] = pos;
 345                 }
 346             }
 347         }
 348     }
 349
 350     #[inline(always)]
 351     fn epsilon_closure_step(
 352         &self,
 353         nlist: &mut Threads,
 354         thread_caps: &mut [Slot],
 355         stack: &mut Vec<FollowEpsilon>,
 356         mut sid: StateID,
 357         haystack: &[u8],
 358         at: usize,
 359     ) {
 360         loop {
 361             if !nlist.set.insert(sid) {
 362                 return;
 363             }
 364             match *self.nfa.state(sid) {
 365                 State::Fail
 366                 | State::Range { .. }
 367                 | State::Sparse { .. }
 368                 | State::Match { .. } => {
 369                     let t = &mut nlist.caps(sid);
 370                     t.copy_from_slice(thread_caps);
 371                     return;
 372                 }
 373                 State::Look { look, next } => {
 374                     if !look.matches(haystack, at) {
 375                         return;
 376                     }
 377                     sid = next;
 378                 }
 379                 State::Union { ref alternates } => {
 380                     sid = match alternates.get(0) {
 381                         None => return,
 382                         Some(&sid) => sid,
 383                     };
 384                     stack.extend(
 385                         alternates[1..]
 386                             .iter()
 387                             .copied()
 388                             .rev()
 389                             .map(FollowEpsilon::StateID),
 390                     );
 391                 }
 392                 State::Capture { next, slot } => {
 393                     if slot < thread_caps.len() {
 394                         stack.push(FollowEpsilon::Capture {
 395                             slot,
 396                             pos: thread_caps[slot],
 397                         });
 398                         thread_caps[slot] = Some(at);
 399                     }
 400                     sid = next;
 401                 }
 402             }
 403         }
 404     }
 405 }
 406
 407 /// An iterator over all non-overlapping leftmost matches for a particular
 408 /// infallible search.
 409 ///
 410 /// The iterator yields a [`MultiMatch`] value until no more matches could be
 411 /// found. If the underlying search returns an error, then this panics.
 412 ///
 413 /// The lifetime variables are as follows:
 414 ///
 415 /// * `'r` is the lifetime of the regular expression itself.
 416 /// * `'c` is the lifetime of the mutable cache used during search.
 417 /// * `'t` is the lifetime of the text being searched.
 418 #[derive(Debug)]
 419 pub struct FindLeftmostMatches<'r, 'c, 't> {
 420     vm: &'r PikeVM,
 421     cache: &'c mut Cache,
 422     // scanner: Option<prefilter::Scanner<'r>>,
 423     text: &'t [u8],
 424     last_end: usize,
 425     last_match: Option<usize>,
 426 }
 427
 428 impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> {
 429     fn new(
 430         vm: &'r PikeVM,
 431         cache: &'c mut Cache,
 432         text: &'t [u8],
 433     ) -> FindLeftmostMatches<'r, 'c, 't> {
 434         FindLeftmostMatches { vm, cache, text, last_end: 0, last_match: None }
 435     }
 436 }
 437
 438 impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> {
 439     // type Item = Captures;
 440     type Item = MultiMatch;
 441
 442     // fn next(&mut self) -> Option<Captures> {
 443     fn next(&mut self) -> Option<MultiMatch> {
 444         if self.last_end > self.text.len() {
 445             return None;
 446         }
 447         let mut caps = self.vm.create_captures();
 448         let m = self.vm.find_leftmost_at(
 449             self.cache,
 450             self.text,
 451             self.last_end,
 452             self.text.len(),
 453             &mut caps,
 454         )?;
 455         if m.is_empty() {
 456             // This is an empty match. To ensure we make progress, start
 457             // the next search at the smallest possible starting position
 458             // of the next match following this one.
 459             self.last_end = if self.vm.config.get_utf8() {
 460                 crate::util::next_utf8(self.text, m.end())
 461             } else {
 462                 m.end() + 1
 463             };
 464             // Don't accept empty matches immediately following a match.
 465             // Just move on to the next match.
 466             if Some(m.end()) == self.last_match {
 467                 return self.next();
 468             }
 469         } else {
 470             self.last_end = m.end();
 471         }
 472         self.last_match = Some(m.end());
 473         Some(m)
 474     }
 475 }
 476
 477 #[derive(Clone, Debug)]
 478 pub struct Captures {
 479     slots: Vec<Slot>,
 480 }
 481
 482 impl Captures {
 483     pub fn new(nfa: &NFA) -> Captures {
 484         Captures { slots: vec![None; nfa.capture_slot_len()] }
 485     }
 486 }
 487
 488 #[derive(Clone, Debug)]
 489 pub struct Cache {
 490     stack: Vec<FollowEpsilon>,
 491     clist: Threads,
 492     nlist: Threads,
 493 }
 494
 495 type Slot = Option<usize>;
 496
 497 #[derive(Clone, Debug)]
 498 struct Threads {
 499     set: SparseSet,
 500     caps: Vec<Slot>,
 501     slots_per_thread: usize,
 502 }
 503
 504 #[derive(Clone, Debug)]
 505 enum FollowEpsilon {
 506     StateID(StateID),
 507     Capture { slot: usize, pos: Slot },
 508 }
 509
 510 impl Cache {
 511     pub fn new(nfa: &NFA) -> Cache {
 512         Cache {
 513             stack: vec![],
 514             clist: Threads::new(nfa),
 515             nlist: Threads::new(nfa),
 516         }
 517     }
 518
 519     fn clear(&mut self) {
 520         self.stack.clear();
 521         self.clist.set.clear();
 522         self.nlist.set.clear();
 523     }
 524
 525     fn swap(&mut self) {
 526         core::mem::swap(&mut self.clist, &mut self.nlist);
 527     }
 528 }
 529
 530 impl Threads {
 531     fn new(nfa: &NFA) -> Threads {
 532         let mut threads = Threads {
 533             set: SparseSet::new(0),
 534             caps: vec![],
 535             slots_per_thread: 0,
 536         };
 537         threads.resize(nfa);
 538         threads
 539     }
 540
 541     fn resize(&mut self, nfa: &NFA) {
 542         if nfa.states().len() == self.set.capacity() {
 543             return;
 544         }
 545         self.slots_per_thread = nfa.capture_slot_len();
 546         self.set.resize(nfa.states().len());
 547         self.caps.resize(self.slots_per_thread * nfa.states().len(), None);
 548     }
 549
 550     fn caps(&mut self, sid: StateID) -> &mut [Slot] {
 551         let i = sid.as_usize() * self.slots_per_thread;
 552         &mut self.caps[i..i + self.slots_per_thread]
 553     }
 554 }