vendor/regex-automata/src/hybrid/search.rs

   1 use crate::{
   2     hybrid::{
   3         dfa::{Cache, DFA},
   4         id::{LazyStateID, OverlappingState, StateMatch},
   5     },
   6     nfa::thompson,
   7     util::{
   8         id::PatternID,
   9         matchtypes::{HalfMatch, MatchError},
  10         prefilter, MATCH_OFFSET,
  11     },
  12 };
  13
  14 #[inline(never)]
  15 pub(crate) fn find_earliest_fwd(
  16     pre: Option<&mut prefilter::Scanner>,
  17     dfa: &DFA,
  18     cache: &mut Cache,
  19     pattern_id: Option<PatternID>,
  20     bytes: &[u8],
  21     start: usize,
  22     end: usize,
  23 ) -> Result<Option<HalfMatch>, MatchError> {
  24     // Searching with a pattern ID is always anchored, so we should never use
  25     // a prefilter.
  26     if pre.is_some() && pattern_id.is_none() {
  27         find_fwd(pre, true, dfa, cache, pattern_id, bytes, start, end)
  28     } else {
  29         find_fwd(None, true, dfa, cache, pattern_id, bytes, start, end)
  30     }
  31 }
  32
  33 #[inline(never)]
  34 pub(crate) fn find_leftmost_fwd(
  35     pre: Option<&mut prefilter::Scanner>,
  36     dfa: &DFA,
  37     cache: &mut Cache,
  38     pattern_id: Option<PatternID>,
  39     bytes: &[u8],
  40     start: usize,
  41     end: usize,
  42 ) -> Result<Option<HalfMatch>, MatchError> {
  43     // Searching with a pattern ID is always anchored, so we should never use
  44     // a prefilter.
  45     if pre.is_some() && pattern_id.is_none() {
  46         find_fwd(pre, false, dfa, cache, pattern_id, bytes, start, end)
  47     } else {
  48         find_fwd(None, false, dfa, cache, pattern_id, bytes, start, end)
  49     }
  50 }
  51
  52 #[inline(always)]
  53 fn find_fwd(
  54     mut pre: Option<&mut prefilter::Scanner>,
  55     earliest: bool,
  56     dfa: &DFA,
  57     cache: &mut Cache,
  58     pattern_id: Option<PatternID>,
  59     haystack: &[u8],
  60     start: usize,
  61     end: usize,
  62 ) -> Result<Option<HalfMatch>, MatchError> {
  63     assert!(start <= end);
  64     assert!(start <= haystack.len());
  65     assert!(end <= haystack.len());
  66
  67     // Why do this? This lets 'bytes[at]' work without bounds checks below.
  68     // It seems the assert on 'end <= haystack.len()' above is otherwise
  69     // not enough. Why not just make 'bytes' scoped this way anyway? Well,
  70     // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
  71     // for resolving look-ahead.
  72     let bytes = &haystack[..end];
  73
  74     let mut sid = init_fwd(dfa, cache, pattern_id, haystack, start, end)?;
  75     let mut last_match = None;
  76     let mut at = start;
  77     if let Some(ref mut pre) = pre {
  78         // If a prefilter doesn't report false positives, then we don't need to
  79         // touch the DFA at all. However, since all matches include the pattern
  80         // ID, and the prefilter infrastructure doesn't report pattern IDs, we
  81         // limit this optimization to cases where there is exactly one pattern.
  82         // In that case, any match must be the 0th pattern.
  83         if dfa.pattern_count() == 1 && !pre.reports_false_positives() {
  84             return Ok(pre.next_candidate(bytes, at).into_option().map(
  85                 |offset| HalfMatch { pattern: PatternID::ZERO, offset },
  86             ));
  87         } else if pre.is_effective(at) {
  88             match pre.next_candidate(bytes, at).into_option() {
  89                 None => return Ok(None),
  90                 Some(i) => {
  91                     at = i;
  92                 }
  93             }
  94         }
  95     }
  96     while at < end {
  97         if sid.is_tagged() {
  98             sid = dfa
  99                 .next_state(cache, sid, bytes[at])
 100                 .map_err(|_| gave_up(at))?;
 101             at += 1;
 102         } else {
 103             // SAFETY: There are two safety invariants we need to uphold
 104             // here in the loop below: that 'sid' is a valid state ID for
 105             // this DFA, and that 'at' is a valid index into 'bytes'. For
 106             // the former, we rely on the invariant that next_state* and
 107             // start_state_forward always returns a valid state ID (given a
 108             // valid state ID in the former case), and that we are only at this
 109             // place in the code if 'sid' is untagged. Moreover, every call to
 110             // next_state_untagged_unchecked below is guarded by a check that
 111             // sid is untagged. For the latter safety invariant, we always
 112             // guard unchecked access with a check that 'at' is less than
 113             // 'end', where 'end == bytes.len()'.
 114             //
 115             // For justification, this gives us a ~10% bump in search time.
 116             // This was used for a benchmark:
 117             //
 118             //     regex-cli find hybrid regex @/some/big/file '(?m)^.+$' -UBb
 119             //
 120             // With bounds checked: ~881.4ms. Without: ~775ms. For input, I
 121             // used OpenSubtitles2018.raw.sample.medium.en.
 122             let mut prev_sid = sid;
 123             while at < end {
 124                 prev_sid = sid;
 125                 sid = unsafe {
 126                     dfa.next_state_untagged_unchecked(
 127                         cache,
 128                         sid,
 129                         *bytes.get_unchecked(at),
 130                     )
 131                 };
 132                 at += 1;
 133                 if sid.is_tagged() {
 134                     break;
 135                 }
 136                 // SAFETY: we make four unguarded accesses to 'bytes[at]'
 137                 // below, and each are safe because we know that 'at + 4' is
 138                 // in bounds. Moreover, while we don't check whether 'sid' is
 139                 // untagged directly, we know it is because of the check above.
 140                 // And the unrolled loop below quits when the next state is not
 141                 // equal to the previous state.
 142                 //
 143                 // PERF: For justification for eliminating bounds checks,
 144                 // see above. For justification for the unrolling, we use
 145                 // two tests. The one above with regex '(?m)^.+$', and also
 146                 // '(?m)^.{40}$'. The former is kinda the best case for
 147                 // unrolling, and gives a 1.67 boost primarily because the DFA
 148                 // spends most of its time munching through the input in the
 149                 // same state. But the latter pattern rarely spends time in the
 150                 // same state through subsequent transitions, so unrolling is
 151                 // pretty much always ineffective in that it craps out on the
 152                 // first 'sid != next' check below. However, without unrolling,
 153                 // search is only 1.03 times faster than with unrolling on the
 154                 // latter pattern, which we deem to be an acceptable loss in
 155                 // favor of optimizing the more common case of having a "hot"
 156                 // state somewhere in the DFA.
 157                 while at + 4 < end {
 158                     let next = unsafe {
 159                         dfa.next_state_untagged_unchecked(
 160                             cache,
 161                             sid,
 162                             *bytes.get_unchecked(at),
 163                         )
 164                     };
 165                     if sid != next {
 166                         break;
 167                     }
 168                     at += 1;
 169                     let next = unsafe {
 170                         dfa.next_state_untagged_unchecked(
 171                             cache,
 172                             sid,
 173                             *bytes.get_unchecked(at),
 174                         )
 175                     };
 176                     if sid != next {
 177                         break;
 178                     }
 179                     at += 1;
 180                     let next = unsafe {
 181                         dfa.next_state_untagged_unchecked(
 182                             cache,
 183                             sid,
 184                             *bytes.get_unchecked(at),
 185                         )
 186                     };
 187                     if sid != next {
 188                         break;
 189                     }
 190                     at += 1;
 191                     let next = unsafe {
 192                         dfa.next_state_untagged_unchecked(
 193                             cache,
 194                             sid,
 195                             *bytes.get_unchecked(at),
 196                         )
 197                     };
 198                     if sid != next {
 199                         break;
 200                     }
 201                     at += 1;
 202                 }
 203             }
 204             if sid.is_unknown() {
 205                 sid = dfa
 206                     .next_state(cache, prev_sid, bytes[at - 1])
 207                     .map_err(|_| gave_up(at - 1))?;
 208             }
 209         }
 210         if sid.is_tagged() {
 211             if sid.is_start() {
 212                 if let Some(ref mut pre) = pre {
 213                     if pre.is_effective(at) {
 214                         match pre.next_candidate(bytes, at).into_option() {
 215                             None => return Ok(None),
 216                             Some(i) => {
 217                                 at = i;
 218                             }
 219                         }
 220                     }
 221                 }
 222             } else if sid.is_match() {
 223                 last_match = Some(HalfMatch {
 224                     pattern: dfa.match_pattern(cache, sid, 0),
 225                     offset: at - MATCH_OFFSET,
 226                 });
 227                 if earliest {
 228                     return Ok(last_match);
 229                 }
 230             } else if sid.is_dead() {
 231                 return Ok(last_match);
 232             } else if sid.is_quit() {
 233                 if last_match.is_some() {
 234                     return Ok(last_match);
 235                 }
 236                 let offset = at - 1;
 237                 return Err(MatchError::Quit { byte: bytes[offset], offset });
 238             } else {
 239                 debug_assert!(sid.is_unknown());
 240                 unreachable!("sid being unknown is a bug");
 241             }
 242         }
 243     }
 244     // We are careful to use 'haystack' here, which contains the full context
 245     // that we might want to inspect.
 246     Ok(eoi_fwd(dfa, cache, haystack, end, &mut sid)?.or(last_match))
 247 }
 248
 249 #[inline(never)]
 250 pub(crate) fn find_earliest_rev(
 251     dfa: &DFA,
 252     cache: &mut Cache,
 253     pattern_id: Option<PatternID>,
 254     bytes: &[u8],
 255     start: usize,
 256     end: usize,
 257 ) -> Result<Option<HalfMatch>, MatchError> {
 258     find_rev(true, dfa, cache, pattern_id, bytes, start, end)
 259 }
 260
 261 #[inline(never)]
 262 pub(crate) fn find_leftmost_rev(
 263     dfa: &DFA,
 264     cache: &mut Cache,
 265     pattern_id: Option<PatternID>,
 266     bytes: &[u8],
 267     start: usize,
 268     end: usize,
 269 ) -> Result<Option<HalfMatch>, MatchError> {
 270     find_rev(false, dfa, cache, pattern_id, bytes, start, end)
 271 }
 272
 273 #[inline(always)]
 274 fn find_rev(
 275     earliest: bool,
 276     dfa: &DFA,
 277     cache: &mut Cache,
 278     pattern_id: Option<PatternID>,
 279     haystack: &[u8],
 280     start: usize,
 281     end: usize,
 282 ) -> Result<Option<HalfMatch>, MatchError> {
 283     assert!(start <= end);
 284     assert!(start <= haystack.len());
 285     assert!(end <= haystack.len());
 286
 287     // Why do this? This lets 'bytes[at]' work without bounds checks below.
 288     // It seems the assert on 'end <= haystack.len()' above is otherwise
 289     // not enough. Why not just make 'bytes' scoped this way anyway? Well,
 290     // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
 291     // for resolving look-ahead.
 292     let bytes = &haystack[start..];
 293
 294     let mut sid = init_rev(dfa, cache, pattern_id, haystack, start, end)?;
 295     let mut last_match = None;
 296     let mut at = end - start;
 297     while at > 0 {
 298         if sid.is_tagged() {
 299             at -= 1;
 300             sid = dfa
 301                 .next_state(cache, sid, bytes[at])
 302                 .map_err(|_| gave_up(at))?;
 303         } else {
 304             // SAFETY: See comments in 'find_fwd' for both a safety argument
 305             // and a justification from a performance perspective as to 1) why
 306             // we elide bounds checks and 2) why we do a specialized version of
 307             // unrolling below.
 308             let mut prev_sid = sid;
 309             while at > 0 && !sid.is_tagged() {
 310                 prev_sid = sid;
 311                 at -= 1;
 312                 while at > 3 {
 313                     let next = unsafe {
 314                         dfa.next_state_untagged_unchecked(
 315                             cache,
 316                             sid,
 317                             *bytes.get_unchecked(at),
 318                         )
 319                     };
 320                     if sid != next {
 321                         break;
 322                     }
 323                     at -= 1;
 324                     let next = unsafe {
 325                         dfa.next_state_untagged_unchecked(
 326                             cache,
 327                             sid,
 328                             *bytes.get_unchecked(at),
 329                         )
 330                     };
 331                     if sid != next {
 332                         break;
 333                     }
 334                     at -= 1;
 335                     let next = unsafe {
 336                         dfa.next_state_untagged_unchecked(
 337                             cache,
 338                             sid,
 339                             *bytes.get_unchecked(at),
 340                         )
 341                     };
 342                     if sid != next {
 343                         break;
 344                     }
 345                     at -= 1;
 346                     let next = unsafe {
 347                         dfa.next_state_untagged_unchecked(
 348                             cache,
 349                             sid,
 350                             *bytes.get_unchecked(at),
 351                         )
 352                     };
 353                     if sid != next {
 354                         break;
 355                     }
 356                     at -= 1;
 357                 }
 358                 sid = unsafe {
 359                     dfa.next_state_untagged_unchecked(
 360                         cache,
 361                         sid,
 362                         *bytes.get_unchecked(at),
 363                     )
 364                 };
 365             }
 366             if sid.is_unknown() {
 367                 sid = dfa
 368                     .next_state(cache, prev_sid, bytes[at])
 369                     .map_err(|_| gave_up(at))?;
 370             }
 371         }
 372         if sid.is_tagged() {
 373             if sid.is_start() {
 374                 continue;
 375             } else if sid.is_match() {
 376                 last_match = Some(HalfMatch {
 377                     pattern: dfa.match_pattern(cache, sid, 0),
 378                     offset: start + at + MATCH_OFFSET,
 379                 });
 380                 if earliest {
 381                     return Ok(last_match);
 382                 }
 383             } else if sid.is_dead() {
 384                 return Ok(last_match);
 385             } else {
 386                 debug_assert!(sid.is_quit());
 387                 if last_match.is_some() {
 388                     return Ok(last_match);
 389                 }
 390                 return Err(MatchError::Quit { byte: bytes[at], offset: at });
 391             }
 392         }
 393     }
 394     Ok(eoi_rev(dfa, cache, haystack, start, sid)?.or(last_match))
 395 }
 396
 397 #[inline(never)]
 398 pub(crate) fn find_overlapping_fwd(
 399     pre: Option<&mut prefilter::Scanner>,
 400     dfa: &DFA,
 401     cache: &mut Cache,
 402     pattern_id: Option<PatternID>,
 403     bytes: &[u8],
 404     start: usize,
 405     end: usize,
 406     caller_state: &mut OverlappingState,
 407 ) -> Result<Option<HalfMatch>, MatchError> {
 408     // Searching with a pattern ID is always anchored, so we should only ever
 409     // use a prefilter when no pattern ID is given.
 410     if pre.is_some() && pattern_id.is_none() {
 411         find_overlapping_fwd_imp(
 412             pre,
 413             dfa,
 414             cache,
 415             pattern_id,
 416             bytes,
 417             start,
 418             end,
 419             caller_state,
 420         )
 421     } else {
 422         find_overlapping_fwd_imp(
 423             None,
 424             dfa,
 425             cache,
 426             pattern_id,
 427             bytes,
 428             start,
 429             end,
 430             caller_state,
 431         )
 432     }
 433 }
 434
 435 #[inline(always)]
 436 fn find_overlapping_fwd_imp(
 437     mut pre: Option<&mut prefilter::Scanner>,
 438     dfa: &DFA,
 439     cache: &mut Cache,
 440     pattern_id: Option<PatternID>,
 441     bytes: &[u8],
 442     mut start: usize,
 443     end: usize,
 444     caller_state: &mut OverlappingState,
 445 ) -> Result<Option<HalfMatch>, MatchError> {
 446     assert!(start <= end);
 447     assert!(start <= bytes.len());
 448     assert!(end <= bytes.len());
 449
 450     let mut sid = match caller_state.id() {
 451         None => init_fwd(dfa, cache, pattern_id, bytes, start, end)?,
 452         Some(sid) => {
 453             if let Some(last) = caller_state.last_match() {
 454                 let match_count = dfa.match_count(cache, sid);
 455                 if last.match_index < match_count {
 456                     let m = HalfMatch {
 457                         pattern: dfa.match_pattern(
 458                             cache,
 459                             sid,
 460                             last.match_index,
 461                         ),
 462                         offset: last.offset,
 463                     };
 464                     last.match_index += 1;
 465                     return Ok(Some(m));
 466                 }
 467             }
 468
 469             // This is a subtle but critical detail. If the caller provides a
 470             // non-None state ID, then it must be the case that the state ID
 471             // corresponds to one set by this function. The state ID therefore
 472             // corresponds to a match state, a dead state or some other state.
 473             // However, "some other" state _only_ occurs when the input has
 474             // been exhausted because the only way to stop before then is to
 475             // see a match or a dead/quit state.
 476             //
 477             // If the input is exhausted or if it's a dead state, then
 478             // incrementing the starting position has no relevance on
 479             // correctness, since the loop below will either not execute
 480             // at all or will immediately stop due to being in a dead state.
 481             // (Once in a dead state it is impossible to leave it.)
 482             //
 483             // Therefore, the only case we need to consider is when
 484             // caller_state is a match state. In this case, since our machines
 485             // support the ability to delay a match by a certain number of
 486             // bytes (to support look-around), it follows that we actually
 487             // consumed that many additional bytes on our previous search. When
 488             // the caller resumes their search to find subsequent matches, they
 489             // will use the ending location from the previous match as the next
 490             // starting point, which is `match_offset` bytes PRIOR to where
 491             // we scanned to on the previous search. Therefore, we need to
 492             // compensate by bumping `start` up by `MATCH_OFFSET` bytes.
 493             //
 494             // Incidentally, since MATCH_OFFSET is non-zero, this also makes
 495             // dealing with empty matches convenient. Namely, callers needn't
 496             // special case them when implementing an iterator. Instead, this
 497             // ensures that forward progress is always made.
 498             start += MATCH_OFFSET;
 499             sid
 500         }
 501     };
 502
 503     let mut at = start;
 504     while at < end {
 505         let byte = bytes[at];
 506         sid = dfa.next_state(cache, sid, byte).map_err(|_| gave_up(at))?;
 507         at += 1;
 508         if sid.is_tagged() {
 509             caller_state.set_id(sid);
 510             if sid.is_start() {
 511                 if let Some(ref mut pre) = pre {
 512                     if pre.is_effective(at) {
 513                         match pre.next_candidate(bytes, at).into_option() {
 514                             None => return Ok(None),
 515                             Some(i) => {
 516                                 at = i;
 517                             }
 518                         }
 519                     }
 520                 }
 521             } else if sid.is_match() {
 522                 let offset = at - MATCH_OFFSET;
 523                 caller_state
 524                     .set_last_match(StateMatch { match_index: 1, offset });
 525                 return Ok(Some(HalfMatch {
 526                     pattern: dfa.match_pattern(cache, sid, 0),
 527                     offset,
 528                 }));
 529             } else if sid.is_dead() {
 530                 return Ok(None);
 531             } else {
 532                 debug_assert!(sid.is_quit());
 533                 return Err(MatchError::Quit { byte, offset: at - 1 });
 534             }
 535         }
 536     }
 537
 538     let result = eoi_fwd(dfa, cache, bytes, end, &mut sid);
 539     caller_state.set_id(sid);
 540     if let Ok(Some(ref last_match)) = result {
 541         caller_state.set_last_match(StateMatch {
 542             // '1' is always correct here since if we get to this point, this
 543             // always corresponds to the first (index '0') match discovered at
 544             // this position. So the next match to report at this position (if
 545             // it exists) is at index '1'.
 546             match_index: 1,
 547             offset: last_match.offset(),
 548         });
 549     }
 550     result
 551 }
 552
 553 #[inline(always)]
 554 fn init_fwd(
 555     dfa: &DFA,
 556     cache: &mut Cache,
 557     pattern_id: Option<PatternID>,
 558     bytes: &[u8],
 559     start: usize,
 560     end: usize,
 561 ) -> Result<LazyStateID, MatchError> {
 562     let sid = dfa
 563         .start_state_forward(cache, pattern_id, bytes, start, end)
 564         .map_err(|_| gave_up(start))?;
 565     // Start states can never be match states, since all matches are delayed
 566     // by 1 byte.
 567     assert!(!sid.is_match());
 568     Ok(sid)
 569 }
 570
 571 #[inline(always)]
 572 fn init_rev(
 573     dfa: &DFA,
 574     cache: &mut Cache,
 575     pattern_id: Option<PatternID>,
 576     bytes: &[u8],
 577     start: usize,
 578     end: usize,
 579 ) -> Result<LazyStateID, MatchError> {
 580     let sid = dfa
 581         .start_state_reverse(cache, pattern_id, bytes, start, end)
 582         .map_err(|_| gave_up(end))?;
 583     // Start states can never be match states, since all matches are delayed
 584     // by 1 byte.
 585     assert!(!sid.is_match());
 586     Ok(sid)
 587 }
 588
 589 #[inline(always)]
 590 fn eoi_fwd(
 591     dfa: &DFA,
 592     cache: &mut Cache,
 593     bytes: &[u8],
 594     end: usize,
 595     sid: &mut LazyStateID,
 596 ) -> Result<Option<HalfMatch>, MatchError> {
 597     match bytes.get(end) {
 598         Some(&b) => {
 599             *sid = dfa.next_state(cache, *sid, b).map_err(|_| gave_up(end))?;
 600             if sid.is_match() {
 601                 Ok(Some(HalfMatch {
 602                     pattern: dfa.match_pattern(cache, *sid, 0),
 603                     offset: end,
 604                 }))
 605             } else {
 606                 Ok(None)
 607             }
 608         }
 609         None => {
 610             *sid = dfa
 611                 .next_eoi_state(cache, *sid)
 612                 .map_err(|_| gave_up(bytes.len()))?;
 613             if sid.is_match() {
 614                 Ok(Some(HalfMatch {
 615                     pattern: dfa.match_pattern(cache, *sid, 0),
 616                     offset: bytes.len(),
 617                 }))
 618             } else {
 619                 Ok(None)
 620             }
 621         }
 622     }
 623 }
 624
 625 #[inline(always)]
 626 fn eoi_rev(
 627     dfa: &DFA,
 628     cache: &mut Cache,
 629     bytes: &[u8],
 630     start: usize,
 631     state: LazyStateID,
 632 ) -> Result<Option<HalfMatch>, MatchError> {
 633     if start > 0 {
 634         let sid = dfa
 635             .next_state(cache, state, bytes[start - 1])
 636             .map_err(|_| gave_up(start))?;
 637         if sid.is_match() {
 638             Ok(Some(HalfMatch {
 639                 pattern: dfa.match_pattern(cache, sid, 0),
 640                 offset: start,
 641             }))
 642         } else {
 643             Ok(None)
 644         }
 645     } else {
 646         let sid =
 647             dfa.next_eoi_state(cache, state).map_err(|_| gave_up(start))?;
 648         if sid.is_match() {
 649             Ok(Some(HalfMatch {
 650                 pattern: dfa.match_pattern(cache, sid, 0),
 651                 offset: 0,
 652             }))
 653         } else {
 654             Ok(None)
 655         }
 656     }
 657 }
 658
 659 /// A convenience routine for constructing a "gave up" match error.
 660 #[inline(always)]
 661 fn gave_up(offset: usize) -> MatchError {
 662     MatchError::GaveUp { offset }
 663 }