4 id
::{LazyStateID, OverlappingState, StateMatch}
,
9 matchtypes
::{HalfMatch, MatchError}
,
10 prefilter
, MATCH_OFFSET
,
15 pub(crate) fn find_earliest_fwd(
16 pre
: Option
<&mut prefilter
::Scanner
>,
19 pattern_id
: Option
<PatternID
>,
23 ) -> Result
<Option
<HalfMatch
>, MatchError
> {
24 // Searching with a pattern ID is always anchored, so we should never use
26 if pre
.is_some() && pattern_id
.is_none() {
27 find_fwd(pre
, true, dfa
, cache
, pattern_id
, bytes
, start
, end
)
29 find_fwd(None
, true, dfa
, cache
, pattern_id
, bytes
, start
, end
)
34 pub(crate) fn find_leftmost_fwd(
35 pre
: Option
<&mut prefilter
::Scanner
>,
38 pattern_id
: Option
<PatternID
>,
42 ) -> Result
<Option
<HalfMatch
>, MatchError
> {
43 // Searching with a pattern ID is always anchored, so we should never use
45 if pre
.is_some() && pattern_id
.is_none() {
46 find_fwd(pre
, false, dfa
, cache
, pattern_id
, bytes
, start
, end
)
48 find_fwd(None
, false, dfa
, cache
, pattern_id
, bytes
, start
, end
)
54 mut pre
: Option
<&mut prefilter
::Scanner
>,
58 pattern_id
: Option
<PatternID
>,
62 ) -> Result
<Option
<HalfMatch
>, MatchError
> {
63 assert
!(start
<= end
);
64 assert
!(start
<= haystack
.len());
65 assert
!(end
<= haystack
.len());
67 // Why do this? This lets 'bytes[at]' work without bounds checks below.
68 // It seems the assert on 'end <= haystack.len()' above is otherwise
69 // not enough. Why not just make 'bytes' scoped this way anyway? Well,
70 // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
71 // for resolving look-ahead.
72 let bytes
= &haystack
[..end
];
74 let mut sid
= init_fwd(dfa
, cache
, pattern_id
, haystack
, start
, end
)?
;
75 let mut last_match
= None
;
77 if let Some(ref mut pre
) = pre
{
78 // If a prefilter doesn't report false positives, then we don't need to
79 // touch the DFA at all. However, since all matches include the pattern
80 // ID, and the prefilter infrastructure doesn't report pattern IDs, we
81 // limit this optimization to cases where there is exactly one pattern.
82 // In that case, any match must be the 0th pattern.
83 if dfa
.pattern_count() == 1 && !pre
.reports_false_positives() {
84 return Ok(pre
.next_candidate(bytes
, at
).into_option().map(
85 |offset
| HalfMatch { pattern: PatternID::ZERO, offset }
,
87 } else if pre
.is_effective(at
) {
88 match pre
.next_candidate(bytes
, at
).into_option() {
89 None
=> return Ok(None
),
99 .next_state(cache
, sid
, bytes
[at
])
100 .map_err(|_
| gave_up(at
))?
;
103 // SAFETY: There are two safety invariants we need to uphold
104 // here in the loop below: that 'sid' is a valid state ID for
105 // this DFA, and that 'at' is a valid index into 'bytes'. For
106 // the former, we rely on the invariant that next_state* and
107 // start_state_forward always returns a valid state ID (given a
108 // valid state ID in the former case), and that we are only at this
109 // place in the code if 'sid' is untagged. Moreover, every call to
110 // next_state_untagged_unchecked below is guarded by a check that
111 // sid is untagged. For the latter safety invariant, we always
112 // guard unchecked access with a check that 'at' is less than
113 // 'end', where 'end == bytes.len()'.
115 // For justification, this gives us a ~10% bump in search time.
116 // This was used for a benchmark:
118 // regex-cli find hybrid regex @/some/big/file '(?m)^.+$' -UBb
120 // With bounds checked: ~881.4ms. Without: ~775ms. For input, I
121 // used OpenSubtitles2018.raw.sample.medium.en.
122 let mut prev_sid
= sid
;
126 dfa
.next_state_untagged_unchecked(
129 *bytes
.get_unchecked(at
),
136 // SAFETY: we make four unguarded accesses to 'bytes[at]'
137 // below, and each are safe because we know that 'at + 4' is
138 // in bounds. Moreover, while we don't check whether 'sid' is
139 // untagged directly, we know it is because of the check above.
140 // And the unrolled loop below quits when the next state is not
141 // equal to the previous state.
143 // PERF: For justification for eliminating bounds checks,
144 // see above. For justification for the unrolling, we use
145 // two tests. The one above with regex '(?m)^.+$', and also
146 // '(?m)^.{40}$'. The former is kinda the best case for
147 // unrolling, and gives a 1.67 boost primarily because the DFA
148 // spends most of its time munching through the input in the
149 // same state. But the latter pattern rarely spends time in the
150 // same state through subsequent transitions, so unrolling is
151 // pretty much always ineffective in that it craps out on the
152 // first 'sid != next' check below. However, without unrolling,
153 // search is only 1.03 times faster than with unrolling on the
154 // latter pattern, which we deem to be an acceptable loss in
155 // favor of optimizing the more common case of having a "hot"
156 // state somewhere in the DFA.
159 dfa
.next_state_untagged_unchecked(
162 *bytes
.get_unchecked(at
),
170 dfa
.next_state_untagged_unchecked(
173 *bytes
.get_unchecked(at
),
181 dfa
.next_state_untagged_unchecked(
184 *bytes
.get_unchecked(at
),
192 dfa
.next_state_untagged_unchecked(
195 *bytes
.get_unchecked(at
),
204 if sid
.is_unknown() {
206 .next_state(cache
, prev_sid
, bytes
[at
- 1])
207 .map_err(|_
| gave_up(at
- 1))?
;
212 if let Some(ref mut pre
) = pre
{
213 if pre
.is_effective(at
) {
214 match pre
.next_candidate(bytes
, at
).into_option() {
215 None
=> return Ok(None
),
222 } else if sid
.is_match() {
223 last_match
= Some(HalfMatch
{
224 pattern
: dfa
.match_pattern(cache
, sid
, 0),
225 offset
: at
- MATCH_OFFSET
,
228 return Ok(last_match
);
230 } else if sid
.is_dead() {
231 return Ok(last_match
);
232 } else if sid
.is_quit() {
233 if last_match
.is_some() {
234 return Ok(last_match
);
237 return Err(MatchError
::Quit { byte: bytes[offset], offset }
);
239 debug_assert
!(sid
.is_unknown());
240 unreachable
!("sid being unknown is a bug");
244 // We are careful to use 'haystack' here, which contains the full context
245 // that we might want to inspect.
246 Ok(eoi_fwd(dfa
, cache
, haystack
, end
, &mut sid
)?
.or(last_match
))
250 pub(crate) fn find_earliest_rev(
253 pattern_id
: Option
<PatternID
>,
257 ) -> Result
<Option
<HalfMatch
>, MatchError
> {
258 find_rev(true, dfa
, cache
, pattern_id
, bytes
, start
, end
)
262 pub(crate) fn find_leftmost_rev(
265 pattern_id
: Option
<PatternID
>,
269 ) -> Result
<Option
<HalfMatch
>, MatchError
> {
270 find_rev(false, dfa
, cache
, pattern_id
, bytes
, start
, end
)
278 pattern_id
: Option
<PatternID
>,
282 ) -> Result
<Option
<HalfMatch
>, MatchError
> {
283 assert
!(start
<= end
);
284 assert
!(start
<= haystack
.len());
285 assert
!(end
<= haystack
.len());
287 // Why do this? This lets 'bytes[at]' work without bounds checks below.
288 // It seems the assert on 'end <= haystack.len()' above is otherwise
289 // not enough. Why not just make 'bytes' scoped this way anyway? Well,
290 // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
291 // for resolving look-ahead.
292 let bytes
= &haystack
[start
..];
294 let mut sid
= init_rev(dfa
, cache
, pattern_id
, haystack
, start
, end
)?
;
295 let mut last_match
= None
;
296 let mut at
= end
- start
;
301 .next_state(cache
, sid
, bytes
[at
])
302 .map_err(|_
| gave_up(at
))?
;
304 // SAFETY: See comments in 'find_fwd' for both a safety argument
305 // and a justification from a performance perspective as to 1) why
306 // we elide bounds checks and 2) why we do a specialized version of
308 let mut prev_sid
= sid
;
309 while at
> 0 && !sid
.is_tagged() {
314 dfa
.next_state_untagged_unchecked(
317 *bytes
.get_unchecked(at
),
325 dfa
.next_state_untagged_unchecked(
328 *bytes
.get_unchecked(at
),
336 dfa
.next_state_untagged_unchecked(
339 *bytes
.get_unchecked(at
),
347 dfa
.next_state_untagged_unchecked(
350 *bytes
.get_unchecked(at
),
359 dfa
.next_state_untagged_unchecked(
362 *bytes
.get_unchecked(at
),
366 if sid
.is_unknown() {
368 .next_state(cache
, prev_sid
, bytes
[at
])
369 .map_err(|_
| gave_up(at
))?
;
375 } else if sid
.is_match() {
376 last_match
= Some(HalfMatch
{
377 pattern
: dfa
.match_pattern(cache
, sid
, 0),
378 offset
: start
+ at
+ MATCH_OFFSET
,
381 return Ok(last_match
);
383 } else if sid
.is_dead() {
384 return Ok(last_match
);
386 debug_assert
!(sid
.is_quit());
387 if last_match
.is_some() {
388 return Ok(last_match
);
390 return Err(MatchError
::Quit { byte: bytes[at], offset: at }
);
394 Ok(eoi_rev(dfa
, cache
, haystack
, start
, sid
)?
.or(last_match
))
398 pub(crate) fn find_overlapping_fwd(
399 pre
: Option
<&mut prefilter
::Scanner
>,
402 pattern_id
: Option
<PatternID
>,
406 caller_state
: &mut OverlappingState
,
407 ) -> Result
<Option
<HalfMatch
>, MatchError
> {
408 // Searching with a pattern ID is always anchored, so we should only ever
409 // use a prefilter when no pattern ID is given.
410 if pre
.is_some() && pattern_id
.is_none() {
411 find_overlapping_fwd_imp(
422 find_overlapping_fwd_imp(
436 fn find_overlapping_fwd_imp(
437 mut pre
: Option
<&mut prefilter
::Scanner
>,
440 pattern_id
: Option
<PatternID
>,
444 caller_state
: &mut OverlappingState
,
445 ) -> Result
<Option
<HalfMatch
>, MatchError
> {
446 assert
!(start
<= end
);
447 assert
!(start
<= bytes
.len());
448 assert
!(end
<= bytes
.len());
450 let mut sid
= match caller_state
.id() {
451 None
=> init_fwd(dfa
, cache
, pattern_id
, bytes
, start
, end
)?
,
453 if let Some(last
) = caller_state
.last_match() {
454 let match_count
= dfa
.match_count(cache
, sid
);
455 if last
.match_index
< match_count
{
457 pattern
: dfa
.match_pattern(
464 last
.match_index
+= 1;
469 // This is a subtle but critical detail. If the caller provides a
470 // non-None state ID, then it must be the case that the state ID
471 // corresponds to one set by this function. The state ID therefore
472 // corresponds to a match state, a dead state or some other state.
473 // However, "some other" state _only_ occurs when the input has
474 // been exhausted because the only way to stop before then is to
475 // see a match or a dead/quit state.
477 // If the input is exhausted or if it's a dead state, then
478 // incrementing the starting position has no relevance on
479 // correctness, since the loop below will either not execute
480 // at all or will immediately stop due to being in a dead state.
481 // (Once in a dead state it is impossible to leave it.)
483 // Therefore, the only case we need to consider is when
484 // caller_state is a match state. In this case, since our machines
485 // support the ability to delay a match by a certain number of
486 // bytes (to support look-around), it follows that we actually
487 // consumed that many additional bytes on our previous search. When
488 // the caller resumes their search to find subsequent matches, they
489 // will use the ending location from the previous match as the next
490 // starting point, which is `match_offset` bytes PRIOR to where
491 // we scanned to on the previous search. Therefore, we need to
492 // compensate by bumping `start` up by `MATCH_OFFSET` bytes.
494 // Incidentally, since MATCH_OFFSET is non-zero, this also makes
495 // dealing with empty matches convenient. Namely, callers needn't
496 // special case them when implementing an iterator. Instead, this
497 // ensures that forward progress is always made.
498 start
+= MATCH_OFFSET
;
505 let byte
= bytes
[at
];
506 sid
= dfa
.next_state(cache
, sid
, byte
).map_err(|_
| gave_up(at
))?
;
509 caller_state
.set_id(sid
);
511 if let Some(ref mut pre
) = pre
{
512 if pre
.is_effective(at
) {
513 match pre
.next_candidate(bytes
, at
).into_option() {
514 None
=> return Ok(None
),
521 } else if sid
.is_match() {
522 let offset
= at
- MATCH_OFFSET
;
524 .set_last_match(StateMatch { match_index: 1, offset }
);
525 return Ok(Some(HalfMatch
{
526 pattern
: dfa
.match_pattern(cache
, sid
, 0),
529 } else if sid
.is_dead() {
532 debug_assert
!(sid
.is_quit());
533 return Err(MatchError
::Quit { byte, offset: at - 1 }
);
538 let result
= eoi_fwd(dfa
, cache
, bytes
, end
, &mut sid
);
539 caller_state
.set_id(sid
);
540 if let Ok(Some(ref last_match
)) = result
{
541 caller_state
.set_last_match(StateMatch
{
542 // '1' is always correct here since if we get to this point, this
543 // always corresponds to the first (index '0') match discovered at
544 // this position. So the next match to report at this position (if
545 // it exists) is at index '1'.
547 offset
: last_match
.offset(),
557 pattern_id
: Option
<PatternID
>,
561 ) -> Result
<LazyStateID
, MatchError
> {
563 .start_state_forward(cache
, pattern_id
, bytes
, start
, end
)
564 .map_err(|_
| gave_up(start
))?
;
565 // Start states can never be match states, since all matches are delayed
567 assert
!(!sid
.is_match());
575 pattern_id
: Option
<PatternID
>,
579 ) -> Result
<LazyStateID
, MatchError
> {
581 .start_state_reverse(cache
, pattern_id
, bytes
, start
, end
)
582 .map_err(|_
| gave_up(end
))?
;
583 // Start states can never be match states, since all matches are delayed
585 assert
!(!sid
.is_match());
595 sid
: &mut LazyStateID
,
596 ) -> Result
<Option
<HalfMatch
>, MatchError
> {
597 match bytes
.get(end
) {
599 *sid
= dfa
.next_state(cache
, *sid
, b
).map_err(|_
| gave_up(end
))?
;
602 pattern
: dfa
.match_pattern(cache
, *sid
, 0),
611 .next_eoi_state(cache
, *sid
)
612 .map_err(|_
| gave_up(bytes
.len()))?
;
615 pattern
: dfa
.match_pattern(cache
, *sid
, 0),
632 ) -> Result
<Option
<HalfMatch
>, MatchError
> {
635 .next_state(cache
, state
, bytes
[start
- 1])
636 .map_err(|_
| gave_up(start
))?
;
639 pattern
: dfa
.match_pattern(cache
, sid
, 0),
647 dfa
.next_eoi_state(cache
, state
).map_err(|_
| gave_up(start
))?
;
650 pattern
: dfa
.match_pattern(cache
, sid
, 0),
659 /// A convenience routine for constructing a "gave up" match error.
661 fn gave_up(offset
: usize) -> MatchError
{
662 MatchError
::GaveUp { offset }