]> git.proxmox.com Git - rustc.git/blob - vendor/regex-automata/src/hybrid/search.rs
New upstream version 1.67.1+dfsg1
[rustc.git] / vendor / regex-automata / src / hybrid / search.rs
1 use crate::{
2 hybrid::{
3 dfa::{Cache, DFA},
4 id::{LazyStateID, OverlappingState, StateMatch},
5 },
6 nfa::thompson,
7 util::{
8 id::PatternID,
9 matchtypes::{HalfMatch, MatchError},
10 prefilter, MATCH_OFFSET,
11 },
12 };
13
14 #[inline(never)]
15 pub(crate) fn find_earliest_fwd(
16 pre: Option<&mut prefilter::Scanner>,
17 dfa: &DFA,
18 cache: &mut Cache,
19 pattern_id: Option<PatternID>,
20 bytes: &[u8],
21 start: usize,
22 end: usize,
23 ) -> Result<Option<HalfMatch>, MatchError> {
24 // Searching with a pattern ID is always anchored, so we should never use
25 // a prefilter.
26 if pre.is_some() && pattern_id.is_none() {
27 find_fwd(pre, true, dfa, cache, pattern_id, bytes, start, end)
28 } else {
29 find_fwd(None, true, dfa, cache, pattern_id, bytes, start, end)
30 }
31 }
32
33 #[inline(never)]
34 pub(crate) fn find_leftmost_fwd(
35 pre: Option<&mut prefilter::Scanner>,
36 dfa: &DFA,
37 cache: &mut Cache,
38 pattern_id: Option<PatternID>,
39 bytes: &[u8],
40 start: usize,
41 end: usize,
42 ) -> Result<Option<HalfMatch>, MatchError> {
43 // Searching with a pattern ID is always anchored, so we should never use
44 // a prefilter.
45 if pre.is_some() && pattern_id.is_none() {
46 find_fwd(pre, false, dfa, cache, pattern_id, bytes, start, end)
47 } else {
48 find_fwd(None, false, dfa, cache, pattern_id, bytes, start, end)
49 }
50 }
51
52 #[inline(always)]
53 fn find_fwd(
54 mut pre: Option<&mut prefilter::Scanner>,
55 earliest: bool,
56 dfa: &DFA,
57 cache: &mut Cache,
58 pattern_id: Option<PatternID>,
59 haystack: &[u8],
60 start: usize,
61 end: usize,
62 ) -> Result<Option<HalfMatch>, MatchError> {
63 assert!(start <= end);
64 assert!(start <= haystack.len());
65 assert!(end <= haystack.len());
66
67 // Why do this? This lets 'bytes[at]' work without bounds checks below.
68 // It seems the assert on 'end <= haystack.len()' above is otherwise
69 // not enough. Why not just make 'bytes' scoped this way anyway? Well,
70 // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
71 // for resolving look-ahead.
72 let bytes = &haystack[..end];
73
74 let mut sid = init_fwd(dfa, cache, pattern_id, haystack, start, end)?;
75 let mut last_match = None;
76 let mut at = start;
77 if let Some(ref mut pre) = pre {
78 // If a prefilter doesn't report false positives, then we don't need to
79 // touch the DFA at all. However, since all matches include the pattern
80 // ID, and the prefilter infrastructure doesn't report pattern IDs, we
81 // limit this optimization to cases where there is exactly one pattern.
82 // In that case, any match must be the 0th pattern.
83 if dfa.pattern_count() == 1 && !pre.reports_false_positives() {
84 return Ok(pre.next_candidate(bytes, at).into_option().map(
85 |offset| HalfMatch { pattern: PatternID::ZERO, offset },
86 ));
87 } else if pre.is_effective(at) {
88 match pre.next_candidate(bytes, at).into_option() {
89 None => return Ok(None),
90 Some(i) => {
91 at = i;
92 }
93 }
94 }
95 }
96 while at < end {
97 if sid.is_tagged() {
98 sid = dfa
99 .next_state(cache, sid, bytes[at])
100 .map_err(|_| gave_up(at))?;
101 at += 1;
102 } else {
103 // SAFETY: There are two safety invariants we need to uphold
104 // here in the loop below: that 'sid' is a valid state ID for
105 // this DFA, and that 'at' is a valid index into 'bytes'. For
106 // the former, we rely on the invariant that next_state* and
107 // start_state_forward always returns a valid state ID (given a
108 // valid state ID in the former case), and that we are only at this
109 // place in the code if 'sid' is untagged. Moreover, every call to
110 // next_state_untagged_unchecked below is guarded by a check that
111 // sid is untagged. For the latter safety invariant, we always
112 // guard unchecked access with a check that 'at' is less than
113 // 'end', where 'end == bytes.len()'.
114 //
115 // For justification, this gives us a ~10% bump in search time.
116 // This was used for a benchmark:
117 //
118 // regex-cli find hybrid regex @/some/big/file '(?m)^.+$' -UBb
119 //
120 // With bounds checked: ~881.4ms. Without: ~775ms. For input, I
121 // used OpenSubtitles2018.raw.sample.medium.en.
122 let mut prev_sid = sid;
123 while at < end {
124 prev_sid = sid;
125 sid = unsafe {
126 dfa.next_state_untagged_unchecked(
127 cache,
128 sid,
129 *bytes.get_unchecked(at),
130 )
131 };
132 at += 1;
133 if sid.is_tagged() {
134 break;
135 }
136 // SAFETY: we make four unguarded accesses to 'bytes[at]'
137 // below, and each are safe because we know that 'at + 4' is
138 // in bounds. Moreover, while we don't check whether 'sid' is
139 // untagged directly, we know it is because of the check above.
140 // And the unrolled loop below quits when the next state is not
141 // equal to the previous state.
142 //
143 // PERF: For justification for eliminating bounds checks,
144 // see above. For justification for the unrolling, we use
145 // two tests. The one above with regex '(?m)^.+$', and also
146 // '(?m)^.{40}$'. The former is kinda the best case for
147 // unrolling, and gives a 1.67 boost primarily because the DFA
148 // spends most of its time munching through the input in the
149 // same state. But the latter pattern rarely spends time in the
150 // same state through subsequent transitions, so unrolling is
151 // pretty much always ineffective in that it craps out on the
152 // first 'sid != next' check below. However, without unrolling,
153 // search is only 1.03 times faster than with unrolling on the
154 // latter pattern, which we deem to be an acceptable loss in
155 // favor of optimizing the more common case of having a "hot"
156 // state somewhere in the DFA.
157 while at + 4 < end {
158 let next = unsafe {
159 dfa.next_state_untagged_unchecked(
160 cache,
161 sid,
162 *bytes.get_unchecked(at),
163 )
164 };
165 if sid != next {
166 break;
167 }
168 at += 1;
169 let next = unsafe {
170 dfa.next_state_untagged_unchecked(
171 cache,
172 sid,
173 *bytes.get_unchecked(at),
174 )
175 };
176 if sid != next {
177 break;
178 }
179 at += 1;
180 let next = unsafe {
181 dfa.next_state_untagged_unchecked(
182 cache,
183 sid,
184 *bytes.get_unchecked(at),
185 )
186 };
187 if sid != next {
188 break;
189 }
190 at += 1;
191 let next = unsafe {
192 dfa.next_state_untagged_unchecked(
193 cache,
194 sid,
195 *bytes.get_unchecked(at),
196 )
197 };
198 if sid != next {
199 break;
200 }
201 at += 1;
202 }
203 }
204 if sid.is_unknown() {
205 sid = dfa
206 .next_state(cache, prev_sid, bytes[at - 1])
207 .map_err(|_| gave_up(at - 1))?;
208 }
209 }
210 if sid.is_tagged() {
211 if sid.is_start() {
212 if let Some(ref mut pre) = pre {
213 if pre.is_effective(at) {
214 match pre.next_candidate(bytes, at).into_option() {
215 None => return Ok(None),
216 Some(i) => {
217 at = i;
218 }
219 }
220 }
221 }
222 } else if sid.is_match() {
223 last_match = Some(HalfMatch {
224 pattern: dfa.match_pattern(cache, sid, 0),
225 offset: at - MATCH_OFFSET,
226 });
227 if earliest {
228 return Ok(last_match);
229 }
230 } else if sid.is_dead() {
231 return Ok(last_match);
232 } else if sid.is_quit() {
233 if last_match.is_some() {
234 return Ok(last_match);
235 }
236 let offset = at - 1;
237 return Err(MatchError::Quit { byte: bytes[offset], offset });
238 } else {
239 debug_assert!(sid.is_unknown());
240 unreachable!("sid being unknown is a bug");
241 }
242 }
243 }
244 // We are careful to use 'haystack' here, which contains the full context
245 // that we might want to inspect.
246 Ok(eoi_fwd(dfa, cache, haystack, end, &mut sid)?.or(last_match))
247 }
248
249 #[inline(never)]
250 pub(crate) fn find_earliest_rev(
251 dfa: &DFA,
252 cache: &mut Cache,
253 pattern_id: Option<PatternID>,
254 bytes: &[u8],
255 start: usize,
256 end: usize,
257 ) -> Result<Option<HalfMatch>, MatchError> {
258 find_rev(true, dfa, cache, pattern_id, bytes, start, end)
259 }
260
261 #[inline(never)]
262 pub(crate) fn find_leftmost_rev(
263 dfa: &DFA,
264 cache: &mut Cache,
265 pattern_id: Option<PatternID>,
266 bytes: &[u8],
267 start: usize,
268 end: usize,
269 ) -> Result<Option<HalfMatch>, MatchError> {
270 find_rev(false, dfa, cache, pattern_id, bytes, start, end)
271 }
272
273 #[inline(always)]
274 fn find_rev(
275 earliest: bool,
276 dfa: &DFA,
277 cache: &mut Cache,
278 pattern_id: Option<PatternID>,
279 haystack: &[u8],
280 start: usize,
281 end: usize,
282 ) -> Result<Option<HalfMatch>, MatchError> {
283 assert!(start <= end);
284 assert!(start <= haystack.len());
285 assert!(end <= haystack.len());
286
287 // Why do this? This lets 'bytes[at]' work without bounds checks below.
288 // It seems the assert on 'end <= haystack.len()' above is otherwise
289 // not enough. Why not just make 'bytes' scoped this way anyway? Well,
290 // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
291 // for resolving look-ahead.
292 let bytes = &haystack[start..];
293
294 let mut sid = init_rev(dfa, cache, pattern_id, haystack, start, end)?;
295 let mut last_match = None;
296 let mut at = end - start;
297 while at > 0 {
298 if sid.is_tagged() {
299 at -= 1;
300 sid = dfa
301 .next_state(cache, sid, bytes[at])
302 .map_err(|_| gave_up(at))?;
303 } else {
304 // SAFETY: See comments in 'find_fwd' for both a safety argument
305 // and a justification from a performance perspective as to 1) why
306 // we elide bounds checks and 2) why we do a specialized version of
307 // unrolling below.
308 let mut prev_sid = sid;
309 while at > 0 && !sid.is_tagged() {
310 prev_sid = sid;
311 at -= 1;
312 while at > 3 {
313 let next = unsafe {
314 dfa.next_state_untagged_unchecked(
315 cache,
316 sid,
317 *bytes.get_unchecked(at),
318 )
319 };
320 if sid != next {
321 break;
322 }
323 at -= 1;
324 let next = unsafe {
325 dfa.next_state_untagged_unchecked(
326 cache,
327 sid,
328 *bytes.get_unchecked(at),
329 )
330 };
331 if sid != next {
332 break;
333 }
334 at -= 1;
335 let next = unsafe {
336 dfa.next_state_untagged_unchecked(
337 cache,
338 sid,
339 *bytes.get_unchecked(at),
340 )
341 };
342 if sid != next {
343 break;
344 }
345 at -= 1;
346 let next = unsafe {
347 dfa.next_state_untagged_unchecked(
348 cache,
349 sid,
350 *bytes.get_unchecked(at),
351 )
352 };
353 if sid != next {
354 break;
355 }
356 at -= 1;
357 }
358 sid = unsafe {
359 dfa.next_state_untagged_unchecked(
360 cache,
361 sid,
362 *bytes.get_unchecked(at),
363 )
364 };
365 }
366 if sid.is_unknown() {
367 sid = dfa
368 .next_state(cache, prev_sid, bytes[at])
369 .map_err(|_| gave_up(at))?;
370 }
371 }
372 if sid.is_tagged() {
373 if sid.is_start() {
374 continue;
375 } else if sid.is_match() {
376 last_match = Some(HalfMatch {
377 pattern: dfa.match_pattern(cache, sid, 0),
378 offset: start + at + MATCH_OFFSET,
379 });
380 if earliest {
381 return Ok(last_match);
382 }
383 } else if sid.is_dead() {
384 return Ok(last_match);
385 } else {
386 debug_assert!(sid.is_quit());
387 if last_match.is_some() {
388 return Ok(last_match);
389 }
390 return Err(MatchError::Quit { byte: bytes[at], offset: at });
391 }
392 }
393 }
394 Ok(eoi_rev(dfa, cache, haystack, start, sid)?.or(last_match))
395 }
396
397 #[inline(never)]
398 pub(crate) fn find_overlapping_fwd(
399 pre: Option<&mut prefilter::Scanner>,
400 dfa: &DFA,
401 cache: &mut Cache,
402 pattern_id: Option<PatternID>,
403 bytes: &[u8],
404 start: usize,
405 end: usize,
406 caller_state: &mut OverlappingState,
407 ) -> Result<Option<HalfMatch>, MatchError> {
408 // Searching with a pattern ID is always anchored, so we should only ever
409 // use a prefilter when no pattern ID is given.
410 if pre.is_some() && pattern_id.is_none() {
411 find_overlapping_fwd_imp(
412 pre,
413 dfa,
414 cache,
415 pattern_id,
416 bytes,
417 start,
418 end,
419 caller_state,
420 )
421 } else {
422 find_overlapping_fwd_imp(
423 None,
424 dfa,
425 cache,
426 pattern_id,
427 bytes,
428 start,
429 end,
430 caller_state,
431 )
432 }
433 }
434
435 #[inline(always)]
436 fn find_overlapping_fwd_imp(
437 mut pre: Option<&mut prefilter::Scanner>,
438 dfa: &DFA,
439 cache: &mut Cache,
440 pattern_id: Option<PatternID>,
441 bytes: &[u8],
442 mut start: usize,
443 end: usize,
444 caller_state: &mut OverlappingState,
445 ) -> Result<Option<HalfMatch>, MatchError> {
446 assert!(start <= end);
447 assert!(start <= bytes.len());
448 assert!(end <= bytes.len());
449
450 let mut sid = match caller_state.id() {
451 None => init_fwd(dfa, cache, pattern_id, bytes, start, end)?,
452 Some(sid) => {
453 if let Some(last) = caller_state.last_match() {
454 let match_count = dfa.match_count(cache, sid);
455 if last.match_index < match_count {
456 let m = HalfMatch {
457 pattern: dfa.match_pattern(
458 cache,
459 sid,
460 last.match_index,
461 ),
462 offset: last.offset,
463 };
464 last.match_index += 1;
465 return Ok(Some(m));
466 }
467 }
468
469 // This is a subtle but critical detail. If the caller provides a
470 // non-None state ID, then it must be the case that the state ID
471 // corresponds to one set by this function. The state ID therefore
472 // corresponds to a match state, a dead state or some other state.
473 // However, "some other" state _only_ occurs when the input has
474 // been exhausted because the only way to stop before then is to
475 // see a match or a dead/quit state.
476 //
477 // If the input is exhausted or if it's a dead state, then
478 // incrementing the starting position has no relevance on
479 // correctness, since the loop below will either not execute
480 // at all or will immediately stop due to being in a dead state.
481 // (Once in a dead state it is impossible to leave it.)
482 //
483 // Therefore, the only case we need to consider is when
484 // caller_state is a match state. In this case, since our machines
485 // support the ability to delay a match by a certain number of
486 // bytes (to support look-around), it follows that we actually
487 // consumed that many additional bytes on our previous search. When
488 // the caller resumes their search to find subsequent matches, they
489 // will use the ending location from the previous match as the next
490 // starting point, which is `match_offset` bytes PRIOR to where
491 // we scanned to on the previous search. Therefore, we need to
492 // compensate by bumping `start` up by `MATCH_OFFSET` bytes.
493 //
494 // Incidentally, since MATCH_OFFSET is non-zero, this also makes
495 // dealing with empty matches convenient. Namely, callers needn't
496 // special case them when implementing an iterator. Instead, this
497 // ensures that forward progress is always made.
498 start += MATCH_OFFSET;
499 sid
500 }
501 };
502
503 let mut at = start;
504 while at < end {
505 let byte = bytes[at];
506 sid = dfa.next_state(cache, sid, byte).map_err(|_| gave_up(at))?;
507 at += 1;
508 if sid.is_tagged() {
509 caller_state.set_id(sid);
510 if sid.is_start() {
511 if let Some(ref mut pre) = pre {
512 if pre.is_effective(at) {
513 match pre.next_candidate(bytes, at).into_option() {
514 None => return Ok(None),
515 Some(i) => {
516 at = i;
517 }
518 }
519 }
520 }
521 } else if sid.is_match() {
522 let offset = at - MATCH_OFFSET;
523 caller_state
524 .set_last_match(StateMatch { match_index: 1, offset });
525 return Ok(Some(HalfMatch {
526 pattern: dfa.match_pattern(cache, sid, 0),
527 offset,
528 }));
529 } else if sid.is_dead() {
530 return Ok(None);
531 } else {
532 debug_assert!(sid.is_quit());
533 return Err(MatchError::Quit { byte, offset: at - 1 });
534 }
535 }
536 }
537
538 let result = eoi_fwd(dfa, cache, bytes, end, &mut sid);
539 caller_state.set_id(sid);
540 if let Ok(Some(ref last_match)) = result {
541 caller_state.set_last_match(StateMatch {
542 // '1' is always correct here since if we get to this point, this
543 // always corresponds to the first (index '0') match discovered at
544 // this position. So the next match to report at this position (if
545 // it exists) is at index '1'.
546 match_index: 1,
547 offset: last_match.offset(),
548 });
549 }
550 result
551 }
552
553 #[inline(always)]
554 fn init_fwd(
555 dfa: &DFA,
556 cache: &mut Cache,
557 pattern_id: Option<PatternID>,
558 bytes: &[u8],
559 start: usize,
560 end: usize,
561 ) -> Result<LazyStateID, MatchError> {
562 let sid = dfa
563 .start_state_forward(cache, pattern_id, bytes, start, end)
564 .map_err(|_| gave_up(start))?;
565 // Start states can never be match states, since all matches are delayed
566 // by 1 byte.
567 assert!(!sid.is_match());
568 Ok(sid)
569 }
570
571 #[inline(always)]
572 fn init_rev(
573 dfa: &DFA,
574 cache: &mut Cache,
575 pattern_id: Option<PatternID>,
576 bytes: &[u8],
577 start: usize,
578 end: usize,
579 ) -> Result<LazyStateID, MatchError> {
580 let sid = dfa
581 .start_state_reverse(cache, pattern_id, bytes, start, end)
582 .map_err(|_| gave_up(end))?;
583 // Start states can never be match states, since all matches are delayed
584 // by 1 byte.
585 assert!(!sid.is_match());
586 Ok(sid)
587 }
588
589 #[inline(always)]
590 fn eoi_fwd(
591 dfa: &DFA,
592 cache: &mut Cache,
593 bytes: &[u8],
594 end: usize,
595 sid: &mut LazyStateID,
596 ) -> Result<Option<HalfMatch>, MatchError> {
597 match bytes.get(end) {
598 Some(&b) => {
599 *sid = dfa.next_state(cache, *sid, b).map_err(|_| gave_up(end))?;
600 if sid.is_match() {
601 Ok(Some(HalfMatch {
602 pattern: dfa.match_pattern(cache, *sid, 0),
603 offset: end,
604 }))
605 } else {
606 Ok(None)
607 }
608 }
609 None => {
610 *sid = dfa
611 .next_eoi_state(cache, *sid)
612 .map_err(|_| gave_up(bytes.len()))?;
613 if sid.is_match() {
614 Ok(Some(HalfMatch {
615 pattern: dfa.match_pattern(cache, *sid, 0),
616 offset: bytes.len(),
617 }))
618 } else {
619 Ok(None)
620 }
621 }
622 }
623 }
624
625 #[inline(always)]
626 fn eoi_rev(
627 dfa: &DFA,
628 cache: &mut Cache,
629 bytes: &[u8],
630 start: usize,
631 state: LazyStateID,
632 ) -> Result<Option<HalfMatch>, MatchError> {
633 if start > 0 {
634 let sid = dfa
635 .next_state(cache, state, bytes[start - 1])
636 .map_err(|_| gave_up(start))?;
637 if sid.is_match() {
638 Ok(Some(HalfMatch {
639 pattern: dfa.match_pattern(cache, sid, 0),
640 offset: start,
641 }))
642 } else {
643 Ok(None)
644 }
645 } else {
646 let sid =
647 dfa.next_eoi_state(cache, state).map_err(|_| gave_up(start))?;
648 if sid.is_match() {
649 Ok(Some(HalfMatch {
650 pattern: dfa.match_pattern(cache, sid, 0),
651 offset: 0,
652 }))
653 } else {
654 Ok(None)
655 }
656 }
657 }
658
659 /// A convenience routine for constructing a "gave up" match error.
660 #[inline(always)]
661 fn gave_up(offset: usize) -> MatchError {
662 MatchError::GaveUp { offset }
663 }