]> git.proxmox.com Git - rustc.git/blame - src/libcore/str/pattern.rs
New upstream version 1.41.1+dfsg1
[rustc.git] / src / libcore / str / pattern.rs
CommitLineData
9346a6ac
AL
1//! The string Pattern API.
2//!
9fa01778
XL
3//! For more details, see the traits [`Pattern`], [`Searcher`],
4//! [`ReverseSearcher`], and [`DoubleEndedSearcher`].
9346a6ac 5
60c5eb7d
XL
6// ignore-tidy-undocumented-unsafe
7
62682a34 8#![unstable(feature = "pattern",
e9174d1e
SL
9 reason = "API not fully fleshed out and ready to be stabilized",
10 issue = "27721")]
11
48663c56
XL
12use crate::cmp;
13use crate::fmt;
14use crate::slice::memchr;
15use crate::usize;
c34b1796
AL
16
17// Pattern
18
19/// A string pattern.
20///
21/// A `Pattern<'a>` expresses that the implementing type
22/// can be used as a string pattern for searching in a `&'a str`.
23///
24/// For example, both `'a'` and `"aa"` are patterns that
25/// would match at index `1` in the string `"baaaab"`.
26///
27/// The trait itself acts as a builder for an associated
28/// `Searcher` type, which does the actual work of finding
29/// occurrences of the pattern in a string.
30pub trait Pattern<'a>: Sized {
31 /// Associated searcher for this pattern
32 type Searcher: Searcher<'a>;
33
9346a6ac 34 /// Constructs the associated searcher from
c34b1796
AL
35 /// `self` and the `haystack` to search in.
36 fn into_searcher(self, haystack: &'a str) -> Self::Searcher;
37
9346a6ac 38 /// Checks whether the pattern matches anywhere in the haystack
c34b1796
AL
39 #[inline]
40 fn is_contained_in(self, haystack: &'a str) -> bool {
41 self.into_searcher(haystack).next_match().is_some()
42 }
43
9346a6ac 44 /// Checks whether the pattern matches at the front of the haystack
c34b1796
AL
45 #[inline]
46 fn is_prefix_of(self, haystack: &'a str) -> bool {
47 match self.into_searcher(haystack).next() {
48 SearchStep::Match(0, _) => true,
49 _ => false,
50 }
51 }
52
9346a6ac 53 /// Checks whether the pattern matches at the back of the haystack
c34b1796
AL
54 #[inline]
55 fn is_suffix_of(self, haystack: &'a str) -> bool
56 where Self::Searcher: ReverseSearcher<'a>
57 {
58 match self.into_searcher(haystack).next_back() {
59 SearchStep::Match(_, j) if haystack.len() == j => true,
60 _ => false,
61 }
62 }
63}
64
65// Searcher
66
67/// Result of calling `Searcher::next()` or `ReverseSearcher::next_back()`.
68#[derive(Copy, Clone, Eq, PartialEq, Debug)]
69pub enum SearchStep {
70 /// Expresses that a match of the pattern has been found at
71 /// `haystack[a..b]`.
72 Match(usize, usize),
73 /// Expresses that `haystack[a..b]` has been rejected as a possible match
74 /// of the pattern.
75 ///
76 /// Note that there might be more than one `Reject` between two `Match`es,
77 /// there is no requirement for them to be combined into one.
78 Reject(usize, usize),
3b2f2976 79 /// Expresses that every byte of the haystack has been visited, ending
c34b1796
AL
80 /// the iteration.
81 Done
82}
83
84/// A searcher for a string pattern.
85///
86/// This trait provides methods for searching for non-overlapping
87/// matches of a pattern starting from the front (left) of a string.
88///
89/// It will be implemented by associated `Searcher`
90/// types of the `Pattern` trait.
91///
92/// The trait is marked unsafe because the indices returned by the
93/// `next()` methods are required to lie on valid utf8 boundaries in
94/// the haystack. This enables consumers of this trait to
95/// slice the haystack without additional runtime checks.
96pub unsafe trait Searcher<'a> {
3b2f2976 97 /// Getter for the underlying string to be searched in
c34b1796
AL
98 ///
99 /// Will always return the same `&str`
100 fn haystack(&self) -> &'a str;
101
102 /// Performs the next search step starting from the front.
103 ///
104 /// - Returns `Match(a, b)` if `haystack[a..b]` matches the pattern.
105 /// - Returns `Reject(a, b)` if `haystack[a..b]` can not match the
106 /// pattern, even partially.
107 /// - Returns `Done` if every byte of the haystack has been visited
108 ///
109 /// The stream of `Match` and `Reject` values up to a `Done`
110 /// will contain index ranges that are adjacent, non-overlapping,
111 /// covering the whole haystack, and laying on utf8 boundaries.
112 ///
113 /// A `Match` result needs to contain the whole matched pattern,
114 /// however `Reject` results may be split up into arbitrary
115 /// many adjacent fragments. Both ranges may have zero length.
116 ///
117 /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"`
118 /// might produce the stream
119 /// `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, 8)]`
120 fn next(&mut self) -> SearchStep;
121
9fa01778 122 /// Finds the next `Match` result. See `next()`
ff7c6d11
XL
123 ///
124 /// Unlike next(), there is no guarantee that the returned ranges
125 /// of this and next_reject will overlap. This will return (start_match, end_match),
126 /// where start_match is the index of where the match begins, and end_match is
127 /// the index after the end of the match.
c34b1796
AL
128 #[inline]
129 fn next_match(&mut self) -> Option<(usize, usize)> {
130 loop {
131 match self.next() {
132 SearchStep::Match(a, b) => return Some((a, b)),
133 SearchStep::Done => return None,
134 _ => continue,
135 }
136 }
137 }
138
9fa01778 139 /// Finds the next `Reject` result. See `next()` and `next_match()`
ff7c6d11
XL
140 ///
141 /// Unlike next(), there is no guarantee that the returned ranges
142 /// of this and next_match will overlap.
c34b1796
AL
143 #[inline]
144 fn next_reject(&mut self) -> Option<(usize, usize)> {
145 loop {
146 match self.next() {
147 SearchStep::Reject(a, b) => return Some((a, b)),
148 SearchStep::Done => return None,
149 _ => continue,
150 }
151 }
152 }
153}
154
155/// A reverse searcher for a string pattern.
156///
157/// This trait provides methods for searching for non-overlapping
158/// matches of a pattern starting from the back (right) of a string.
159///
160/// It will be implemented by associated `Searcher`
161/// types of the `Pattern` trait if the pattern supports searching
162/// for it from the back.
163///
164/// The index ranges returned by this trait are not required
165/// to exactly match those of the forward search in reverse.
166///
167/// For the reason why this trait is marked unsafe, see them
168/// parent trait `Searcher`.
169pub unsafe trait ReverseSearcher<'a>: Searcher<'a> {
170 /// Performs the next search step starting from the back.
171 ///
172 /// - Returns `Match(a, b)` if `haystack[a..b]` matches the pattern.
173 /// - Returns `Reject(a, b)` if `haystack[a..b]` can not match the
174 /// pattern, even partially.
175 /// - Returns `Done` if every byte of the haystack has been visited
176 ///
177 /// The stream of `Match` and `Reject` values up to a `Done`
178 /// will contain index ranges that are adjacent, non-overlapping,
179 /// covering the whole haystack, and laying on utf8 boundaries.
180 ///
181 /// A `Match` result needs to contain the whole matched pattern,
182 /// however `Reject` results may be split up into arbitrary
183 /// many adjacent fragments. Both ranges may have zero length.
184 ///
185 /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"`
186 /// might produce the stream
187 /// `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, 1)]`
188 fn next_back(&mut self) -> SearchStep;
189
9fa01778 190 /// Finds the next `Match` result. See `next_back()`
c34b1796
AL
191 #[inline]
192 fn next_match_back(&mut self) -> Option<(usize, usize)>{
193 loop {
194 match self.next_back() {
195 SearchStep::Match(a, b) => return Some((a, b)),
196 SearchStep::Done => return None,
197 _ => continue,
198 }
199 }
200 }
201
9fa01778 202 /// Finds the next `Reject` result. See `next_back()`
c34b1796
AL
203 #[inline]
204 fn next_reject_back(&mut self) -> Option<(usize, usize)>{
205 loop {
206 match self.next_back() {
207 SearchStep::Reject(a, b) => return Some((a, b)),
208 SearchStep::Done => return None,
209 _ => continue,
210 }
211 }
212 }
213}
214
215/// A marker trait to express that a `ReverseSearcher`
216/// can be used for a `DoubleEndedIterator` implementation.
217///
218/// For this, the impl of `Searcher` and `ReverseSearcher` need
219/// to follow these conditions:
220///
221/// - All results of `next()` need to be identical
222/// to the results of `next_back()` in reverse order.
223/// - `next()` and `next_back()` need to behave as
224/// the two ends of a range of values, that is they
225/// can not "walk past each other".
226///
227/// # Examples
228///
229/// `char::Searcher` is a `DoubleEndedSearcher` because searching for a
230/// `char` only requires looking at one at a time, which behaves the same
231/// from both ends.
232///
233/// `(&str)::Searcher` is not a `DoubleEndedSearcher` because
234/// the pattern `"aa"` in the haystack `"aaa"` matches as either
235/// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched.
236pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {}
237
ff7c6d11 238
9346a6ac 239/////////////////////////////////////////////////////////////////////////////
ff7c6d11 240// Impl for char
9346a6ac 241/////////////////////////////////////////////////////////////////////////////
c34b1796 242
ff7c6d11
XL
243/// Associated type for `<char as Pattern<'a>>::Searcher`.
244#[derive(Clone, Debug)]
245pub struct CharSearcher<'a> {
246 haystack: &'a str,
247 // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack`
248 // This invariant can be broken *within* next_match and next_match_back, however
249 // they must exit with fingers on valid code point boundaries.
250
251 /// `finger` is the current byte index of the forward search.
252 /// Imagine that it exists before the byte at its index, i.e.
83c7162d 253 /// `haystack[finger]` is the first byte of the slice we must inspect during
ff7c6d11
XL
254 /// forward searching
255 finger: usize,
256 /// `finger_back` is the current byte index of the reverse search.
257 /// Imagine that it exists after the byte at its index, i.e.
258 /// haystack[finger_back - 1] is the last byte of the slice we must inspect during
259 /// forward searching (and thus the first byte to be inspected when calling next_back())
260 finger_back: usize,
261 /// The character being searched for
262 needle: char,
263
264 // safety invariant: `utf8_size` must be less than 5
265 /// The number of bytes `needle` takes up when encoded in utf8
266 utf8_size: usize,
267 /// A utf8 encoded copy of the `needle`
268 utf8_encoded: [u8; 4],
c34b1796
AL
269}
270
ff7c6d11
XL
271unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
272 #[inline]
273 fn haystack(&self) -> &'a str {
274 self.haystack
275 }
276 #[inline]
277 fn next(&mut self) -> SearchStep {
278 let old_finger = self.finger;
279 let slice = unsafe { self.haystack.get_unchecked(old_finger..self.finger_back) };
280 let mut iter = slice.chars();
281 let old_len = iter.iter.len();
282 if let Some(ch) = iter.next() {
283 // add byte offset of current character
284 // without re-encoding as utf-8
285 self.finger += old_len - iter.iter.len();
286 if ch == self.needle {
287 SearchStep::Match(old_finger, self.finger)
288 } else {
289 SearchStep::Reject(old_finger, self.finger)
290 }
291 } else {
292 SearchStep::Done
293 }
294 }
c34b1796 295 #[inline]
ff7c6d11
XL
296 fn next_match(&mut self) -> Option<(usize, usize)> {
297 loop {
298 // get the haystack after the last character found
60c5eb7d 299 let bytes = self.haystack.as_bytes().get(self.finger..self.finger_back)?;
ff7c6d11
XL
300 // the last byte of the utf8 encoded needle
301 let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
302 if let Some(index) = memchr::memchr(last_byte, bytes) {
303 // The new finger is the index of the byte we found,
304 // plus one, since we memchr'd for the last byte of the character.
305 //
306 // Note that this doesn't always give us a finger on a UTF8 boundary.
307 // If we *didn't* find our character
308 // we may have indexed to the non-last byte of a 3-byte or 4-byte character.
309 // We can't just skip to the next valid starting byte because a character like
310 // ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find
311 // the second byte when searching for the third.
312 //
313 // However, this is totally okay. While we have the invariant that
0531ce1d 314 // self.finger is on a UTF8 boundary, this invariant is not relied upon
ff7c6d11
XL
315 // within this method (it is relied upon in CharSearcher::next()).
316 //
317 // We only exit this method when we reach the end of the string, or if we
318 // find something. When we find something the `finger` will be set
319 // to a UTF8 boundary.
320 self.finger += index + 1;
321 if self.finger >= self.utf8_size {
322 let found_char = self.finger - self.utf8_size;
323 if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) {
324 if slice == &self.utf8_encoded[0..self.utf8_size] {
325 return Some((found_char, self.finger));
326 }
327 }
328 }
329 } else {
330 // found nothing, exit
331 self.finger = self.finger_back;
332 return None;
333 }
334 }
335 }
336
337 // let next_reject use the default implementation from the Searcher trait
338}
c34b1796 339
ff7c6d11 340unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
c34b1796 341 #[inline]
ff7c6d11
XL
342 fn next_back(&mut self) -> SearchStep {
343 let old_finger = self.finger_back;
8faf50e0 344 let slice = unsafe { self.haystack.get_unchecked(self.finger..old_finger) };
ff7c6d11
XL
345 let mut iter = slice.chars();
346 let old_len = iter.iter.len();
347 if let Some(ch) = iter.next_back() {
348 // subtract byte offset of current character
349 // without re-encoding as utf-8
350 self.finger_back -= old_len - iter.iter.len();
351 if ch == self.needle {
352 SearchStep::Match(self.finger_back, old_finger)
353 } else {
354 SearchStep::Reject(self.finger_back, old_finger)
355 }
356 } else {
357 SearchStep::Done
358 }
359 }
360 #[inline]
361 fn next_match_back(&mut self) -> Option<(usize, usize)> {
362 let haystack = self.haystack.as_bytes();
363 loop {
364 // get the haystack up to but not including the last character searched
365 let bytes = if let Some(slice) = haystack.get(self.finger..self.finger_back) {
366 slice
367 } else {
368 return None;
369 };
370 // the last byte of the utf8 encoded needle
371 let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
372 if let Some(index) = memchr::memrchr(last_byte, bytes) {
373 // we searched a slice that was offset by self.finger,
374 // add self.finger to recoup the original index
375 let index = self.finger + index;
376 // memrchr will return the index of the byte we wish to
377 // find. In case of an ASCII character, this is indeed
378 // were we wish our new finger to be ("after" the found
379 // char in the paradigm of reverse iteration). For
380 // multibyte chars we need to skip down by the number of more
381 // bytes they have than ASCII
382 let shift = self.utf8_size - 1;
383 if index >= shift {
384 let found_char = index - shift;
385 if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) {
386 if slice == &self.utf8_encoded[0..self.utf8_size] {
0731742a 387 // move finger to before the character found (i.e., at its start index)
ff7c6d11
XL
388 self.finger_back = found_char;
389 return Some((self.finger_back, self.finger_back + self.utf8_size));
390 }
391 }
392 }
393 // We can't use finger_back = index - size + 1 here. If we found the last char
394 // of a different-sized character (or the middle byte of a different character)
395 // we need to bump the finger_back down to `index`. This similarly makes
396 // `finger_back` have the potential to no longer be on a boundary,
397 // but this is OK since we only exit this function on a boundary
398 // or when the haystack has been searched completely.
399 //
400 // Unlike next_match this does not
401 // have the problem of repeated bytes in utf-8 because
402 // we're searching for the last byte, and we can only have
403 // found the last byte when searching in reverse.
404 self.finger_back = index;
405 } else {
406 self.finger_back = self.finger;
407 // found nothing, exit
408 return None;
409 }
410 }
411 }
412
413 // let next_reject_back use the default implementation from the Searcher trait
c34b1796
AL
414}
415
ff7c6d11
XL
416impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {}
417
418/// Searches for chars that are equal to a given char
419impl<'a> Pattern<'a> for char {
420 type Searcher = CharSearcher<'a>;
421
c34b1796 422 #[inline]
ff7c6d11
XL
423 fn into_searcher(self, haystack: &'a str) -> Self::Searcher {
424 let mut utf8_encoded = [0; 4];
0731742a 425 let utf8_size = self.encode_utf8(&mut utf8_encoded).len();
ff7c6d11
XL
426 CharSearcher {
427 haystack,
428 finger: 0,
429 finger_back: haystack.len(),
430 needle: self,
431 utf8_size,
432 utf8_encoded
433 }
434 }
c34b1796
AL
435
436 #[inline]
ff7c6d11
XL
437 fn is_contained_in(self, haystack: &'a str) -> bool {
438 if (self as u32) < 128 {
439 haystack.as_bytes().contains(&(self as u8))
440 } else {
441 let mut buffer = [0u8; 4];
442 self.encode_utf8(&mut buffer).is_contained_in(haystack)
443 }
444 }
c34b1796 445
c34b1796 446 #[inline]
ff7c6d11 447 fn is_prefix_of(self, haystack: &'a str) -> bool {
60c5eb7d 448 self.encode_utf8(&mut [0u8; 4]).is_prefix_of(haystack)
ff7c6d11
XL
449 }
450
451 #[inline]
452 fn is_suffix_of(self, haystack: &'a str) -> bool where Self::Searcher: ReverseSearcher<'a>
453 {
60c5eb7d 454 self.encode_utf8(&mut [0u8; 4]).is_suffix_of(haystack)
c34b1796 455 }
ff7c6d11
XL
456}
457
458/////////////////////////////////////////////////////////////////////////////
459// Impl for a MultiCharEq wrapper
460/////////////////////////////////////////////////////////////////////////////
461
462#[doc(hidden)]
463trait MultiCharEq {
464 fn matches(&mut self, c: char) -> bool;
465}
c34b1796 466
ff7c6d11 467impl<F> MultiCharEq for F where F: FnMut(char) -> bool {
c34b1796 468 #[inline]
ff7c6d11
XL
469 fn matches(&mut self, c: char) -> bool { (*self)(c) }
470}
471
0bf4aa26 472impl MultiCharEq for &[char] {
ff7c6d11
XL
473 #[inline]
474 fn matches(&mut self, c: char) -> bool {
475 self.iter().any(|&m| { m == c })
c34b1796
AL
476 }
477}
478
ff7c6d11 479struct MultiCharEqPattern<C: MultiCharEq>(C);
c34b1796 480
54a0048b 481#[derive(Clone, Debug)]
ff7c6d11 482struct MultiCharEqSearcher<'a, C: MultiCharEq> {
c34b1796
AL
483 char_eq: C,
484 haystack: &'a str,
485 char_indices: super::CharIndices<'a>,
c34b1796
AL
486}
487
ff7c6d11
XL
488impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern<C> {
489 type Searcher = MultiCharEqSearcher<'a, C>;
c34b1796
AL
490
491 #[inline]
ff7c6d11
XL
492 fn into_searcher(self, haystack: &'a str) -> MultiCharEqSearcher<'a, C> {
493 MultiCharEqSearcher {
3b2f2976 494 haystack,
c34b1796
AL
495 char_eq: self.0,
496 char_indices: haystack.char_indices(),
497 }
498 }
499}
500
ff7c6d11 501unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> {
c34b1796
AL
502 #[inline]
503 fn haystack(&self) -> &'a str {
504 self.haystack
505 }
506
507 #[inline]
508 fn next(&mut self) -> SearchStep {
509 let s = &mut self.char_indices;
510 // Compare lengths of the internal byte slice iterator
511 // to find length of current char
3157f602 512 let pre_len = s.iter.iter.len();
c34b1796 513 if let Some((i, c)) = s.next() {
3157f602 514 let len = s.iter.iter.len();
c34b1796
AL
515 let char_len = pre_len - len;
516 if self.char_eq.matches(c) {
517 return SearchStep::Match(i, i + char_len);
518 } else {
519 return SearchStep::Reject(i, i + char_len);
520 }
521 }
522 SearchStep::Done
523 }
524}
525
ff7c6d11 526unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> {
c34b1796
AL
527 #[inline]
528 fn next_back(&mut self) -> SearchStep {
529 let s = &mut self.char_indices;
530 // Compare lengths of the internal byte slice iterator
531 // to find length of current char
3157f602 532 let pre_len = s.iter.iter.len();
c34b1796 533 if let Some((i, c)) = s.next_back() {
3157f602 534 let len = s.iter.iter.len();
c34b1796
AL
535 let char_len = pre_len - len;
536 if self.char_eq.matches(c) {
537 return SearchStep::Match(i, i + char_len);
538 } else {
539 return SearchStep::Reject(i, i + char_len);
540 }
541 }
542 SearchStep::Done
543 }
544}
545
ff7c6d11 546impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {}
c34b1796 547
9346a6ac
AL
548/////////////////////////////////////////////////////////////////////////////
549
550macro_rules! pattern_methods {
551 ($t:ty, $pmap:expr, $smap:expr) => {
552 type Searcher = $t;
553
554 #[inline]
555 fn into_searcher(self, haystack: &'a str) -> $t {
556 ($smap)(($pmap)(self).into_searcher(haystack))
c34b1796 557 }
9346a6ac 558
c34b1796
AL
559 #[inline]
560 fn is_contained_in(self, haystack: &'a str) -> bool {
9346a6ac 561 ($pmap)(self).is_contained_in(haystack)
c34b1796 562 }
9346a6ac 563
c34b1796
AL
564 #[inline]
565 fn is_prefix_of(self, haystack: &'a str) -> bool {
9346a6ac 566 ($pmap)(self).is_prefix_of(haystack)
c34b1796 567 }
9346a6ac 568
c34b1796
AL
569 #[inline]
570 fn is_suffix_of(self, haystack: &'a str) -> bool
9346a6ac 571 where $t: ReverseSearcher<'a>
c34b1796 572 {
9346a6ac 573 ($pmap)(self).is_suffix_of(haystack)
c34b1796
AL
574 }
575 }
576}
577
9346a6ac
AL
578macro_rules! searcher_methods {
579 (forward) => {
580 #[inline]
581 fn haystack(&self) -> &'a str {
582 self.0.haystack()
583 }
584 #[inline]
585 fn next(&mut self) -> SearchStep {
586 self.0.next()
587 }
588 #[inline]
589 fn next_match(&mut self) -> Option<(usize, usize)> {
590 self.0.next_match()
591 }
592 #[inline]
593 fn next_reject(&mut self) -> Option<(usize, usize)> {
594 self.0.next_reject()
595 }
596 };
597 (reverse) => {
598 #[inline]
599 fn next_back(&mut self) -> SearchStep {
600 self.0.next_back()
601 }
602 #[inline]
603 fn next_match_back(&mut self) -> Option<(usize, usize)> {
604 self.0.next_match_back()
605 }
606 #[inline]
607 fn next_reject_back(&mut self) -> Option<(usize, usize)> {
608 self.0.next_reject_back()
609 }
610 }
c34b1796
AL
611}
612
9346a6ac
AL
613/////////////////////////////////////////////////////////////////////////////
614// Impl for &[char]
615/////////////////////////////////////////////////////////////////////////////
616
617// Todo: Change / Remove due to ambiguity in meaning.
618
619/// Associated type for `<&[char] as Pattern<'a>>::Searcher`.
54a0048b 620#[derive(Clone, Debug)]
ff7c6d11 621pub struct CharSliceSearcher<'a, 'b>(<MultiCharEqPattern<&'b [char]> as Pattern<'a>>::Searcher);
c34b1796
AL
622
623unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> {
9346a6ac 624 searcher_methods!(forward);
c34b1796 625}
9346a6ac 626
c34b1796 627unsafe impl<'a, 'b> ReverseSearcher<'a> for CharSliceSearcher<'a, 'b> {
9346a6ac 628 searcher_methods!(reverse);
c34b1796 629}
c34b1796 630
9346a6ac 631impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {}
c34b1796 632
9346a6ac
AL
633/// Searches for chars that are equal to any of the chars in the array
634impl<'a, 'b> Pattern<'a> for &'b [char] {
ff7c6d11 635 pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher);
c34b1796
AL
636}
637
9346a6ac
AL
638/////////////////////////////////////////////////////////////////////////////
639// Impl for F: FnMut(char) -> bool
640/////////////////////////////////////////////////////////////////////////////
641
642/// Associated type for `<F as Pattern<'a>>::Searcher`.
643#[derive(Clone)]
ff7c6d11 644pub struct CharPredicateSearcher<'a, F>(<MultiCharEqPattern<F> as Pattern<'a>>::Searcher)
9346a6ac 645 where F: FnMut(char) -> bool;
c34b1796 646
0bf4aa26 647impl<F> fmt::Debug for CharPredicateSearcher<'_, F>
54a0048b
SL
648 where F: FnMut(char) -> bool
649{
48663c56 650 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
54a0048b
SL
651 f.debug_struct("CharPredicateSearcher")
652 .field("haystack", &self.0.haystack)
653 .field("char_indices", &self.0.char_indices)
54a0048b
SL
654 .finish()
655 }
656}
9346a6ac 657unsafe impl<'a, F> Searcher<'a> for CharPredicateSearcher<'a, F>
c34b1796
AL
658 where F: FnMut(char) -> bool
659{
9346a6ac 660 searcher_methods!(forward);
c34b1796 661}
9346a6ac
AL
662
663unsafe impl<'a, F> ReverseSearcher<'a> for CharPredicateSearcher<'a, F>
c34b1796
AL
664 where F: FnMut(char) -> bool
665{
9346a6ac 666 searcher_methods!(reverse);
c34b1796 667}
c34b1796 668
9346a6ac
AL
669impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F>
670 where F: FnMut(char) -> bool {}
c34b1796 671
9346a6ac
AL
672/// Searches for chars that match the given predicate
673impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool {
ff7c6d11 674 pattern_methods!(CharPredicateSearcher<'a, F>, MultiCharEqPattern, CharPredicateSearcher);
9346a6ac
AL
675}
676
677/////////////////////////////////////////////////////////////////////////////
678// Impl for &&str
679/////////////////////////////////////////////////////////////////////////////
680
681/// Delegates to the `&str` impl.
54a0048b 682impl<'a, 'b, 'c> Pattern<'a> for &'c &'b str {
9346a6ac 683 pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s);
c34b1796 684}
c1a9b12d
SL
685
686/////////////////////////////////////////////////////////////////////////////
687// Impl for &str
688/////////////////////////////////////////////////////////////////////////////
689
690/// Non-allocating substring search.
691///
692/// Will handle the pattern `""` as returning empty matches at each character
693/// boundary.
694impl<'a, 'b> Pattern<'a> for &'b str {
695 type Searcher = StrSearcher<'a, 'b>;
696
697 #[inline]
698 fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> {
699 StrSearcher::new(haystack, self)
700 }
701
702 /// Checks whether the pattern matches at the front of the haystack
703 #[inline]
704 fn is_prefix_of(self, haystack: &'a str) -> bool {
60c5eb7d 705 haystack.as_bytes().starts_with(self.as_bytes())
c1a9b12d
SL
706 }
707
708 /// Checks whether the pattern matches at the back of the haystack
709 #[inline]
710 fn is_suffix_of(self, haystack: &'a str) -> bool {
60c5eb7d 711 haystack.as_bytes().ends_with(self.as_bytes())
c1a9b12d
SL
712 }
713}
714
715
716/////////////////////////////////////////////////////////////////////////////
717// Two Way substring searcher
718/////////////////////////////////////////////////////////////////////////////
719
720#[derive(Clone, Debug)]
721/// Associated type for `<&str as Pattern<'a>>::Searcher`.
722pub struct StrSearcher<'a, 'b> {
723 haystack: &'a str,
724 needle: &'b str,
725
726 searcher: StrSearcherImpl,
727}
728
729#[derive(Clone, Debug)]
730enum StrSearcherImpl {
731 Empty(EmptyNeedle),
732 TwoWay(TwoWaySearcher),
733}
734
735#[derive(Clone, Debug)]
736struct EmptyNeedle {
737 position: usize,
738 end: usize,
739 is_match_fw: bool,
740 is_match_bw: bool,
741}
742
743impl<'a, 'b> StrSearcher<'a, 'b> {
744 fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> {
745 if needle.is_empty() {
746 StrSearcher {
3b2f2976
XL
747 haystack,
748 needle,
c1a9b12d
SL
749 searcher: StrSearcherImpl::Empty(EmptyNeedle {
750 position: 0,
751 end: haystack.len(),
752 is_match_fw: true,
753 is_match_bw: true,
754 }),
755 }
756 } else {
757 StrSearcher {
3b2f2976
XL
758 haystack,
759 needle,
c1a9b12d
SL
760 searcher: StrSearcherImpl::TwoWay(
761 TwoWaySearcher::new(needle.as_bytes(), haystack.len())
762 ),
763 }
764 }
765 }
766}
767
768unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> {
041b39d2
XL
769 #[inline]
770 fn haystack(&self) -> &'a str {
771 self.haystack
772 }
c1a9b12d
SL
773
774 #[inline]
775 fn next(&mut self) -> SearchStep {
776 match self.searcher {
777 StrSearcherImpl::Empty(ref mut searcher) => {
778 // empty needle rejects every char and matches every empty string between them
779 let is_match = searcher.is_match_fw;
780 searcher.is_match_fw = !searcher.is_match_fw;
781 let pos = searcher.position;
782 match self.haystack[pos..].chars().next() {
783 _ if is_match => SearchStep::Match(pos, pos),
784 None => SearchStep::Done,
785 Some(ch) => {
786 searcher.position += ch.len_utf8();
787 SearchStep::Reject(pos, searcher.position)
788 }
789 }
790 }
791 StrSearcherImpl::TwoWay(ref mut searcher) => {
792 // TwoWaySearcher produces valid *Match* indices that split at char boundaries
793 // as long as it does correct matching and that haystack and needle are
794 // valid UTF-8
795 // *Rejects* from the algorithm can fall on any indices, but we will walk them
796 // manually to the next character boundary, so that they are utf-8 safe.
797 if searcher.position == self.haystack.len() {
798 return SearchStep::Done;
799 }
800 let is_long = searcher.memory == usize::MAX;
801 match searcher.next::<RejectAndMatch>(self.haystack.as_bytes(),
802 self.needle.as_bytes(),
803 is_long)
804 {
805 SearchStep::Reject(a, mut b) => {
806 // skip to next char boundary
807 while !self.haystack.is_char_boundary(b) {
808 b += 1;
809 }
810 searcher.position = cmp::max(b, searcher.position);
811 SearchStep::Reject(a, b)
812 }
813 otherwise => otherwise,
814 }
815 }
816 }
817 }
818
3b2f2976 819 #[inline]
c1a9b12d
SL
820 fn next_match(&mut self) -> Option<(usize, usize)> {
821 match self.searcher {
822 StrSearcherImpl::Empty(..) => {
823 loop {
824 match self.next() {
825 SearchStep::Match(a, b) => return Some((a, b)),
826 SearchStep::Done => return None,
827 SearchStep::Reject(..) => { }
828 }
829 }
830 }
831 StrSearcherImpl::TwoWay(ref mut searcher) => {
832 let is_long = searcher.memory == usize::MAX;
e9174d1e
SL
833 // write out `true` and `false` cases to encourage the compiler
834 // to specialize the two cases separately.
c1a9b12d
SL
835 if is_long {
836 searcher.next::<MatchOnly>(self.haystack.as_bytes(),
837 self.needle.as_bytes(),
838 true)
839 } else {
840 searcher.next::<MatchOnly>(self.haystack.as_bytes(),
841 self.needle.as_bytes(),
842 false)
843 }
844 }
845 }
846 }
c1a9b12d 847}
e9174d1e 848
c1a9b12d
SL
849unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
850 #[inline]
851 fn next_back(&mut self) -> SearchStep {
852 match self.searcher {
853 StrSearcherImpl::Empty(ref mut searcher) => {
854 let is_match = searcher.is_match_bw;
855 searcher.is_match_bw = !searcher.is_match_bw;
856 let end = searcher.end;
857 match self.haystack[..end].chars().next_back() {
858 _ if is_match => SearchStep::Match(end, end),
859 None => SearchStep::Done,
860 Some(ch) => {
861 searcher.end -= ch.len_utf8();
862 SearchStep::Reject(searcher.end, end)
863 }
864 }
865 }
866 StrSearcherImpl::TwoWay(ref mut searcher) => {
867 if searcher.end == 0 {
868 return SearchStep::Done;
869 }
e9174d1e 870 let is_long = searcher.memory == usize::MAX;
c1a9b12d 871 match searcher.next_back::<RejectAndMatch>(self.haystack.as_bytes(),
e9174d1e
SL
872 self.needle.as_bytes(),
873 is_long)
c1a9b12d
SL
874 {
875 SearchStep::Reject(mut a, b) => {
876 // skip to next char boundary
877 while !self.haystack.is_char_boundary(a) {
878 a -= 1;
879 }
880 searcher.end = cmp::min(a, searcher.end);
881 SearchStep::Reject(a, b)
882 }
883 otherwise => otherwise,
884 }
885 }
886 }
887 }
888
889 #[inline]
890 fn next_match_back(&mut self) -> Option<(usize, usize)> {
891 match self.searcher {
892 StrSearcherImpl::Empty(..) => {
893 loop {
894 match self.next_back() {
895 SearchStep::Match(a, b) => return Some((a, b)),
896 SearchStep::Done => return None,
897 SearchStep::Reject(..) => { }
898 }
899 }
900 }
901 StrSearcherImpl::TwoWay(ref mut searcher) => {
e9174d1e
SL
902 let is_long = searcher.memory == usize::MAX;
903 // write out `true` and `false`, like `next_match`
904 if is_long {
905 searcher.next_back::<MatchOnly>(self.haystack.as_bytes(),
906 self.needle.as_bytes(),
907 true)
908 } else {
909 searcher.next_back::<MatchOnly>(self.haystack.as_bytes(),
910 self.needle.as_bytes(),
911 false)
912 }
c1a9b12d
SL
913 }
914 }
915 }
916}
917
e9174d1e 918/// The internal state of the two-way substring search algorithm.
c1a9b12d
SL
919#[derive(Clone, Debug)]
920struct TwoWaySearcher {
921 // constants
e9174d1e 922 /// critical factorization index
c1a9b12d 923 crit_pos: usize,
e9174d1e
SL
924 /// critical factorization index for reversed needle
925 crit_pos_back: usize,
c1a9b12d 926 period: usize,
e9174d1e
SL
927 /// `byteset` is an extension (not part of the two way algorithm);
928 /// it's a 64-bit "fingerprint" where each set bit `j` corresponds
929 /// to a (byte & 63) == j present in the needle.
c1a9b12d
SL
930 byteset: u64,
931
932 // variables
933 position: usize,
934 end: usize,
e9174d1e
SL
935 /// index into needle before which we have already matched
936 memory: usize,
937 /// index into needle after which we have already matched
938 memory_back: usize,
c1a9b12d
SL
939}
940
941/*
942 This is the Two-Way search algorithm, which was introduced in the paper:
943 Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
944
945 Here's some background information.
946
947 A *word* is a string of symbols. The *length* of a word should be a familiar
948 notion, and here we denote it for any word x by |x|.
949 (We also allow for the possibility of the *empty word*, a word of length zero).
950
951 If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a
952 *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p].
953 For example, both 1 and 2 are periods for the string "aa". As another example,
954 the only period of the string "abcd" is 4.
955
956 We denote by period(x) the *smallest* period of x (provided that x is non-empty).
957 This is always well-defined since every non-empty word x has at least one period,
958 |x|. We sometimes call this *the period* of x.
959
960 If u, v and x are words such that x = uv, where uv is the concatenation of u and
961 v, then we say that (u, v) is a *factorization* of x.
962
963 Let (u, v) be a factorization for a word x. Then if w is a non-empty word such
964 that both of the following hold
965
966 - either w is a suffix of u or u is a suffix of w
967 - either w is a prefix of v or v is a prefix of w
968
969 then w is said to be a *repetition* for the factorization (u, v).
970
971 Just to unpack this, there are four possibilities here. Let w = "abc". Then we
972 might have:
973
974 - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde")
975 - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab")
976 - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi")
977 - u is a suffix of w and v is a prefix of w. ex: ("bc", "a")
978
979 Note that the word vu is a repetition for any factorization (u,v) of x = uv,
980 so every factorization has at least one repetition.
981
982 If x is a string and (u, v) is a factorization for x, then a *local period* for
983 (u, v) is an integer r such that there is some word w such that |w| = r and w is
984 a repetition for (u, v).
985
986 We denote by local_period(u, v) the smallest local period of (u, v). We sometimes
987 call this *the local period* of (u, v). Provided that x = uv is non-empty, this
988 is well-defined (because each non-empty word has at least one factorization, as
989 noted above).
990
991 It can be proven that the following is an equivalent definition of a local period
992 for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for
993 all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are
0731742a 994 defined. (i.e., i > 0 and i + r < |x|).
c1a9b12d
SL
995
996 Using the above reformulation, it is easy to prove that
997
998 1 <= local_period(u, v) <= period(uv)
999
1000 A factorization (u, v) of x such that local_period(u,v) = period(x) is called a
1001 *critical factorization*.
1002
1003 The algorithm hinges on the following theorem, which is stated without proof:
1004
1005 **Critical Factorization Theorem** Any word x has at least one critical
1006 factorization (u, v) such that |u| < period(x).
1007
1008 The purpose of maximal_suffix is to find such a critical factorization.
1009
e9174d1e
SL
1010 If the period is short, compute another factorization x = u' v' to use
1011 for reverse search, chosen instead so that |v'| < period(x).
1012
c1a9b12d
SL
1013*/
1014impl TwoWaySearcher {
1015 fn new(needle: &[u8], end: usize) -> TwoWaySearcher {
1016 let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false);
1017 let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true);
1018
1019 let (crit_pos, period) =
1020 if crit_pos_false > crit_pos_true {
1021 (crit_pos_false, period_false)
1022 } else {
1023 (crit_pos_true, period_true)
1024 };
1025
c1a9b12d
SL
1026 // A particularly readable explanation of what's going on here can be found
1027 // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
1028 // see the code for "Algorithm CP" on p. 323.
1029 //
1030 // What's going on is we have some critical factorization (u, v) of the
1031 // needle, and we want to determine whether u is a suffix of
1032 // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use
1033 // "Algorithm CP2", which is optimized for when the period of the needle
1034 // is large.
1035 if &needle[..crit_pos] == &needle[period.. period + crit_pos] {
e9174d1e
SL
1036 // short period case -- the period is exact
1037 // compute a separate critical factorization for the reversed needle
1038 // x = u' v' where |v'| < period(x).
1039 //
1040 // This is sped up by the period being known already.
1041 // Note that a case like x = "acba" may be factored exactly forwards
1042 // (crit_pos = 1, period = 3) while being factored with approximate
1043 // period in reverse (crit_pos = 2, period = 2). We use the given
1044 // reverse factorization but keep the exact period.
1045 let crit_pos_back = needle.len() - cmp::max(
1046 TwoWaySearcher::reverse_maximal_suffix(needle, period, false),
1047 TwoWaySearcher::reverse_maximal_suffix(needle, period, true));
1048
c1a9b12d 1049 TwoWaySearcher {
3b2f2976
XL
1050 crit_pos,
1051 crit_pos_back,
1052 period,
e9174d1e 1053 byteset: Self::byteset_create(&needle[..period]),
c1a9b12d
SL
1054
1055 position: 0,
3b2f2976 1056 end,
e9174d1e
SL
1057 memory: 0,
1058 memory_back: needle.len(),
c1a9b12d
SL
1059 }
1060 } else {
e9174d1e
SL
1061 // long period case -- we have an approximation to the actual period,
1062 // and don't use memorization.
1063 //
1064 // Approximate the period by lower bound max(|u|, |v|) + 1.
1065 // The critical factorization is efficient to use for both forward and
1066 // reverse search.
1067
c1a9b12d 1068 TwoWaySearcher {
3b2f2976 1069 crit_pos,
e9174d1e 1070 crit_pos_back: crit_pos,
c1a9b12d 1071 period: cmp::max(crit_pos, needle.len() - crit_pos) + 1,
e9174d1e 1072 byteset: Self::byteset_create(needle),
c1a9b12d
SL
1073
1074 position: 0,
3b2f2976 1075 end,
e9174d1e
SL
1076 memory: usize::MAX, // Dummy value to signify that the period is long
1077 memory_back: usize::MAX,
c1a9b12d
SL
1078 }
1079 }
1080 }
1081
e9174d1e
SL
1082 #[inline]
1083 fn byteset_create(bytes: &[u8]) -> u64 {
1084 bytes.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a)
1085 }
1086
3b2f2976 1087 #[inline]
c1a9b12d
SL
1088 fn byteset_contains(&self, byte: u8) -> bool {
1089 (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0
1090 }
1091
1092 // One of the main ideas of Two-Way is that we factorize the needle into
1093 // two halves, (u, v), and begin trying to find v in the haystack by scanning
1094 // left to right. If v matches, we try to match u by scanning right to left.
1095 // How far we can jump when we encounter a mismatch is all based on the fact
1096 // that (u, v) is a critical factorization for the needle.
3b2f2976 1097 #[inline]
c1a9b12d
SL
1098 fn next<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool)
1099 -> S::Output
1100 where S: TwoWayStrategy
1101 {
1102 // `next()` uses `self.position` as its cursor
1103 let old_pos = self.position;
e9174d1e 1104 let needle_last = needle.len() - 1;
c1a9b12d
SL
1105 'search: loop {
1106 // Check that we have room to search in
e9174d1e
SL
1107 // position + needle_last can not overflow if we assume slices
1108 // are bounded by isize's range.
1109 let tail_byte = match haystack.get(self.position + needle_last) {
1110 Some(&b) => b,
1111 None => {
1112 self.position = haystack.len();
1113 return S::rejecting(old_pos, self.position);
1114 }
1115 };
c1a9b12d
SL
1116
1117 if S::use_early_reject() && old_pos != self.position {
1118 return S::rejecting(old_pos, self.position);
1119 }
1120
1121 // Quickly skip by large portions unrelated to our substring
e9174d1e 1122 if !self.byteset_contains(tail_byte) {
c1a9b12d
SL
1123 self.position += needle.len();
1124 if !long_period {
1125 self.memory = 0;
1126 }
1127 continue 'search;
1128 }
1129
1130 // See if the right part of the needle matches
1131 let start = if long_period { self.crit_pos }
1132 else { cmp::max(self.crit_pos, self.memory) };
1133 for i in start..needle.len() {
1134 if needle[i] != haystack[self.position + i] {
1135 self.position += i - self.crit_pos + 1;
1136 if !long_period {
1137 self.memory = 0;
1138 }
1139 continue 'search;
1140 }
1141 }
1142
1143 // See if the left part of the needle matches
1144 let start = if long_period { 0 } else { self.memory };
1145 for i in (start..self.crit_pos).rev() {
1146 if needle[i] != haystack[self.position + i] {
1147 self.position += self.period;
1148 if !long_period {
1149 self.memory = needle.len() - self.period;
1150 }
1151 continue 'search;
1152 }
1153 }
1154
1155 // We have found a match!
1156 let match_pos = self.position;
1157
1158 // Note: add self.period instead of needle.len() to have overlapping matches
1159 self.position += needle.len();
1160 if !long_period {
1161 self.memory = 0; // set to needle.len() - self.period for overlapping matches
1162 }
1163
1164 return S::matching(match_pos, match_pos + needle.len());
1165 }
1166 }
1167
1168 // Follows the ideas in `next()`.
1169 //
e9174d1e 1170 // The definitions are symmetrical, with period(x) = period(reverse(x))
c1a9b12d 1171 // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v)
e9174d1e
SL
1172 // is a critical factorization, so is (reverse(v), reverse(u)).
1173 //
1174 // For the reverse case we have computed a critical factorization x = u' v'
1175 // (field `crit_pos_back`). We need |u| < period(x) for the forward case and
1176 // thus |v'| < period(x) for the reverse.
c1a9b12d
SL
1177 //
1178 // To search in reverse through the haystack, we search forward through
e9174d1e 1179 // a reversed haystack with a reversed needle, matching first u' and then v'.
c1a9b12d 1180 #[inline]
e9174d1e 1181 fn next_back<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool)
c1a9b12d
SL
1182 -> S::Output
1183 where S: TwoWayStrategy
1184 {
1185 // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()`
1186 // are independent.
1187 let old_end = self.end;
1188 'search: loop {
1189 // Check that we have room to search in
e9174d1e
SL
1190 // end - needle.len() will wrap around when there is no more room,
1191 // but due to slice length limits it can never wrap all the way back
1192 // into the length of haystack.
1193 let front_byte = match haystack.get(self.end.wrapping_sub(needle.len())) {
1194 Some(&b) => b,
1195 None => {
1196 self.end = 0;
1197 return S::rejecting(0, old_end);
1198 }
1199 };
c1a9b12d
SL
1200
1201 if S::use_early_reject() && old_end != self.end {
1202 return S::rejecting(self.end, old_end);
1203 }
1204
1205 // Quickly skip by large portions unrelated to our substring
e9174d1e 1206 if !self.byteset_contains(front_byte) {
c1a9b12d 1207 self.end -= needle.len();
e9174d1e
SL
1208 if !long_period {
1209 self.memory_back = needle.len();
1210 }
c1a9b12d
SL
1211 continue 'search;
1212 }
1213
1214 // See if the left part of the needle matches
e9174d1e
SL
1215 let crit = if long_period { self.crit_pos_back }
1216 else { cmp::min(self.crit_pos_back, self.memory_back) };
1217 for i in (0..crit).rev() {
c1a9b12d 1218 if needle[i] != haystack[self.end - needle.len() + i] {
e9174d1e
SL
1219 self.end -= self.crit_pos_back - i;
1220 if !long_period {
1221 self.memory_back = needle.len();
1222 }
c1a9b12d
SL
1223 continue 'search;
1224 }
1225 }
1226
1227 // See if the right part of the needle matches
e9174d1e
SL
1228 let needle_end = if long_period { needle.len() }
1229 else { self.memory_back };
1230 for i in self.crit_pos_back..needle_end {
c1a9b12d
SL
1231 if needle[i] != haystack[self.end - needle.len() + i] {
1232 self.end -= self.period;
e9174d1e
SL
1233 if !long_period {
1234 self.memory_back = self.period;
1235 }
c1a9b12d
SL
1236 continue 'search;
1237 }
1238 }
1239
1240 // We have found a match!
1241 let match_pos = self.end - needle.len();
1242 // Note: sub self.period instead of needle.len() to have overlapping matches
1243 self.end -= needle.len();
e9174d1e
SL
1244 if !long_period {
1245 self.memory_back = needle.len();
1246 }
c1a9b12d
SL
1247
1248 return S::matching(match_pos, match_pos + needle.len());
1249 }
1250 }
1251
e9174d1e
SL
1252 // Compute the maximal suffix of `arr`.
1253 //
1254 // The maximal suffix is a possible critical factorization (u, v) of `arr`.
1255 //
1256 // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the
1257 // period of v.
1258 //
1259 // `order_greater` determines if lexical order is `<` or `>`. Both
1260 // orders must be computed -- the ordering with the largest `i` gives
1261 // a critical factorization.
1262 //
1263 // For long period cases, the resulting period is not exact (it is too short).
c1a9b12d 1264 #[inline]
e9174d1e
SL
1265 fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) {
1266 let mut left = 0; // Corresponds to i in the paper
1267 let mut right = 1; // Corresponds to j in the paper
1268 let mut offset = 0; // Corresponds to k in the paper, but starting at 0
1269 // to match 0-based indexing.
c1a9b12d
SL
1270 let mut period = 1; // Corresponds to p in the paper
1271
e9174d1e
SL
1272 while let Some(&a) = arr.get(right + offset) {
1273 // `left` will be inbounds when `right` is.
1274 let b = arr[left + offset];
1275 if (a < b && !order_greater) || (a > b && order_greater) {
1276 // Suffix is smaller, period is entire prefix so far.
1277 right += offset + 1;
1278 offset = 0;
1279 period = right - left;
1280 } else if a == b {
1281 // Advance through repetition of the current period.
1282 if offset + 1 == period {
1283 right += offset + 1;
1284 offset = 0;
1285 } else {
1286 offset += 1;
1287 }
c1a9b12d 1288 } else {
e9174d1e
SL
1289 // Suffix is larger, start over from current location.
1290 left = right;
1291 right += 1;
1292 offset = 0;
1293 period = 1;
c1a9b12d 1294 }
e9174d1e
SL
1295 }
1296 (left, period)
1297 }
1298
1299 // Compute the maximal suffix of the reverse of `arr`.
1300 //
1301 // The maximal suffix is a possible critical factorization (u', v') of `arr`.
1302 //
1303 // Returns `i` where `i` is the starting index of v', from the back;
3b2f2976 1304 // returns immediately when a period of `known_period` is reached.
e9174d1e
SL
1305 //
1306 // `order_greater` determines if lexical order is `<` or `>`. Both
1307 // orders must be computed -- the ordering with the largest `i` gives
1308 // a critical factorization.
1309 //
1310 // For long period cases, the resulting period is not exact (it is too short).
1311 fn reverse_maximal_suffix(arr: &[u8], known_period: usize,
1312 order_greater: bool) -> usize
1313 {
1314 let mut left = 0; // Corresponds to i in the paper
1315 let mut right = 1; // Corresponds to j in the paper
1316 let mut offset = 0; // Corresponds to k in the paper, but starting at 0
1317 // to match 0-based indexing.
1318 let mut period = 1; // Corresponds to p in the paper
1319 let n = arr.len();
1320
1321 while right + offset < n {
1322 let a = arr[n - (1 + right + offset)];
1323 let b = arr[n - (1 + left + offset)];
1324 if (a < b && !order_greater) || (a > b && order_greater) {
c1a9b12d 1325 // Suffix is smaller, period is entire prefix so far.
e9174d1e
SL
1326 right += offset + 1;
1327 offset = 0;
1328 period = right - left;
c1a9b12d
SL
1329 } else if a == b {
1330 // Advance through repetition of the current period.
e9174d1e
SL
1331 if offset + 1 == period {
1332 right += offset + 1;
1333 offset = 0;
c1a9b12d
SL
1334 } else {
1335 offset += 1;
1336 }
1337 } else {
1338 // Suffix is larger, start over from current location.
1339 left = right;
1340 right += 1;
e9174d1e 1341 offset = 0;
c1a9b12d
SL
1342 period = 1;
1343 }
e9174d1e
SL
1344 if period == known_period {
1345 break;
1346 }
c1a9b12d 1347 }
e9174d1e
SL
1348 debug_assert!(period <= known_period);
1349 left
c1a9b12d
SL
1350 }
1351}
1352
1353// TwoWayStrategy allows the algorithm to either skip non-matches as quickly
1354// as possible, or to work in a mode where it emits Rejects relatively quickly.
1355trait TwoWayStrategy {
1356 type Output;
1357 fn use_early_reject() -> bool;
8bb4bdeb
XL
1358 fn rejecting(a: usize, b: usize) -> Self::Output;
1359 fn matching(a: usize, b: usize) -> Self::Output;
c1a9b12d
SL
1360}
1361
1362/// Skip to match intervals as quickly as possible
1363enum MatchOnly { }
1364
1365impl TwoWayStrategy for MatchOnly {
1366 type Output = Option<(usize, usize)>;
1367
1368 #[inline]
1369 fn use_early_reject() -> bool { false }
1370 #[inline]
1371 fn rejecting(_a: usize, _b: usize) -> Self::Output { None }
1372 #[inline]
1373 fn matching(a: usize, b: usize) -> Self::Output { Some((a, b)) }
1374}
1375
1376/// Emit Rejects regularly
1377enum RejectAndMatch { }
1378
1379impl TwoWayStrategy for RejectAndMatch {
1380 type Output = SearchStep;
1381
1382 #[inline]
1383 fn use_early_reject() -> bool { true }
1384 #[inline]
1385 fn rejecting(a: usize, b: usize) -> Self::Output { SearchStep::Reject(a, b) }
1386 #[inline]
1387 fn matching(a: usize, b: usize) -> Self::Output { SearchStep::Match(a, b) }
1388}