]>
Commit | Line | Data |
---|---|---|
9346a6ac AL |
1 | //! The string Pattern API. |
2 | //! | |
ba9703b0 XL |
3 | //! The Pattern API provides a generic mechanism for using different pattern |
4 | //! types when searching through a string. | |
5 | //! | |
9fa01778 XL |
6 | //! For more details, see the traits [`Pattern`], [`Searcher`], |
7 | //! [`ReverseSearcher`], and [`DoubleEndedSearcher`]. | |
ba9703b0 XL |
8 | //! |
9 | //! Although this API is unstable, it is exposed via stable APIs on the | |
10 | //! [`str`] type. | |
11 | //! | |
12 | //! # Examples | |
13 | //! | |
14 | //! [`Pattern`] is [implemented][pattern-impls] in the stable API for | |
3dfed10e | 15 | //! [`&str`][`str`], [`char`], slices of [`char`], and functions and closures |
ba9703b0 XL |
16 | //! implementing `FnMut(char) -> bool`. |
17 | //! | |
18 | //! ``` | |
19 | //! let s = "Can you find a needle in a haystack?"; | |
20 | //! | |
21 | //! // &str pattern | |
22 | //! assert_eq!(s.find("you"), Some(4)); | |
23 | //! // char pattern | |
24 | //! assert_eq!(s.find('n'), Some(2)); | |
25 | //! // slice of chars pattern | |
26 | //! assert_eq!(s.find(&['a', 'e', 'i', 'o', 'u'][..]), Some(1)); | |
27 | //! // closure pattern | |
28 | //! assert_eq!(s.find(|c: char| c.is_ascii_punctuation()), Some(35)); | |
29 | //! ``` | |
30 | //! | |
1b1a35ee | 31 | //! [pattern-impls]: Pattern#implementors |
9346a6ac | 32 | |
dfeec247 XL |
33 | #![unstable( |
34 | feature = "pattern", | |
35 | reason = "API not fully fleshed out and ready to be stabilized", | |
36 | issue = "27721" | |
37 | )] | |
e9174d1e | 38 | |
48663c56 XL |
39 | use crate::cmp; |
40 | use crate::fmt; | |
41 | use crate::slice::memchr; | |
c34b1796 AL |
42 | |
43 | // Pattern | |
44 | ||
45 | /// A string pattern. | |
46 | /// | |
47 | /// A `Pattern<'a>` expresses that the implementing type | |
3dfed10e | 48 | /// can be used as a string pattern for searching in a [`&'a str`][str]. |
c34b1796 AL |
49 | /// |
50 | /// For example, both `'a'` and `"aa"` are patterns that | |
51 | /// would match at index `1` in the string `"baaaab"`. | |
52 | /// | |
53 | /// The trait itself acts as a builder for an associated | |
3dfed10e | 54 | /// [`Searcher`] type, which does the actual work of finding |
c34b1796 | 55 | /// occurrences of the pattern in a string. |
f035d41b XL |
56 | /// |
57 | /// Depending on the type of the pattern, the behaviour of methods like | |
58 | /// [`str::find`] and [`str::contains`] can change. The table below describes | |
59 | /// some of those behaviours. | |
60 | /// | |
61 | /// | Pattern type | Match condition | | |
62 | /// |--------------------------|-------------------------------------------| | |
63 | /// | `&str` | is substring | | |
64 | /// | `char` | is contained in string | | |
65 | /// | `&[char]` | any char in slice is contained in string | | |
66 | /// | `F: FnMut(char) -> bool` | `F` returns `true` for a char in string | | |
67 | /// | `&&str` | is substring | | |
68 | /// | `&String` | is substring | | |
69 | /// | |
70 | /// # Examples | |
3dfed10e | 71 | /// |
f035d41b XL |
72 | /// ``` |
73 | /// // &str | |
74 | /// assert_eq!("abaaa".find("ba"), Some(1)); | |
75 | /// assert_eq!("abaaa".find("bac"), None); | |
76 | /// | |
77 | /// // char | |
78 | /// assert_eq!("abaaa".find('a'), Some(0)); | |
79 | /// assert_eq!("abaaa".find('b'), Some(1)); | |
80 | /// assert_eq!("abaaa".find('c'), None); | |
81 | /// | |
82 | /// // &[char] | |
83 | /// assert_eq!("ab".find(&['b', 'a'][..]), Some(0)); | |
84 | /// assert_eq!("abaaa".find(&['a', 'z'][..]), Some(0)); | |
85 | /// assert_eq!("abaaa".find(&['c', 'd'][..]), None); | |
86 | /// | |
87 | /// // FnMut(char) -> bool | |
88 | /// assert_eq!("abcdef_z".find(|ch| ch > 'd' && ch < 'y'), Some(4)); | |
89 | /// assert_eq!("abcddd_z".find(|ch| ch > 'd' && ch < 'y'), None); | |
90 | /// ``` | |
c34b1796 AL |
91 | pub trait Pattern<'a>: Sized { |
92 | /// Associated searcher for this pattern | |
93 | type Searcher: Searcher<'a>; | |
94 | ||
9346a6ac | 95 | /// Constructs the associated searcher from |
c34b1796 AL |
96 | /// `self` and the `haystack` to search in. |
97 | fn into_searcher(self, haystack: &'a str) -> Self::Searcher; | |
98 | ||
9346a6ac | 99 | /// Checks whether the pattern matches anywhere in the haystack |
c34b1796 AL |
100 | #[inline] |
101 | fn is_contained_in(self, haystack: &'a str) -> bool { | |
102 | self.into_searcher(haystack).next_match().is_some() | |
103 | } | |
104 | ||
9346a6ac | 105 | /// Checks whether the pattern matches at the front of the haystack |
c34b1796 AL |
106 | #[inline] |
107 | fn is_prefix_of(self, haystack: &'a str) -> bool { | |
dfeec247 | 108 | matches!(self.into_searcher(haystack).next(), SearchStep::Match(0, _)) |
c34b1796 AL |
109 | } |
110 | ||
f035d41b XL |
111 | /// Checks whether the pattern matches at the back of the haystack |
112 | #[inline] | |
113 | fn is_suffix_of(self, haystack: &'a str) -> bool | |
114 | where | |
115 | Self::Searcher: ReverseSearcher<'a>, | |
116 | { | |
117 | matches!(self.into_searcher(haystack).next_back(), SearchStep::Match(_, j) if haystack.len() == j) | |
118 | } | |
119 | ||
ba9703b0 XL |
120 | /// Removes the pattern from the front of haystack, if it matches. |
121 | #[inline] | |
122 | fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> { | |
123 | if let SearchStep::Match(start, len) = self.into_searcher(haystack).next() { | |
124 | debug_assert_eq!( | |
125 | start, 0, | |
126 | "The first search step from Searcher \ | |
127 | must include the first character" | |
128 | ); | |
129 | // SAFETY: `Searcher` is known to return valid indices. | |
130 | unsafe { Some(haystack.get_unchecked(len..)) } | |
131 | } else { | |
132 | None | |
133 | } | |
134 | } | |
135 | ||
ba9703b0 XL |
136 | /// Removes the pattern from the back of haystack, if it matches. |
137 | #[inline] | |
138 | fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> | |
139 | where | |
140 | Self::Searcher: ReverseSearcher<'a>, | |
141 | { | |
142 | if let SearchStep::Match(start, end) = self.into_searcher(haystack).next_back() { | |
143 | debug_assert_eq!( | |
144 | end, | |
145 | haystack.len(), | |
146 | "The first search step from ReverseSearcher \ | |
147 | must include the last character" | |
148 | ); | |
149 | // SAFETY: `Searcher` is known to return valid indices. | |
150 | unsafe { Some(haystack.get_unchecked(..start)) } | |
151 | } else { | |
152 | None | |
153 | } | |
154 | } | |
c34b1796 AL |
155 | } |
156 | ||
157 | // Searcher | |
158 | ||
3dfed10e | 159 | /// Result of calling [`Searcher::next()`] or [`ReverseSearcher::next_back()`]. |
c34b1796 AL |
160 | #[derive(Copy, Clone, Eq, PartialEq, Debug)] |
161 | pub enum SearchStep { | |
162 | /// Expresses that a match of the pattern has been found at | |
163 | /// `haystack[a..b]`. | |
164 | Match(usize, usize), | |
165 | /// Expresses that `haystack[a..b]` has been rejected as a possible match | |
166 | /// of the pattern. | |
167 | /// | |
168 | /// Note that there might be more than one `Reject` between two `Match`es, | |
169 | /// there is no requirement for them to be combined into one. | |
170 | Reject(usize, usize), | |
3b2f2976 | 171 | /// Expresses that every byte of the haystack has been visited, ending |
c34b1796 | 172 | /// the iteration. |
dfeec247 | 173 | Done, |
c34b1796 AL |
174 | } |
175 | ||
176 | /// A searcher for a string pattern. | |
177 | /// | |
178 | /// This trait provides methods for searching for non-overlapping | |
179 | /// matches of a pattern starting from the front (left) of a string. | |
180 | /// | |
181 | /// It will be implemented by associated `Searcher` | |
3dfed10e | 182 | /// types of the [`Pattern`] trait. |
c34b1796 AL |
183 | /// |
184 | /// The trait is marked unsafe because the indices returned by the | |
3dfed10e XL |
185 | /// [`next()`][Searcher::next] methods are required to lie on valid utf8 |
186 | /// boundaries in the haystack. This enables consumers of this trait to | |
c34b1796 AL |
187 | /// slice the haystack without additional runtime checks. |
188 | pub unsafe trait Searcher<'a> { | |
3b2f2976 | 189 | /// Getter for the underlying string to be searched in |
c34b1796 | 190 | /// |
3dfed10e | 191 | /// Will always return the same [`&str`][str]. |
c34b1796 AL |
192 | fn haystack(&self) -> &'a str; |
193 | ||
194 | /// Performs the next search step starting from the front. | |
195 | /// | |
3dfed10e XL |
196 | /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` matches |
197 | /// the pattern. | |
198 | /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]` can | |
199 | /// not match the pattern, even partially. | |
200 | /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack has | |
201 | /// been visited. | |
c34b1796 | 202 | /// |
3dfed10e XL |
203 | /// The stream of [`Match`][SearchStep::Match] and |
204 | /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done] | |
c34b1796 AL |
205 | /// will contain index ranges that are adjacent, non-overlapping, |
206 | /// covering the whole haystack, and laying on utf8 boundaries. | |
207 | /// | |
3dfed10e XL |
208 | /// A [`Match`][SearchStep::Match] result needs to contain the whole matched |
209 | /// pattern, however [`Reject`][SearchStep::Reject] results may be split up | |
210 | /// into arbitrary many adjacent fragments. Both ranges may have zero length. | |
c34b1796 AL |
211 | /// |
212 | /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` | |
213 | /// might produce the stream | |
214 | /// `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, 8)]` | |
215 | fn next(&mut self) -> SearchStep; | |
216 | ||
3dfed10e | 217 | /// Finds the next [`Match`][SearchStep::Match] result. See [`next()`][Searcher::next]. |
ff7c6d11 | 218 | /// |
3dfed10e XL |
219 | /// Unlike [`next()`][Searcher::next], there is no guarantee that the returned ranges |
220 | /// of this and [`next_reject`][Searcher::next_reject] will overlap. This will return | |
221 | /// `(start_match, end_match)`, where start_match is the index of where | |
222 | /// the match begins, and end_match is the index after the end of the match. | |
c34b1796 AL |
223 | #[inline] |
224 | fn next_match(&mut self) -> Option<(usize, usize)> { | |
225 | loop { | |
226 | match self.next() { | |
227 | SearchStep::Match(a, b) => return Some((a, b)), | |
228 | SearchStep::Done => return None, | |
229 | _ => continue, | |
230 | } | |
231 | } | |
232 | } | |
233 | ||
3dfed10e XL |
234 | /// Finds the next [`Reject`][SearchStep::Reject] result. See [`next()`][Searcher::next] |
235 | /// and [`next_match()`][Searcher::next_match]. | |
ff7c6d11 | 236 | /// |
3dfed10e XL |
237 | /// Unlike [`next()`][Searcher::next], there is no guarantee that the returned ranges |
238 | /// of this and [`next_match`][Searcher::next_match] will overlap. | |
c34b1796 AL |
239 | #[inline] |
240 | fn next_reject(&mut self) -> Option<(usize, usize)> { | |
241 | loop { | |
242 | match self.next() { | |
243 | SearchStep::Reject(a, b) => return Some((a, b)), | |
244 | SearchStep::Done => return None, | |
245 | _ => continue, | |
246 | } | |
247 | } | |
248 | } | |
249 | } | |
250 | ||
251 | /// A reverse searcher for a string pattern. | |
252 | /// | |
253 | /// This trait provides methods for searching for non-overlapping | |
254 | /// matches of a pattern starting from the back (right) of a string. | |
255 | /// | |
3dfed10e XL |
256 | /// It will be implemented by associated [`Searcher`] |
257 | /// types of the [`Pattern`] trait if the pattern supports searching | |
c34b1796 AL |
258 | /// for it from the back. |
259 | /// | |
260 | /// The index ranges returned by this trait are not required | |
261 | /// to exactly match those of the forward search in reverse. | |
262 | /// | |
263 | /// For the reason why this trait is marked unsafe, see them | |
3dfed10e | 264 | /// parent trait [`Searcher`]. |
c34b1796 AL |
265 | pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { |
266 | /// Performs the next search step starting from the back. | |
267 | /// | |
3dfed10e XL |
268 | /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` |
269 | /// matches the pattern. | |
270 | /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]` | |
271 | /// can not match the pattern, even partially. | |
272 | /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack | |
273 | /// has been visited | |
c34b1796 | 274 | /// |
3dfed10e XL |
275 | /// The stream of [`Match`][SearchStep::Match] and |
276 | /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done] | |
c34b1796 AL |
277 | /// will contain index ranges that are adjacent, non-overlapping, |
278 | /// covering the whole haystack, and laying on utf8 boundaries. | |
279 | /// | |
3dfed10e XL |
280 | /// A [`Match`][SearchStep::Match] result needs to contain the whole matched |
281 | /// pattern, however [`Reject`][SearchStep::Reject] results may be split up | |
282 | /// into arbitrary many adjacent fragments. Both ranges may have zero length. | |
c34b1796 AL |
283 | /// |
284 | /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` | |
285 | /// might produce the stream | |
3dfed10e | 286 | /// `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, 1)]`. |
c34b1796 AL |
287 | fn next_back(&mut self) -> SearchStep; |
288 | ||
3dfed10e XL |
289 | /// Finds the next [`Match`][SearchStep::Match] result. |
290 | /// See [`next_back()`][ReverseSearcher::next_back]. | |
c34b1796 | 291 | #[inline] |
dfeec247 | 292 | fn next_match_back(&mut self) -> Option<(usize, usize)> { |
c34b1796 AL |
293 | loop { |
294 | match self.next_back() { | |
295 | SearchStep::Match(a, b) => return Some((a, b)), | |
296 | SearchStep::Done => return None, | |
297 | _ => continue, | |
298 | } | |
299 | } | |
300 | } | |
301 | ||
3dfed10e XL |
302 | /// Finds the next [`Reject`][SearchStep::Reject] result. |
303 | /// See [`next_back()`][ReverseSearcher::next_back]. | |
c34b1796 | 304 | #[inline] |
dfeec247 | 305 | fn next_reject_back(&mut self) -> Option<(usize, usize)> { |
c34b1796 AL |
306 | loop { |
307 | match self.next_back() { | |
308 | SearchStep::Reject(a, b) => return Some((a, b)), | |
309 | SearchStep::Done => return None, | |
310 | _ => continue, | |
311 | } | |
312 | } | |
313 | } | |
314 | } | |
315 | ||
3dfed10e XL |
316 | /// A marker trait to express that a [`ReverseSearcher`] |
317 | /// can be used for a [`DoubleEndedIterator`] implementation. | |
c34b1796 | 318 | /// |
3dfed10e | 319 | /// For this, the impl of [`Searcher`] and [`ReverseSearcher`] need |
c34b1796 AL |
320 | /// to follow these conditions: |
321 | /// | |
322 | /// - All results of `next()` need to be identical | |
323 | /// to the results of `next_back()` in reverse order. | |
324 | /// - `next()` and `next_back()` need to behave as | |
325 | /// the two ends of a range of values, that is they | |
326 | /// can not "walk past each other". | |
327 | /// | |
328 | /// # Examples | |
329 | /// | |
330 | /// `char::Searcher` is a `DoubleEndedSearcher` because searching for a | |
3dfed10e | 331 | /// [`char`] only requires looking at one at a time, which behaves the same |
c34b1796 AL |
332 | /// from both ends. |
333 | /// | |
334 | /// `(&str)::Searcher` is not a `DoubleEndedSearcher` because | |
335 | /// the pattern `"aa"` in the haystack `"aaa"` matches as either | |
336 | /// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched. | |
337 | pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} | |
338 | ||
9346a6ac | 339 | ///////////////////////////////////////////////////////////////////////////// |
ff7c6d11 | 340 | // Impl for char |
9346a6ac | 341 | ///////////////////////////////////////////////////////////////////////////// |
c34b1796 | 342 | |
ff7c6d11 XL |
343 | /// Associated type for `<char as Pattern<'a>>::Searcher`. |
344 | #[derive(Clone, Debug)] | |
345 | pub struct CharSearcher<'a> { | |
346 | haystack: &'a str, | |
347 | // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack` | |
348 | // This invariant can be broken *within* next_match and next_match_back, however | |
349 | // they must exit with fingers on valid code point boundaries. | |
ff7c6d11 XL |
350 | /// `finger` is the current byte index of the forward search. |
351 | /// Imagine that it exists before the byte at its index, i.e. | |
83c7162d | 352 | /// `haystack[finger]` is the first byte of the slice we must inspect during |
ff7c6d11 XL |
353 | /// forward searching |
354 | finger: usize, | |
355 | /// `finger_back` is the current byte index of the reverse search. | |
356 | /// Imagine that it exists after the byte at its index, i.e. | |
357 | /// haystack[finger_back - 1] is the last byte of the slice we must inspect during | |
3dfed10e | 358 | /// forward searching (and thus the first byte to be inspected when calling next_back()). |
ff7c6d11 XL |
359 | finger_back: usize, |
360 | /// The character being searched for | |
361 | needle: char, | |
362 | ||
363 | // safety invariant: `utf8_size` must be less than 5 | |
3dfed10e | 364 | /// The number of bytes `needle` takes up when encoded in utf8. |
ff7c6d11 XL |
365 | utf8_size: usize, |
366 | /// A utf8 encoded copy of the `needle` | |
367 | utf8_encoded: [u8; 4], | |
c34b1796 AL |
368 | } |
369 | ||
ff7c6d11 XL |
370 | unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { |
371 | #[inline] | |
372 | fn haystack(&self) -> &'a str { | |
373 | self.haystack | |
374 | } | |
375 | #[inline] | |
376 | fn next(&mut self) -> SearchStep { | |
377 | let old_finger = self.finger; | |
dfeec247 XL |
378 | // SAFETY: 1-4 guarantee safety of `get_unchecked` |
379 | // 1. `self.finger` and `self.finger_back` are kept on unicode boundaries | |
380 | // (this is invariant) | |
381 | // 2. `self.finger >= 0` since it starts at 0 and only increases | |
382 | // 3. `self.finger < self.finger_back` because otherwise the char `iter` | |
383 | // would return `SearchStep::Done` | |
384 | // 4. `self.finger` comes before the end of the haystack because `self.finger_back` | |
385 | // starts at the end and only decreases | |
ff7c6d11 XL |
386 | let slice = unsafe { self.haystack.get_unchecked(old_finger..self.finger_back) }; |
387 | let mut iter = slice.chars(); | |
388 | let old_len = iter.iter.len(); | |
389 | if let Some(ch) = iter.next() { | |
390 | // add byte offset of current character | |
391 | // without re-encoding as utf-8 | |
392 | self.finger += old_len - iter.iter.len(); | |
393 | if ch == self.needle { | |
394 | SearchStep::Match(old_finger, self.finger) | |
395 | } else { | |
396 | SearchStep::Reject(old_finger, self.finger) | |
397 | } | |
398 | } else { | |
399 | SearchStep::Done | |
400 | } | |
401 | } | |
c34b1796 | 402 | #[inline] |
ff7c6d11 XL |
403 | fn next_match(&mut self) -> Option<(usize, usize)> { |
404 | loop { | |
405 | // get the haystack after the last character found | |
60c5eb7d | 406 | let bytes = self.haystack.as_bytes().get(self.finger..self.finger_back)?; |
ff7c6d11 | 407 | // the last byte of the utf8 encoded needle |
dfeec247 | 408 | // SAFETY: we have an invariant that `utf8_size < 5` |
ff7c6d11 XL |
409 | let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; |
410 | if let Some(index) = memchr::memchr(last_byte, bytes) { | |
411 | // The new finger is the index of the byte we found, | |
412 | // plus one, since we memchr'd for the last byte of the character. | |
413 | // | |
414 | // Note that this doesn't always give us a finger on a UTF8 boundary. | |
415 | // If we *didn't* find our character | |
416 | // we may have indexed to the non-last byte of a 3-byte or 4-byte character. | |
417 | // We can't just skip to the next valid starting byte because a character like | |
418 | // ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find | |
419 | // the second byte when searching for the third. | |
420 | // | |
421 | // However, this is totally okay. While we have the invariant that | |
0531ce1d | 422 | // self.finger is on a UTF8 boundary, this invariant is not relied upon |
ff7c6d11 XL |
423 | // within this method (it is relied upon in CharSearcher::next()). |
424 | // | |
425 | // We only exit this method when we reach the end of the string, or if we | |
426 | // find something. When we find something the `finger` will be set | |
427 | // to a UTF8 boundary. | |
428 | self.finger += index + 1; | |
429 | if self.finger >= self.utf8_size { | |
430 | let found_char = self.finger - self.utf8_size; | |
431 | if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) { | |
432 | if slice == &self.utf8_encoded[0..self.utf8_size] { | |
433 | return Some((found_char, self.finger)); | |
434 | } | |
435 | } | |
436 | } | |
437 | } else { | |
438 | // found nothing, exit | |
439 | self.finger = self.finger_back; | |
440 | return None; | |
441 | } | |
442 | } | |
443 | } | |
444 | ||
445 | // let next_reject use the default implementation from the Searcher trait | |
446 | } | |
c34b1796 | 447 | |
ff7c6d11 | 448 | unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { |
c34b1796 | 449 | #[inline] |
ff7c6d11 XL |
450 | fn next_back(&mut self) -> SearchStep { |
451 | let old_finger = self.finger_back; | |
dfeec247 | 452 | // SAFETY: see the comment for next() above |
8faf50e0 | 453 | let slice = unsafe { self.haystack.get_unchecked(self.finger..old_finger) }; |
ff7c6d11 XL |
454 | let mut iter = slice.chars(); |
455 | let old_len = iter.iter.len(); | |
456 | if let Some(ch) = iter.next_back() { | |
457 | // subtract byte offset of current character | |
458 | // without re-encoding as utf-8 | |
459 | self.finger_back -= old_len - iter.iter.len(); | |
460 | if ch == self.needle { | |
461 | SearchStep::Match(self.finger_back, old_finger) | |
462 | } else { | |
463 | SearchStep::Reject(self.finger_back, old_finger) | |
464 | } | |
465 | } else { | |
466 | SearchStep::Done | |
467 | } | |
468 | } | |
469 | #[inline] | |
470 | fn next_match_back(&mut self) -> Option<(usize, usize)> { | |
471 | let haystack = self.haystack.as_bytes(); | |
472 | loop { | |
473 | // get the haystack up to but not including the last character searched | |
ba9703b0 | 474 | let bytes = haystack.get(self.finger..self.finger_back)?; |
ff7c6d11 | 475 | // the last byte of the utf8 encoded needle |
dfeec247 | 476 | // SAFETY: we have an invariant that `utf8_size < 5` |
ff7c6d11 XL |
477 | let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; |
478 | if let Some(index) = memchr::memrchr(last_byte, bytes) { | |
479 | // we searched a slice that was offset by self.finger, | |
480 | // add self.finger to recoup the original index | |
481 | let index = self.finger + index; | |
482 | // memrchr will return the index of the byte we wish to | |
483 | // find. In case of an ASCII character, this is indeed | |
484 | // were we wish our new finger to be ("after" the found | |
485 | // char in the paradigm of reverse iteration). For | |
486 | // multibyte chars we need to skip down by the number of more | |
487 | // bytes they have than ASCII | |
488 | let shift = self.utf8_size - 1; | |
489 | if index >= shift { | |
490 | let found_char = index - shift; | |
491 | if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) { | |
492 | if slice == &self.utf8_encoded[0..self.utf8_size] { | |
0731742a | 493 | // move finger to before the character found (i.e., at its start index) |
ff7c6d11 XL |
494 | self.finger_back = found_char; |
495 | return Some((self.finger_back, self.finger_back + self.utf8_size)); | |
496 | } | |
497 | } | |
498 | } | |
499 | // We can't use finger_back = index - size + 1 here. If we found the last char | |
500 | // of a different-sized character (or the middle byte of a different character) | |
501 | // we need to bump the finger_back down to `index`. This similarly makes | |
502 | // `finger_back` have the potential to no longer be on a boundary, | |
503 | // but this is OK since we only exit this function on a boundary | |
504 | // or when the haystack has been searched completely. | |
505 | // | |
506 | // Unlike next_match this does not | |
507 | // have the problem of repeated bytes in utf-8 because | |
508 | // we're searching for the last byte, and we can only have | |
509 | // found the last byte when searching in reverse. | |
510 | self.finger_back = index; | |
511 | } else { | |
512 | self.finger_back = self.finger; | |
513 | // found nothing, exit | |
514 | return None; | |
515 | } | |
516 | } | |
517 | } | |
518 | ||
519 | // let next_reject_back use the default implementation from the Searcher trait | |
c34b1796 AL |
520 | } |
521 | ||
ff7c6d11 XL |
522 | impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} |
523 | ||
3dfed10e | 524 | /// Searches for chars that are equal to a given [`char`]. |
ba9703b0 XL |
525 | /// |
526 | /// # Examples | |
527 | /// | |
528 | /// ``` | |
529 | /// assert_eq!("Hello world".find('o'), Some(4)); | |
530 | /// ``` | |
ff7c6d11 XL |
531 | impl<'a> Pattern<'a> for char { |
532 | type Searcher = CharSearcher<'a>; | |
533 | ||
c34b1796 | 534 | #[inline] |
ff7c6d11 XL |
535 | fn into_searcher(self, haystack: &'a str) -> Self::Searcher { |
536 | let mut utf8_encoded = [0; 4]; | |
0731742a | 537 | let utf8_size = self.encode_utf8(&mut utf8_encoded).len(); |
ff7c6d11 XL |
538 | CharSearcher { |
539 | haystack, | |
540 | finger: 0, | |
541 | finger_back: haystack.len(), | |
542 | needle: self, | |
543 | utf8_size, | |
dfeec247 | 544 | utf8_encoded, |
ff7c6d11 XL |
545 | } |
546 | } | |
c34b1796 AL |
547 | |
548 | #[inline] | |
ff7c6d11 XL |
549 | fn is_contained_in(self, haystack: &'a str) -> bool { |
550 | if (self as u32) < 128 { | |
551 | haystack.as_bytes().contains(&(self as u8)) | |
552 | } else { | |
553 | let mut buffer = [0u8; 4]; | |
554 | self.encode_utf8(&mut buffer).is_contained_in(haystack) | |
555 | } | |
556 | } | |
c34b1796 | 557 | |
c34b1796 | 558 | #[inline] |
ff7c6d11 | 559 | fn is_prefix_of(self, haystack: &'a str) -> bool { |
60c5eb7d | 560 | self.encode_utf8(&mut [0u8; 4]).is_prefix_of(haystack) |
ff7c6d11 XL |
561 | } |
562 | ||
ba9703b0 XL |
563 | #[inline] |
564 | fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> { | |
565 | self.encode_utf8(&mut [0u8; 4]).strip_prefix_of(haystack) | |
566 | } | |
567 | ||
ff7c6d11 | 568 | #[inline] |
dfeec247 XL |
569 | fn is_suffix_of(self, haystack: &'a str) -> bool |
570 | where | |
571 | Self::Searcher: ReverseSearcher<'a>, | |
ff7c6d11 | 572 | { |
60c5eb7d | 573 | self.encode_utf8(&mut [0u8; 4]).is_suffix_of(haystack) |
c34b1796 | 574 | } |
ba9703b0 XL |
575 | |
576 | #[inline] | |
577 | fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> | |
578 | where | |
579 | Self::Searcher: ReverseSearcher<'a>, | |
580 | { | |
581 | self.encode_utf8(&mut [0u8; 4]).strip_suffix_of(haystack) | |
582 | } | |
ff7c6d11 XL |
583 | } |
584 | ||
585 | ///////////////////////////////////////////////////////////////////////////// | |
586 | // Impl for a MultiCharEq wrapper | |
587 | ///////////////////////////////////////////////////////////////////////////// | |
588 | ||
589 | #[doc(hidden)] | |
590 | trait MultiCharEq { | |
591 | fn matches(&mut self, c: char) -> bool; | |
592 | } | |
c34b1796 | 593 | |
dfeec247 XL |
594 | impl<F> MultiCharEq for F |
595 | where | |
596 | F: FnMut(char) -> bool, | |
597 | { | |
c34b1796 | 598 | #[inline] |
dfeec247 XL |
599 | fn matches(&mut self, c: char) -> bool { |
600 | (*self)(c) | |
601 | } | |
ff7c6d11 XL |
602 | } |
603 | ||
0bf4aa26 | 604 | impl MultiCharEq for &[char] { |
ff7c6d11 XL |
605 | #[inline] |
606 | fn matches(&mut self, c: char) -> bool { | |
dfeec247 | 607 | self.iter().any(|&m| m == c) |
c34b1796 AL |
608 | } |
609 | } | |
610 | ||
ff7c6d11 | 611 | struct MultiCharEqPattern<C: MultiCharEq>(C); |
c34b1796 | 612 | |
54a0048b | 613 | #[derive(Clone, Debug)] |
ff7c6d11 | 614 | struct MultiCharEqSearcher<'a, C: MultiCharEq> { |
c34b1796 AL |
615 | char_eq: C, |
616 | haystack: &'a str, | |
617 | char_indices: super::CharIndices<'a>, | |
c34b1796 AL |
618 | } |
619 | ||
ff7c6d11 XL |
620 | impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern<C> { |
621 | type Searcher = MultiCharEqSearcher<'a, C>; | |
c34b1796 AL |
622 | |
623 | #[inline] | |
ff7c6d11 | 624 | fn into_searcher(self, haystack: &'a str) -> MultiCharEqSearcher<'a, C> { |
dfeec247 | 625 | MultiCharEqSearcher { haystack, char_eq: self.0, char_indices: haystack.char_indices() } |
c34b1796 AL |
626 | } |
627 | } | |
628 | ||
ff7c6d11 | 629 | unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> { |
c34b1796 AL |
630 | #[inline] |
631 | fn haystack(&self) -> &'a str { | |
632 | self.haystack | |
633 | } | |
634 | ||
635 | #[inline] | |
636 | fn next(&mut self) -> SearchStep { | |
637 | let s = &mut self.char_indices; | |
638 | // Compare lengths of the internal byte slice iterator | |
639 | // to find length of current char | |
3157f602 | 640 | let pre_len = s.iter.iter.len(); |
c34b1796 | 641 | if let Some((i, c)) = s.next() { |
3157f602 | 642 | let len = s.iter.iter.len(); |
c34b1796 AL |
643 | let char_len = pre_len - len; |
644 | if self.char_eq.matches(c) { | |
645 | return SearchStep::Match(i, i + char_len); | |
646 | } else { | |
647 | return SearchStep::Reject(i, i + char_len); | |
648 | } | |
649 | } | |
650 | SearchStep::Done | |
651 | } | |
652 | } | |
653 | ||
ff7c6d11 | 654 | unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> { |
c34b1796 AL |
655 | #[inline] |
656 | fn next_back(&mut self) -> SearchStep { | |
657 | let s = &mut self.char_indices; | |
658 | // Compare lengths of the internal byte slice iterator | |
659 | // to find length of current char | |
3157f602 | 660 | let pre_len = s.iter.iter.len(); |
c34b1796 | 661 | if let Some((i, c)) = s.next_back() { |
3157f602 | 662 | let len = s.iter.iter.len(); |
c34b1796 AL |
663 | let char_len = pre_len - len; |
664 | if self.char_eq.matches(c) { | |
665 | return SearchStep::Match(i, i + char_len); | |
666 | } else { | |
667 | return SearchStep::Reject(i, i + char_len); | |
668 | } | |
669 | } | |
670 | SearchStep::Done | |
671 | } | |
672 | } | |
673 | ||
ff7c6d11 | 674 | impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {} |
c34b1796 | 675 | |
9346a6ac AL |
676 | ///////////////////////////////////////////////////////////////////////////// |
677 | ||
678 | macro_rules! pattern_methods { | |
679 | ($t:ty, $pmap:expr, $smap:expr) => { | |
680 | type Searcher = $t; | |
681 | ||
682 | #[inline] | |
683 | fn into_searcher(self, haystack: &'a str) -> $t { | |
684 | ($smap)(($pmap)(self).into_searcher(haystack)) | |
c34b1796 | 685 | } |
9346a6ac | 686 | |
c34b1796 AL |
687 | #[inline] |
688 | fn is_contained_in(self, haystack: &'a str) -> bool { | |
9346a6ac | 689 | ($pmap)(self).is_contained_in(haystack) |
c34b1796 | 690 | } |
9346a6ac | 691 | |
c34b1796 AL |
692 | #[inline] |
693 | fn is_prefix_of(self, haystack: &'a str) -> bool { | |
9346a6ac | 694 | ($pmap)(self).is_prefix_of(haystack) |
c34b1796 | 695 | } |
9346a6ac | 696 | |
ba9703b0 XL |
697 | #[inline] |
698 | fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> { | |
699 | ($pmap)(self).strip_prefix_of(haystack) | |
700 | } | |
701 | ||
c34b1796 AL |
702 | #[inline] |
703 | fn is_suffix_of(self, haystack: &'a str) -> bool | |
ba9703b0 XL |
704 | where |
705 | $t: ReverseSearcher<'a>, | |
c34b1796 | 706 | { |
9346a6ac | 707 | ($pmap)(self).is_suffix_of(haystack) |
c34b1796 | 708 | } |
ba9703b0 XL |
709 | |
710 | #[inline] | |
711 | fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> | |
712 | where | |
713 | $t: ReverseSearcher<'a>, | |
714 | { | |
715 | ($pmap)(self).strip_suffix_of(haystack) | |
716 | } | |
717 | }; | |
c34b1796 AL |
718 | } |
719 | ||
9346a6ac AL |
720 | macro_rules! searcher_methods { |
721 | (forward) => { | |
722 | #[inline] | |
723 | fn haystack(&self) -> &'a str { | |
724 | self.0.haystack() | |
725 | } | |
726 | #[inline] | |
727 | fn next(&mut self) -> SearchStep { | |
728 | self.0.next() | |
729 | } | |
730 | #[inline] | |
731 | fn next_match(&mut self) -> Option<(usize, usize)> { | |
732 | self.0.next_match() | |
733 | } | |
734 | #[inline] | |
735 | fn next_reject(&mut self) -> Option<(usize, usize)> { | |
736 | self.0.next_reject() | |
737 | } | |
738 | }; | |
739 | (reverse) => { | |
740 | #[inline] | |
741 | fn next_back(&mut self) -> SearchStep { | |
742 | self.0.next_back() | |
743 | } | |
744 | #[inline] | |
745 | fn next_match_back(&mut self) -> Option<(usize, usize)> { | |
746 | self.0.next_match_back() | |
747 | } | |
748 | #[inline] | |
749 | fn next_reject_back(&mut self) -> Option<(usize, usize)> { | |
750 | self.0.next_reject_back() | |
751 | } | |
ba9703b0 | 752 | }; |
c34b1796 AL |
753 | } |
754 | ||
9346a6ac AL |
755 | ///////////////////////////////////////////////////////////////////////////// |
756 | // Impl for &[char] | |
757 | ///////////////////////////////////////////////////////////////////////////// | |
758 | ||
759 | // Todo: Change / Remove due to ambiguity in meaning. | |
760 | ||
761 | /// Associated type for `<&[char] as Pattern<'a>>::Searcher`. | |
54a0048b | 762 | #[derive(Clone, Debug)] |
ff7c6d11 | 763 | pub struct CharSliceSearcher<'a, 'b>(<MultiCharEqPattern<&'b [char]> as Pattern<'a>>::Searcher); |
c34b1796 AL |
764 | |
765 | unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> { | |
9346a6ac | 766 | searcher_methods!(forward); |
c34b1796 | 767 | } |
9346a6ac | 768 | |
c34b1796 | 769 | unsafe impl<'a, 'b> ReverseSearcher<'a> for CharSliceSearcher<'a, 'b> { |
9346a6ac | 770 | searcher_methods!(reverse); |
c34b1796 | 771 | } |
c34b1796 | 772 | |
9346a6ac | 773 | impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {} |
c34b1796 | 774 | |
3dfed10e | 775 | /// Searches for chars that are equal to any of the [`char`]s in the slice. |
ba9703b0 XL |
776 | /// |
777 | /// # Examples | |
778 | /// | |
779 | /// ``` | |
780 | /// assert_eq!("Hello world".find(&['l', 'l'] as &[_]), Some(2)); | |
781 | /// assert_eq!("Hello world".find(&['l', 'l'][..]), Some(2)); | |
782 | /// ``` | |
9346a6ac | 783 | impl<'a, 'b> Pattern<'a> for &'b [char] { |
ff7c6d11 | 784 | pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher); |
c34b1796 AL |
785 | } |
786 | ||
9346a6ac AL |
787 | ///////////////////////////////////////////////////////////////////////////// |
788 | // Impl for F: FnMut(char) -> bool | |
789 | ///////////////////////////////////////////////////////////////////////////// | |
790 | ||
791 | /// Associated type for `<F as Pattern<'a>>::Searcher`. | |
792 | #[derive(Clone)] | |
ff7c6d11 | 793 | pub struct CharPredicateSearcher<'a, F>(<MultiCharEqPattern<F> as Pattern<'a>>::Searcher) |
dfeec247 XL |
794 | where |
795 | F: FnMut(char) -> bool; | |
c34b1796 | 796 | |
0bf4aa26 | 797 | impl<F> fmt::Debug for CharPredicateSearcher<'_, F> |
dfeec247 XL |
798 | where |
799 | F: FnMut(char) -> bool, | |
54a0048b | 800 | { |
48663c56 | 801 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
54a0048b SL |
802 | f.debug_struct("CharPredicateSearcher") |
803 | .field("haystack", &self.0.haystack) | |
804 | .field("char_indices", &self.0.char_indices) | |
54a0048b SL |
805 | .finish() |
806 | } | |
807 | } | |
9346a6ac | 808 | unsafe impl<'a, F> Searcher<'a> for CharPredicateSearcher<'a, F> |
dfeec247 XL |
809 | where |
810 | F: FnMut(char) -> bool, | |
c34b1796 | 811 | { |
9346a6ac | 812 | searcher_methods!(forward); |
c34b1796 | 813 | } |
9346a6ac AL |
814 | |
815 | unsafe impl<'a, F> ReverseSearcher<'a> for CharPredicateSearcher<'a, F> | |
dfeec247 XL |
816 | where |
817 | F: FnMut(char) -> bool, | |
c34b1796 | 818 | { |
9346a6ac | 819 | searcher_methods!(reverse); |
c34b1796 | 820 | } |
c34b1796 | 821 | |
dfeec247 | 822 | impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> where F: FnMut(char) -> bool {} |
c34b1796 | 823 | |
3dfed10e | 824 | /// Searches for [`char`]s that match the given predicate. |
ba9703b0 XL |
825 | /// |
826 | /// # Examples | |
827 | /// | |
828 | /// ``` | |
829 | /// assert_eq!("Hello world".find(char::is_uppercase), Some(0)); | |
830 | /// assert_eq!("Hello world".find(|c| "aeiou".contains(c)), Some(1)); | |
831 | /// ``` | |
dfeec247 XL |
832 | impl<'a, F> Pattern<'a> for F |
833 | where | |
834 | F: FnMut(char) -> bool, | |
835 | { | |
ff7c6d11 | 836 | pattern_methods!(CharPredicateSearcher<'a, F>, MultiCharEqPattern, CharPredicateSearcher); |
9346a6ac AL |
837 | } |
838 | ||
839 | ///////////////////////////////////////////////////////////////////////////// | |
840 | // Impl for &&str | |
841 | ///////////////////////////////////////////////////////////////////////////// | |
842 | ||
843 | /// Delegates to the `&str` impl. | |
54a0048b | 844 | impl<'a, 'b, 'c> Pattern<'a> for &'c &'b str { |
9346a6ac | 845 | pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s); |
c34b1796 | 846 | } |
c1a9b12d SL |
847 | |
848 | ///////////////////////////////////////////////////////////////////////////// | |
849 | // Impl for &str | |
850 | ///////////////////////////////////////////////////////////////////////////// | |
851 | ||
852 | /// Non-allocating substring search. | |
853 | /// | |
854 | /// Will handle the pattern `""` as returning empty matches at each character | |
855 | /// boundary. | |
ba9703b0 XL |
856 | /// |
857 | /// # Examples | |
858 | /// | |
859 | /// ``` | |
860 | /// assert_eq!("Hello world".find("world"), Some(6)); | |
861 | /// ``` | |
c1a9b12d SL |
862 | impl<'a, 'b> Pattern<'a> for &'b str { |
863 | type Searcher = StrSearcher<'a, 'b>; | |
864 | ||
865 | #[inline] | |
866 | fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> { | |
867 | StrSearcher::new(haystack, self) | |
868 | } | |
869 | ||
ba9703b0 | 870 | /// Checks whether the pattern matches at the front of the haystack. |
c1a9b12d SL |
871 | #[inline] |
872 | fn is_prefix_of(self, haystack: &'a str) -> bool { | |
60c5eb7d | 873 | haystack.as_bytes().starts_with(self.as_bytes()) |
c1a9b12d SL |
874 | } |
875 | ||
ba9703b0 XL |
876 | /// Removes the pattern from the front of haystack, if it matches. |
877 | #[inline] | |
878 | fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> { | |
879 | if self.is_prefix_of(haystack) { | |
880 | // SAFETY: prefix was just verified to exist. | |
881 | unsafe { Some(haystack.get_unchecked(self.as_bytes().len()..)) } | |
882 | } else { | |
883 | None | |
884 | } | |
885 | } | |
886 | ||
887 | /// Checks whether the pattern matches at the back of the haystack. | |
c1a9b12d SL |
888 | #[inline] |
889 | fn is_suffix_of(self, haystack: &'a str) -> bool { | |
60c5eb7d | 890 | haystack.as_bytes().ends_with(self.as_bytes()) |
c1a9b12d | 891 | } |
ba9703b0 XL |
892 | |
893 | /// Removes the pattern from the back of haystack, if it matches. | |
894 | #[inline] | |
895 | fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> { | |
896 | if self.is_suffix_of(haystack) { | |
897 | let i = haystack.len() - self.as_bytes().len(); | |
898 | // SAFETY: suffix was just verified to exist. | |
899 | unsafe { Some(haystack.get_unchecked(..i)) } | |
900 | } else { | |
901 | None | |
902 | } | |
903 | } | |
c1a9b12d SL |
904 | } |
905 | ||
c1a9b12d SL |
906 | ///////////////////////////////////////////////////////////////////////////// |
907 | // Two Way substring searcher | |
908 | ///////////////////////////////////////////////////////////////////////////// | |
909 | ||
910 | #[derive(Clone, Debug)] | |
911 | /// Associated type for `<&str as Pattern<'a>>::Searcher`. | |
912 | pub struct StrSearcher<'a, 'b> { | |
913 | haystack: &'a str, | |
914 | needle: &'b str, | |
915 | ||
916 | searcher: StrSearcherImpl, | |
917 | } | |
918 | ||
919 | #[derive(Clone, Debug)] | |
920 | enum StrSearcherImpl { | |
921 | Empty(EmptyNeedle), | |
922 | TwoWay(TwoWaySearcher), | |
923 | } | |
924 | ||
925 | #[derive(Clone, Debug)] | |
926 | struct EmptyNeedle { | |
927 | position: usize, | |
928 | end: usize, | |
929 | is_match_fw: bool, | |
930 | is_match_bw: bool, | |
931 | } | |
932 | ||
933 | impl<'a, 'b> StrSearcher<'a, 'b> { | |
934 | fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> { | |
935 | if needle.is_empty() { | |
936 | StrSearcher { | |
3b2f2976 XL |
937 | haystack, |
938 | needle, | |
c1a9b12d SL |
939 | searcher: StrSearcherImpl::Empty(EmptyNeedle { |
940 | position: 0, | |
941 | end: haystack.len(), | |
942 | is_match_fw: true, | |
943 | is_match_bw: true, | |
944 | }), | |
945 | } | |
946 | } else { | |
947 | StrSearcher { | |
3b2f2976 XL |
948 | haystack, |
949 | needle, | |
dfeec247 XL |
950 | searcher: StrSearcherImpl::TwoWay(TwoWaySearcher::new( |
951 | needle.as_bytes(), | |
952 | haystack.len(), | |
953 | )), | |
c1a9b12d SL |
954 | } |
955 | } | |
956 | } | |
957 | } | |
958 | ||
959 | unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { | |
041b39d2 XL |
960 | #[inline] |
961 | fn haystack(&self) -> &'a str { | |
962 | self.haystack | |
963 | } | |
c1a9b12d SL |
964 | |
965 | #[inline] | |
966 | fn next(&mut self) -> SearchStep { | |
967 | match self.searcher { | |
968 | StrSearcherImpl::Empty(ref mut searcher) => { | |
969 | // empty needle rejects every char and matches every empty string between them | |
970 | let is_match = searcher.is_match_fw; | |
971 | searcher.is_match_fw = !searcher.is_match_fw; | |
972 | let pos = searcher.position; | |
973 | match self.haystack[pos..].chars().next() { | |
974 | _ if is_match => SearchStep::Match(pos, pos), | |
975 | None => SearchStep::Done, | |
976 | Some(ch) => { | |
977 | searcher.position += ch.len_utf8(); | |
978 | SearchStep::Reject(pos, searcher.position) | |
979 | } | |
980 | } | |
981 | } | |
982 | StrSearcherImpl::TwoWay(ref mut searcher) => { | |
983 | // TwoWaySearcher produces valid *Match* indices that split at char boundaries | |
984 | // as long as it does correct matching and that haystack and needle are | |
985 | // valid UTF-8 | |
986 | // *Rejects* from the algorithm can fall on any indices, but we will walk them | |
987 | // manually to the next character boundary, so that they are utf-8 safe. | |
988 | if searcher.position == self.haystack.len() { | |
989 | return SearchStep::Done; | |
990 | } | |
991 | let is_long = searcher.memory == usize::MAX; | |
dfeec247 XL |
992 | match searcher.next::<RejectAndMatch>( |
993 | self.haystack.as_bytes(), | |
994 | self.needle.as_bytes(), | |
995 | is_long, | |
996 | ) { | |
c1a9b12d SL |
997 | SearchStep::Reject(a, mut b) => { |
998 | // skip to next char boundary | |
999 | while !self.haystack.is_char_boundary(b) { | |
1000 | b += 1; | |
1001 | } | |
1002 | searcher.position = cmp::max(b, searcher.position); | |
1003 | SearchStep::Reject(a, b) | |
1004 | } | |
1005 | otherwise => otherwise, | |
1006 | } | |
1007 | } | |
1008 | } | |
1009 | } | |
1010 | ||
3b2f2976 | 1011 | #[inline] |
c1a9b12d SL |
1012 | fn next_match(&mut self) -> Option<(usize, usize)> { |
1013 | match self.searcher { | |
dfeec247 XL |
1014 | StrSearcherImpl::Empty(..) => loop { |
1015 | match self.next() { | |
1016 | SearchStep::Match(a, b) => return Some((a, b)), | |
1017 | SearchStep::Done => return None, | |
1018 | SearchStep::Reject(..) => {} | |
c1a9b12d | 1019 | } |
dfeec247 | 1020 | }, |
c1a9b12d SL |
1021 | StrSearcherImpl::TwoWay(ref mut searcher) => { |
1022 | let is_long = searcher.memory == usize::MAX; | |
e9174d1e SL |
1023 | // write out `true` and `false` cases to encourage the compiler |
1024 | // to specialize the two cases separately. | |
c1a9b12d | 1025 | if is_long { |
dfeec247 XL |
1026 | searcher.next::<MatchOnly>( |
1027 | self.haystack.as_bytes(), | |
1028 | self.needle.as_bytes(), | |
1029 | true, | |
1030 | ) | |
c1a9b12d | 1031 | } else { |
dfeec247 XL |
1032 | searcher.next::<MatchOnly>( |
1033 | self.haystack.as_bytes(), | |
1034 | self.needle.as_bytes(), | |
1035 | false, | |
1036 | ) | |
c1a9b12d SL |
1037 | } |
1038 | } | |
1039 | } | |
1040 | } | |
c1a9b12d | 1041 | } |
e9174d1e | 1042 | |
c1a9b12d SL |
1043 | unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { |
1044 | #[inline] | |
1045 | fn next_back(&mut self) -> SearchStep { | |
1046 | match self.searcher { | |
1047 | StrSearcherImpl::Empty(ref mut searcher) => { | |
1048 | let is_match = searcher.is_match_bw; | |
1049 | searcher.is_match_bw = !searcher.is_match_bw; | |
1050 | let end = searcher.end; | |
1051 | match self.haystack[..end].chars().next_back() { | |
1052 | _ if is_match => SearchStep::Match(end, end), | |
1053 | None => SearchStep::Done, | |
1054 | Some(ch) => { | |
1055 | searcher.end -= ch.len_utf8(); | |
1056 | SearchStep::Reject(searcher.end, end) | |
1057 | } | |
1058 | } | |
1059 | } | |
1060 | StrSearcherImpl::TwoWay(ref mut searcher) => { | |
1061 | if searcher.end == 0 { | |
1062 | return SearchStep::Done; | |
1063 | } | |
e9174d1e | 1064 | let is_long = searcher.memory == usize::MAX; |
dfeec247 XL |
1065 | match searcher.next_back::<RejectAndMatch>( |
1066 | self.haystack.as_bytes(), | |
1067 | self.needle.as_bytes(), | |
1068 | is_long, | |
1069 | ) { | |
c1a9b12d SL |
1070 | SearchStep::Reject(mut a, b) => { |
1071 | // skip to next char boundary | |
1072 | while !self.haystack.is_char_boundary(a) { | |
1073 | a -= 1; | |
1074 | } | |
1075 | searcher.end = cmp::min(a, searcher.end); | |
1076 | SearchStep::Reject(a, b) | |
1077 | } | |
1078 | otherwise => otherwise, | |
1079 | } | |
1080 | } | |
1081 | } | |
1082 | } | |
1083 | ||
1084 | #[inline] | |
1085 | fn next_match_back(&mut self) -> Option<(usize, usize)> { | |
1086 | match self.searcher { | |
dfeec247 XL |
1087 | StrSearcherImpl::Empty(..) => loop { |
1088 | match self.next_back() { | |
1089 | SearchStep::Match(a, b) => return Some((a, b)), | |
1090 | SearchStep::Done => return None, | |
1091 | SearchStep::Reject(..) => {} | |
c1a9b12d | 1092 | } |
dfeec247 | 1093 | }, |
c1a9b12d | 1094 | StrSearcherImpl::TwoWay(ref mut searcher) => { |
e9174d1e SL |
1095 | let is_long = searcher.memory == usize::MAX; |
1096 | // write out `true` and `false`, like `next_match` | |
1097 | if is_long { | |
dfeec247 XL |
1098 | searcher.next_back::<MatchOnly>( |
1099 | self.haystack.as_bytes(), | |
1100 | self.needle.as_bytes(), | |
1101 | true, | |
1102 | ) | |
e9174d1e | 1103 | } else { |
dfeec247 XL |
1104 | searcher.next_back::<MatchOnly>( |
1105 | self.haystack.as_bytes(), | |
1106 | self.needle.as_bytes(), | |
1107 | false, | |
1108 | ) | |
e9174d1e | 1109 | } |
c1a9b12d SL |
1110 | } |
1111 | } | |
1112 | } | |
1113 | } | |
1114 | ||
e9174d1e | 1115 | /// The internal state of the two-way substring search algorithm. |
c1a9b12d SL |
1116 | #[derive(Clone, Debug)] |
1117 | struct TwoWaySearcher { | |
1118 | // constants | |
e9174d1e | 1119 | /// critical factorization index |
c1a9b12d | 1120 | crit_pos: usize, |
e9174d1e SL |
1121 | /// critical factorization index for reversed needle |
1122 | crit_pos_back: usize, | |
c1a9b12d | 1123 | period: usize, |
e9174d1e SL |
1124 | /// `byteset` is an extension (not part of the two way algorithm); |
1125 | /// it's a 64-bit "fingerprint" where each set bit `j` corresponds | |
1126 | /// to a (byte & 63) == j present in the needle. | |
c1a9b12d SL |
1127 | byteset: u64, |
1128 | ||
1129 | // variables | |
1130 | position: usize, | |
1131 | end: usize, | |
e9174d1e SL |
1132 | /// index into needle before which we have already matched |
1133 | memory: usize, | |
1134 | /// index into needle after which we have already matched | |
1135 | memory_back: usize, | |
c1a9b12d SL |
1136 | } |
1137 | ||
1138 | /* | |
1139 | This is the Two-Way search algorithm, which was introduced in the paper: | |
1140 | Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675. | |
1141 | ||
1142 | Here's some background information. | |
1143 | ||
1144 | A *word* is a string of symbols. The *length* of a word should be a familiar | |
1145 | notion, and here we denote it for any word x by |x|. | |
1146 | (We also allow for the possibility of the *empty word*, a word of length zero). | |
1147 | ||
1148 | If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a | |
1149 | *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p]. | |
1150 | For example, both 1 and 2 are periods for the string "aa". As another example, | |
1151 | the only period of the string "abcd" is 4. | |
1152 | ||
1153 | We denote by period(x) the *smallest* period of x (provided that x is non-empty). | |
1154 | This is always well-defined since every non-empty word x has at least one period, | |
1155 | |x|. We sometimes call this *the period* of x. | |
1156 | ||
1157 | If u, v and x are words such that x = uv, where uv is the concatenation of u and | |
1158 | v, then we say that (u, v) is a *factorization* of x. | |
1159 | ||
1160 | Let (u, v) be a factorization for a word x. Then if w is a non-empty word such | |
1161 | that both of the following hold | |
1162 | ||
1163 | - either w is a suffix of u or u is a suffix of w | |
1164 | - either w is a prefix of v or v is a prefix of w | |
1165 | ||
1166 | then w is said to be a *repetition* for the factorization (u, v). | |
1167 | ||
1168 | Just to unpack this, there are four possibilities here. Let w = "abc". Then we | |
1169 | might have: | |
1170 | ||
1171 | - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde") | |
1172 | - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab") | |
1173 | - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi") | |
1174 | - u is a suffix of w and v is a prefix of w. ex: ("bc", "a") | |
1175 | ||
1176 | Note that the word vu is a repetition for any factorization (u,v) of x = uv, | |
1177 | so every factorization has at least one repetition. | |
1178 | ||
1179 | If x is a string and (u, v) is a factorization for x, then a *local period* for | |
1180 | (u, v) is an integer r such that there is some word w such that |w| = r and w is | |
1181 | a repetition for (u, v). | |
1182 | ||
1183 | We denote by local_period(u, v) the smallest local period of (u, v). We sometimes | |
1184 | call this *the local period* of (u, v). Provided that x = uv is non-empty, this | |
1185 | is well-defined (because each non-empty word has at least one factorization, as | |
1186 | noted above). | |
1187 | ||
1188 | It can be proven that the following is an equivalent definition of a local period | |
1189 | for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for | |
1190 | all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are | |
0731742a | 1191 | defined. (i.e., i > 0 and i + r < |x|). |
c1a9b12d SL |
1192 | |
1193 | Using the above reformulation, it is easy to prove that | |
1194 | ||
1195 | 1 <= local_period(u, v) <= period(uv) | |
1196 | ||
1197 | A factorization (u, v) of x such that local_period(u,v) = period(x) is called a | |
1198 | *critical factorization*. | |
1199 | ||
1200 | The algorithm hinges on the following theorem, which is stated without proof: | |
1201 | ||
1202 | **Critical Factorization Theorem** Any word x has at least one critical | |
1203 | factorization (u, v) such that |u| < period(x). | |
1204 | ||
1205 | The purpose of maximal_suffix is to find such a critical factorization. | |
1206 | ||
e9174d1e SL |
1207 | If the period is short, compute another factorization x = u' v' to use |
1208 | for reverse search, chosen instead so that |v'| < period(x). | |
1209 | ||
c1a9b12d SL |
1210 | */ |
1211 | impl TwoWaySearcher { | |
1212 | fn new(needle: &[u8], end: usize) -> TwoWaySearcher { | |
1213 | let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false); | |
1214 | let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true); | |
1215 | ||
dfeec247 XL |
1216 | let (crit_pos, period) = if crit_pos_false > crit_pos_true { |
1217 | (crit_pos_false, period_false) | |
1218 | } else { | |
1219 | (crit_pos_true, period_true) | |
1220 | }; | |
c1a9b12d | 1221 | |
c1a9b12d SL |
1222 | // A particularly readable explanation of what's going on here can be found |
1223 | // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically | |
1224 | // see the code for "Algorithm CP" on p. 323. | |
1225 | // | |
1226 | // What's going on is we have some critical factorization (u, v) of the | |
1227 | // needle, and we want to determine whether u is a suffix of | |
1228 | // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use | |
1229 | // "Algorithm CP2", which is optimized for when the period of the needle | |
1230 | // is large. | |
74b04a01 | 1231 | if needle[..crit_pos] == needle[period..period + crit_pos] { |
e9174d1e SL |
1232 | // short period case -- the period is exact |
1233 | // compute a separate critical factorization for the reversed needle | |
1234 | // x = u' v' where |v'| < period(x). | |
1235 | // | |
1236 | // This is sped up by the period being known already. | |
1237 | // Note that a case like x = "acba" may be factored exactly forwards | |
1238 | // (crit_pos = 1, period = 3) while being factored with approximate | |
1239 | // period in reverse (crit_pos = 2, period = 2). We use the given | |
1240 | // reverse factorization but keep the exact period. | |
dfeec247 XL |
1241 | let crit_pos_back = needle.len() |
1242 | - cmp::max( | |
1243 | TwoWaySearcher::reverse_maximal_suffix(needle, period, false), | |
1244 | TwoWaySearcher::reverse_maximal_suffix(needle, period, true), | |
1245 | ); | |
e9174d1e | 1246 | |
c1a9b12d | 1247 | TwoWaySearcher { |
3b2f2976 XL |
1248 | crit_pos, |
1249 | crit_pos_back, | |
1250 | period, | |
e9174d1e | 1251 | byteset: Self::byteset_create(&needle[..period]), |
c1a9b12d SL |
1252 | |
1253 | position: 0, | |
3b2f2976 | 1254 | end, |
e9174d1e SL |
1255 | memory: 0, |
1256 | memory_back: needle.len(), | |
c1a9b12d SL |
1257 | } |
1258 | } else { | |
e9174d1e SL |
1259 | // long period case -- we have an approximation to the actual period, |
1260 | // and don't use memorization. | |
1261 | // | |
1262 | // Approximate the period by lower bound max(|u|, |v|) + 1. | |
1263 | // The critical factorization is efficient to use for both forward and | |
1264 | // reverse search. | |
1265 | ||
c1a9b12d | 1266 | TwoWaySearcher { |
3b2f2976 | 1267 | crit_pos, |
e9174d1e | 1268 | crit_pos_back: crit_pos, |
c1a9b12d | 1269 | period: cmp::max(crit_pos, needle.len() - crit_pos) + 1, |
e9174d1e | 1270 | byteset: Self::byteset_create(needle), |
c1a9b12d SL |
1271 | |
1272 | position: 0, | |
3b2f2976 | 1273 | end, |
e9174d1e SL |
1274 | memory: usize::MAX, // Dummy value to signify that the period is long |
1275 | memory_back: usize::MAX, | |
c1a9b12d SL |
1276 | } |
1277 | } | |
1278 | } | |
1279 | ||
e9174d1e SL |
1280 | #[inline] |
1281 | fn byteset_create(bytes: &[u8]) -> u64 { | |
1282 | bytes.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a) | |
1283 | } | |
1284 | ||
3b2f2976 | 1285 | #[inline] |
c1a9b12d SL |
1286 | fn byteset_contains(&self, byte: u8) -> bool { |
1287 | (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0 | |
1288 | } | |
1289 | ||
1290 | // One of the main ideas of Two-Way is that we factorize the needle into | |
1291 | // two halves, (u, v), and begin trying to find v in the haystack by scanning | |
1292 | // left to right. If v matches, we try to match u by scanning right to left. | |
1293 | // How far we can jump when we encounter a mismatch is all based on the fact | |
1294 | // that (u, v) is a critical factorization for the needle. | |
3b2f2976 | 1295 | #[inline] |
dfeec247 XL |
1296 | fn next<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> S::Output |
1297 | where | |
1298 | S: TwoWayStrategy, | |
c1a9b12d SL |
1299 | { |
1300 | // `next()` uses `self.position` as its cursor | |
1301 | let old_pos = self.position; | |
e9174d1e | 1302 | let needle_last = needle.len() - 1; |
c1a9b12d SL |
1303 | 'search: loop { |
1304 | // Check that we have room to search in | |
e9174d1e SL |
1305 | // position + needle_last can not overflow if we assume slices |
1306 | // are bounded by isize's range. | |
1307 | let tail_byte = match haystack.get(self.position + needle_last) { | |
1308 | Some(&b) => b, | |
1309 | None => { | |
1310 | self.position = haystack.len(); | |
1311 | return S::rejecting(old_pos, self.position); | |
1312 | } | |
1313 | }; | |
c1a9b12d SL |
1314 | |
1315 | if S::use_early_reject() && old_pos != self.position { | |
1316 | return S::rejecting(old_pos, self.position); | |
1317 | } | |
1318 | ||
1319 | // Quickly skip by large portions unrelated to our substring | |
e9174d1e | 1320 | if !self.byteset_contains(tail_byte) { |
c1a9b12d SL |
1321 | self.position += needle.len(); |
1322 | if !long_period { | |
1323 | self.memory = 0; | |
1324 | } | |
1325 | continue 'search; | |
1326 | } | |
1327 | ||
1328 | // See if the right part of the needle matches | |
dfeec247 XL |
1329 | let start = |
1330 | if long_period { self.crit_pos } else { cmp::max(self.crit_pos, self.memory) }; | |
c1a9b12d SL |
1331 | for i in start..needle.len() { |
1332 | if needle[i] != haystack[self.position + i] { | |
1333 | self.position += i - self.crit_pos + 1; | |
1334 | if !long_period { | |
1335 | self.memory = 0; | |
1336 | } | |
1337 | continue 'search; | |
1338 | } | |
1339 | } | |
1340 | ||
1341 | // See if the left part of the needle matches | |
1342 | let start = if long_period { 0 } else { self.memory }; | |
1343 | for i in (start..self.crit_pos).rev() { | |
1344 | if needle[i] != haystack[self.position + i] { | |
1345 | self.position += self.period; | |
1346 | if !long_period { | |
1347 | self.memory = needle.len() - self.period; | |
1348 | } | |
1349 | continue 'search; | |
1350 | } | |
1351 | } | |
1352 | ||
1353 | // We have found a match! | |
1354 | let match_pos = self.position; | |
1355 | ||
1356 | // Note: add self.period instead of needle.len() to have overlapping matches | |
1357 | self.position += needle.len(); | |
1358 | if !long_period { | |
1359 | self.memory = 0; // set to needle.len() - self.period for overlapping matches | |
1360 | } | |
1361 | ||
1362 | return S::matching(match_pos, match_pos + needle.len()); | |
1363 | } | |
1364 | } | |
1365 | ||
1366 | // Follows the ideas in `next()`. | |
1367 | // | |
e9174d1e | 1368 | // The definitions are symmetrical, with period(x) = period(reverse(x)) |
c1a9b12d | 1369 | // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v) |
e9174d1e SL |
1370 | // is a critical factorization, so is (reverse(v), reverse(u)). |
1371 | // | |
1372 | // For the reverse case we have computed a critical factorization x = u' v' | |
1373 | // (field `crit_pos_back`). We need |u| < period(x) for the forward case and | |
1374 | // thus |v'| < period(x) for the reverse. | |
c1a9b12d SL |
1375 | // |
1376 | // To search in reverse through the haystack, we search forward through | |
e9174d1e | 1377 | // a reversed haystack with a reversed needle, matching first u' and then v'. |
c1a9b12d | 1378 | #[inline] |
dfeec247 XL |
1379 | fn next_back<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> S::Output |
1380 | where | |
1381 | S: TwoWayStrategy, | |
c1a9b12d SL |
1382 | { |
1383 | // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()` | |
1384 | // are independent. | |
1385 | let old_end = self.end; | |
1386 | 'search: loop { | |
1387 | // Check that we have room to search in | |
e9174d1e SL |
1388 | // end - needle.len() will wrap around when there is no more room, |
1389 | // but due to slice length limits it can never wrap all the way back | |
1390 | // into the length of haystack. | |
1391 | let front_byte = match haystack.get(self.end.wrapping_sub(needle.len())) { | |
1392 | Some(&b) => b, | |
1393 | None => { | |
1394 | self.end = 0; | |
1395 | return S::rejecting(0, old_end); | |
1396 | } | |
1397 | }; | |
c1a9b12d SL |
1398 | |
1399 | if S::use_early_reject() && old_end != self.end { | |
1400 | return S::rejecting(self.end, old_end); | |
1401 | } | |
1402 | ||
1403 | // Quickly skip by large portions unrelated to our substring | |
e9174d1e | 1404 | if !self.byteset_contains(front_byte) { |
c1a9b12d | 1405 | self.end -= needle.len(); |
e9174d1e SL |
1406 | if !long_period { |
1407 | self.memory_back = needle.len(); | |
1408 | } | |
c1a9b12d SL |
1409 | continue 'search; |
1410 | } | |
1411 | ||
1412 | // See if the left part of the needle matches | |
dfeec247 XL |
1413 | let crit = if long_period { |
1414 | self.crit_pos_back | |
1415 | } else { | |
1416 | cmp::min(self.crit_pos_back, self.memory_back) | |
1417 | }; | |
e9174d1e | 1418 | for i in (0..crit).rev() { |
c1a9b12d | 1419 | if needle[i] != haystack[self.end - needle.len() + i] { |
e9174d1e SL |
1420 | self.end -= self.crit_pos_back - i; |
1421 | if !long_period { | |
1422 | self.memory_back = needle.len(); | |
1423 | } | |
c1a9b12d SL |
1424 | continue 'search; |
1425 | } | |
1426 | } | |
1427 | ||
1428 | // See if the right part of the needle matches | |
dfeec247 | 1429 | let needle_end = if long_period { needle.len() } else { self.memory_back }; |
e9174d1e | 1430 | for i in self.crit_pos_back..needle_end { |
c1a9b12d SL |
1431 | if needle[i] != haystack[self.end - needle.len() + i] { |
1432 | self.end -= self.period; | |
e9174d1e SL |
1433 | if !long_period { |
1434 | self.memory_back = self.period; | |
1435 | } | |
c1a9b12d SL |
1436 | continue 'search; |
1437 | } | |
1438 | } | |
1439 | ||
1440 | // We have found a match! | |
1441 | let match_pos = self.end - needle.len(); | |
1442 | // Note: sub self.period instead of needle.len() to have overlapping matches | |
1443 | self.end -= needle.len(); | |
e9174d1e SL |
1444 | if !long_period { |
1445 | self.memory_back = needle.len(); | |
1446 | } | |
c1a9b12d SL |
1447 | |
1448 | return S::matching(match_pos, match_pos + needle.len()); | |
1449 | } | |
1450 | } | |
1451 | ||
e9174d1e SL |
1452 | // Compute the maximal suffix of `arr`. |
1453 | // | |
1454 | // The maximal suffix is a possible critical factorization (u, v) of `arr`. | |
1455 | // | |
1456 | // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the | |
1457 | // period of v. | |
1458 | // | |
1459 | // `order_greater` determines if lexical order is `<` or `>`. Both | |
1460 | // orders must be computed -- the ordering with the largest `i` gives | |
1461 | // a critical factorization. | |
1462 | // | |
1463 | // For long period cases, the resulting period is not exact (it is too short). | |
c1a9b12d | 1464 | #[inline] |
e9174d1e SL |
1465 | fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) { |
1466 | let mut left = 0; // Corresponds to i in the paper | |
1467 | let mut right = 1; // Corresponds to j in the paper | |
1468 | let mut offset = 0; // Corresponds to k in the paper, but starting at 0 | |
dfeec247 | 1469 | // to match 0-based indexing. |
c1a9b12d SL |
1470 | let mut period = 1; // Corresponds to p in the paper |
1471 | ||
e9174d1e SL |
1472 | while let Some(&a) = arr.get(right + offset) { |
1473 | // `left` will be inbounds when `right` is. | |
1474 | let b = arr[left + offset]; | |
1475 | if (a < b && !order_greater) || (a > b && order_greater) { | |
1476 | // Suffix is smaller, period is entire prefix so far. | |
1477 | right += offset + 1; | |
1478 | offset = 0; | |
1479 | period = right - left; | |
1480 | } else if a == b { | |
1481 | // Advance through repetition of the current period. | |
1482 | if offset + 1 == period { | |
1483 | right += offset + 1; | |
1484 | offset = 0; | |
1485 | } else { | |
1486 | offset += 1; | |
1487 | } | |
c1a9b12d | 1488 | } else { |
e9174d1e SL |
1489 | // Suffix is larger, start over from current location. |
1490 | left = right; | |
1491 | right += 1; | |
1492 | offset = 0; | |
1493 | period = 1; | |
c1a9b12d | 1494 | } |
e9174d1e SL |
1495 | } |
1496 | (left, period) | |
1497 | } | |
1498 | ||
1499 | // Compute the maximal suffix of the reverse of `arr`. | |
1500 | // | |
1501 | // The maximal suffix is a possible critical factorization (u', v') of `arr`. | |
1502 | // | |
1503 | // Returns `i` where `i` is the starting index of v', from the back; | |
3b2f2976 | 1504 | // returns immediately when a period of `known_period` is reached. |
e9174d1e SL |
1505 | // |
1506 | // `order_greater` determines if lexical order is `<` or `>`. Both | |
1507 | // orders must be computed -- the ordering with the largest `i` gives | |
1508 | // a critical factorization. | |
1509 | // | |
1510 | // For long period cases, the resulting period is not exact (it is too short). | |
dfeec247 | 1511 | fn reverse_maximal_suffix(arr: &[u8], known_period: usize, order_greater: bool) -> usize { |
e9174d1e SL |
1512 | let mut left = 0; // Corresponds to i in the paper |
1513 | let mut right = 1; // Corresponds to j in the paper | |
1514 | let mut offset = 0; // Corresponds to k in the paper, but starting at 0 | |
dfeec247 | 1515 | // to match 0-based indexing. |
e9174d1e SL |
1516 | let mut period = 1; // Corresponds to p in the paper |
1517 | let n = arr.len(); | |
1518 | ||
1519 | while right + offset < n { | |
1520 | let a = arr[n - (1 + right + offset)]; | |
1521 | let b = arr[n - (1 + left + offset)]; | |
1522 | if (a < b && !order_greater) || (a > b && order_greater) { | |
c1a9b12d | 1523 | // Suffix is smaller, period is entire prefix so far. |
e9174d1e SL |
1524 | right += offset + 1; |
1525 | offset = 0; | |
1526 | period = right - left; | |
c1a9b12d SL |
1527 | } else if a == b { |
1528 | // Advance through repetition of the current period. | |
e9174d1e SL |
1529 | if offset + 1 == period { |
1530 | right += offset + 1; | |
1531 | offset = 0; | |
c1a9b12d SL |
1532 | } else { |
1533 | offset += 1; | |
1534 | } | |
1535 | } else { | |
1536 | // Suffix is larger, start over from current location. | |
1537 | left = right; | |
1538 | right += 1; | |
e9174d1e | 1539 | offset = 0; |
c1a9b12d SL |
1540 | period = 1; |
1541 | } | |
e9174d1e SL |
1542 | if period == known_period { |
1543 | break; | |
1544 | } | |
c1a9b12d | 1545 | } |
e9174d1e SL |
1546 | debug_assert!(period <= known_period); |
1547 | left | |
c1a9b12d SL |
1548 | } |
1549 | } | |
1550 | ||
1551 | // TwoWayStrategy allows the algorithm to either skip non-matches as quickly | |
1552 | // as possible, or to work in a mode where it emits Rejects relatively quickly. | |
1553 | trait TwoWayStrategy { | |
1554 | type Output; | |
1555 | fn use_early_reject() -> bool; | |
8bb4bdeb XL |
1556 | fn rejecting(a: usize, b: usize) -> Self::Output; |
1557 | fn matching(a: usize, b: usize) -> Self::Output; | |
c1a9b12d SL |
1558 | } |
1559 | ||
1560 | /// Skip to match intervals as quickly as possible | |
dfeec247 | 1561 | enum MatchOnly {} |
c1a9b12d SL |
1562 | |
1563 | impl TwoWayStrategy for MatchOnly { | |
1564 | type Output = Option<(usize, usize)>; | |
1565 | ||
1566 | #[inline] | |
dfeec247 XL |
1567 | fn use_early_reject() -> bool { |
1568 | false | |
1569 | } | |
c1a9b12d | 1570 | #[inline] |
dfeec247 XL |
1571 | fn rejecting(_a: usize, _b: usize) -> Self::Output { |
1572 | None | |
1573 | } | |
c1a9b12d | 1574 | #[inline] |
dfeec247 XL |
1575 | fn matching(a: usize, b: usize) -> Self::Output { |
1576 | Some((a, b)) | |
1577 | } | |
c1a9b12d SL |
1578 | } |
1579 | ||
1580 | /// Emit Rejects regularly | |
dfeec247 | 1581 | enum RejectAndMatch {} |
c1a9b12d SL |
1582 | |
1583 | impl TwoWayStrategy for RejectAndMatch { | |
1584 | type Output = SearchStep; | |
1585 | ||
1586 | #[inline] | |
dfeec247 XL |
1587 | fn use_early_reject() -> bool { |
1588 | true | |
1589 | } | |
c1a9b12d | 1590 | #[inline] |
dfeec247 XL |
1591 | fn rejecting(a: usize, b: usize) -> Self::Output { |
1592 | SearchStep::Reject(a, b) | |
1593 | } | |
c1a9b12d | 1594 | #[inline] |
dfeec247 XL |
1595 | fn matching(a: usize, b: usize) -> Self::Output { |
1596 | SearchStep::Match(a, b) | |
1597 | } | |
c1a9b12d | 1598 | } |