]>
Commit | Line | Data |
---|---|---|
9346a6ac AL |
1 | //! The string Pattern API. |
2 | //! | |
9fa01778 XL |
3 | //! For more details, see the traits [`Pattern`], [`Searcher`], |
4 | //! [`ReverseSearcher`], and [`DoubleEndedSearcher`]. | |
9346a6ac | 5 | |
60c5eb7d XL |
6 | // ignore-tidy-undocumented-unsafe |
7 | ||
62682a34 | 8 | #![unstable(feature = "pattern", |
e9174d1e SL |
9 | reason = "API not fully fleshed out and ready to be stabilized", |
10 | issue = "27721")] | |
11 | ||
48663c56 XL |
12 | use crate::cmp; |
13 | use crate::fmt; | |
14 | use crate::slice::memchr; | |
15 | use crate::usize; | |
c34b1796 AL |
16 | |
17 | // Pattern | |
18 | ||
19 | /// A string pattern. | |
20 | /// | |
21 | /// A `Pattern<'a>` expresses that the implementing type | |
22 | /// can be used as a string pattern for searching in a `&'a str`. | |
23 | /// | |
24 | /// For example, both `'a'` and `"aa"` are patterns that | |
25 | /// would match at index `1` in the string `"baaaab"`. | |
26 | /// | |
27 | /// The trait itself acts as a builder for an associated | |
28 | /// `Searcher` type, which does the actual work of finding | |
29 | /// occurrences of the pattern in a string. | |
30 | pub trait Pattern<'a>: Sized { | |
31 | /// Associated searcher for this pattern | |
32 | type Searcher: Searcher<'a>; | |
33 | ||
9346a6ac | 34 | /// Constructs the associated searcher from |
c34b1796 AL |
35 | /// `self` and the `haystack` to search in. |
36 | fn into_searcher(self, haystack: &'a str) -> Self::Searcher; | |
37 | ||
9346a6ac | 38 | /// Checks whether the pattern matches anywhere in the haystack |
c34b1796 AL |
39 | #[inline] |
40 | fn is_contained_in(self, haystack: &'a str) -> bool { | |
41 | self.into_searcher(haystack).next_match().is_some() | |
42 | } | |
43 | ||
9346a6ac | 44 | /// Checks whether the pattern matches at the front of the haystack |
c34b1796 AL |
45 | #[inline] |
46 | fn is_prefix_of(self, haystack: &'a str) -> bool { | |
47 | match self.into_searcher(haystack).next() { | |
48 | SearchStep::Match(0, _) => true, | |
49 | _ => false, | |
50 | } | |
51 | } | |
52 | ||
9346a6ac | 53 | /// Checks whether the pattern matches at the back of the haystack |
c34b1796 AL |
54 | #[inline] |
55 | fn is_suffix_of(self, haystack: &'a str) -> bool | |
56 | where Self::Searcher: ReverseSearcher<'a> | |
57 | { | |
58 | match self.into_searcher(haystack).next_back() { | |
59 | SearchStep::Match(_, j) if haystack.len() == j => true, | |
60 | _ => false, | |
61 | } | |
62 | } | |
63 | } | |
64 | ||
65 | // Searcher | |
66 | ||
67 | /// Result of calling `Searcher::next()` or `ReverseSearcher::next_back()`. | |
68 | #[derive(Copy, Clone, Eq, PartialEq, Debug)] | |
69 | pub enum SearchStep { | |
70 | /// Expresses that a match of the pattern has been found at | |
71 | /// `haystack[a..b]`. | |
72 | Match(usize, usize), | |
73 | /// Expresses that `haystack[a..b]` has been rejected as a possible match | |
74 | /// of the pattern. | |
75 | /// | |
76 | /// Note that there might be more than one `Reject` between two `Match`es, | |
77 | /// there is no requirement for them to be combined into one. | |
78 | Reject(usize, usize), | |
3b2f2976 | 79 | /// Expresses that every byte of the haystack has been visited, ending |
c34b1796 AL |
80 | /// the iteration. |
81 | Done | |
82 | } | |
83 | ||
84 | /// A searcher for a string pattern. | |
85 | /// | |
86 | /// This trait provides methods for searching for non-overlapping | |
87 | /// matches of a pattern starting from the front (left) of a string. | |
88 | /// | |
89 | /// It will be implemented by associated `Searcher` | |
90 | /// types of the `Pattern` trait. | |
91 | /// | |
92 | /// The trait is marked unsafe because the indices returned by the | |
93 | /// `next()` methods are required to lie on valid utf8 boundaries in | |
94 | /// the haystack. This enables consumers of this trait to | |
95 | /// slice the haystack without additional runtime checks. | |
96 | pub unsafe trait Searcher<'a> { | |
3b2f2976 | 97 | /// Getter for the underlying string to be searched in |
c34b1796 AL |
98 | /// |
99 | /// Will always return the same `&str` | |
100 | fn haystack(&self) -> &'a str; | |
101 | ||
102 | /// Performs the next search step starting from the front. | |
103 | /// | |
104 | /// - Returns `Match(a, b)` if `haystack[a..b]` matches the pattern. | |
105 | /// - Returns `Reject(a, b)` if `haystack[a..b]` can not match the | |
106 | /// pattern, even partially. | |
107 | /// - Returns `Done` if every byte of the haystack has been visited | |
108 | /// | |
109 | /// The stream of `Match` and `Reject` values up to a `Done` | |
110 | /// will contain index ranges that are adjacent, non-overlapping, | |
111 | /// covering the whole haystack, and laying on utf8 boundaries. | |
112 | /// | |
113 | /// A `Match` result needs to contain the whole matched pattern, | |
114 | /// however `Reject` results may be split up into arbitrary | |
115 | /// many adjacent fragments. Both ranges may have zero length. | |
116 | /// | |
117 | /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` | |
118 | /// might produce the stream | |
119 | /// `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, 8)]` | |
120 | fn next(&mut self) -> SearchStep; | |
121 | ||
9fa01778 | 122 | /// Finds the next `Match` result. See `next()` |
ff7c6d11 XL |
123 | /// |
124 | /// Unlike next(), there is no guarantee that the returned ranges | |
125 | /// of this and next_reject will overlap. This will return (start_match, end_match), | |
126 | /// where start_match is the index of where the match begins, and end_match is | |
127 | /// the index after the end of the match. | |
c34b1796 AL |
128 | #[inline] |
129 | fn next_match(&mut self) -> Option<(usize, usize)> { | |
130 | loop { | |
131 | match self.next() { | |
132 | SearchStep::Match(a, b) => return Some((a, b)), | |
133 | SearchStep::Done => return None, | |
134 | _ => continue, | |
135 | } | |
136 | } | |
137 | } | |
138 | ||
9fa01778 | 139 | /// Finds the next `Reject` result. See `next()` and `next_match()` |
ff7c6d11 XL |
140 | /// |
141 | /// Unlike next(), there is no guarantee that the returned ranges | |
142 | /// of this and next_match will overlap. | |
c34b1796 AL |
143 | #[inline] |
144 | fn next_reject(&mut self) -> Option<(usize, usize)> { | |
145 | loop { | |
146 | match self.next() { | |
147 | SearchStep::Reject(a, b) => return Some((a, b)), | |
148 | SearchStep::Done => return None, | |
149 | _ => continue, | |
150 | } | |
151 | } | |
152 | } | |
153 | } | |
154 | ||
155 | /// A reverse searcher for a string pattern. | |
156 | /// | |
157 | /// This trait provides methods for searching for non-overlapping | |
158 | /// matches of a pattern starting from the back (right) of a string. | |
159 | /// | |
160 | /// It will be implemented by associated `Searcher` | |
161 | /// types of the `Pattern` trait if the pattern supports searching | |
162 | /// for it from the back. | |
163 | /// | |
164 | /// The index ranges returned by this trait are not required | |
165 | /// to exactly match those of the forward search in reverse. | |
166 | /// | |
167 | /// For the reason why this trait is marked unsafe, see them | |
168 | /// parent trait `Searcher`. | |
169 | pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { | |
170 | /// Performs the next search step starting from the back. | |
171 | /// | |
172 | /// - Returns `Match(a, b)` if `haystack[a..b]` matches the pattern. | |
173 | /// - Returns `Reject(a, b)` if `haystack[a..b]` can not match the | |
174 | /// pattern, even partially. | |
175 | /// - Returns `Done` if every byte of the haystack has been visited | |
176 | /// | |
177 | /// The stream of `Match` and `Reject` values up to a `Done` | |
178 | /// will contain index ranges that are adjacent, non-overlapping, | |
179 | /// covering the whole haystack, and laying on utf8 boundaries. | |
180 | /// | |
181 | /// A `Match` result needs to contain the whole matched pattern, | |
182 | /// however `Reject` results may be split up into arbitrary | |
183 | /// many adjacent fragments. Both ranges may have zero length. | |
184 | /// | |
185 | /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` | |
186 | /// might produce the stream | |
187 | /// `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, 1)]` | |
188 | fn next_back(&mut self) -> SearchStep; | |
189 | ||
9fa01778 | 190 | /// Finds the next `Match` result. See `next_back()` |
c34b1796 AL |
191 | #[inline] |
192 | fn next_match_back(&mut self) -> Option<(usize, usize)>{ | |
193 | loop { | |
194 | match self.next_back() { | |
195 | SearchStep::Match(a, b) => return Some((a, b)), | |
196 | SearchStep::Done => return None, | |
197 | _ => continue, | |
198 | } | |
199 | } | |
200 | } | |
201 | ||
9fa01778 | 202 | /// Finds the next `Reject` result. See `next_back()` |
c34b1796 AL |
203 | #[inline] |
204 | fn next_reject_back(&mut self) -> Option<(usize, usize)>{ | |
205 | loop { | |
206 | match self.next_back() { | |
207 | SearchStep::Reject(a, b) => return Some((a, b)), | |
208 | SearchStep::Done => return None, | |
209 | _ => continue, | |
210 | } | |
211 | } | |
212 | } | |
213 | } | |
214 | ||
215 | /// A marker trait to express that a `ReverseSearcher` | |
216 | /// can be used for a `DoubleEndedIterator` implementation. | |
217 | /// | |
218 | /// For this, the impl of `Searcher` and `ReverseSearcher` need | |
219 | /// to follow these conditions: | |
220 | /// | |
221 | /// - All results of `next()` need to be identical | |
222 | /// to the results of `next_back()` in reverse order. | |
223 | /// - `next()` and `next_back()` need to behave as | |
224 | /// the two ends of a range of values, that is they | |
225 | /// can not "walk past each other". | |
226 | /// | |
227 | /// # Examples | |
228 | /// | |
229 | /// `char::Searcher` is a `DoubleEndedSearcher` because searching for a | |
230 | /// `char` only requires looking at one at a time, which behaves the same | |
231 | /// from both ends. | |
232 | /// | |
233 | /// `(&str)::Searcher` is not a `DoubleEndedSearcher` because | |
234 | /// the pattern `"aa"` in the haystack `"aaa"` matches as either | |
235 | /// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched. | |
236 | pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} | |
237 | ||
ff7c6d11 | 238 | |
9346a6ac | 239 | ///////////////////////////////////////////////////////////////////////////// |
ff7c6d11 | 240 | // Impl for char |
9346a6ac | 241 | ///////////////////////////////////////////////////////////////////////////// |
c34b1796 | 242 | |
ff7c6d11 XL |
243 | /// Associated type for `<char as Pattern<'a>>::Searcher`. |
244 | #[derive(Clone, Debug)] | |
245 | pub struct CharSearcher<'a> { | |
246 | haystack: &'a str, | |
247 | // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack` | |
248 | // This invariant can be broken *within* next_match and next_match_back, however | |
249 | // they must exit with fingers on valid code point boundaries. | |
250 | ||
251 | /// `finger` is the current byte index of the forward search. | |
252 | /// Imagine that it exists before the byte at its index, i.e. | |
83c7162d | 253 | /// `haystack[finger]` is the first byte of the slice we must inspect during |
ff7c6d11 XL |
254 | /// forward searching |
255 | finger: usize, | |
256 | /// `finger_back` is the current byte index of the reverse search. | |
257 | /// Imagine that it exists after the byte at its index, i.e. | |
258 | /// haystack[finger_back - 1] is the last byte of the slice we must inspect during | |
259 | /// forward searching (and thus the first byte to be inspected when calling next_back()) | |
260 | finger_back: usize, | |
261 | /// The character being searched for | |
262 | needle: char, | |
263 | ||
264 | // safety invariant: `utf8_size` must be less than 5 | |
265 | /// The number of bytes `needle` takes up when encoded in utf8 | |
266 | utf8_size: usize, | |
267 | /// A utf8 encoded copy of the `needle` | |
268 | utf8_encoded: [u8; 4], | |
c34b1796 AL |
269 | } |
270 | ||
ff7c6d11 XL |
271 | unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { |
272 | #[inline] | |
273 | fn haystack(&self) -> &'a str { | |
274 | self.haystack | |
275 | } | |
276 | #[inline] | |
277 | fn next(&mut self) -> SearchStep { | |
278 | let old_finger = self.finger; | |
279 | let slice = unsafe { self.haystack.get_unchecked(old_finger..self.finger_back) }; | |
280 | let mut iter = slice.chars(); | |
281 | let old_len = iter.iter.len(); | |
282 | if let Some(ch) = iter.next() { | |
283 | // add byte offset of current character | |
284 | // without re-encoding as utf-8 | |
285 | self.finger += old_len - iter.iter.len(); | |
286 | if ch == self.needle { | |
287 | SearchStep::Match(old_finger, self.finger) | |
288 | } else { | |
289 | SearchStep::Reject(old_finger, self.finger) | |
290 | } | |
291 | } else { | |
292 | SearchStep::Done | |
293 | } | |
294 | } | |
c34b1796 | 295 | #[inline] |
ff7c6d11 XL |
296 | fn next_match(&mut self) -> Option<(usize, usize)> { |
297 | loop { | |
298 | // get the haystack after the last character found | |
60c5eb7d | 299 | let bytes = self.haystack.as_bytes().get(self.finger..self.finger_back)?; |
ff7c6d11 XL |
300 | // the last byte of the utf8 encoded needle |
301 | let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; | |
302 | if let Some(index) = memchr::memchr(last_byte, bytes) { | |
303 | // The new finger is the index of the byte we found, | |
304 | // plus one, since we memchr'd for the last byte of the character. | |
305 | // | |
306 | // Note that this doesn't always give us a finger on a UTF8 boundary. | |
307 | // If we *didn't* find our character | |
308 | // we may have indexed to the non-last byte of a 3-byte or 4-byte character. | |
309 | // We can't just skip to the next valid starting byte because a character like | |
310 | // ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find | |
311 | // the second byte when searching for the third. | |
312 | // | |
313 | // However, this is totally okay. While we have the invariant that | |
0531ce1d | 314 | // self.finger is on a UTF8 boundary, this invariant is not relied upon |
ff7c6d11 XL |
315 | // within this method (it is relied upon in CharSearcher::next()). |
316 | // | |
317 | // We only exit this method when we reach the end of the string, or if we | |
318 | // find something. When we find something the `finger` will be set | |
319 | // to a UTF8 boundary. | |
320 | self.finger += index + 1; | |
321 | if self.finger >= self.utf8_size { | |
322 | let found_char = self.finger - self.utf8_size; | |
323 | if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) { | |
324 | if slice == &self.utf8_encoded[0..self.utf8_size] { | |
325 | return Some((found_char, self.finger)); | |
326 | } | |
327 | } | |
328 | } | |
329 | } else { | |
330 | // found nothing, exit | |
331 | self.finger = self.finger_back; | |
332 | return None; | |
333 | } | |
334 | } | |
335 | } | |
336 | ||
337 | // let next_reject use the default implementation from the Searcher trait | |
338 | } | |
c34b1796 | 339 | |
ff7c6d11 | 340 | unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { |
c34b1796 | 341 | #[inline] |
ff7c6d11 XL |
342 | fn next_back(&mut self) -> SearchStep { |
343 | let old_finger = self.finger_back; | |
8faf50e0 | 344 | let slice = unsafe { self.haystack.get_unchecked(self.finger..old_finger) }; |
ff7c6d11 XL |
345 | let mut iter = slice.chars(); |
346 | let old_len = iter.iter.len(); | |
347 | if let Some(ch) = iter.next_back() { | |
348 | // subtract byte offset of current character | |
349 | // without re-encoding as utf-8 | |
350 | self.finger_back -= old_len - iter.iter.len(); | |
351 | if ch == self.needle { | |
352 | SearchStep::Match(self.finger_back, old_finger) | |
353 | } else { | |
354 | SearchStep::Reject(self.finger_back, old_finger) | |
355 | } | |
356 | } else { | |
357 | SearchStep::Done | |
358 | } | |
359 | } | |
360 | #[inline] | |
361 | fn next_match_back(&mut self) -> Option<(usize, usize)> { | |
362 | let haystack = self.haystack.as_bytes(); | |
363 | loop { | |
364 | // get the haystack up to but not including the last character searched | |
365 | let bytes = if let Some(slice) = haystack.get(self.finger..self.finger_back) { | |
366 | slice | |
367 | } else { | |
368 | return None; | |
369 | }; | |
370 | // the last byte of the utf8 encoded needle | |
371 | let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; | |
372 | if let Some(index) = memchr::memrchr(last_byte, bytes) { | |
373 | // we searched a slice that was offset by self.finger, | |
374 | // add self.finger to recoup the original index | |
375 | let index = self.finger + index; | |
376 | // memrchr will return the index of the byte we wish to | |
377 | // find. In case of an ASCII character, this is indeed | |
378 | // were we wish our new finger to be ("after" the found | |
379 | // char in the paradigm of reverse iteration). For | |
380 | // multibyte chars we need to skip down by the number of more | |
381 | // bytes they have than ASCII | |
382 | let shift = self.utf8_size - 1; | |
383 | if index >= shift { | |
384 | let found_char = index - shift; | |
385 | if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) { | |
386 | if slice == &self.utf8_encoded[0..self.utf8_size] { | |
0731742a | 387 | // move finger to before the character found (i.e., at its start index) |
ff7c6d11 XL |
388 | self.finger_back = found_char; |
389 | return Some((self.finger_back, self.finger_back + self.utf8_size)); | |
390 | } | |
391 | } | |
392 | } | |
393 | // We can't use finger_back = index - size + 1 here. If we found the last char | |
394 | // of a different-sized character (or the middle byte of a different character) | |
395 | // we need to bump the finger_back down to `index`. This similarly makes | |
396 | // `finger_back` have the potential to no longer be on a boundary, | |
397 | // but this is OK since we only exit this function on a boundary | |
398 | // or when the haystack has been searched completely. | |
399 | // | |
400 | // Unlike next_match this does not | |
401 | // have the problem of repeated bytes in utf-8 because | |
402 | // we're searching for the last byte, and we can only have | |
403 | // found the last byte when searching in reverse. | |
404 | self.finger_back = index; | |
405 | } else { | |
406 | self.finger_back = self.finger; | |
407 | // found nothing, exit | |
408 | return None; | |
409 | } | |
410 | } | |
411 | } | |
412 | ||
413 | // let next_reject_back use the default implementation from the Searcher trait | |
c34b1796 AL |
414 | } |
415 | ||
ff7c6d11 XL |
416 | impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} |
417 | ||
418 | /// Searches for chars that are equal to a given char | |
419 | impl<'a> Pattern<'a> for char { | |
420 | type Searcher = CharSearcher<'a>; | |
421 | ||
c34b1796 | 422 | #[inline] |
ff7c6d11 XL |
423 | fn into_searcher(self, haystack: &'a str) -> Self::Searcher { |
424 | let mut utf8_encoded = [0; 4]; | |
0731742a | 425 | let utf8_size = self.encode_utf8(&mut utf8_encoded).len(); |
ff7c6d11 XL |
426 | CharSearcher { |
427 | haystack, | |
428 | finger: 0, | |
429 | finger_back: haystack.len(), | |
430 | needle: self, | |
431 | utf8_size, | |
432 | utf8_encoded | |
433 | } | |
434 | } | |
c34b1796 AL |
435 | |
436 | #[inline] | |
ff7c6d11 XL |
437 | fn is_contained_in(self, haystack: &'a str) -> bool { |
438 | if (self as u32) < 128 { | |
439 | haystack.as_bytes().contains(&(self as u8)) | |
440 | } else { | |
441 | let mut buffer = [0u8; 4]; | |
442 | self.encode_utf8(&mut buffer).is_contained_in(haystack) | |
443 | } | |
444 | } | |
c34b1796 | 445 | |
c34b1796 | 446 | #[inline] |
ff7c6d11 | 447 | fn is_prefix_of(self, haystack: &'a str) -> bool { |
60c5eb7d | 448 | self.encode_utf8(&mut [0u8; 4]).is_prefix_of(haystack) |
ff7c6d11 XL |
449 | } |
450 | ||
451 | #[inline] | |
452 | fn is_suffix_of(self, haystack: &'a str) -> bool where Self::Searcher: ReverseSearcher<'a> | |
453 | { | |
60c5eb7d | 454 | self.encode_utf8(&mut [0u8; 4]).is_suffix_of(haystack) |
c34b1796 | 455 | } |
ff7c6d11 XL |
456 | } |
457 | ||
458 | ///////////////////////////////////////////////////////////////////////////// | |
459 | // Impl for a MultiCharEq wrapper | |
460 | ///////////////////////////////////////////////////////////////////////////// | |
461 | ||
462 | #[doc(hidden)] | |
463 | trait MultiCharEq { | |
464 | fn matches(&mut self, c: char) -> bool; | |
465 | } | |
c34b1796 | 466 | |
ff7c6d11 | 467 | impl<F> MultiCharEq for F where F: FnMut(char) -> bool { |
c34b1796 | 468 | #[inline] |
ff7c6d11 XL |
469 | fn matches(&mut self, c: char) -> bool { (*self)(c) } |
470 | } | |
471 | ||
0bf4aa26 | 472 | impl MultiCharEq for &[char] { |
ff7c6d11 XL |
473 | #[inline] |
474 | fn matches(&mut self, c: char) -> bool { | |
475 | self.iter().any(|&m| { m == c }) | |
c34b1796 AL |
476 | } |
477 | } | |
478 | ||
ff7c6d11 | 479 | struct MultiCharEqPattern<C: MultiCharEq>(C); |
c34b1796 | 480 | |
54a0048b | 481 | #[derive(Clone, Debug)] |
ff7c6d11 | 482 | struct MultiCharEqSearcher<'a, C: MultiCharEq> { |
c34b1796 AL |
483 | char_eq: C, |
484 | haystack: &'a str, | |
485 | char_indices: super::CharIndices<'a>, | |
c34b1796 AL |
486 | } |
487 | ||
ff7c6d11 XL |
488 | impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern<C> { |
489 | type Searcher = MultiCharEqSearcher<'a, C>; | |
c34b1796 AL |
490 | |
491 | #[inline] | |
ff7c6d11 XL |
492 | fn into_searcher(self, haystack: &'a str) -> MultiCharEqSearcher<'a, C> { |
493 | MultiCharEqSearcher { | |
3b2f2976 | 494 | haystack, |
c34b1796 AL |
495 | char_eq: self.0, |
496 | char_indices: haystack.char_indices(), | |
497 | } | |
498 | } | |
499 | } | |
500 | ||
ff7c6d11 | 501 | unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> { |
c34b1796 AL |
502 | #[inline] |
503 | fn haystack(&self) -> &'a str { | |
504 | self.haystack | |
505 | } | |
506 | ||
507 | #[inline] | |
508 | fn next(&mut self) -> SearchStep { | |
509 | let s = &mut self.char_indices; | |
510 | // Compare lengths of the internal byte slice iterator | |
511 | // to find length of current char | |
3157f602 | 512 | let pre_len = s.iter.iter.len(); |
c34b1796 | 513 | if let Some((i, c)) = s.next() { |
3157f602 | 514 | let len = s.iter.iter.len(); |
c34b1796 AL |
515 | let char_len = pre_len - len; |
516 | if self.char_eq.matches(c) { | |
517 | return SearchStep::Match(i, i + char_len); | |
518 | } else { | |
519 | return SearchStep::Reject(i, i + char_len); | |
520 | } | |
521 | } | |
522 | SearchStep::Done | |
523 | } | |
524 | } | |
525 | ||
ff7c6d11 | 526 | unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> { |
c34b1796 AL |
527 | #[inline] |
528 | fn next_back(&mut self) -> SearchStep { | |
529 | let s = &mut self.char_indices; | |
530 | // Compare lengths of the internal byte slice iterator | |
531 | // to find length of current char | |
3157f602 | 532 | let pre_len = s.iter.iter.len(); |
c34b1796 | 533 | if let Some((i, c)) = s.next_back() { |
3157f602 | 534 | let len = s.iter.iter.len(); |
c34b1796 AL |
535 | let char_len = pre_len - len; |
536 | if self.char_eq.matches(c) { | |
537 | return SearchStep::Match(i, i + char_len); | |
538 | } else { | |
539 | return SearchStep::Reject(i, i + char_len); | |
540 | } | |
541 | } | |
542 | SearchStep::Done | |
543 | } | |
544 | } | |
545 | ||
ff7c6d11 | 546 | impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {} |
c34b1796 | 547 | |
9346a6ac AL |
548 | ///////////////////////////////////////////////////////////////////////////// |
549 | ||
550 | macro_rules! pattern_methods { | |
551 | ($t:ty, $pmap:expr, $smap:expr) => { | |
552 | type Searcher = $t; | |
553 | ||
554 | #[inline] | |
555 | fn into_searcher(self, haystack: &'a str) -> $t { | |
556 | ($smap)(($pmap)(self).into_searcher(haystack)) | |
c34b1796 | 557 | } |
9346a6ac | 558 | |
c34b1796 AL |
559 | #[inline] |
560 | fn is_contained_in(self, haystack: &'a str) -> bool { | |
9346a6ac | 561 | ($pmap)(self).is_contained_in(haystack) |
c34b1796 | 562 | } |
9346a6ac | 563 | |
c34b1796 AL |
564 | #[inline] |
565 | fn is_prefix_of(self, haystack: &'a str) -> bool { | |
9346a6ac | 566 | ($pmap)(self).is_prefix_of(haystack) |
c34b1796 | 567 | } |
9346a6ac | 568 | |
c34b1796 AL |
569 | #[inline] |
570 | fn is_suffix_of(self, haystack: &'a str) -> bool | |
9346a6ac | 571 | where $t: ReverseSearcher<'a> |
c34b1796 | 572 | { |
9346a6ac | 573 | ($pmap)(self).is_suffix_of(haystack) |
c34b1796 AL |
574 | } |
575 | } | |
576 | } | |
577 | ||
9346a6ac AL |
578 | macro_rules! searcher_methods { |
579 | (forward) => { | |
580 | #[inline] | |
581 | fn haystack(&self) -> &'a str { | |
582 | self.0.haystack() | |
583 | } | |
584 | #[inline] | |
585 | fn next(&mut self) -> SearchStep { | |
586 | self.0.next() | |
587 | } | |
588 | #[inline] | |
589 | fn next_match(&mut self) -> Option<(usize, usize)> { | |
590 | self.0.next_match() | |
591 | } | |
592 | #[inline] | |
593 | fn next_reject(&mut self) -> Option<(usize, usize)> { | |
594 | self.0.next_reject() | |
595 | } | |
596 | }; | |
597 | (reverse) => { | |
598 | #[inline] | |
599 | fn next_back(&mut self) -> SearchStep { | |
600 | self.0.next_back() | |
601 | } | |
602 | #[inline] | |
603 | fn next_match_back(&mut self) -> Option<(usize, usize)> { | |
604 | self.0.next_match_back() | |
605 | } | |
606 | #[inline] | |
607 | fn next_reject_back(&mut self) -> Option<(usize, usize)> { | |
608 | self.0.next_reject_back() | |
609 | } | |
610 | } | |
c34b1796 AL |
611 | } |
612 | ||
9346a6ac AL |
613 | ///////////////////////////////////////////////////////////////////////////// |
614 | // Impl for &[char] | |
615 | ///////////////////////////////////////////////////////////////////////////// | |
616 | ||
617 | // Todo: Change / Remove due to ambiguity in meaning. | |
618 | ||
619 | /// Associated type for `<&[char] as Pattern<'a>>::Searcher`. | |
54a0048b | 620 | #[derive(Clone, Debug)] |
ff7c6d11 | 621 | pub struct CharSliceSearcher<'a, 'b>(<MultiCharEqPattern<&'b [char]> as Pattern<'a>>::Searcher); |
c34b1796 AL |
622 | |
623 | unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> { | |
9346a6ac | 624 | searcher_methods!(forward); |
c34b1796 | 625 | } |
9346a6ac | 626 | |
c34b1796 | 627 | unsafe impl<'a, 'b> ReverseSearcher<'a> for CharSliceSearcher<'a, 'b> { |
9346a6ac | 628 | searcher_methods!(reverse); |
c34b1796 | 629 | } |
c34b1796 | 630 | |
9346a6ac | 631 | impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {} |
c34b1796 | 632 | |
9346a6ac AL |
633 | /// Searches for chars that are equal to any of the chars in the array |
634 | impl<'a, 'b> Pattern<'a> for &'b [char] { | |
ff7c6d11 | 635 | pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher); |
c34b1796 AL |
636 | } |
637 | ||
9346a6ac AL |
638 | ///////////////////////////////////////////////////////////////////////////// |
639 | // Impl for F: FnMut(char) -> bool | |
640 | ///////////////////////////////////////////////////////////////////////////// | |
641 | ||
642 | /// Associated type for `<F as Pattern<'a>>::Searcher`. | |
643 | #[derive(Clone)] | |
ff7c6d11 | 644 | pub struct CharPredicateSearcher<'a, F>(<MultiCharEqPattern<F> as Pattern<'a>>::Searcher) |
9346a6ac | 645 | where F: FnMut(char) -> bool; |
c34b1796 | 646 | |
0bf4aa26 | 647 | impl<F> fmt::Debug for CharPredicateSearcher<'_, F> |
54a0048b SL |
648 | where F: FnMut(char) -> bool |
649 | { | |
48663c56 | 650 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
54a0048b SL |
651 | f.debug_struct("CharPredicateSearcher") |
652 | .field("haystack", &self.0.haystack) | |
653 | .field("char_indices", &self.0.char_indices) | |
54a0048b SL |
654 | .finish() |
655 | } | |
656 | } | |
9346a6ac | 657 | unsafe impl<'a, F> Searcher<'a> for CharPredicateSearcher<'a, F> |
c34b1796 AL |
658 | where F: FnMut(char) -> bool |
659 | { | |
9346a6ac | 660 | searcher_methods!(forward); |
c34b1796 | 661 | } |
9346a6ac AL |
662 | |
663 | unsafe impl<'a, F> ReverseSearcher<'a> for CharPredicateSearcher<'a, F> | |
c34b1796 AL |
664 | where F: FnMut(char) -> bool |
665 | { | |
9346a6ac | 666 | searcher_methods!(reverse); |
c34b1796 | 667 | } |
c34b1796 | 668 | |
9346a6ac AL |
669 | impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> |
670 | where F: FnMut(char) -> bool {} | |
c34b1796 | 671 | |
9346a6ac AL |
672 | /// Searches for chars that match the given predicate |
673 | impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool { | |
ff7c6d11 | 674 | pattern_methods!(CharPredicateSearcher<'a, F>, MultiCharEqPattern, CharPredicateSearcher); |
9346a6ac AL |
675 | } |
676 | ||
677 | ///////////////////////////////////////////////////////////////////////////// | |
678 | // Impl for &&str | |
679 | ///////////////////////////////////////////////////////////////////////////// | |
680 | ||
681 | /// Delegates to the `&str` impl. | |
54a0048b | 682 | impl<'a, 'b, 'c> Pattern<'a> for &'c &'b str { |
9346a6ac | 683 | pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s); |
c34b1796 | 684 | } |
c1a9b12d SL |
685 | |
686 | ///////////////////////////////////////////////////////////////////////////// | |
687 | // Impl for &str | |
688 | ///////////////////////////////////////////////////////////////////////////// | |
689 | ||
690 | /// Non-allocating substring search. | |
691 | /// | |
692 | /// Will handle the pattern `""` as returning empty matches at each character | |
693 | /// boundary. | |
694 | impl<'a, 'b> Pattern<'a> for &'b str { | |
695 | type Searcher = StrSearcher<'a, 'b>; | |
696 | ||
697 | #[inline] | |
698 | fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> { | |
699 | StrSearcher::new(haystack, self) | |
700 | } | |
701 | ||
702 | /// Checks whether the pattern matches at the front of the haystack | |
703 | #[inline] | |
704 | fn is_prefix_of(self, haystack: &'a str) -> bool { | |
60c5eb7d | 705 | haystack.as_bytes().starts_with(self.as_bytes()) |
c1a9b12d SL |
706 | } |
707 | ||
708 | /// Checks whether the pattern matches at the back of the haystack | |
709 | #[inline] | |
710 | fn is_suffix_of(self, haystack: &'a str) -> bool { | |
60c5eb7d | 711 | haystack.as_bytes().ends_with(self.as_bytes()) |
c1a9b12d SL |
712 | } |
713 | } | |
714 | ||
715 | ||
716 | ///////////////////////////////////////////////////////////////////////////// | |
717 | // Two Way substring searcher | |
718 | ///////////////////////////////////////////////////////////////////////////// | |
719 | ||
720 | #[derive(Clone, Debug)] | |
721 | /// Associated type for `<&str as Pattern<'a>>::Searcher`. | |
722 | pub struct StrSearcher<'a, 'b> { | |
723 | haystack: &'a str, | |
724 | needle: &'b str, | |
725 | ||
726 | searcher: StrSearcherImpl, | |
727 | } | |
728 | ||
729 | #[derive(Clone, Debug)] | |
730 | enum StrSearcherImpl { | |
731 | Empty(EmptyNeedle), | |
732 | TwoWay(TwoWaySearcher), | |
733 | } | |
734 | ||
735 | #[derive(Clone, Debug)] | |
736 | struct EmptyNeedle { | |
737 | position: usize, | |
738 | end: usize, | |
739 | is_match_fw: bool, | |
740 | is_match_bw: bool, | |
741 | } | |
742 | ||
743 | impl<'a, 'b> StrSearcher<'a, 'b> { | |
744 | fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> { | |
745 | if needle.is_empty() { | |
746 | StrSearcher { | |
3b2f2976 XL |
747 | haystack, |
748 | needle, | |
c1a9b12d SL |
749 | searcher: StrSearcherImpl::Empty(EmptyNeedle { |
750 | position: 0, | |
751 | end: haystack.len(), | |
752 | is_match_fw: true, | |
753 | is_match_bw: true, | |
754 | }), | |
755 | } | |
756 | } else { | |
757 | StrSearcher { | |
3b2f2976 XL |
758 | haystack, |
759 | needle, | |
c1a9b12d SL |
760 | searcher: StrSearcherImpl::TwoWay( |
761 | TwoWaySearcher::new(needle.as_bytes(), haystack.len()) | |
762 | ), | |
763 | } | |
764 | } | |
765 | } | |
766 | } | |
767 | ||
768 | unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { | |
041b39d2 XL |
769 | #[inline] |
770 | fn haystack(&self) -> &'a str { | |
771 | self.haystack | |
772 | } | |
c1a9b12d SL |
773 | |
774 | #[inline] | |
775 | fn next(&mut self) -> SearchStep { | |
776 | match self.searcher { | |
777 | StrSearcherImpl::Empty(ref mut searcher) => { | |
778 | // empty needle rejects every char and matches every empty string between them | |
779 | let is_match = searcher.is_match_fw; | |
780 | searcher.is_match_fw = !searcher.is_match_fw; | |
781 | let pos = searcher.position; | |
782 | match self.haystack[pos..].chars().next() { | |
783 | _ if is_match => SearchStep::Match(pos, pos), | |
784 | None => SearchStep::Done, | |
785 | Some(ch) => { | |
786 | searcher.position += ch.len_utf8(); | |
787 | SearchStep::Reject(pos, searcher.position) | |
788 | } | |
789 | } | |
790 | } | |
791 | StrSearcherImpl::TwoWay(ref mut searcher) => { | |
792 | // TwoWaySearcher produces valid *Match* indices that split at char boundaries | |
793 | // as long as it does correct matching and that haystack and needle are | |
794 | // valid UTF-8 | |
795 | // *Rejects* from the algorithm can fall on any indices, but we will walk them | |
796 | // manually to the next character boundary, so that they are utf-8 safe. | |
797 | if searcher.position == self.haystack.len() { | |
798 | return SearchStep::Done; | |
799 | } | |
800 | let is_long = searcher.memory == usize::MAX; | |
801 | match searcher.next::<RejectAndMatch>(self.haystack.as_bytes(), | |
802 | self.needle.as_bytes(), | |
803 | is_long) | |
804 | { | |
805 | SearchStep::Reject(a, mut b) => { | |
806 | // skip to next char boundary | |
807 | while !self.haystack.is_char_boundary(b) { | |
808 | b += 1; | |
809 | } | |
810 | searcher.position = cmp::max(b, searcher.position); | |
811 | SearchStep::Reject(a, b) | |
812 | } | |
813 | otherwise => otherwise, | |
814 | } | |
815 | } | |
816 | } | |
817 | } | |
818 | ||
3b2f2976 | 819 | #[inline] |
c1a9b12d SL |
820 | fn next_match(&mut self) -> Option<(usize, usize)> { |
821 | match self.searcher { | |
822 | StrSearcherImpl::Empty(..) => { | |
823 | loop { | |
824 | match self.next() { | |
825 | SearchStep::Match(a, b) => return Some((a, b)), | |
826 | SearchStep::Done => return None, | |
827 | SearchStep::Reject(..) => { } | |
828 | } | |
829 | } | |
830 | } | |
831 | StrSearcherImpl::TwoWay(ref mut searcher) => { | |
832 | let is_long = searcher.memory == usize::MAX; | |
e9174d1e SL |
833 | // write out `true` and `false` cases to encourage the compiler |
834 | // to specialize the two cases separately. | |
c1a9b12d SL |
835 | if is_long { |
836 | searcher.next::<MatchOnly>(self.haystack.as_bytes(), | |
837 | self.needle.as_bytes(), | |
838 | true) | |
839 | } else { | |
840 | searcher.next::<MatchOnly>(self.haystack.as_bytes(), | |
841 | self.needle.as_bytes(), | |
842 | false) | |
843 | } | |
844 | } | |
845 | } | |
846 | } | |
c1a9b12d | 847 | } |
e9174d1e | 848 | |
c1a9b12d SL |
849 | unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { |
850 | #[inline] | |
851 | fn next_back(&mut self) -> SearchStep { | |
852 | match self.searcher { | |
853 | StrSearcherImpl::Empty(ref mut searcher) => { | |
854 | let is_match = searcher.is_match_bw; | |
855 | searcher.is_match_bw = !searcher.is_match_bw; | |
856 | let end = searcher.end; | |
857 | match self.haystack[..end].chars().next_back() { | |
858 | _ if is_match => SearchStep::Match(end, end), | |
859 | None => SearchStep::Done, | |
860 | Some(ch) => { | |
861 | searcher.end -= ch.len_utf8(); | |
862 | SearchStep::Reject(searcher.end, end) | |
863 | } | |
864 | } | |
865 | } | |
866 | StrSearcherImpl::TwoWay(ref mut searcher) => { | |
867 | if searcher.end == 0 { | |
868 | return SearchStep::Done; | |
869 | } | |
e9174d1e | 870 | let is_long = searcher.memory == usize::MAX; |
c1a9b12d | 871 | match searcher.next_back::<RejectAndMatch>(self.haystack.as_bytes(), |
e9174d1e SL |
872 | self.needle.as_bytes(), |
873 | is_long) | |
c1a9b12d SL |
874 | { |
875 | SearchStep::Reject(mut a, b) => { | |
876 | // skip to next char boundary | |
877 | while !self.haystack.is_char_boundary(a) { | |
878 | a -= 1; | |
879 | } | |
880 | searcher.end = cmp::min(a, searcher.end); | |
881 | SearchStep::Reject(a, b) | |
882 | } | |
883 | otherwise => otherwise, | |
884 | } | |
885 | } | |
886 | } | |
887 | } | |
888 | ||
889 | #[inline] | |
890 | fn next_match_back(&mut self) -> Option<(usize, usize)> { | |
891 | match self.searcher { | |
892 | StrSearcherImpl::Empty(..) => { | |
893 | loop { | |
894 | match self.next_back() { | |
895 | SearchStep::Match(a, b) => return Some((a, b)), | |
896 | SearchStep::Done => return None, | |
897 | SearchStep::Reject(..) => { } | |
898 | } | |
899 | } | |
900 | } | |
901 | StrSearcherImpl::TwoWay(ref mut searcher) => { | |
e9174d1e SL |
902 | let is_long = searcher.memory == usize::MAX; |
903 | // write out `true` and `false`, like `next_match` | |
904 | if is_long { | |
905 | searcher.next_back::<MatchOnly>(self.haystack.as_bytes(), | |
906 | self.needle.as_bytes(), | |
907 | true) | |
908 | } else { | |
909 | searcher.next_back::<MatchOnly>(self.haystack.as_bytes(), | |
910 | self.needle.as_bytes(), | |
911 | false) | |
912 | } | |
c1a9b12d SL |
913 | } |
914 | } | |
915 | } | |
916 | } | |
917 | ||
e9174d1e | 918 | /// The internal state of the two-way substring search algorithm. |
c1a9b12d SL |
919 | #[derive(Clone, Debug)] |
920 | struct TwoWaySearcher { | |
921 | // constants | |
e9174d1e | 922 | /// critical factorization index |
c1a9b12d | 923 | crit_pos: usize, |
e9174d1e SL |
924 | /// critical factorization index for reversed needle |
925 | crit_pos_back: usize, | |
c1a9b12d | 926 | period: usize, |
e9174d1e SL |
927 | /// `byteset` is an extension (not part of the two way algorithm); |
928 | /// it's a 64-bit "fingerprint" where each set bit `j` corresponds | |
929 | /// to a (byte & 63) == j present in the needle. | |
c1a9b12d SL |
930 | byteset: u64, |
931 | ||
932 | // variables | |
933 | position: usize, | |
934 | end: usize, | |
e9174d1e SL |
935 | /// index into needle before which we have already matched |
936 | memory: usize, | |
937 | /// index into needle after which we have already matched | |
938 | memory_back: usize, | |
c1a9b12d SL |
939 | } |
940 | ||
941 | /* | |
942 | This is the Two-Way search algorithm, which was introduced in the paper: | |
943 | Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675. | |
944 | ||
945 | Here's some background information. | |
946 | ||
947 | A *word* is a string of symbols. The *length* of a word should be a familiar | |
948 | notion, and here we denote it for any word x by |x|. | |
949 | (We also allow for the possibility of the *empty word*, a word of length zero). | |
950 | ||
951 | If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a | |
952 | *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p]. | |
953 | For example, both 1 and 2 are periods for the string "aa". As another example, | |
954 | the only period of the string "abcd" is 4. | |
955 | ||
956 | We denote by period(x) the *smallest* period of x (provided that x is non-empty). | |
957 | This is always well-defined since every non-empty word x has at least one period, | |
958 | |x|. We sometimes call this *the period* of x. | |
959 | ||
960 | If u, v and x are words such that x = uv, where uv is the concatenation of u and | |
961 | v, then we say that (u, v) is a *factorization* of x. | |
962 | ||
963 | Let (u, v) be a factorization for a word x. Then if w is a non-empty word such | |
964 | that both of the following hold | |
965 | ||
966 | - either w is a suffix of u or u is a suffix of w | |
967 | - either w is a prefix of v or v is a prefix of w | |
968 | ||
969 | then w is said to be a *repetition* for the factorization (u, v). | |
970 | ||
971 | Just to unpack this, there are four possibilities here. Let w = "abc". Then we | |
972 | might have: | |
973 | ||
974 | - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde") | |
975 | - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab") | |
976 | - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi") | |
977 | - u is a suffix of w and v is a prefix of w. ex: ("bc", "a") | |
978 | ||
979 | Note that the word vu is a repetition for any factorization (u,v) of x = uv, | |
980 | so every factorization has at least one repetition. | |
981 | ||
982 | If x is a string and (u, v) is a factorization for x, then a *local period* for | |
983 | (u, v) is an integer r such that there is some word w such that |w| = r and w is | |
984 | a repetition for (u, v). | |
985 | ||
986 | We denote by local_period(u, v) the smallest local period of (u, v). We sometimes | |
987 | call this *the local period* of (u, v). Provided that x = uv is non-empty, this | |
988 | is well-defined (because each non-empty word has at least one factorization, as | |
989 | noted above). | |
990 | ||
991 | It can be proven that the following is an equivalent definition of a local period | |
992 | for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for | |
993 | all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are | |
0731742a | 994 | defined. (i.e., i > 0 and i + r < |x|). |
c1a9b12d SL |
995 | |
996 | Using the above reformulation, it is easy to prove that | |
997 | ||
998 | 1 <= local_period(u, v) <= period(uv) | |
999 | ||
1000 | A factorization (u, v) of x such that local_period(u,v) = period(x) is called a | |
1001 | *critical factorization*. | |
1002 | ||
1003 | The algorithm hinges on the following theorem, which is stated without proof: | |
1004 | ||
1005 | **Critical Factorization Theorem** Any word x has at least one critical | |
1006 | factorization (u, v) such that |u| < period(x). | |
1007 | ||
1008 | The purpose of maximal_suffix is to find such a critical factorization. | |
1009 | ||
e9174d1e SL |
1010 | If the period is short, compute another factorization x = u' v' to use |
1011 | for reverse search, chosen instead so that |v'| < period(x). | |
1012 | ||
c1a9b12d SL |
1013 | */ |
1014 | impl TwoWaySearcher { | |
1015 | fn new(needle: &[u8], end: usize) -> TwoWaySearcher { | |
1016 | let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false); | |
1017 | let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true); | |
1018 | ||
1019 | let (crit_pos, period) = | |
1020 | if crit_pos_false > crit_pos_true { | |
1021 | (crit_pos_false, period_false) | |
1022 | } else { | |
1023 | (crit_pos_true, period_true) | |
1024 | }; | |
1025 | ||
c1a9b12d SL |
1026 | // A particularly readable explanation of what's going on here can be found |
1027 | // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically | |
1028 | // see the code for "Algorithm CP" on p. 323. | |
1029 | // | |
1030 | // What's going on is we have some critical factorization (u, v) of the | |
1031 | // needle, and we want to determine whether u is a suffix of | |
1032 | // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use | |
1033 | // "Algorithm CP2", which is optimized for when the period of the needle | |
1034 | // is large. | |
1035 | if &needle[..crit_pos] == &needle[period.. period + crit_pos] { | |
e9174d1e SL |
1036 | // short period case -- the period is exact |
1037 | // compute a separate critical factorization for the reversed needle | |
1038 | // x = u' v' where |v'| < period(x). | |
1039 | // | |
1040 | // This is sped up by the period being known already. | |
1041 | // Note that a case like x = "acba" may be factored exactly forwards | |
1042 | // (crit_pos = 1, period = 3) while being factored with approximate | |
1043 | // period in reverse (crit_pos = 2, period = 2). We use the given | |
1044 | // reverse factorization but keep the exact period. | |
1045 | let crit_pos_back = needle.len() - cmp::max( | |
1046 | TwoWaySearcher::reverse_maximal_suffix(needle, period, false), | |
1047 | TwoWaySearcher::reverse_maximal_suffix(needle, period, true)); | |
1048 | ||
c1a9b12d | 1049 | TwoWaySearcher { |
3b2f2976 XL |
1050 | crit_pos, |
1051 | crit_pos_back, | |
1052 | period, | |
e9174d1e | 1053 | byteset: Self::byteset_create(&needle[..period]), |
c1a9b12d SL |
1054 | |
1055 | position: 0, | |
3b2f2976 | 1056 | end, |
e9174d1e SL |
1057 | memory: 0, |
1058 | memory_back: needle.len(), | |
c1a9b12d SL |
1059 | } |
1060 | } else { | |
e9174d1e SL |
1061 | // long period case -- we have an approximation to the actual period, |
1062 | // and don't use memorization. | |
1063 | // | |
1064 | // Approximate the period by lower bound max(|u|, |v|) + 1. | |
1065 | // The critical factorization is efficient to use for both forward and | |
1066 | // reverse search. | |
1067 | ||
c1a9b12d | 1068 | TwoWaySearcher { |
3b2f2976 | 1069 | crit_pos, |
e9174d1e | 1070 | crit_pos_back: crit_pos, |
c1a9b12d | 1071 | period: cmp::max(crit_pos, needle.len() - crit_pos) + 1, |
e9174d1e | 1072 | byteset: Self::byteset_create(needle), |
c1a9b12d SL |
1073 | |
1074 | position: 0, | |
3b2f2976 | 1075 | end, |
e9174d1e SL |
1076 | memory: usize::MAX, // Dummy value to signify that the period is long |
1077 | memory_back: usize::MAX, | |
c1a9b12d SL |
1078 | } |
1079 | } | |
1080 | } | |
1081 | ||
e9174d1e SL |
1082 | #[inline] |
1083 | fn byteset_create(bytes: &[u8]) -> u64 { | |
1084 | bytes.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a) | |
1085 | } | |
1086 | ||
3b2f2976 | 1087 | #[inline] |
c1a9b12d SL |
1088 | fn byteset_contains(&self, byte: u8) -> bool { |
1089 | (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0 | |
1090 | } | |
1091 | ||
1092 | // One of the main ideas of Two-Way is that we factorize the needle into | |
1093 | // two halves, (u, v), and begin trying to find v in the haystack by scanning | |
1094 | // left to right. If v matches, we try to match u by scanning right to left. | |
1095 | // How far we can jump when we encounter a mismatch is all based on the fact | |
1096 | // that (u, v) is a critical factorization for the needle. | |
3b2f2976 | 1097 | #[inline] |
c1a9b12d SL |
1098 | fn next<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) |
1099 | -> S::Output | |
1100 | where S: TwoWayStrategy | |
1101 | { | |
1102 | // `next()` uses `self.position` as its cursor | |
1103 | let old_pos = self.position; | |
e9174d1e | 1104 | let needle_last = needle.len() - 1; |
c1a9b12d SL |
1105 | 'search: loop { |
1106 | // Check that we have room to search in | |
e9174d1e SL |
1107 | // position + needle_last can not overflow if we assume slices |
1108 | // are bounded by isize's range. | |
1109 | let tail_byte = match haystack.get(self.position + needle_last) { | |
1110 | Some(&b) => b, | |
1111 | None => { | |
1112 | self.position = haystack.len(); | |
1113 | return S::rejecting(old_pos, self.position); | |
1114 | } | |
1115 | }; | |
c1a9b12d SL |
1116 | |
1117 | if S::use_early_reject() && old_pos != self.position { | |
1118 | return S::rejecting(old_pos, self.position); | |
1119 | } | |
1120 | ||
1121 | // Quickly skip by large portions unrelated to our substring | |
e9174d1e | 1122 | if !self.byteset_contains(tail_byte) { |
c1a9b12d SL |
1123 | self.position += needle.len(); |
1124 | if !long_period { | |
1125 | self.memory = 0; | |
1126 | } | |
1127 | continue 'search; | |
1128 | } | |
1129 | ||
1130 | // See if the right part of the needle matches | |
1131 | let start = if long_period { self.crit_pos } | |
1132 | else { cmp::max(self.crit_pos, self.memory) }; | |
1133 | for i in start..needle.len() { | |
1134 | if needle[i] != haystack[self.position + i] { | |
1135 | self.position += i - self.crit_pos + 1; | |
1136 | if !long_period { | |
1137 | self.memory = 0; | |
1138 | } | |
1139 | continue 'search; | |
1140 | } | |
1141 | } | |
1142 | ||
1143 | // See if the left part of the needle matches | |
1144 | let start = if long_period { 0 } else { self.memory }; | |
1145 | for i in (start..self.crit_pos).rev() { | |
1146 | if needle[i] != haystack[self.position + i] { | |
1147 | self.position += self.period; | |
1148 | if !long_period { | |
1149 | self.memory = needle.len() - self.period; | |
1150 | } | |
1151 | continue 'search; | |
1152 | } | |
1153 | } | |
1154 | ||
1155 | // We have found a match! | |
1156 | let match_pos = self.position; | |
1157 | ||
1158 | // Note: add self.period instead of needle.len() to have overlapping matches | |
1159 | self.position += needle.len(); | |
1160 | if !long_period { | |
1161 | self.memory = 0; // set to needle.len() - self.period for overlapping matches | |
1162 | } | |
1163 | ||
1164 | return S::matching(match_pos, match_pos + needle.len()); | |
1165 | } | |
1166 | } | |
1167 | ||
1168 | // Follows the ideas in `next()`. | |
1169 | // | |
e9174d1e | 1170 | // The definitions are symmetrical, with period(x) = period(reverse(x)) |
c1a9b12d | 1171 | // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v) |
e9174d1e SL |
1172 | // is a critical factorization, so is (reverse(v), reverse(u)). |
1173 | // | |
1174 | // For the reverse case we have computed a critical factorization x = u' v' | |
1175 | // (field `crit_pos_back`). We need |u| < period(x) for the forward case and | |
1176 | // thus |v'| < period(x) for the reverse. | |
c1a9b12d SL |
1177 | // |
1178 | // To search in reverse through the haystack, we search forward through | |
e9174d1e | 1179 | // a reversed haystack with a reversed needle, matching first u' and then v'. |
c1a9b12d | 1180 | #[inline] |
e9174d1e | 1181 | fn next_back<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) |
c1a9b12d SL |
1182 | -> S::Output |
1183 | where S: TwoWayStrategy | |
1184 | { | |
1185 | // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()` | |
1186 | // are independent. | |
1187 | let old_end = self.end; | |
1188 | 'search: loop { | |
1189 | // Check that we have room to search in | |
e9174d1e SL |
1190 | // end - needle.len() will wrap around when there is no more room, |
1191 | // but due to slice length limits it can never wrap all the way back | |
1192 | // into the length of haystack. | |
1193 | let front_byte = match haystack.get(self.end.wrapping_sub(needle.len())) { | |
1194 | Some(&b) => b, | |
1195 | None => { | |
1196 | self.end = 0; | |
1197 | return S::rejecting(0, old_end); | |
1198 | } | |
1199 | }; | |
c1a9b12d SL |
1200 | |
1201 | if S::use_early_reject() && old_end != self.end { | |
1202 | return S::rejecting(self.end, old_end); | |
1203 | } | |
1204 | ||
1205 | // Quickly skip by large portions unrelated to our substring | |
e9174d1e | 1206 | if !self.byteset_contains(front_byte) { |
c1a9b12d | 1207 | self.end -= needle.len(); |
e9174d1e SL |
1208 | if !long_period { |
1209 | self.memory_back = needle.len(); | |
1210 | } | |
c1a9b12d SL |
1211 | continue 'search; |
1212 | } | |
1213 | ||
1214 | // See if the left part of the needle matches | |
e9174d1e SL |
1215 | let crit = if long_period { self.crit_pos_back } |
1216 | else { cmp::min(self.crit_pos_back, self.memory_back) }; | |
1217 | for i in (0..crit).rev() { | |
c1a9b12d | 1218 | if needle[i] != haystack[self.end - needle.len() + i] { |
e9174d1e SL |
1219 | self.end -= self.crit_pos_back - i; |
1220 | if !long_period { | |
1221 | self.memory_back = needle.len(); | |
1222 | } | |
c1a9b12d SL |
1223 | continue 'search; |
1224 | } | |
1225 | } | |
1226 | ||
1227 | // See if the right part of the needle matches | |
e9174d1e SL |
1228 | let needle_end = if long_period { needle.len() } |
1229 | else { self.memory_back }; | |
1230 | for i in self.crit_pos_back..needle_end { | |
c1a9b12d SL |
1231 | if needle[i] != haystack[self.end - needle.len() + i] { |
1232 | self.end -= self.period; | |
e9174d1e SL |
1233 | if !long_period { |
1234 | self.memory_back = self.period; | |
1235 | } | |
c1a9b12d SL |
1236 | continue 'search; |
1237 | } | |
1238 | } | |
1239 | ||
1240 | // We have found a match! | |
1241 | let match_pos = self.end - needle.len(); | |
1242 | // Note: sub self.period instead of needle.len() to have overlapping matches | |
1243 | self.end -= needle.len(); | |
e9174d1e SL |
1244 | if !long_period { |
1245 | self.memory_back = needle.len(); | |
1246 | } | |
c1a9b12d SL |
1247 | |
1248 | return S::matching(match_pos, match_pos + needle.len()); | |
1249 | } | |
1250 | } | |
1251 | ||
e9174d1e SL |
1252 | // Compute the maximal suffix of `arr`. |
1253 | // | |
1254 | // The maximal suffix is a possible critical factorization (u, v) of `arr`. | |
1255 | // | |
1256 | // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the | |
1257 | // period of v. | |
1258 | // | |
1259 | // `order_greater` determines if lexical order is `<` or `>`. Both | |
1260 | // orders must be computed -- the ordering with the largest `i` gives | |
1261 | // a critical factorization. | |
1262 | // | |
1263 | // For long period cases, the resulting period is not exact (it is too short). | |
c1a9b12d | 1264 | #[inline] |
e9174d1e SL |
1265 | fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) { |
1266 | let mut left = 0; // Corresponds to i in the paper | |
1267 | let mut right = 1; // Corresponds to j in the paper | |
1268 | let mut offset = 0; // Corresponds to k in the paper, but starting at 0 | |
1269 | // to match 0-based indexing. | |
c1a9b12d SL |
1270 | let mut period = 1; // Corresponds to p in the paper |
1271 | ||
e9174d1e SL |
1272 | while let Some(&a) = arr.get(right + offset) { |
1273 | // `left` will be inbounds when `right` is. | |
1274 | let b = arr[left + offset]; | |
1275 | if (a < b && !order_greater) || (a > b && order_greater) { | |
1276 | // Suffix is smaller, period is entire prefix so far. | |
1277 | right += offset + 1; | |
1278 | offset = 0; | |
1279 | period = right - left; | |
1280 | } else if a == b { | |
1281 | // Advance through repetition of the current period. | |
1282 | if offset + 1 == period { | |
1283 | right += offset + 1; | |
1284 | offset = 0; | |
1285 | } else { | |
1286 | offset += 1; | |
1287 | } | |
c1a9b12d | 1288 | } else { |
e9174d1e SL |
1289 | // Suffix is larger, start over from current location. |
1290 | left = right; | |
1291 | right += 1; | |
1292 | offset = 0; | |
1293 | period = 1; | |
c1a9b12d | 1294 | } |
e9174d1e SL |
1295 | } |
1296 | (left, period) | |
1297 | } | |
1298 | ||
1299 | // Compute the maximal suffix of the reverse of `arr`. | |
1300 | // | |
1301 | // The maximal suffix is a possible critical factorization (u', v') of `arr`. | |
1302 | // | |
1303 | // Returns `i` where `i` is the starting index of v', from the back; | |
3b2f2976 | 1304 | // returns immediately when a period of `known_period` is reached. |
e9174d1e SL |
1305 | // |
1306 | // `order_greater` determines if lexical order is `<` or `>`. Both | |
1307 | // orders must be computed -- the ordering with the largest `i` gives | |
1308 | // a critical factorization. | |
1309 | // | |
1310 | // For long period cases, the resulting period is not exact (it is too short). | |
1311 | fn reverse_maximal_suffix(arr: &[u8], known_period: usize, | |
1312 | order_greater: bool) -> usize | |
1313 | { | |
1314 | let mut left = 0; // Corresponds to i in the paper | |
1315 | let mut right = 1; // Corresponds to j in the paper | |
1316 | let mut offset = 0; // Corresponds to k in the paper, but starting at 0 | |
1317 | // to match 0-based indexing. | |
1318 | let mut period = 1; // Corresponds to p in the paper | |
1319 | let n = arr.len(); | |
1320 | ||
1321 | while right + offset < n { | |
1322 | let a = arr[n - (1 + right + offset)]; | |
1323 | let b = arr[n - (1 + left + offset)]; | |
1324 | if (a < b && !order_greater) || (a > b && order_greater) { | |
c1a9b12d | 1325 | // Suffix is smaller, period is entire prefix so far. |
e9174d1e SL |
1326 | right += offset + 1; |
1327 | offset = 0; | |
1328 | period = right - left; | |
c1a9b12d SL |
1329 | } else if a == b { |
1330 | // Advance through repetition of the current period. | |
e9174d1e SL |
1331 | if offset + 1 == period { |
1332 | right += offset + 1; | |
1333 | offset = 0; | |
c1a9b12d SL |
1334 | } else { |
1335 | offset += 1; | |
1336 | } | |
1337 | } else { | |
1338 | // Suffix is larger, start over from current location. | |
1339 | left = right; | |
1340 | right += 1; | |
e9174d1e | 1341 | offset = 0; |
c1a9b12d SL |
1342 | period = 1; |
1343 | } | |
e9174d1e SL |
1344 | if period == known_period { |
1345 | break; | |
1346 | } | |
c1a9b12d | 1347 | } |
e9174d1e SL |
1348 | debug_assert!(period <= known_period); |
1349 | left | |
c1a9b12d SL |
1350 | } |
1351 | } | |
1352 | ||
1353 | // TwoWayStrategy allows the algorithm to either skip non-matches as quickly | |
1354 | // as possible, or to work in a mode where it emits Rejects relatively quickly. | |
1355 | trait TwoWayStrategy { | |
1356 | type Output; | |
1357 | fn use_early_reject() -> bool; | |
8bb4bdeb XL |
1358 | fn rejecting(a: usize, b: usize) -> Self::Output; |
1359 | fn matching(a: usize, b: usize) -> Self::Output; | |
c1a9b12d SL |
1360 | } |
1361 | ||
1362 | /// Skip to match intervals as quickly as possible | |
1363 | enum MatchOnly { } | |
1364 | ||
1365 | impl TwoWayStrategy for MatchOnly { | |
1366 | type Output = Option<(usize, usize)>; | |
1367 | ||
1368 | #[inline] | |
1369 | fn use_early_reject() -> bool { false } | |
1370 | #[inline] | |
1371 | fn rejecting(_a: usize, _b: usize) -> Self::Output { None } | |
1372 | #[inline] | |
1373 | fn matching(a: usize, b: usize) -> Self::Output { Some((a, b)) } | |
1374 | } | |
1375 | ||
1376 | /// Emit Rejects regularly | |
1377 | enum RejectAndMatch { } | |
1378 | ||
1379 | impl TwoWayStrategy for RejectAndMatch { | |
1380 | type Output = SearchStep; | |
1381 | ||
1382 | #[inline] | |
1383 | fn use_early_reject() -> bool { true } | |
1384 | #[inline] | |
1385 | fn rejecting(a: usize, b: usize) -> Self::Output { SearchStep::Reject(a, b) } | |
1386 | #[inline] | |
1387 | fn matching(a: usize, b: usize) -> Self::Output { SearchStep::Match(a, b) } | |
1388 | } |