]>
Commit | Line | Data |
---|---|---|
c34b1796 AL |
1 | // Copyright 2015 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
10 | ||
9346a6ac AL |
11 | //! The string Pattern API. |
12 | //! | |
13 | //! For more details, see the traits `Pattern`, `Searcher`, | |
14 | //! `ReverseSearcher` and `DoubleEndedSearcher`. | |
15 | ||
62682a34 SL |
16 | #![unstable(feature = "pattern", |
17 | reason = "API not fully fleshed out and ready to be stabilized")] | |
18 | ||
c34b1796 | 19 | use prelude::*; |
c1a9b12d SL |
20 | use cmp; |
21 | use usize; | |
c34b1796 AL |
22 | |
23 | // Pattern | |
24 | ||
25 | /// A string pattern. | |
26 | /// | |
27 | /// A `Pattern<'a>` expresses that the implementing type | |
28 | /// can be used as a string pattern for searching in a `&'a str`. | |
29 | /// | |
30 | /// For example, both `'a'` and `"aa"` are patterns that | |
31 | /// would match at index `1` in the string `"baaaab"`. | |
32 | /// | |
33 | /// The trait itself acts as a builder for an associated | |
34 | /// `Searcher` type, which does the actual work of finding | |
35 | /// occurrences of the pattern in a string. | |
36 | pub trait Pattern<'a>: Sized { | |
37 | /// Associated searcher for this pattern | |
38 | type Searcher: Searcher<'a>; | |
39 | ||
9346a6ac | 40 | /// Constructs the associated searcher from |
c34b1796 AL |
41 | /// `self` and the `haystack` to search in. |
42 | fn into_searcher(self, haystack: &'a str) -> Self::Searcher; | |
43 | ||
9346a6ac | 44 | /// Checks whether the pattern matches anywhere in the haystack |
c34b1796 AL |
45 | #[inline] |
46 | fn is_contained_in(self, haystack: &'a str) -> bool { | |
47 | self.into_searcher(haystack).next_match().is_some() | |
48 | } | |
49 | ||
9346a6ac | 50 | /// Checks whether the pattern matches at the front of the haystack |
c34b1796 AL |
51 | #[inline] |
52 | fn is_prefix_of(self, haystack: &'a str) -> bool { | |
53 | match self.into_searcher(haystack).next() { | |
54 | SearchStep::Match(0, _) => true, | |
55 | _ => false, | |
56 | } | |
57 | } | |
58 | ||
9346a6ac | 59 | /// Checks whether the pattern matches at the back of the haystack |
c34b1796 AL |
60 | #[inline] |
61 | fn is_suffix_of(self, haystack: &'a str) -> bool | |
62 | where Self::Searcher: ReverseSearcher<'a> | |
63 | { | |
64 | match self.into_searcher(haystack).next_back() { | |
65 | SearchStep::Match(_, j) if haystack.len() == j => true, | |
66 | _ => false, | |
67 | } | |
68 | } | |
69 | } | |
70 | ||
71 | // Searcher | |
72 | ||
73 | /// Result of calling `Searcher::next()` or `ReverseSearcher::next_back()`. | |
74 | #[derive(Copy, Clone, Eq, PartialEq, Debug)] | |
75 | pub enum SearchStep { | |
76 | /// Expresses that a match of the pattern has been found at | |
77 | /// `haystack[a..b]`. | |
78 | Match(usize, usize), | |
79 | /// Expresses that `haystack[a..b]` has been rejected as a possible match | |
80 | /// of the pattern. | |
81 | /// | |
82 | /// Note that there might be more than one `Reject` between two `Match`es, | |
83 | /// there is no requirement for them to be combined into one. | |
84 | Reject(usize, usize), | |
85 | /// Expresses that every byte of the haystack has been visted, ending | |
86 | /// the iteration. | |
87 | Done | |
88 | } | |
89 | ||
90 | /// A searcher for a string pattern. | |
91 | /// | |
92 | /// This trait provides methods for searching for non-overlapping | |
93 | /// matches of a pattern starting from the front (left) of a string. | |
94 | /// | |
95 | /// It will be implemented by associated `Searcher` | |
96 | /// types of the `Pattern` trait. | |
97 | /// | |
98 | /// The trait is marked unsafe because the indices returned by the | |
99 | /// `next()` methods are required to lie on valid utf8 boundaries in | |
100 | /// the haystack. This enables consumers of this trait to | |
101 | /// slice the haystack without additional runtime checks. | |
102 | pub unsafe trait Searcher<'a> { | |
103 | /// Getter for the underlaying string to be searched in | |
104 | /// | |
105 | /// Will always return the same `&str` | |
106 | fn haystack(&self) -> &'a str; | |
107 | ||
108 | /// Performs the next search step starting from the front. | |
109 | /// | |
110 | /// - Returns `Match(a, b)` if `haystack[a..b]` matches the pattern. | |
111 | /// - Returns `Reject(a, b)` if `haystack[a..b]` can not match the | |
112 | /// pattern, even partially. | |
113 | /// - Returns `Done` if every byte of the haystack has been visited | |
114 | /// | |
115 | /// The stream of `Match` and `Reject` values up to a `Done` | |
116 | /// will contain index ranges that are adjacent, non-overlapping, | |
117 | /// covering the whole haystack, and laying on utf8 boundaries. | |
118 | /// | |
119 | /// A `Match` result needs to contain the whole matched pattern, | |
120 | /// however `Reject` results may be split up into arbitrary | |
121 | /// many adjacent fragments. Both ranges may have zero length. | |
122 | /// | |
123 | /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` | |
124 | /// might produce the stream | |
125 | /// `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, 8)]` | |
126 | fn next(&mut self) -> SearchStep; | |
127 | ||
128 | /// Find the next `Match` result. See `next()` | |
129 | #[inline] | |
130 | fn next_match(&mut self) -> Option<(usize, usize)> { | |
131 | loop { | |
132 | match self.next() { | |
133 | SearchStep::Match(a, b) => return Some((a, b)), | |
134 | SearchStep::Done => return None, | |
135 | _ => continue, | |
136 | } | |
137 | } | |
138 | } | |
139 | ||
140 | /// Find the next `Reject` result. See `next()` | |
141 | #[inline] | |
142 | fn next_reject(&mut self) -> Option<(usize, usize)> { | |
143 | loop { | |
144 | match self.next() { | |
145 | SearchStep::Reject(a, b) => return Some((a, b)), | |
146 | SearchStep::Done => return None, | |
147 | _ => continue, | |
148 | } | |
149 | } | |
150 | } | |
151 | } | |
152 | ||
153 | /// A reverse searcher for a string pattern. | |
154 | /// | |
155 | /// This trait provides methods for searching for non-overlapping | |
156 | /// matches of a pattern starting from the back (right) of a string. | |
157 | /// | |
158 | /// It will be implemented by associated `Searcher` | |
159 | /// types of the `Pattern` trait if the pattern supports searching | |
160 | /// for it from the back. | |
161 | /// | |
162 | /// The index ranges returned by this trait are not required | |
163 | /// to exactly match those of the forward search in reverse. | |
164 | /// | |
165 | /// For the reason why this trait is marked unsafe, see them | |
166 | /// parent trait `Searcher`. | |
167 | pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { | |
168 | /// Performs the next search step starting from the back. | |
169 | /// | |
170 | /// - Returns `Match(a, b)` if `haystack[a..b]` matches the pattern. | |
171 | /// - Returns `Reject(a, b)` if `haystack[a..b]` can not match the | |
172 | /// pattern, even partially. | |
173 | /// - Returns `Done` if every byte of the haystack has been visited | |
174 | /// | |
175 | /// The stream of `Match` and `Reject` values up to a `Done` | |
176 | /// will contain index ranges that are adjacent, non-overlapping, | |
177 | /// covering the whole haystack, and laying on utf8 boundaries. | |
178 | /// | |
179 | /// A `Match` result needs to contain the whole matched pattern, | |
180 | /// however `Reject` results may be split up into arbitrary | |
181 | /// many adjacent fragments. Both ranges may have zero length. | |
182 | /// | |
183 | /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` | |
184 | /// might produce the stream | |
185 | /// `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, 1)]` | |
186 | fn next_back(&mut self) -> SearchStep; | |
187 | ||
188 | /// Find the next `Match` result. See `next_back()` | |
189 | #[inline] | |
190 | fn next_match_back(&mut self) -> Option<(usize, usize)>{ | |
191 | loop { | |
192 | match self.next_back() { | |
193 | SearchStep::Match(a, b) => return Some((a, b)), | |
194 | SearchStep::Done => return None, | |
195 | _ => continue, | |
196 | } | |
197 | } | |
198 | } | |
199 | ||
200 | /// Find the next `Reject` result. See `next_back()` | |
201 | #[inline] | |
202 | fn next_reject_back(&mut self) -> Option<(usize, usize)>{ | |
203 | loop { | |
204 | match self.next_back() { | |
205 | SearchStep::Reject(a, b) => return Some((a, b)), | |
206 | SearchStep::Done => return None, | |
207 | _ => continue, | |
208 | } | |
209 | } | |
210 | } | |
211 | } | |
212 | ||
213 | /// A marker trait to express that a `ReverseSearcher` | |
214 | /// can be used for a `DoubleEndedIterator` implementation. | |
215 | /// | |
216 | /// For this, the impl of `Searcher` and `ReverseSearcher` need | |
217 | /// to follow these conditions: | |
218 | /// | |
219 | /// - All results of `next()` need to be identical | |
220 | /// to the results of `next_back()` in reverse order. | |
221 | /// - `next()` and `next_back()` need to behave as | |
222 | /// the two ends of a range of values, that is they | |
223 | /// can not "walk past each other". | |
224 | /// | |
225 | /// # Examples | |
226 | /// | |
227 | /// `char::Searcher` is a `DoubleEndedSearcher` because searching for a | |
228 | /// `char` only requires looking at one at a time, which behaves the same | |
229 | /// from both ends. | |
230 | /// | |
231 | /// `(&str)::Searcher` is not a `DoubleEndedSearcher` because | |
232 | /// the pattern `"aa"` in the haystack `"aaa"` matches as either | |
233 | /// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched. | |
234 | pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} | |
235 | ||
9346a6ac | 236 | ///////////////////////////////////////////////////////////////////////////// |
c34b1796 | 237 | // Impl for a CharEq wrapper |
9346a6ac | 238 | ///////////////////////////////////////////////////////////////////////////// |
c34b1796 AL |
239 | |
240 | #[doc(hidden)] | |
241 | trait CharEq { | |
242 | fn matches(&mut self, char) -> bool; | |
243 | fn only_ascii(&self) -> bool; | |
244 | } | |
245 | ||
246 | impl CharEq for char { | |
247 | #[inline] | |
248 | fn matches(&mut self, c: char) -> bool { *self == c } | |
249 | ||
250 | #[inline] | |
251 | fn only_ascii(&self) -> bool { (*self as u32) < 128 } | |
252 | } | |
253 | ||
254 | impl<F> CharEq for F where F: FnMut(char) -> bool { | |
255 | #[inline] | |
256 | fn matches(&mut self, c: char) -> bool { (*self)(c) } | |
257 | ||
258 | #[inline] | |
259 | fn only_ascii(&self) -> bool { false } | |
260 | } | |
261 | ||
262 | impl<'a> CharEq for &'a [char] { | |
263 | #[inline] | |
264 | fn matches(&mut self, c: char) -> bool { | |
265 | self.iter().any(|&m| { let mut m = m; m.matches(c) }) | |
266 | } | |
267 | ||
268 | #[inline] | |
269 | fn only_ascii(&self) -> bool { | |
270 | self.iter().all(|m| m.only_ascii()) | |
271 | } | |
272 | } | |
273 | ||
274 | struct CharEqPattern<C: CharEq>(C); | |
275 | ||
9346a6ac | 276 | #[derive(Clone)] |
c34b1796 AL |
277 | struct CharEqSearcher<'a, C: CharEq> { |
278 | char_eq: C, | |
279 | haystack: &'a str, | |
280 | char_indices: super::CharIndices<'a>, | |
281 | #[allow(dead_code)] | |
282 | ascii_only: bool, | |
283 | } | |
284 | ||
285 | impl<'a, C: CharEq> Pattern<'a> for CharEqPattern<C> { | |
286 | type Searcher = CharEqSearcher<'a, C>; | |
287 | ||
288 | #[inline] | |
289 | fn into_searcher(self, haystack: &'a str) -> CharEqSearcher<'a, C> { | |
290 | CharEqSearcher { | |
291 | ascii_only: self.0.only_ascii(), | |
292 | haystack: haystack, | |
293 | char_eq: self.0, | |
294 | char_indices: haystack.char_indices(), | |
295 | } | |
296 | } | |
297 | } | |
298 | ||
299 | unsafe impl<'a, C: CharEq> Searcher<'a> for CharEqSearcher<'a, C> { | |
300 | #[inline] | |
301 | fn haystack(&self) -> &'a str { | |
302 | self.haystack | |
303 | } | |
304 | ||
305 | #[inline] | |
306 | fn next(&mut self) -> SearchStep { | |
307 | let s = &mut self.char_indices; | |
308 | // Compare lengths of the internal byte slice iterator | |
309 | // to find length of current char | |
310 | let (pre_len, _) = s.iter.iter.size_hint(); | |
311 | if let Some((i, c)) = s.next() { | |
312 | let (len, _) = s.iter.iter.size_hint(); | |
313 | let char_len = pre_len - len; | |
314 | if self.char_eq.matches(c) { | |
315 | return SearchStep::Match(i, i + char_len); | |
316 | } else { | |
317 | return SearchStep::Reject(i, i + char_len); | |
318 | } | |
319 | } | |
320 | SearchStep::Done | |
321 | } | |
322 | } | |
323 | ||
324 | unsafe impl<'a, C: CharEq> ReverseSearcher<'a> for CharEqSearcher<'a, C> { | |
325 | #[inline] | |
326 | fn next_back(&mut self) -> SearchStep { | |
327 | let s = &mut self.char_indices; | |
328 | // Compare lengths of the internal byte slice iterator | |
329 | // to find length of current char | |
330 | let (pre_len, _) = s.iter.iter.size_hint(); | |
331 | if let Some((i, c)) = s.next_back() { | |
332 | let (len, _) = s.iter.iter.size_hint(); | |
333 | let char_len = pre_len - len; | |
334 | if self.char_eq.matches(c) { | |
335 | return SearchStep::Match(i, i + char_len); | |
336 | } else { | |
337 | return SearchStep::Reject(i, i + char_len); | |
338 | } | |
339 | } | |
340 | SearchStep::Done | |
341 | } | |
342 | } | |
343 | ||
344 | impl<'a, C: CharEq> DoubleEndedSearcher<'a> for CharEqSearcher<'a, C> {} | |
345 | ||
9346a6ac AL |
346 | ///////////////////////////////////////////////////////////////////////////// |
347 | ||
348 | macro_rules! pattern_methods { | |
349 | ($t:ty, $pmap:expr, $smap:expr) => { | |
350 | type Searcher = $t; | |
351 | ||
352 | #[inline] | |
353 | fn into_searcher(self, haystack: &'a str) -> $t { | |
354 | ($smap)(($pmap)(self).into_searcher(haystack)) | |
c34b1796 | 355 | } |
9346a6ac | 356 | |
c34b1796 AL |
357 | #[inline] |
358 | fn is_contained_in(self, haystack: &'a str) -> bool { | |
9346a6ac | 359 | ($pmap)(self).is_contained_in(haystack) |
c34b1796 | 360 | } |
9346a6ac | 361 | |
c34b1796 AL |
362 | #[inline] |
363 | fn is_prefix_of(self, haystack: &'a str) -> bool { | |
9346a6ac | 364 | ($pmap)(self).is_prefix_of(haystack) |
c34b1796 | 365 | } |
9346a6ac | 366 | |
c34b1796 AL |
367 | #[inline] |
368 | fn is_suffix_of(self, haystack: &'a str) -> bool | |
9346a6ac | 369 | where $t: ReverseSearcher<'a> |
c34b1796 | 370 | { |
9346a6ac | 371 | ($pmap)(self).is_suffix_of(haystack) |
c34b1796 AL |
372 | } |
373 | } | |
374 | } | |
375 | ||
9346a6ac AL |
376 | macro_rules! searcher_methods { |
377 | (forward) => { | |
378 | #[inline] | |
379 | fn haystack(&self) -> &'a str { | |
380 | self.0.haystack() | |
381 | } | |
382 | #[inline] | |
383 | fn next(&mut self) -> SearchStep { | |
384 | self.0.next() | |
385 | } | |
386 | #[inline] | |
387 | fn next_match(&mut self) -> Option<(usize, usize)> { | |
388 | self.0.next_match() | |
389 | } | |
390 | #[inline] | |
391 | fn next_reject(&mut self) -> Option<(usize, usize)> { | |
392 | self.0.next_reject() | |
393 | } | |
394 | }; | |
395 | (reverse) => { | |
396 | #[inline] | |
397 | fn next_back(&mut self) -> SearchStep { | |
398 | self.0.next_back() | |
399 | } | |
400 | #[inline] | |
401 | fn next_match_back(&mut self) -> Option<(usize, usize)> { | |
402 | self.0.next_match_back() | |
403 | } | |
404 | #[inline] | |
405 | fn next_reject_back(&mut self) -> Option<(usize, usize)> { | |
406 | self.0.next_reject_back() | |
407 | } | |
408 | } | |
c34b1796 AL |
409 | } |
410 | ||
9346a6ac AL |
411 | ///////////////////////////////////////////////////////////////////////////// |
412 | // Impl for char | |
413 | ///////////////////////////////////////////////////////////////////////////// | |
414 | ||
415 | /// Associated type for `<char as Pattern<'a>>::Searcher`. | |
416 | #[derive(Clone)] | |
417 | pub struct CharSearcher<'a>(<CharEqPattern<char> as Pattern<'a>>::Searcher); | |
c34b1796 AL |
418 | |
419 | unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { | |
9346a6ac | 420 | searcher_methods!(forward); |
c34b1796 | 421 | } |
9346a6ac | 422 | |
c34b1796 | 423 | unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { |
9346a6ac | 424 | searcher_methods!(reverse); |
c34b1796 | 425 | } |
c34b1796 | 426 | |
9346a6ac | 427 | impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} |
c34b1796 | 428 | |
9346a6ac AL |
429 | /// Searches for chars that are equal to a given char |
430 | impl<'a> Pattern<'a> for char { | |
431 | pattern_methods!(CharSearcher<'a>, CharEqPattern, CharSearcher); | |
c34b1796 AL |
432 | } |
433 | ||
9346a6ac AL |
434 | ///////////////////////////////////////////////////////////////////////////// |
435 | // Impl for &[char] | |
436 | ///////////////////////////////////////////////////////////////////////////// | |
437 | ||
438 | // Todo: Change / Remove due to ambiguity in meaning. | |
439 | ||
440 | /// Associated type for `<&[char] as Pattern<'a>>::Searcher`. | |
441 | #[derive(Clone)] | |
442 | pub struct CharSliceSearcher<'a, 'b>(<CharEqPattern<&'b [char]> as Pattern<'a>>::Searcher); | |
c34b1796 AL |
443 | |
444 | unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> { | |
9346a6ac | 445 | searcher_methods!(forward); |
c34b1796 | 446 | } |
9346a6ac | 447 | |
c34b1796 | 448 | unsafe impl<'a, 'b> ReverseSearcher<'a> for CharSliceSearcher<'a, 'b> { |
9346a6ac | 449 | searcher_methods!(reverse); |
c34b1796 | 450 | } |
c34b1796 | 451 | |
9346a6ac | 452 | impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {} |
c34b1796 | 453 | |
9346a6ac AL |
454 | /// Searches for chars that are equal to any of the chars in the array |
455 | impl<'a, 'b> Pattern<'a> for &'b [char] { | |
456 | pattern_methods!(CharSliceSearcher<'a, 'b>, CharEqPattern, CharSliceSearcher); | |
c34b1796 AL |
457 | } |
458 | ||
9346a6ac AL |
459 | ///////////////////////////////////////////////////////////////////////////// |
460 | // Impl for F: FnMut(char) -> bool | |
461 | ///////////////////////////////////////////////////////////////////////////// | |
462 | ||
463 | /// Associated type for `<F as Pattern<'a>>::Searcher`. | |
464 | #[derive(Clone)] | |
465 | pub struct CharPredicateSearcher<'a, F>(<CharEqPattern<F> as Pattern<'a>>::Searcher) | |
466 | where F: FnMut(char) -> bool; | |
c34b1796 | 467 | |
9346a6ac | 468 | unsafe impl<'a, F> Searcher<'a> for CharPredicateSearcher<'a, F> |
c34b1796 AL |
469 | where F: FnMut(char) -> bool |
470 | { | |
9346a6ac | 471 | searcher_methods!(forward); |
c34b1796 | 472 | } |
9346a6ac AL |
473 | |
474 | unsafe impl<'a, F> ReverseSearcher<'a> for CharPredicateSearcher<'a, F> | |
c34b1796 AL |
475 | where F: FnMut(char) -> bool |
476 | { | |
9346a6ac | 477 | searcher_methods!(reverse); |
c34b1796 | 478 | } |
c34b1796 | 479 | |
9346a6ac AL |
480 | impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> |
481 | where F: FnMut(char) -> bool {} | |
c34b1796 | 482 | |
9346a6ac AL |
483 | /// Searches for chars that match the given predicate |
484 | impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool { | |
485 | pattern_methods!(CharPredicateSearcher<'a, F>, CharEqPattern, CharPredicateSearcher); | |
486 | } | |
487 | ||
488 | ///////////////////////////////////////////////////////////////////////////// | |
489 | // Impl for &&str | |
490 | ///////////////////////////////////////////////////////////////////////////// | |
491 | ||
492 | /// Delegates to the `&str` impl. | |
c34b1796 | 493 | impl<'a, 'b> Pattern<'a> for &'b &'b str { |
9346a6ac | 494 | pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s); |
c34b1796 | 495 | } |
c1a9b12d SL |
496 | |
497 | ///////////////////////////////////////////////////////////////////////////// | |
498 | // Impl for &str | |
499 | ///////////////////////////////////////////////////////////////////////////// | |
500 | ||
501 | /// Non-allocating substring search. | |
502 | /// | |
503 | /// Will handle the pattern `""` as returning empty matches at each character | |
504 | /// boundary. | |
505 | impl<'a, 'b> Pattern<'a> for &'b str { | |
506 | type Searcher = StrSearcher<'a, 'b>; | |
507 | ||
508 | #[inline] | |
509 | fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> { | |
510 | StrSearcher::new(haystack, self) | |
511 | } | |
512 | ||
513 | /// Checks whether the pattern matches at the front of the haystack | |
514 | #[inline] | |
515 | fn is_prefix_of(self, haystack: &'a str) -> bool { | |
516 | haystack.is_char_boundary(self.len()) && | |
517 | self == &haystack[..self.len()] | |
518 | } | |
519 | ||
520 | /// Checks whether the pattern matches at the back of the haystack | |
521 | #[inline] | |
522 | fn is_suffix_of(self, haystack: &'a str) -> bool { | |
523 | self.len() <= haystack.len() && | |
524 | haystack.is_char_boundary(haystack.len() - self.len()) && | |
525 | self == &haystack[haystack.len() - self.len()..] | |
526 | } | |
527 | } | |
528 | ||
529 | ||
530 | ///////////////////////////////////////////////////////////////////////////// | |
531 | // Two Way substring searcher | |
532 | ///////////////////////////////////////////////////////////////////////////// | |
533 | ||
534 | #[derive(Clone, Debug)] | |
535 | /// Associated type for `<&str as Pattern<'a>>::Searcher`. | |
536 | pub struct StrSearcher<'a, 'b> { | |
537 | haystack: &'a str, | |
538 | needle: &'b str, | |
539 | ||
540 | searcher: StrSearcherImpl, | |
541 | } | |
542 | ||
543 | #[derive(Clone, Debug)] | |
544 | enum StrSearcherImpl { | |
545 | Empty(EmptyNeedle), | |
546 | TwoWay(TwoWaySearcher), | |
547 | } | |
548 | ||
549 | #[derive(Clone, Debug)] | |
550 | struct EmptyNeedle { | |
551 | position: usize, | |
552 | end: usize, | |
553 | is_match_fw: bool, | |
554 | is_match_bw: bool, | |
555 | } | |
556 | ||
557 | impl<'a, 'b> StrSearcher<'a, 'b> { | |
558 | fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> { | |
559 | if needle.is_empty() { | |
560 | StrSearcher { | |
561 | haystack: haystack, | |
562 | needle: needle, | |
563 | searcher: StrSearcherImpl::Empty(EmptyNeedle { | |
564 | position: 0, | |
565 | end: haystack.len(), | |
566 | is_match_fw: true, | |
567 | is_match_bw: true, | |
568 | }), | |
569 | } | |
570 | } else { | |
571 | StrSearcher { | |
572 | haystack: haystack, | |
573 | needle: needle, | |
574 | searcher: StrSearcherImpl::TwoWay( | |
575 | TwoWaySearcher::new(needle.as_bytes(), haystack.len()) | |
576 | ), | |
577 | } | |
578 | } | |
579 | } | |
580 | } | |
581 | ||
582 | unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { | |
583 | fn haystack(&self) -> &'a str { self.haystack } | |
584 | ||
585 | #[inline] | |
586 | fn next(&mut self) -> SearchStep { | |
587 | match self.searcher { | |
588 | StrSearcherImpl::Empty(ref mut searcher) => { | |
589 | // empty needle rejects every char and matches every empty string between them | |
590 | let is_match = searcher.is_match_fw; | |
591 | searcher.is_match_fw = !searcher.is_match_fw; | |
592 | let pos = searcher.position; | |
593 | match self.haystack[pos..].chars().next() { | |
594 | _ if is_match => SearchStep::Match(pos, pos), | |
595 | None => SearchStep::Done, | |
596 | Some(ch) => { | |
597 | searcher.position += ch.len_utf8(); | |
598 | SearchStep::Reject(pos, searcher.position) | |
599 | } | |
600 | } | |
601 | } | |
602 | StrSearcherImpl::TwoWay(ref mut searcher) => { | |
603 | // TwoWaySearcher produces valid *Match* indices that split at char boundaries | |
604 | // as long as it does correct matching and that haystack and needle are | |
605 | // valid UTF-8 | |
606 | // *Rejects* from the algorithm can fall on any indices, but we will walk them | |
607 | // manually to the next character boundary, so that they are utf-8 safe. | |
608 | if searcher.position == self.haystack.len() { | |
609 | return SearchStep::Done; | |
610 | } | |
611 | let is_long = searcher.memory == usize::MAX; | |
612 | match searcher.next::<RejectAndMatch>(self.haystack.as_bytes(), | |
613 | self.needle.as_bytes(), | |
614 | is_long) | |
615 | { | |
616 | SearchStep::Reject(a, mut b) => { | |
617 | // skip to next char boundary | |
618 | while !self.haystack.is_char_boundary(b) { | |
619 | b += 1; | |
620 | } | |
621 | searcher.position = cmp::max(b, searcher.position); | |
622 | SearchStep::Reject(a, b) | |
623 | } | |
624 | otherwise => otherwise, | |
625 | } | |
626 | } | |
627 | } | |
628 | } | |
629 | ||
630 | #[inline(always)] | |
631 | fn next_match(&mut self) -> Option<(usize, usize)> { | |
632 | match self.searcher { | |
633 | StrSearcherImpl::Empty(..) => { | |
634 | loop { | |
635 | match self.next() { | |
636 | SearchStep::Match(a, b) => return Some((a, b)), | |
637 | SearchStep::Done => return None, | |
638 | SearchStep::Reject(..) => { } | |
639 | } | |
640 | } | |
641 | } | |
642 | StrSearcherImpl::TwoWay(ref mut searcher) => { | |
643 | let is_long = searcher.memory == usize::MAX; | |
644 | if is_long { | |
645 | searcher.next::<MatchOnly>(self.haystack.as_bytes(), | |
646 | self.needle.as_bytes(), | |
647 | true) | |
648 | } else { | |
649 | searcher.next::<MatchOnly>(self.haystack.as_bytes(), | |
650 | self.needle.as_bytes(), | |
651 | false) | |
652 | } | |
653 | } | |
654 | } | |
655 | } | |
656 | ||
657 | } | |
658 | unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { | |
659 | #[inline] | |
660 | fn next_back(&mut self) -> SearchStep { | |
661 | match self.searcher { | |
662 | StrSearcherImpl::Empty(ref mut searcher) => { | |
663 | let is_match = searcher.is_match_bw; | |
664 | searcher.is_match_bw = !searcher.is_match_bw; | |
665 | let end = searcher.end; | |
666 | match self.haystack[..end].chars().next_back() { | |
667 | _ if is_match => SearchStep::Match(end, end), | |
668 | None => SearchStep::Done, | |
669 | Some(ch) => { | |
670 | searcher.end -= ch.len_utf8(); | |
671 | SearchStep::Reject(searcher.end, end) | |
672 | } | |
673 | } | |
674 | } | |
675 | StrSearcherImpl::TwoWay(ref mut searcher) => { | |
676 | if searcher.end == 0 { | |
677 | return SearchStep::Done; | |
678 | } | |
679 | match searcher.next_back::<RejectAndMatch>(self.haystack.as_bytes(), | |
680 | self.needle.as_bytes()) | |
681 | { | |
682 | SearchStep::Reject(mut a, b) => { | |
683 | // skip to next char boundary | |
684 | while !self.haystack.is_char_boundary(a) { | |
685 | a -= 1; | |
686 | } | |
687 | searcher.end = cmp::min(a, searcher.end); | |
688 | SearchStep::Reject(a, b) | |
689 | } | |
690 | otherwise => otherwise, | |
691 | } | |
692 | } | |
693 | } | |
694 | } | |
695 | ||
696 | #[inline] | |
697 | fn next_match_back(&mut self) -> Option<(usize, usize)> { | |
698 | match self.searcher { | |
699 | StrSearcherImpl::Empty(..) => { | |
700 | loop { | |
701 | match self.next_back() { | |
702 | SearchStep::Match(a, b) => return Some((a, b)), | |
703 | SearchStep::Done => return None, | |
704 | SearchStep::Reject(..) => { } | |
705 | } | |
706 | } | |
707 | } | |
708 | StrSearcherImpl::TwoWay(ref mut searcher) => { | |
709 | searcher.next_back::<MatchOnly>(self.haystack.as_bytes(), | |
710 | self.needle.as_bytes()) | |
711 | } | |
712 | } | |
713 | } | |
714 | } | |
715 | ||
716 | /// The internal state of an iterator that searches for matches of a substring | |
717 | /// within a larger string using two-way search | |
718 | #[derive(Clone, Debug)] | |
719 | struct TwoWaySearcher { | |
720 | // constants | |
721 | crit_pos: usize, | |
722 | period: usize, | |
723 | byteset: u64, | |
724 | ||
725 | // variables | |
726 | position: usize, | |
727 | end: usize, | |
728 | memory: usize | |
729 | } | |
730 | ||
731 | /* | |
732 | This is the Two-Way search algorithm, which was introduced in the paper: | |
733 | Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675. | |
734 | ||
735 | Here's some background information. | |
736 | ||
737 | A *word* is a string of symbols. The *length* of a word should be a familiar | |
738 | notion, and here we denote it for any word x by |x|. | |
739 | (We also allow for the possibility of the *empty word*, a word of length zero). | |
740 | ||
741 | If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a | |
742 | *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p]. | |
743 | For example, both 1 and 2 are periods for the string "aa". As another example, | |
744 | the only period of the string "abcd" is 4. | |
745 | ||
746 | We denote by period(x) the *smallest* period of x (provided that x is non-empty). | |
747 | This is always well-defined since every non-empty word x has at least one period, | |
748 | |x|. We sometimes call this *the period* of x. | |
749 | ||
750 | If u, v and x are words such that x = uv, where uv is the concatenation of u and | |
751 | v, then we say that (u, v) is a *factorization* of x. | |
752 | ||
753 | Let (u, v) be a factorization for a word x. Then if w is a non-empty word such | |
754 | that both of the following hold | |
755 | ||
756 | - either w is a suffix of u or u is a suffix of w | |
757 | - either w is a prefix of v or v is a prefix of w | |
758 | ||
759 | then w is said to be a *repetition* for the factorization (u, v). | |
760 | ||
761 | Just to unpack this, there are four possibilities here. Let w = "abc". Then we | |
762 | might have: | |
763 | ||
764 | - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde") | |
765 | - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab") | |
766 | - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi") | |
767 | - u is a suffix of w and v is a prefix of w. ex: ("bc", "a") | |
768 | ||
769 | Note that the word vu is a repetition for any factorization (u,v) of x = uv, | |
770 | so every factorization has at least one repetition. | |
771 | ||
772 | If x is a string and (u, v) is a factorization for x, then a *local period* for | |
773 | (u, v) is an integer r such that there is some word w such that |w| = r and w is | |
774 | a repetition for (u, v). | |
775 | ||
776 | We denote by local_period(u, v) the smallest local period of (u, v). We sometimes | |
777 | call this *the local period* of (u, v). Provided that x = uv is non-empty, this | |
778 | is well-defined (because each non-empty word has at least one factorization, as | |
779 | noted above). | |
780 | ||
781 | It can be proven that the following is an equivalent definition of a local period | |
782 | for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for | |
783 | all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are | |
784 | defined. (i.e. i > 0 and i + r < |x|). | |
785 | ||
786 | Using the above reformulation, it is easy to prove that | |
787 | ||
788 | 1 <= local_period(u, v) <= period(uv) | |
789 | ||
790 | A factorization (u, v) of x such that local_period(u,v) = period(x) is called a | |
791 | *critical factorization*. | |
792 | ||
793 | The algorithm hinges on the following theorem, which is stated without proof: | |
794 | ||
795 | **Critical Factorization Theorem** Any word x has at least one critical | |
796 | factorization (u, v) such that |u| < period(x). | |
797 | ||
798 | The purpose of maximal_suffix is to find such a critical factorization. | |
799 | ||
800 | */ | |
801 | impl TwoWaySearcher { | |
802 | fn new(needle: &[u8], end: usize) -> TwoWaySearcher { | |
803 | let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false); | |
804 | let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true); | |
805 | ||
806 | let (crit_pos, period) = | |
807 | if crit_pos_false > crit_pos_true { | |
808 | (crit_pos_false, period_false) | |
809 | } else { | |
810 | (crit_pos_true, period_true) | |
811 | }; | |
812 | ||
813 | // This isn't in the original algorithm, as far as I'm aware. | |
814 | let byteset = needle.iter() | |
815 | .fold(0, |a, &b| (1 << ((b & 0x3f) as usize)) | a); | |
816 | ||
817 | // A particularly readable explanation of what's going on here can be found | |
818 | // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically | |
819 | // see the code for "Algorithm CP" on p. 323. | |
820 | // | |
821 | // What's going on is we have some critical factorization (u, v) of the | |
822 | // needle, and we want to determine whether u is a suffix of | |
823 | // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use | |
824 | // "Algorithm CP2", which is optimized for when the period of the needle | |
825 | // is large. | |
826 | if &needle[..crit_pos] == &needle[period.. period + crit_pos] { | |
827 | // short period case | |
828 | TwoWaySearcher { | |
829 | crit_pos: crit_pos, | |
830 | period: period, | |
831 | byteset: byteset, | |
832 | ||
833 | position: 0, | |
834 | end: end, | |
835 | memory: 0 | |
836 | } | |
837 | } else { | |
838 | // long period case | |
839 | // we have an approximation to the actual period, and don't use memory. | |
840 | TwoWaySearcher { | |
841 | crit_pos: crit_pos, | |
842 | period: cmp::max(crit_pos, needle.len() - crit_pos) + 1, | |
843 | byteset: byteset, | |
844 | ||
845 | position: 0, | |
846 | end: end, | |
847 | memory: usize::MAX // Dummy value to signify that the period is long | |
848 | } | |
849 | } | |
850 | } | |
851 | ||
852 | #[inline(always)] | |
853 | fn byteset_contains(&self, byte: u8) -> bool { | |
854 | (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0 | |
855 | } | |
856 | ||
857 | // One of the main ideas of Two-Way is that we factorize the needle into | |
858 | // two halves, (u, v), and begin trying to find v in the haystack by scanning | |
859 | // left to right. If v matches, we try to match u by scanning right to left. | |
860 | // How far we can jump when we encounter a mismatch is all based on the fact | |
861 | // that (u, v) is a critical factorization for the needle. | |
862 | #[inline(always)] | |
863 | fn next<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) | |
864 | -> S::Output | |
865 | where S: TwoWayStrategy | |
866 | { | |
867 | // `next()` uses `self.position` as its cursor | |
868 | let old_pos = self.position; | |
869 | 'search: loop { | |
870 | // Check that we have room to search in | |
871 | if needle.len() > haystack.len() - self.position { | |
872 | self.position = haystack.len(); | |
873 | return S::rejecting(old_pos, self.position); | |
874 | } | |
875 | ||
876 | if S::use_early_reject() && old_pos != self.position { | |
877 | return S::rejecting(old_pos, self.position); | |
878 | } | |
879 | ||
880 | // Quickly skip by large portions unrelated to our substring | |
881 | if !self.byteset_contains(haystack[self.position + needle.len() - 1]) { | |
882 | self.position += needle.len(); | |
883 | if !long_period { | |
884 | self.memory = 0; | |
885 | } | |
886 | continue 'search; | |
887 | } | |
888 | ||
889 | // See if the right part of the needle matches | |
890 | let start = if long_period { self.crit_pos } | |
891 | else { cmp::max(self.crit_pos, self.memory) }; | |
892 | for i in start..needle.len() { | |
893 | if needle[i] != haystack[self.position + i] { | |
894 | self.position += i - self.crit_pos + 1; | |
895 | if !long_period { | |
896 | self.memory = 0; | |
897 | } | |
898 | continue 'search; | |
899 | } | |
900 | } | |
901 | ||
902 | // See if the left part of the needle matches | |
903 | let start = if long_period { 0 } else { self.memory }; | |
904 | for i in (start..self.crit_pos).rev() { | |
905 | if needle[i] != haystack[self.position + i] { | |
906 | self.position += self.period; | |
907 | if !long_period { | |
908 | self.memory = needle.len() - self.period; | |
909 | } | |
910 | continue 'search; | |
911 | } | |
912 | } | |
913 | ||
914 | // We have found a match! | |
915 | let match_pos = self.position; | |
916 | ||
917 | // Note: add self.period instead of needle.len() to have overlapping matches | |
918 | self.position += needle.len(); | |
919 | if !long_period { | |
920 | self.memory = 0; // set to needle.len() - self.period for overlapping matches | |
921 | } | |
922 | ||
923 | return S::matching(match_pos, match_pos + needle.len()); | |
924 | } | |
925 | } | |
926 | ||
927 | // Follows the ideas in `next()`. | |
928 | // | |
929 | // All the definitions are completely symmetrical, with period(x) = period(reverse(x)) | |
930 | // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v) | |
931 | // is a critical factorization, so is (reverse(v), reverse(u)). Similarly, | |
932 | // the "period" stored in self.period is the real period if long_period is | |
933 | // false, and so is still valid for a reversed needle, and if long_period is | |
934 | // true, all the algorithm requires is that self.period is less than or | |
935 | // equal to the real period, which must be true for the forward case anyway. | |
936 | // | |
937 | // To search in reverse through the haystack, we search forward through | |
938 | // a reversed haystack with a reversed needle, and the above paragraph shows | |
939 | // that the precomputed parameters can be left alone. | |
940 | #[inline] | |
941 | fn next_back<S>(&mut self, haystack: &[u8], needle: &[u8]) | |
942 | -> S::Output | |
943 | where S: TwoWayStrategy | |
944 | { | |
945 | // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()` | |
946 | // are independent. | |
947 | let old_end = self.end; | |
948 | 'search: loop { | |
949 | // Check that we have room to search in | |
950 | if needle.len() > self.end { | |
951 | self.end = 0; | |
952 | return S::rejecting(0, old_end); | |
953 | } | |
954 | ||
955 | if S::use_early_reject() && old_end != self.end { | |
956 | return S::rejecting(self.end, old_end); | |
957 | } | |
958 | ||
959 | // Quickly skip by large portions unrelated to our substring | |
960 | if !self.byteset_contains(haystack[self.end - needle.len()]) { | |
961 | self.end -= needle.len(); | |
962 | continue 'search; | |
963 | } | |
964 | ||
965 | // See if the left part of the needle matches | |
966 | for i in (0..self.crit_pos).rev() { | |
967 | if needle[i] != haystack[self.end - needle.len() + i] { | |
968 | self.end -= self.crit_pos - i; | |
969 | continue 'search; | |
970 | } | |
971 | } | |
972 | ||
973 | // See if the right part of the needle matches | |
974 | for i in self.crit_pos..needle.len() { | |
975 | if needle[i] != haystack[self.end - needle.len() + i] { | |
976 | self.end -= self.period; | |
977 | continue 'search; | |
978 | } | |
979 | } | |
980 | ||
981 | // We have found a match! | |
982 | let match_pos = self.end - needle.len(); | |
983 | // Note: sub self.period instead of needle.len() to have overlapping matches | |
984 | self.end -= needle.len(); | |
985 | ||
986 | return S::matching(match_pos, match_pos + needle.len()); | |
987 | } | |
988 | } | |
989 | ||
990 | // Computes a critical factorization (u, v) of `arr`. | |
991 | // Specifically, returns (i, p), where i is the starting index of v in some | |
992 | // critical factorization (u, v) and p = period(v) | |
993 | #[inline] | |
994 | fn maximal_suffix(arr: &[u8], reversed: bool) -> (usize, usize) { | |
995 | let mut left: usize = !0; // Corresponds to i in the paper | |
996 | let mut right = 0; // Corresponds to j in the paper | |
997 | let mut offset = 1; // Corresponds to k in the paper | |
998 | let mut period = 1; // Corresponds to p in the paper | |
999 | ||
1000 | while right + offset < arr.len() { | |
1001 | let a; | |
1002 | let b; | |
1003 | if reversed { | |
1004 | a = arr[left.wrapping_add(offset)]; | |
1005 | b = arr[right + offset]; | |
1006 | } else { | |
1007 | a = arr[right + offset]; | |
1008 | b = arr[left.wrapping_add(offset)]; | |
1009 | } | |
1010 | if a < b { | |
1011 | // Suffix is smaller, period is entire prefix so far. | |
1012 | right += offset; | |
1013 | offset = 1; | |
1014 | period = right.wrapping_sub(left); | |
1015 | } else if a == b { | |
1016 | // Advance through repetition of the current period. | |
1017 | if offset == period { | |
1018 | right += offset; | |
1019 | offset = 1; | |
1020 | } else { | |
1021 | offset += 1; | |
1022 | } | |
1023 | } else { | |
1024 | // Suffix is larger, start over from current location. | |
1025 | left = right; | |
1026 | right += 1; | |
1027 | offset = 1; | |
1028 | period = 1; | |
1029 | } | |
1030 | } | |
1031 | (left.wrapping_add(1), period) | |
1032 | } | |
1033 | } | |
1034 | ||
1035 | // TwoWayStrategy allows the algorithm to either skip non-matches as quickly | |
1036 | // as possible, or to work in a mode where it emits Rejects relatively quickly. | |
1037 | trait TwoWayStrategy { | |
1038 | type Output; | |
1039 | fn use_early_reject() -> bool; | |
1040 | fn rejecting(usize, usize) -> Self::Output; | |
1041 | fn matching(usize, usize) -> Self::Output; | |
1042 | } | |
1043 | ||
1044 | /// Skip to match intervals as quickly as possible | |
1045 | enum MatchOnly { } | |
1046 | ||
1047 | impl TwoWayStrategy for MatchOnly { | |
1048 | type Output = Option<(usize, usize)>; | |
1049 | ||
1050 | #[inline] | |
1051 | fn use_early_reject() -> bool { false } | |
1052 | #[inline] | |
1053 | fn rejecting(_a: usize, _b: usize) -> Self::Output { None } | |
1054 | #[inline] | |
1055 | fn matching(a: usize, b: usize) -> Self::Output { Some((a, b)) } | |
1056 | } | |
1057 | ||
1058 | /// Emit Rejects regularly | |
1059 | enum RejectAndMatch { } | |
1060 | ||
1061 | impl TwoWayStrategy for RejectAndMatch { | |
1062 | type Output = SearchStep; | |
1063 | ||
1064 | #[inline] | |
1065 | fn use_early_reject() -> bool { true } | |
1066 | #[inline] | |
1067 | fn rejecting(a: usize, b: usize) -> Self::Output { SearchStep::Reject(a, b) } | |
1068 | #[inline] | |
1069 | fn matching(a: usize, b: usize) -> Self::Output { SearchStep::Match(a, b) } | |
1070 | } |