]>
Commit | Line | Data |
---|---|---|
2c00a5a8 XL |
1 | //! Parallel iterator types for [strings][std::str] |
2 | //! | |
3 | //! You will rarely need to interact with this module directly unless you need | |
4 | //! to name one of the iterator types. | |
5 | //! | |
6 | //! Note: [`ParallelString::par_split()`] and [`par_split_terminator()`] | |
7 | //! reference a `Pattern` trait which is not visible outside this crate. | |
8 | //! This trait is intentionally kept private, for use only by Rayon itself. | |
9 | //! It is implemented for `char` and any `F: Fn(char) -> bool + Sync + Send`. | |
10 | //! | |
11 | //! [`ParallelString::par_split()`]: trait.ParallelString.html#method.par_split | |
12 | //! [`par_split_terminator()`]: trait.ParallelString.html#method.par_split_terminator | |
13 | //! | |
14 | //! [std::str]: https://doc.rust-lang.org/stable/std/str/ | |
15 | ||
16 | use iter::*; | |
17 | use iter::plumbing::*; | |
18 | use split_producer::*; | |
19 | ||
20 | ||
21 | /// Test if a byte is the start of a UTF-8 character. | |
22 | /// (extracted from `str::is_char_boundary`) | |
23 | #[inline] | |
24 | fn is_char_boundary(b: u8) -> bool { | |
25 | // This is bit magic equivalent to: b < 128 || b >= 192 | |
26 | (b as i8) >= -0x40 | |
27 | } | |
28 | ||
29 | /// Find the index of a character boundary near the midpoint. | |
30 | #[inline] | |
31 | fn find_char_midpoint(chars: &str) -> usize { | |
32 | let mid = chars.len() / 2; | |
33 | ||
34 | // We want to split near the midpoint, but we need to find an actual | |
35 | // character boundary. So we look at the raw bytes, first scanning | |
36 | // forward from the midpoint for a boundary, then trying backward. | |
37 | let (left, right) = chars.as_bytes().split_at(mid); | |
38 | right.iter() | |
39 | .cloned() | |
40 | .position(is_char_boundary) | |
41 | .map(|i| mid + i) | |
42 | .or_else(|| left.iter().cloned().rposition(is_char_boundary)) | |
43 | .unwrap_or(0) | |
44 | } | |
45 | ||
46 | ||
47 | /// Parallel extensions for strings. | |
48 | pub trait ParallelString { | |
49 | /// Returns a plain string slice, which is used to implement the rest of | |
50 | /// the parallel methods. | |
51 | fn as_parallel_string(&self) -> &str; | |
52 | ||
53 | /// Returns a parallel iterator over the characters of a string. | |
54 | /// | |
55 | /// # Examples | |
56 | /// | |
57 | /// ``` | |
58 | /// use rayon::prelude::*; | |
59 | /// let max = "hello".par_chars().max_by_key(|c| *c as i32); | |
60 | /// assert_eq!(Some('o'), max); | |
61 | /// ``` | |
62 | fn par_chars(&self) -> Chars { | |
63 | Chars { chars: self.as_parallel_string() } | |
64 | } | |
65 | ||
66 | /// Returns a parallel iterator over substrings separated by a | |
67 | /// given character or predicate, similar to `str::split`. | |
68 | /// | |
69 | /// Note: the `Pattern` trait is private, for use only by Rayon itself. | |
70 | /// It is implemented for `char` and any `F: Fn(char) -> bool + Sync + Send`. | |
71 | /// | |
72 | /// # Examples | |
73 | /// | |
74 | /// ``` | |
75 | /// use rayon::prelude::*; | |
76 | /// let total = "1, 2, buckle, 3, 4, door" | |
77 | /// .par_split(',') | |
78 | /// .filter_map(|s| s.trim().parse::<i32>().ok()) | |
79 | /// .sum(); | |
80 | /// assert_eq!(10, total); | |
81 | /// ``` | |
82 | fn par_split<P: Pattern>(&self, separator: P) -> Split<P> { | |
83 | Split::new(self.as_parallel_string(), separator) | |
84 | } | |
85 | ||
86 | /// Returns a parallel iterator over substrings terminated by a | |
87 | /// given character or predicate, similar to `str::split_terminator`. | |
88 | /// It's equivalent to `par_split`, except it doesn't produce an empty | |
89 | /// substring after a trailing terminator. | |
90 | /// | |
91 | /// Note: the `Pattern` trait is private, for use only by Rayon itself. | |
92 | /// It is implemented for `char` and any `F: Fn(char) -> bool + Sync + Send`. | |
93 | /// | |
94 | /// # Examples | |
95 | /// | |
96 | /// ``` | |
97 | /// use rayon::prelude::*; | |
98 | /// let parts: Vec<_> = "((1 + 3) * 2)" | |
99 | /// .par_split_terminator(|c| c == '(' || c == ')') | |
100 | /// .collect(); | |
101 | /// assert_eq!(vec!["", "", "1 + 3", " * 2"], parts); | |
102 | /// ``` | |
103 | fn par_split_terminator<P: Pattern>(&self, terminator: P) -> SplitTerminator<P> { | |
104 | SplitTerminator::new(self.as_parallel_string(), terminator) | |
105 | } | |
106 | ||
107 | /// Returns a parallel iterator over the lines of a string, ending with an | |
108 | /// optional carriage return and with a newline (`\r\n` or just `\n`). | |
109 | /// The final line ending is optional, and line endings are not included in | |
110 | /// the output strings. | |
111 | /// | |
112 | /// # Examples | |
113 | /// | |
114 | /// ``` | |
115 | /// use rayon::prelude::*; | |
116 | /// let lengths: Vec<_> = "hello world\nfizbuzz" | |
117 | /// .par_lines() | |
118 | /// .map(|l| l.len()) | |
119 | /// .collect(); | |
120 | /// assert_eq!(vec![11, 7], lengths); | |
121 | /// ``` | |
122 | fn par_lines(&self) -> Lines { | |
123 | Lines(self.as_parallel_string()) | |
124 | } | |
125 | ||
126 | /// Returns a parallel iterator over the sub-slices of a string that are | |
127 | /// separated by any amount of whitespace. | |
128 | /// | |
129 | /// As with `str::split_whitespace`, 'whitespace' is defined according to | |
130 | /// the terms of the Unicode Derived Core Property `White_Space`. | |
131 | /// | |
132 | /// # Examples | |
133 | /// | |
134 | /// ``` | |
135 | /// use rayon::prelude::*; | |
136 | /// let longest = "which is the longest word?" | |
137 | /// .par_split_whitespace() | |
138 | /// .max_by_key(|word| word.len()); | |
139 | /// assert_eq!(Some("longest"), longest); | |
140 | /// ``` | |
141 | fn par_split_whitespace(&self) -> SplitWhitespace { | |
142 | SplitWhitespace(self.as_parallel_string()) | |
143 | } | |
144 | } | |
145 | ||
146 | impl ParallelString for str { | |
147 | #[inline] | |
148 | fn as_parallel_string(&self) -> &str { | |
149 | self | |
150 | } | |
151 | } | |
152 | ||
153 | ||
154 | // ///////////////////////////////////////////////////////////////////////// | |
155 | ||
156 | /// We hide the `Pattern` trait in a private module, as its API is not meant | |
157 | /// for general consumption. If we could have privacy on trait items, then it | |
158 | /// would be nicer to have its basic existence and implementors public while | |
159 | /// keeping all of the methods private. | |
160 | mod private { | |
161 | use iter::plumbing::Folder; | |
162 | ||
163 | /// Pattern-matching trait for `ParallelString`, somewhat like a mix of | |
164 | /// `std::str::pattern::{Pattern, Searcher}`. | |
165 | /// | |
166 | /// Implementing this trait is not permitted outside of `rayon`. | |
167 | pub trait Pattern: Sized + Sync + Send { | |
168 | private_decl!{} | |
169 | fn find_in(&self, &str) -> Option<usize>; | |
170 | fn rfind_in(&self, &str) -> Option<usize>; | |
171 | fn is_suffix_of(&self, &str) -> bool; | |
172 | fn fold_with<'ch, F>(&self, &'ch str, folder: F, skip_last: bool) -> F | |
173 | where F: Folder<&'ch str>; | |
174 | } | |
175 | } | |
176 | use self::private::Pattern; | |
177 | ||
178 | impl Pattern for char { | |
179 | private_impl!{} | |
180 | ||
181 | #[inline] | |
182 | fn find_in(&self, chars: &str) -> Option<usize> { | |
183 | chars.find(*self) | |
184 | } | |
185 | ||
186 | #[inline] | |
187 | fn rfind_in(&self, chars: &str) -> Option<usize> { | |
188 | chars.rfind(*self) | |
189 | } | |
190 | ||
191 | #[inline] | |
192 | fn is_suffix_of(&self, chars: &str) -> bool { | |
193 | chars.ends_with(*self) | |
194 | } | |
195 | ||
196 | fn fold_with<'ch, F>(&self, chars: &'ch str, folder: F, skip_last: bool) -> F | |
197 | where F: Folder<&'ch str> | |
198 | { | |
199 | let mut split = chars.split(*self); | |
200 | if skip_last { | |
201 | split.next_back(); | |
202 | } | |
203 | folder.consume_iter(split) | |
204 | } | |
205 | } | |
206 | ||
207 | impl<FN: Sync + Send + Fn(char) -> bool> Pattern for FN { | |
208 | private_impl!{} | |
209 | ||
210 | fn find_in(&self, chars: &str) -> Option<usize> { | |
211 | chars.find(self) | |
212 | } | |
213 | ||
214 | fn rfind_in(&self, chars: &str) -> Option<usize> { | |
215 | chars.rfind(self) | |
216 | } | |
217 | ||
218 | fn is_suffix_of(&self, chars: &str) -> bool { | |
219 | chars.ends_with(self) | |
220 | } | |
221 | ||
222 | fn fold_with<'ch, F>(&self, chars: &'ch str, folder: F, skip_last: bool) -> F | |
223 | where F: Folder<&'ch str> | |
224 | { | |
225 | let mut split = chars.split(self); | |
226 | if skip_last { | |
227 | split.next_back(); | |
228 | } | |
229 | folder.consume_iter(split) | |
230 | } | |
231 | } | |
232 | ||
233 | ||
234 | // ///////////////////////////////////////////////////////////////////////// | |
235 | ||
236 | /// Parallel iterator over the characters of a string | |
237 | #[derive(Debug, Clone)] | |
238 | pub struct Chars<'ch> { | |
239 | chars: &'ch str, | |
240 | } | |
241 | ||
242 | struct CharsProducer<'ch> { | |
243 | chars: &'ch str, | |
244 | } | |
245 | ||
246 | impl<'ch> ParallelIterator for Chars<'ch> { | |
247 | type Item = char; | |
248 | ||
249 | fn drive_unindexed<C>(self, consumer: C) -> C::Result | |
250 | where C: UnindexedConsumer<Self::Item> | |
251 | { | |
252 | bridge_unindexed(CharsProducer { chars: self.chars }, consumer) | |
253 | } | |
254 | } | |
255 | ||
256 | impl<'ch> UnindexedProducer for CharsProducer<'ch> { | |
257 | type Item = char; | |
258 | ||
259 | fn split(mut self) -> (Self, Option<Self>) { | |
260 | let index = find_char_midpoint(self.chars); | |
261 | if index > 0 { | |
262 | let (left, right) = self.chars.split_at(index); | |
263 | self.chars = left; | |
264 | (self, Some(CharsProducer { chars: right })) | |
265 | } else { | |
266 | (self, None) | |
267 | } | |
268 | } | |
269 | ||
270 | fn fold_with<F>(self, folder: F) -> F | |
271 | where F: Folder<Self::Item> | |
272 | { | |
273 | folder.consume_iter(self.chars.chars()) | |
274 | } | |
275 | } | |
276 | ||
277 | ||
278 | // ///////////////////////////////////////////////////////////////////////// | |
279 | ||
280 | /// Parallel iterator over substrings separated by a pattern | |
281 | #[derive(Debug, Clone)] | |
282 | pub struct Split<'ch, P: Pattern> { | |
283 | chars: &'ch str, | |
284 | separator: P, | |
285 | } | |
286 | ||
287 | impl<'ch, P: Pattern> Split<'ch, P> { | |
288 | fn new(chars: &'ch str, separator: P) -> Self { | |
289 | Split { | |
290 | chars: chars, | |
291 | separator: separator, | |
292 | } | |
293 | } | |
294 | } | |
295 | ||
296 | impl<'ch, P: Pattern> ParallelIterator for Split<'ch, P> { | |
297 | type Item = &'ch str; | |
298 | ||
299 | fn drive_unindexed<C>(self, consumer: C) -> C::Result | |
300 | where C: UnindexedConsumer<Self::Item> | |
301 | { | |
302 | let producer = SplitProducer::new(self.chars, &self.separator); | |
303 | bridge_unindexed(producer, consumer) | |
304 | } | |
305 | } | |
306 | ||
307 | /// Implement support for `SplitProducer`. | |
308 | impl<'ch, P: Pattern> Fissile<P> for &'ch str { | |
309 | fn length(&self) -> usize { | |
310 | self.len() | |
311 | } | |
312 | ||
313 | fn midpoint(&self, end: usize) -> usize { | |
314 | // First find a suitable UTF-8 boundary. | |
315 | find_char_midpoint(&self[..end]) | |
316 | } | |
317 | ||
318 | fn find(&self, separator: &P, start: usize, end: usize) -> Option<usize> { | |
319 | separator.find_in(&self[start..end]) | |
320 | } | |
321 | ||
322 | fn rfind(&self, separator: &P, end: usize) -> Option<usize> { | |
323 | separator.rfind_in(&self[..end]) | |
324 | } | |
325 | ||
326 | fn split_once(self, index: usize) -> (Self, Self) { | |
327 | let (left, right) = self.split_at(index); | |
328 | let mut right_iter = right.chars(); | |
329 | right_iter.next(); // skip the separator | |
330 | (left, right_iter.as_str()) | |
331 | } | |
332 | ||
333 | fn fold_splits<F>(self, separator: &P, folder: F, skip_last: bool) -> F | |
334 | where F: Folder<Self> | |
335 | { | |
336 | separator.fold_with(self, folder, skip_last) | |
337 | } | |
338 | } | |
339 | ||
340 | ||
341 | // ///////////////////////////////////////////////////////////////////////// | |
342 | ||
343 | /// Parallel iterator over substrings separated by a terminator pattern | |
344 | #[derive(Debug, Clone)] | |
345 | pub struct SplitTerminator<'ch, P: Pattern> { | |
346 | chars: &'ch str, | |
347 | terminator: P, | |
348 | } | |
349 | ||
350 | struct SplitTerminatorProducer<'ch, 'sep, P: Pattern + 'sep> { | |
351 | splitter: SplitProducer<'sep, P, &'ch str>, | |
352 | skip_last: bool, | |
353 | } | |
354 | ||
355 | impl<'ch, P: Pattern> SplitTerminator<'ch, P> { | |
356 | fn new(chars: &'ch str, terminator: P) -> Self { | |
357 | SplitTerminator { | |
358 | chars: chars, | |
359 | terminator: terminator, | |
360 | } | |
361 | } | |
362 | } | |
363 | ||
364 | impl<'ch, 'sep, P: Pattern + 'sep> SplitTerminatorProducer<'ch, 'sep, P> { | |
365 | fn new(chars: &'ch str, terminator: &'sep P) -> Self { | |
366 | SplitTerminatorProducer { | |
367 | splitter: SplitProducer::new(chars, terminator), | |
368 | skip_last: chars.is_empty() || terminator.is_suffix_of(chars), | |
369 | } | |
370 | } | |
371 | } | |
372 | ||
373 | impl<'ch, P: Pattern> ParallelIterator for SplitTerminator<'ch, P> { | |
374 | type Item = &'ch str; | |
375 | ||
376 | fn drive_unindexed<C>(self, consumer: C) -> C::Result | |
377 | where C: UnindexedConsumer<Self::Item> | |
378 | { | |
379 | let producer = SplitTerminatorProducer::new(self.chars, &self.terminator); | |
380 | bridge_unindexed(producer, consumer) | |
381 | } | |
382 | } | |
383 | ||
384 | impl<'ch, 'sep, P: Pattern + 'sep> UnindexedProducer for SplitTerminatorProducer<'ch, 'sep, P> { | |
385 | type Item = &'ch str; | |
386 | ||
387 | fn split(mut self) -> (Self, Option<Self>) { | |
388 | let (left, right) = self.splitter.split(); | |
389 | self.splitter = left; | |
390 | let right = right.map(|right| { | |
391 | let skip_last = self.skip_last; | |
392 | self.skip_last = false; | |
393 | SplitTerminatorProducer { | |
394 | splitter: right, | |
395 | skip_last: skip_last, | |
396 | } | |
397 | }); | |
398 | (self, right) | |
399 | } | |
400 | ||
401 | fn fold_with<F>(self, folder: F) -> F | |
402 | where F: Folder<Self::Item> | |
403 | { | |
404 | self.splitter.fold_with(folder, self.skip_last) | |
405 | } | |
406 | } | |
407 | ||
408 | ||
409 | // ///////////////////////////////////////////////////////////////////////// | |
410 | ||
411 | /// Parallel iterator over lines in a string | |
412 | #[derive(Debug, Clone)] | |
413 | pub struct Lines<'ch>(&'ch str); | |
414 | ||
415 | impl<'ch> ParallelIterator for Lines<'ch> { | |
416 | type Item = &'ch str; | |
417 | ||
418 | fn drive_unindexed<C>(self, consumer: C) -> C::Result | |
419 | where C: UnindexedConsumer<Self::Item> | |
420 | { | |
421 | self.0 | |
422 | .par_split_terminator('\n') | |
423 | .map(|line| if line.ends_with('\r') { | |
424 | &line[..line.len() - 1] | |
425 | } else { | |
426 | line | |
427 | }) | |
428 | .drive_unindexed(consumer) | |
429 | } | |
430 | } | |
431 | ||
432 | ||
433 | // ///////////////////////////////////////////////////////////////////////// | |
434 | ||
435 | /// Parallel iterator over substrings separated by whitespace | |
436 | #[derive(Debug, Clone)] | |
437 | pub struct SplitWhitespace<'ch>(&'ch str); | |
438 | ||
439 | impl<'ch> ParallelIterator for SplitWhitespace<'ch> { | |
440 | type Item = &'ch str; | |
441 | ||
442 | fn drive_unindexed<C>(self, consumer: C) -> C::Result | |
443 | where C: UnindexedConsumer<Self::Item> | |
444 | { | |
445 | self.0 | |
446 | .par_split(char::is_whitespace) | |
447 | .filter(|string| !string.is_empty()) | |
448 | .drive_unindexed(consumer) | |
449 | } | |
450 | } |