]>
Commit | Line | Data |
---|---|---|
60c5eb7d XL |
1 | // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
10 | ||
11 | use core::cmp; | |
12 | use core::iter::Filter; | |
13 | ||
14 | // All of the logic for forward iteration over sentences | |
15 | mod fwd { | |
136023e0 | 16 | use crate::tables::sentence::SentenceCat; |
60c5eb7d XL |
17 | use core::cmp; |
18 | ||
19 | // Describe a parsed part of source string as described in this table: | |
20 | // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries | |
21 | #[derive(Clone, Copy, PartialEq, Eq)] | |
22 | enum StatePart { | |
23 | Sot, | |
24 | Eot, | |
25 | Other, | |
26 | CR, | |
27 | LF, | |
28 | Sep, | |
29 | ATerm, | |
30 | UpperLower, | |
31 | ClosePlus, | |
32 | SpPlus, | |
33 | STerm | |
34 | } | |
35 | ||
36 | #[derive(Clone, PartialEq, Eq)] | |
37 | struct SentenceBreaksState(pub [StatePart; 4]); | |
38 | ||
39 | const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([ | |
40 | StatePart::Sot, | |
41 | StatePart::Sot, | |
42 | StatePart::Sot, | |
43 | StatePart::Sot | |
44 | ]); | |
45 | ||
46 | #[derive(Clone)] | |
47 | pub struct SentenceBreaks<'a> { | |
48 | pub string: &'a str, | |
49 | pos: usize, | |
50 | state: SentenceBreaksState | |
51 | } | |
52 | ||
53 | impl SentenceBreaksState { | |
54 | // Attempt to advance the internal state by one part | |
55 | // Whitespace and some punctutation will be collapsed | |
56 | fn next(&self, cat: SentenceCat) -> SentenceBreaksState { | |
57 | let &SentenceBreaksState(parts) = self; | |
58 | let parts = match (parts[3], cat) { | |
59 | (StatePart::ClosePlus, SentenceCat::SC_Close) => parts, | |
60 | (StatePart::SpPlus, SentenceCat::SC_Sp) => parts, | |
61 | _ => [ | |
62 | parts[1], | |
63 | parts[2], | |
64 | parts[3], | |
65 | match cat { | |
66 | SentenceCat::SC_CR => StatePart::CR, | |
67 | SentenceCat::SC_LF => StatePart::LF, | |
68 | SentenceCat::SC_Sep => StatePart::Sep, | |
69 | SentenceCat::SC_ATerm => StatePart::ATerm, | |
70 | SentenceCat::SC_Upper | | |
71 | SentenceCat::SC_Lower => StatePart::UpperLower, | |
72 | SentenceCat::SC_Close => StatePart::ClosePlus, | |
73 | SentenceCat::SC_Sp => StatePart::SpPlus, | |
74 | SentenceCat::SC_STerm => StatePart::STerm, | |
75 | _ => StatePart::Other | |
76 | } | |
77 | ] | |
78 | }; | |
79 | SentenceBreaksState(parts) | |
80 | } | |
81 | ||
82 | fn end(&self) -> SentenceBreaksState { | |
83 | let &SentenceBreaksState(parts) = self; | |
84 | SentenceBreaksState([ | |
85 | parts[1], | |
86 | parts[2], | |
87 | parts[3], | |
88 | StatePart::Eot | |
89 | ]) | |
90 | } | |
91 | ||
92 | // Helper function to check if state head matches a single `StatePart` | |
93 | fn match1(&self, part: StatePart) -> bool { | |
94 | let &SentenceBreaksState(parts) = self; | |
95 | part == parts[3] | |
96 | } | |
97 | ||
98 | // Helper function to check if first two `StateParts` in state match | |
99 | // the given two | |
100 | fn match2(&self, part1: StatePart, part2: StatePart) -> bool { | |
101 | let &SentenceBreaksState(parts) = self; | |
102 | part1 == parts[2] && part2 == parts[3] | |
103 | } | |
104 | } | |
105 | ||
106 | // https://unicode.org/reports/tr29/#SB8 | |
107 | // TODO cache this, it is currently quadratic | |
108 | fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool { | |
109 | let &SentenceBreaksState(parts) = state; | |
110 | let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 }; | |
111 | if parts[idx] == StatePart::ClosePlus { idx -= 1 } | |
112 | ||
113 | if parts[idx] == StatePart::ATerm { | |
136023e0 | 114 | use crate::tables::sentence as se; |
60c5eb7d XL |
115 | |
116 | for next_char in ahead.chars() { | |
117 | //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower | |
5869c6ff | 118 | match se::sentence_category(next_char).2 { |
60c5eb7d XL |
119 | se::SC_Lower => return true, |
120 | se::SC_OLetter | | |
121 | se::SC_Upper | | |
122 | se::SC_Sep | se::SC_CR | se::SC_LF | | |
123 | se::SC_STerm | se::SC_ATerm => return false, | |
124 | _ => continue | |
125 | } | |
126 | } | |
127 | } | |
128 | ||
129 | false | |
130 | } | |
131 | ||
132 | // https://unicode.org/reports/tr29/#SB8a | |
133 | fn match_sb8a(state: &SentenceBreaksState) -> bool { | |
134 | // SATerm Close* Sp* | |
135 | let &SentenceBreaksState(parts) = state; | |
136 | let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 }; | |
137 | if parts[idx] == StatePart::ClosePlus { idx -= 1 } | |
138 | parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm | |
139 | } | |
140 | ||
141 | // https://unicode.org/reports/tr29/#SB9 | |
142 | fn match_sb9(state: &SentenceBreaksState) -> bool { | |
143 | // SATerm Close* | |
144 | let &SentenceBreaksState(parts) = state; | |
145 | let idx = if parts[3] == StatePart::ClosePlus { 2 } else { 3 }; | |
146 | parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm | |
147 | } | |
148 | ||
149 | // https://unicode.org/reports/tr29/#SB11 | |
150 | fn match_sb11(state: &SentenceBreaksState) -> bool { | |
151 | // SATerm Close* Sp* ParaSep? | |
152 | let &SentenceBreaksState(parts) = state; | |
153 | let mut idx = match parts[3] { | |
154 | StatePart::Sep | | |
155 | StatePart::CR | | |
156 | StatePart::LF => 2, | |
157 | _ => 3 | |
158 | }; | |
159 | ||
160 | if parts[idx] == StatePart::SpPlus { idx -= 1 } | |
161 | if parts[idx] == StatePart::ClosePlus { idx -= 1} | |
162 | ||
163 | parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm | |
164 | } | |
165 | ||
166 | impl<'a> Iterator for SentenceBreaks<'a> { | |
167 | // Returns the index of the character which follows a break | |
168 | type Item = usize; | |
169 | ||
170 | #[inline] | |
171 | fn size_hint(&self) -> (usize, Option<usize>) { | |
172 | let slen = self.string.len(); | |
173 | // A sentence could be one character | |
174 | (cmp::min(slen, 2), Some(slen + 1)) | |
175 | } | |
176 | ||
177 | #[inline] | |
178 | fn next(&mut self) -> Option<usize> { | |
136023e0 | 179 | use crate::tables::sentence as se; |
60c5eb7d XL |
180 | |
181 | for next_char in self.string[self.pos..].chars() { | |
182 | let position_before = self.pos; | |
183 | let state_before = self.state.clone(); | |
184 | ||
5869c6ff | 185 | let next_cat = se::sentence_category(next_char).2; |
60c5eb7d XL |
186 | |
187 | self.pos += next_char.len_utf8(); | |
188 | self.state = self.state.next(next_cat); | |
189 | ||
190 | match next_cat { | |
191 | // SB1 https://unicode.org/reports/tr29/#SB1 | |
192 | _ if state_before.match1(StatePart::Sot) => | |
193 | return Some(position_before), | |
194 | ||
195 | // SB2 is handled when inner iterator (chars) is finished | |
196 | ||
197 | // SB3 https://unicode.org/reports/tr29/#SB3 | |
198 | SentenceCat::SC_LF if state_before.match1(StatePart::CR) => | |
199 | continue, | |
200 | ||
201 | // SB4 https://unicode.org/reports/tr29/#SB4 | |
202 | _ if state_before.match1(StatePart::Sep) | |
203 | || state_before.match1(StatePart::CR) | |
204 | || state_before.match1(StatePart::LF) | |
205 | => return Some(position_before), | |
206 | ||
207 | // SB5 https://unicode.org/reports/tr29/#SB5 | |
208 | SentenceCat::SC_Extend | | |
209 | SentenceCat::SC_Format => self.state = state_before, | |
210 | ||
211 | // SB6 https://unicode.org/reports/tr29/#SB6 | |
212 | SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => | |
213 | continue, | |
214 | ||
215 | // SB7 https://unicode.org/reports/tr29/#SB7 | |
216 | SentenceCat::SC_Upper if state_before.match2(StatePart::UpperLower, StatePart::ATerm) => | |
217 | continue, | |
218 | ||
219 | // SB8 https://unicode.org/reports/tr29/#SB8 | |
220 | _ if match_sb8(&state_before, &self.string[position_before..]) => | |
221 | continue, | |
222 | ||
223 | // SB8a https://unicode.org/reports/tr29/#SB8a | |
224 | SentenceCat::SC_SContinue | | |
225 | SentenceCat::SC_STerm | | |
226 | SentenceCat::SC_ATerm if match_sb8a(&state_before) => | |
227 | continue, | |
228 | ||
229 | // SB9 https://unicode.org/reports/tr29/#SB9 | |
230 | SentenceCat::SC_Close | | |
231 | SentenceCat::SC_Sp | | |
232 | SentenceCat::SC_Sep | | |
233 | SentenceCat::SC_CR | | |
234 | SentenceCat::SC_LF if match_sb9(&state_before) => | |
235 | continue, | |
236 | ||
237 | // SB10 https://unicode.org/reports/tr29/#SB10 | |
238 | SentenceCat::SC_Sp | | |
239 | SentenceCat::SC_Sep | | |
240 | SentenceCat::SC_CR | | |
241 | SentenceCat::SC_LF if match_sb8a(&state_before) => | |
242 | continue, | |
243 | ||
244 | // SB11 https://unicode.org/reports/tr29/#SB11 | |
245 | _ if match_sb11(&state_before) => | |
246 | return Some(position_before), | |
247 | ||
248 | // SB998 https://unicode.org/reports/tr29/#SB998 | |
249 | _ => continue | |
250 | } | |
251 | } | |
252 | ||
253 | // SB2 https://unicode.org/reports/tr29/#SB2 | |
254 | if self.state.match1(StatePart::Sot) { | |
255 | None | |
256 | } else if self.state.match1(StatePart::Eot) { | |
257 | None | |
258 | } else { | |
259 | self.state = self.state.end(); | |
260 | Some(self.pos) | |
261 | } | |
262 | } | |
263 | } | |
264 | ||
265 | pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> { | |
266 | SentenceBreaks { string: source, pos: 0, state: INITIAL_STATE } | |
267 | } | |
268 | ||
269 | } | |
270 | ||
271 | /// An iterator over the substrings of a string which, after splitting the string on | |
272 | /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries), | |
273 | /// contain any characters with the | |
274 | /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) | |
275 | /// property, or with | |
276 | /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). | |
5869c6ff XL |
277 | /// |
278 | /// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`] | |
279 | /// trait. See its documentation for more. | |
280 | /// | |
281 | /// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences | |
282 | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html | |
60c5eb7d XL |
283 | #[derive(Clone)] |
284 | pub struct UnicodeSentences<'a> { | |
285 | inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>, | |
286 | } | |
287 | ||
288 | /// External iterator for a string's | |
289 | /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). | |
5869c6ff XL |
290 | /// |
291 | /// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`] | |
292 | /// trait. See its documentation for more. | |
293 | /// | |
294 | /// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds | |
295 | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html | |
60c5eb7d XL |
296 | #[derive(Clone)] |
297 | pub struct USentenceBounds<'a> { | |
298 | iter: fwd::SentenceBreaks<'a>, | |
299 | sentence_start: Option<usize> | |
300 | } | |
301 | ||
302 | /// External iterator for sentence boundaries and byte offsets. | |
5869c6ff XL |
303 | /// |
304 | /// This struct is created by the [`split_sentence_bound_indices`] method on the | |
305 | /// [`UnicodeSegmentation`] trait. See its documentation for more. | |
306 | /// | |
307 | /// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices | |
308 | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html | |
60c5eb7d XL |
309 | #[derive(Clone)] |
310 | pub struct USentenceBoundIndices<'a> { | |
311 | start_offset: usize, | |
312 | iter: USentenceBounds<'a>, | |
313 | } | |
314 | ||
315 | #[inline] | |
316 | pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> { | |
317 | USentenceBounds { | |
318 | iter: fwd::new_sentence_breaks(source), | |
319 | sentence_start: None | |
320 | } | |
321 | } | |
322 | ||
323 | #[inline] | |
324 | pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> { | |
325 | USentenceBoundIndices { | |
326 | start_offset: source.as_ptr() as usize, | |
327 | iter: new_sentence_bounds(source) | |
328 | } | |
329 | } | |
330 | ||
331 | #[inline] | |
332 | pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> { | |
333 | use super::UnicodeSegmentation; | |
136023e0 | 334 | use crate::tables::util::is_alphanumeric; |
60c5eb7d XL |
335 | |
336 | fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) } | |
337 | let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer | |
338 | ||
339 | UnicodeSentences { inner: s.split_sentence_bounds().filter(has_alphanumeric) } | |
340 | } | |
341 | ||
342 | impl<'a> Iterator for UnicodeSentences<'a> { | |
343 | type Item = &'a str; | |
344 | ||
345 | #[inline] | |
346 | fn next(&mut self) -> Option<&'a str> { self.inner.next() } | |
347 | } | |
348 | ||
349 | impl<'a> Iterator for USentenceBounds<'a> { | |
350 | type Item = &'a str; | |
351 | ||
352 | #[inline] | |
353 | fn size_hint(&self) -> (usize, Option<usize>) { | |
354 | let (lower, upper) = self.iter.size_hint(); | |
355 | (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1))) | |
356 | } | |
357 | ||
358 | #[inline] | |
359 | fn next(&mut self) -> Option<&'a str> { | |
360 | if self.sentence_start == None { | |
361 | if let Some(start_pos) = self.iter.next() { | |
362 | self.sentence_start = Some(start_pos) | |
363 | } else { | |
364 | return None | |
365 | } | |
366 | } | |
367 | ||
368 | if let Some(break_pos) = self.iter.next() { | |
369 | let start_pos = self.sentence_start.unwrap(); | |
370 | let sentence = &self.iter.string[start_pos..break_pos]; | |
371 | self.sentence_start = Some(break_pos); | |
372 | Some(sentence) | |
373 | } else { | |
374 | None | |
375 | } | |
376 | } | |
377 | } | |
378 | ||
379 | impl<'a> Iterator for USentenceBoundIndices<'a> { | |
380 | type Item = (usize, &'a str); | |
381 | ||
382 | #[inline] | |
383 | fn next(&mut self) -> Option<(usize, &'a str)> { | |
384 | self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s)) | |
385 | } | |
386 | ||
387 | #[inline] | |
388 | fn size_hint(&self) -> (usize, Option<usize>) { | |
389 | self.iter.size_hint() | |
390 | } | |
391 | } |