]> git.proxmox.com Git - rustc.git/blame - vendor/unicode-segmentation/src/sentence.rs
New upstream version 1.55.0+dfsg1
[rustc.git] / vendor / unicode-segmentation / src / sentence.rs
CommitLineData
60c5eb7d
XL
1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp;
12use core::iter::Filter;
13
14// All of the logic for forward iteration over sentences
15mod fwd {
136023e0 16 use crate::tables::sentence::SentenceCat;
60c5eb7d
XL
17 use core::cmp;
18
19 // Describe a parsed part of source string as described in this table:
20 // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
21 #[derive(Clone, Copy, PartialEq, Eq)]
22 enum StatePart {
23 Sot,
24 Eot,
25 Other,
26 CR,
27 LF,
28 Sep,
29 ATerm,
30 UpperLower,
31 ClosePlus,
32 SpPlus,
33 STerm
34 }
35
36 #[derive(Clone, PartialEq, Eq)]
37 struct SentenceBreaksState(pub [StatePart; 4]);
38
39 const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
40 StatePart::Sot,
41 StatePart::Sot,
42 StatePart::Sot,
43 StatePart::Sot
44 ]);
45
46 #[derive(Clone)]
47 pub struct SentenceBreaks<'a> {
48 pub string: &'a str,
49 pos: usize,
50 state: SentenceBreaksState
51 }
52
53 impl SentenceBreaksState {
54 // Attempt to advance the internal state by one part
55 // Whitespace and some punctutation will be collapsed
56 fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
57 let &SentenceBreaksState(parts) = self;
58 let parts = match (parts[3], cat) {
59 (StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
60 (StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
61 _ => [
62 parts[1],
63 parts[2],
64 parts[3],
65 match cat {
66 SentenceCat::SC_CR => StatePart::CR,
67 SentenceCat::SC_LF => StatePart::LF,
68 SentenceCat::SC_Sep => StatePart::Sep,
69 SentenceCat::SC_ATerm => StatePart::ATerm,
70 SentenceCat::SC_Upper |
71 SentenceCat::SC_Lower => StatePart::UpperLower,
72 SentenceCat::SC_Close => StatePart::ClosePlus,
73 SentenceCat::SC_Sp => StatePart::SpPlus,
74 SentenceCat::SC_STerm => StatePart::STerm,
75 _ => StatePart::Other
76 }
77 ]
78 };
79 SentenceBreaksState(parts)
80 }
81
82 fn end(&self) -> SentenceBreaksState {
83 let &SentenceBreaksState(parts) = self;
84 SentenceBreaksState([
85 parts[1],
86 parts[2],
87 parts[3],
88 StatePart::Eot
89 ])
90 }
91
92 // Helper function to check if state head matches a single `StatePart`
93 fn match1(&self, part: StatePart) -> bool {
94 let &SentenceBreaksState(parts) = self;
95 part == parts[3]
96 }
97
98 // Helper function to check if first two `StateParts` in state match
99 // the given two
100 fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
101 let &SentenceBreaksState(parts) = self;
102 part1 == parts[2] && part2 == parts[3]
103 }
104 }
105
106 // https://unicode.org/reports/tr29/#SB8
107 // TODO cache this, it is currently quadratic
108 fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
109 let &SentenceBreaksState(parts) = state;
110 let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
111 if parts[idx] == StatePart::ClosePlus { idx -= 1 }
112
113 if parts[idx] == StatePart::ATerm {
136023e0 114 use crate::tables::sentence as se;
60c5eb7d
XL
115
116 for next_char in ahead.chars() {
117 //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
5869c6ff 118 match se::sentence_category(next_char).2 {
60c5eb7d
XL
119 se::SC_Lower => return true,
120 se::SC_OLetter |
121 se::SC_Upper |
122 se::SC_Sep | se::SC_CR | se::SC_LF |
123 se::SC_STerm | se::SC_ATerm => return false,
124 _ => continue
125 }
126 }
127 }
128
129 false
130 }
131
132 // https://unicode.org/reports/tr29/#SB8a
133 fn match_sb8a(state: &SentenceBreaksState) -> bool {
134 // SATerm Close* Sp*
135 let &SentenceBreaksState(parts) = state;
136 let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
137 if parts[idx] == StatePart::ClosePlus { idx -= 1 }
138 parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
139 }
140
141 // https://unicode.org/reports/tr29/#SB9
142 fn match_sb9(state: &SentenceBreaksState) -> bool {
143 // SATerm Close*
144 let &SentenceBreaksState(parts) = state;
145 let idx = if parts[3] == StatePart::ClosePlus { 2 } else { 3 };
146 parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
147 }
148
149 // https://unicode.org/reports/tr29/#SB11
150 fn match_sb11(state: &SentenceBreaksState) -> bool {
151 // SATerm Close* Sp* ParaSep?
152 let &SentenceBreaksState(parts) = state;
153 let mut idx = match parts[3] {
154 StatePart::Sep |
155 StatePart::CR |
156 StatePart::LF => 2,
157 _ => 3
158 };
159
160 if parts[idx] == StatePart::SpPlus { idx -= 1 }
161 if parts[idx] == StatePart::ClosePlus { idx -= 1}
162
163 parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
164 }
165
166 impl<'a> Iterator for SentenceBreaks<'a> {
167 // Returns the index of the character which follows a break
168 type Item = usize;
169
170 #[inline]
171 fn size_hint(&self) -> (usize, Option<usize>) {
172 let slen = self.string.len();
173 // A sentence could be one character
174 (cmp::min(slen, 2), Some(slen + 1))
175 }
176
177 #[inline]
178 fn next(&mut self) -> Option<usize> {
136023e0 179 use crate::tables::sentence as se;
60c5eb7d
XL
180
181 for next_char in self.string[self.pos..].chars() {
182 let position_before = self.pos;
183 let state_before = self.state.clone();
184
5869c6ff 185 let next_cat = se::sentence_category(next_char).2;
60c5eb7d
XL
186
187 self.pos += next_char.len_utf8();
188 self.state = self.state.next(next_cat);
189
190 match next_cat {
191 // SB1 https://unicode.org/reports/tr29/#SB1
192 _ if state_before.match1(StatePart::Sot) =>
193 return Some(position_before),
194
195 // SB2 is handled when inner iterator (chars) is finished
196
197 // SB3 https://unicode.org/reports/tr29/#SB3
198 SentenceCat::SC_LF if state_before.match1(StatePart::CR) =>
199 continue,
200
201 // SB4 https://unicode.org/reports/tr29/#SB4
202 _ if state_before.match1(StatePart::Sep)
203 || state_before.match1(StatePart::CR)
204 || state_before.match1(StatePart::LF)
205 => return Some(position_before),
206
207 // SB5 https://unicode.org/reports/tr29/#SB5
208 SentenceCat::SC_Extend |
209 SentenceCat::SC_Format => self.state = state_before,
210
211 // SB6 https://unicode.org/reports/tr29/#SB6
212 SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) =>
213 continue,
214
215 // SB7 https://unicode.org/reports/tr29/#SB7
216 SentenceCat::SC_Upper if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
217 continue,
218
219 // SB8 https://unicode.org/reports/tr29/#SB8
220 _ if match_sb8(&state_before, &self.string[position_before..]) =>
221 continue,
222
223 // SB8a https://unicode.org/reports/tr29/#SB8a
224 SentenceCat::SC_SContinue |
225 SentenceCat::SC_STerm |
226 SentenceCat::SC_ATerm if match_sb8a(&state_before) =>
227 continue,
228
229 // SB9 https://unicode.org/reports/tr29/#SB9
230 SentenceCat::SC_Close |
231 SentenceCat::SC_Sp |
232 SentenceCat::SC_Sep |
233 SentenceCat::SC_CR |
234 SentenceCat::SC_LF if match_sb9(&state_before) =>
235 continue,
236
237 // SB10 https://unicode.org/reports/tr29/#SB10
238 SentenceCat::SC_Sp |
239 SentenceCat::SC_Sep |
240 SentenceCat::SC_CR |
241 SentenceCat::SC_LF if match_sb8a(&state_before) =>
242 continue,
243
244 // SB11 https://unicode.org/reports/tr29/#SB11
245 _ if match_sb11(&state_before) =>
246 return Some(position_before),
247
248 // SB998 https://unicode.org/reports/tr29/#SB998
249 _ => continue
250 }
251 }
252
253 // SB2 https://unicode.org/reports/tr29/#SB2
254 if self.state.match1(StatePart::Sot) {
255 None
256 } else if self.state.match1(StatePart::Eot) {
257 None
258 } else {
259 self.state = self.state.end();
260 Some(self.pos)
261 }
262 }
263 }
264
265 pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
266 SentenceBreaks { string: source, pos: 0, state: INITIAL_STATE }
267 }
268
269}
270
271/// An iterator over the substrings of a string which, after splitting the string on
272/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
273/// contain any characters with the
274/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
275/// property, or with
276/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
5869c6ff
XL
277///
278/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
279/// trait. See its documentation for more.
280///
281/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
282/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
60c5eb7d
XL
283#[derive(Clone)]
284pub struct UnicodeSentences<'a> {
285 inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
286}
287
288/// External iterator for a string's
289/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
5869c6ff
XL
290///
291/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
292/// trait. See its documentation for more.
293///
294/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
295/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
60c5eb7d
XL
296#[derive(Clone)]
297pub struct USentenceBounds<'a> {
298 iter: fwd::SentenceBreaks<'a>,
299 sentence_start: Option<usize>
300}
301
302/// External iterator for sentence boundaries and byte offsets.
5869c6ff
XL
303///
304/// This struct is created by the [`split_sentence_bound_indices`] method on the
305/// [`UnicodeSegmentation`] trait. See its documentation for more.
306///
307/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
308/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
60c5eb7d
XL
309#[derive(Clone)]
310pub struct USentenceBoundIndices<'a> {
311 start_offset: usize,
312 iter: USentenceBounds<'a>,
313}
314
315#[inline]
316pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
317 USentenceBounds {
318 iter: fwd::new_sentence_breaks(source),
319 sentence_start: None
320 }
321}
322
323#[inline]
324pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
325 USentenceBoundIndices {
326 start_offset: source.as_ptr() as usize,
327 iter: new_sentence_bounds(source)
328 }
329}
330
331#[inline]
332pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
333 use super::UnicodeSegmentation;
136023e0 334 use crate::tables::util::is_alphanumeric;
60c5eb7d
XL
335
336 fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
337 let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
338
339 UnicodeSentences { inner: s.split_sentence_bounds().filter(has_alphanumeric) }
340}
341
342impl<'a> Iterator for UnicodeSentences<'a> {
343 type Item = &'a str;
344
345 #[inline]
346 fn next(&mut self) -> Option<&'a str> { self.inner.next() }
347}
348
349impl<'a> Iterator for USentenceBounds<'a> {
350 type Item = &'a str;
351
352 #[inline]
353 fn size_hint(&self) -> (usize, Option<usize>) {
354 let (lower, upper) = self.iter.size_hint();
355 (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
356 }
357
358 #[inline]
359 fn next(&mut self) -> Option<&'a str> {
360 if self.sentence_start == None {
361 if let Some(start_pos) = self.iter.next() {
362 self.sentence_start = Some(start_pos)
363 } else {
364 return None
365 }
366 }
367
368 if let Some(break_pos) = self.iter.next() {
369 let start_pos = self.sentence_start.unwrap();
370 let sentence = &self.iter.string[start_pos..break_pos];
371 self.sentence_start = Some(break_pos);
372 Some(sentence)
373 } else {
374 None
375 }
376 }
377}
378
379impl<'a> Iterator for USentenceBoundIndices<'a> {
380 type Item = (usize, &'a str);
381
382 #[inline]
383 fn next(&mut self) -> Option<(usize, &'a str)> {
384 self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
385 }
386
387 #[inline]
388 fn size_hint(&self) -> (usize, Option<usize>) {
389 self.iter.size_hint()
390 }
391}