]> git.proxmox.com Git - rustc.git/blob - vendor/bstr/src/unicode/sentence.rs
New upstream version 1.64.0+dfsg1
[rustc.git] / vendor / bstr / src / unicode / sentence.rs
1 use regex_automata::DFA;
2
3 use crate::ext_slice::ByteSlice;
4 use crate::unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD;
5 use crate::utf8;
6
7 /// An iterator over sentences in a byte string.
8 ///
9 /// This iterator is typically constructed by
10 /// [`ByteSlice::sentences`](trait.ByteSlice.html#method.sentences).
11 ///
12 /// Sentences typically include their trailing punctuation and whitespace.
13 ///
14 /// Since sentences are made up of one or more codepoints, this iterator yields
15 /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
16 /// are [substituted](index.html#handling-of-invalid-utf-8).
17 ///
18 /// This iterator yields words in accordance with the default sentence boundary
19 /// rules specified in
20 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
21 #[derive(Clone, Debug)]
22 pub struct Sentences<'a> {
23 bs: &'a [u8],
24 }
25
26 impl<'a> Sentences<'a> {
27 pub(crate) fn new(bs: &'a [u8]) -> Sentences<'a> {
28 Sentences { bs }
29 }
30
31 /// View the underlying data as a subslice of the original data.
32 ///
33 /// The slice returned has the same lifetime as the original slice, and so
34 /// the iterator can continue to be used while this exists.
35 ///
36 /// # Examples
37 ///
38 /// ```
39 /// use bstr::ByteSlice;
40 ///
41 /// let mut it = b"I want this. Not that. Right now.".sentences();
42 ///
43 /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
44 /// it.next();
45 /// assert_eq!(b"Not that. Right now.", it.as_bytes());
46 /// it.next();
47 /// it.next();
48 /// assert_eq!(b"", it.as_bytes());
49 /// ```
50 #[inline]
51 pub fn as_bytes(&self) -> &'a [u8] {
52 self.bs
53 }
54 }
55
56 impl<'a> Iterator for Sentences<'a> {
57 type Item = &'a str;
58
59 #[inline]
60 fn next(&mut self) -> Option<&'a str> {
61 let (sentence, size) = decode_sentence(self.bs);
62 if size == 0 {
63 return None;
64 }
65 self.bs = &self.bs[size..];
66 Some(sentence)
67 }
68 }
69
70 /// An iterator over sentences in a byte string, along with their byte offsets.
71 ///
72 /// This iterator is typically constructed by
73 /// [`ByteSlice::sentence_indices`](trait.ByteSlice.html#method.sentence_indices).
74 ///
75 /// Sentences typically include their trailing punctuation and whitespace.
76 ///
77 /// Since sentences are made up of one or more codepoints, this iterator
78 /// yields `&str` elements (along with their start and end byte offsets).
79 /// When invalid UTF-8 is encountered, replacement codepoints are
80 /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
81 /// indices yielded by this iterator may not correspond to the length of the
82 /// sentence yielded with those indices. For example, when this iterator
83 /// encounters `\xFF` in the byte string, then it will yield a pair of indices
84 /// ranging over a single byte, but will provide an `&str` equivalent to
85 /// `"\u{FFFD}"`, which is three bytes in length. However, when given only
86 /// valid UTF-8, then all indices are in exact correspondence with their paired
87 /// word.
88 ///
89 /// This iterator yields words in accordance with the default sentence boundary
90 /// rules specified in
91 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
92 #[derive(Clone, Debug)]
93 pub struct SentenceIndices<'a> {
94 bs: &'a [u8],
95 forward_index: usize,
96 }
97
98 impl<'a> SentenceIndices<'a> {
99 pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> {
100 SentenceIndices { bs: bs, forward_index: 0 }
101 }
102
103 /// View the underlying data as a subslice of the original data.
104 ///
105 /// The slice returned has the same lifetime as the original slice, and so
106 /// the iterator can continue to be used while this exists.
107 ///
108 /// # Examples
109 ///
110 /// ```
111 /// use bstr::ByteSlice;
112 ///
113 /// let mut it = b"I want this. Not that. Right now.".sentence_indices();
114 ///
115 /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
116 /// it.next();
117 /// assert_eq!(b"Not that. Right now.", it.as_bytes());
118 /// it.next();
119 /// it.next();
120 /// assert_eq!(b"", it.as_bytes());
121 /// ```
122 #[inline]
123 pub fn as_bytes(&self) -> &'a [u8] {
124 self.bs
125 }
126 }
127
128 impl<'a> Iterator for SentenceIndices<'a> {
129 type Item = (usize, usize, &'a str);
130
131 #[inline]
132 fn next(&mut self) -> Option<(usize, usize, &'a str)> {
133 let index = self.forward_index;
134 let (word, size) = decode_sentence(self.bs);
135 if size == 0 {
136 return None;
137 }
138 self.bs = &self.bs[size..];
139 self.forward_index += size;
140 Some((index, index + size, word))
141 }
142 }
143
144 fn decode_sentence(bs: &[u8]) -> (&str, usize) {
145 if bs.is_empty() {
146 ("", 0)
147 } else if let Some(end) = SENTENCE_BREAK_FWD.find(bs) {
148 // Safe because a match can only occur for valid UTF-8.
149 let sentence = unsafe { bs[..end].to_str_unchecked() };
150 (sentence, sentence.len())
151 } else {
152 const INVALID: &'static str = "\u{FFFD}";
153 // No match on non-empty bytes implies we found invalid UTF-8.
154 let (_, size) = utf8::decode_lossy(bs);
155 (INVALID, size)
156 }
157 }
158
159 #[cfg(test)]
160 mod tests {
161 use ucd_parse::SentenceBreakTest;
162
163 use crate::ext_slice::ByteSlice;
164
165 #[test]
166 fn forward_ucd() {
167 for (i, test) in ucdtests().into_iter().enumerate() {
168 let given = test.sentences.concat();
169 let got = sentences(given.as_bytes());
170 assert_eq!(
171 test.sentences,
172 got,
173 "\n\nsentence forward break test {} failed:\n\
174 given: {:?}\n\
175 expected: {:?}\n\
176 got: {:?}\n",
177 i,
178 given,
179 strs_to_bstrs(&test.sentences),
180 strs_to_bstrs(&got),
181 );
182 }
183 }
184
185 // Some additional tests that don't seem to be covered by the UCD tests.
186 #[test]
187 fn forward_additional() {
188 assert_eq!(vec!["a.. ", "A"], sentences(b"a.. A"));
189 assert_eq!(vec!["a.. a"], sentences(b"a.. a"));
190
191 assert_eq!(vec!["a... ", "A"], sentences(b"a... A"));
192 assert_eq!(vec!["a... a"], sentences(b"a... a"));
193
194 assert_eq!(vec!["a...,..., a"], sentences(b"a...,..., a"));
195 }
196
197 fn sentences(bytes: &[u8]) -> Vec<&str> {
198 bytes.sentences().collect()
199 }
200
201 fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
202 strs.iter().map(|s| s.as_ref().as_bytes()).collect()
203 }
204
205 /// Return all of the UCD for sentence breaks.
206 fn ucdtests() -> Vec<SentenceBreakTest> {
207 const TESTDATA: &'static str =
208 include_str!("data/SentenceBreakTest.txt");
209
210 let mut tests = vec![];
211 for mut line in TESTDATA.lines() {
212 line = line.trim();
213 if line.starts_with("#") || line.contains("surrogate") {
214 continue;
215 }
216 tests.push(line.parse().unwrap());
217 }
218 tests
219 }
220 }