]>
Commit | Line | Data |
---|---|---|
22758248 XL |
1 | use regex_automata::DFA; |
2 | ||
07e530e8 | 3 | use ext_slice::ByteSlice; |
22758248 XL |
4 | use unicode::fsm::simple_word_fwd::SIMPLE_WORD_FWD; |
5 | use unicode::fsm::word_break_fwd::WORD_BREAK_FWD; | |
6 | use utf8; | |
7 | ||
8 | /// An iterator over words in a byte string. | |
9 | /// | |
10 | /// This iterator is typically constructed by | |
07e530e8 | 11 | /// [`ByteSlice::words`](trait.ByteSlice.html#method.words). |
22758248 XL |
12 | /// |
13 | /// This is similar to the [`WordsWithBreaks`](struct.WordsWithBreaks.html) | |
14 | /// iterator, except it only returns elements that contain a "word" character. | |
15 | /// A word character is defined by UTS #18 (Annex C) to be the combination | |
16 | /// of the `Alphabetic` and `Join_Control` properties, along with the | |
17 | /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories. | |
18 | /// | |
19 | /// Since words are made up of one or more codepoints, this iterator yields | |
20 | /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints | |
21 | /// are [substituted](index.html#handling-of-invalid-utf-8). | |
22 | /// | |
23 | /// This iterator yields words in accordance with the default word boundary | |
24 | /// rules specified in | |
25 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries). | |
26 | /// In particular, this may not be suitable for Japanese and Chinese scripts | |
27 | /// that do not use spaces between words. | |
28 | #[derive(Clone, Debug)] | |
29 | pub struct Words<'a>(WordsWithBreaks<'a>); | |
30 | ||
31 | impl<'a> Words<'a> { | |
07e530e8 | 32 | pub(crate) fn new(bs: &'a [u8]) -> Words<'a> { |
22758248 XL |
33 | Words(WordsWithBreaks::new(bs)) |
34 | } | |
35 | ||
36 | /// View the underlying data as a subslice of the original data. | |
37 | /// | |
38 | /// The slice returned has the same lifetime as the original slice, and so | |
39 | /// the iterator can continue to be used while this exists. | |
40 | /// | |
41 | /// # Examples | |
42 | /// | |
43 | /// ``` | |
07e530e8 | 44 | /// use bstr::ByteSlice; |
22758248 | 45 | /// |
07e530e8 | 46 | /// let mut it = b"foo bar baz".words(); |
22758248 | 47 | /// |
07e530e8 | 48 | /// assert_eq!(b"foo bar baz", it.as_bytes()); |
22758248 XL |
49 | /// it.next(); |
50 | /// it.next(); | |
07e530e8 | 51 | /// assert_eq!(b" baz", it.as_bytes()); |
22758248 | 52 | /// it.next(); |
07e530e8 | 53 | /// assert_eq!(b"", it.as_bytes()); |
22758248 XL |
54 | /// ``` |
55 | #[inline] | |
07e530e8 XL |
56 | pub fn as_bytes(&self) -> &'a [u8] { |
57 | self.0.as_bytes() | |
22758248 XL |
58 | } |
59 | } | |
60 | ||
61 | impl<'a> Iterator for Words<'a> { | |
62 | type Item = &'a str; | |
63 | ||
64 | #[inline] | |
65 | fn next(&mut self) -> Option<&'a str> { | |
66 | while let Some(word) = self.0.next() { | |
67 | if SIMPLE_WORD_FWD.is_match(word.as_bytes()) { | |
68 | return Some(word); | |
69 | } | |
70 | } | |
71 | None | |
72 | } | |
73 | } | |
74 | ||
75 | /// An iterator over words in a byte string and their byte index positions. | |
76 | /// | |
77 | /// This iterator is typically constructed by | |
07e530e8 | 78 | /// [`ByteSlice::word_indices`](trait.ByteSlice.html#method.word_indices). |
22758248 XL |
79 | /// |
80 | /// This is similar to the | |
81 | /// [`WordsWithBreakIndices`](struct.WordsWithBreakIndices.html) iterator, | |
82 | /// except it only returns elements that contain a "word" character. A | |
83 | /// word character is defined by UTS #18 (Annex C) to be the combination | |
84 | /// of the `Alphabetic` and `Join_Control` properties, along with the | |
85 | /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories. | |
86 | /// | |
87 | /// Since words are made up of one or more codepoints, this iterator | |
88 | /// yields `&str` elements (along with their start and end byte offsets). | |
89 | /// When invalid UTF-8 is encountered, replacement codepoints are | |
90 | /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the | |
91 | /// indices yielded by this iterator may not correspond to the length of the | |
92 | /// word yielded with those indices. For example, when this iterator encounters | |
93 | /// `\xFF` in the byte string, then it will yield a pair of indices ranging | |
94 | /// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`, | |
95 | /// which is three bytes in length. However, when given only valid UTF-8, then | |
96 | /// all indices are in exact correspondence with their paired word. | |
97 | /// | |
98 | /// This iterator yields words in accordance with the default word boundary | |
99 | /// rules specified in | |
100 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries). | |
101 | /// In particular, this may not be suitable for Japanese and Chinese scripts | |
102 | /// that do not use spaces between words. | |
103 | #[derive(Clone, Debug)] | |
104 | pub struct WordIndices<'a>(WordsWithBreakIndices<'a>); | |
105 | ||
106 | impl<'a> WordIndices<'a> { | |
07e530e8 | 107 | pub(crate) fn new(bs: &'a [u8]) -> WordIndices<'a> { |
22758248 XL |
108 | WordIndices(WordsWithBreakIndices::new(bs)) |
109 | } | |
110 | ||
111 | /// View the underlying data as a subslice of the original data. | |
112 | /// | |
113 | /// The slice returned has the same lifetime as the original slice, and so | |
114 | /// the iterator can continue to be used while this exists. | |
115 | /// | |
116 | /// # Examples | |
117 | /// | |
118 | /// ``` | |
07e530e8 | 119 | /// use bstr::ByteSlice; |
22758248 | 120 | /// |
07e530e8 | 121 | /// let mut it = b"foo bar baz".word_indices(); |
22758248 | 122 | /// |
07e530e8 | 123 | /// assert_eq!(b"foo bar baz", it.as_bytes()); |
22758248 XL |
124 | /// it.next(); |
125 | /// it.next(); | |
07e530e8 | 126 | /// assert_eq!(b" baz", it.as_bytes()); |
22758248 XL |
127 | /// it.next(); |
128 | /// it.next(); | |
07e530e8 | 129 | /// assert_eq!(b"", it.as_bytes()); |
22758248 XL |
130 | /// ``` |
131 | #[inline] | |
07e530e8 XL |
132 | pub fn as_bytes(&self) -> &'a [u8] { |
133 | self.0.as_bytes() | |
22758248 XL |
134 | } |
135 | } | |
136 | ||
137 | impl<'a> Iterator for WordIndices<'a> { | |
138 | type Item = (usize, usize, &'a str); | |
139 | ||
140 | #[inline] | |
141 | fn next(&mut self) -> Option<(usize, usize, &'a str)> { | |
142 | while let Some((start, end, word)) = self.0.next() { | |
143 | if SIMPLE_WORD_FWD.is_match(word.as_bytes()) { | |
144 | return Some((start, end, word)); | |
145 | } | |
146 | } | |
147 | None | |
148 | } | |
149 | } | |
150 | ||
151 | /// An iterator over all word breaks in a byte string. | |
152 | /// | |
153 | /// This iterator is typically constructed by | |
07e530e8 | 154 | /// [`ByteSlice::words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks). |
22758248 XL |
155 | /// |
156 | /// This iterator yields not only all words, but the content that comes between | |
157 | /// words. In particular, if all elements yielded by this iterator are | |
158 | /// concatenated, then the result is the original string (subject to Unicode | |
159 | /// replacement codepoint substitutions). | |
160 | /// | |
161 | /// Since words are made up of one or more codepoints, this iterator yields | |
162 | /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints | |
163 | /// are [substituted](index.html#handling-of-invalid-utf-8). | |
164 | /// | |
165 | /// This iterator yields words in accordance with the default word boundary | |
166 | /// rules specified in | |
167 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries). | |
168 | /// In particular, this may not be suitable for Japanese and Chinese scripts | |
169 | /// that do not use spaces between words. | |
170 | #[derive(Clone, Debug)] | |
171 | pub struct WordsWithBreaks<'a> { | |
07e530e8 | 172 | bs: &'a [u8], |
22758248 XL |
173 | } |
174 | ||
175 | impl<'a> WordsWithBreaks<'a> { | |
07e530e8 | 176 | pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreaks<'a> { |
22758248 XL |
177 | WordsWithBreaks { bs } |
178 | } | |
179 | ||
180 | /// View the underlying data as a subslice of the original data. | |
181 | /// | |
182 | /// The slice returned has the same lifetime as the original slice, and so | |
183 | /// the iterator can continue to be used while this exists. | |
184 | /// | |
185 | /// # Examples | |
186 | /// | |
187 | /// ``` | |
07e530e8 | 188 | /// use bstr::ByteSlice; |
22758248 | 189 | /// |
07e530e8 | 190 | /// let mut it = b"foo bar baz".words_with_breaks(); |
22758248 | 191 | /// |
07e530e8 | 192 | /// assert_eq!(b"foo bar baz", it.as_bytes()); |
22758248 | 193 | /// it.next(); |
07e530e8 | 194 | /// assert_eq!(b" bar baz", it.as_bytes()); |
22758248 XL |
195 | /// it.next(); |
196 | /// it.next(); | |
07e530e8 | 197 | /// assert_eq!(b" baz", it.as_bytes()); |
22758248 XL |
198 | /// it.next(); |
199 | /// it.next(); | |
07e530e8 | 200 | /// assert_eq!(b"", it.as_bytes()); |
22758248 XL |
201 | /// ``` |
202 | #[inline] | |
07e530e8 | 203 | pub fn as_bytes(&self) -> &'a [u8] { |
22758248 XL |
204 | self.bs |
205 | } | |
206 | } | |
207 | ||
208 | impl<'a> Iterator for WordsWithBreaks<'a> { | |
209 | type Item = &'a str; | |
210 | ||
211 | #[inline] | |
212 | fn next(&mut self) -> Option<&'a str> { | |
213 | let (word, size) = decode_word(self.bs); | |
214 | if size == 0 { | |
215 | return None; | |
216 | } | |
217 | self.bs = &self.bs[size..]; | |
218 | Some(word) | |
219 | } | |
220 | } | |
221 | ||
222 | /// An iterator over all word breaks in a byte string, along with their byte | |
223 | /// index positions. | |
224 | /// | |
225 | /// This iterator is typically constructed by | |
07e530e8 | 226 | /// [`ByteSlice::words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices). |
22758248 XL |
227 | /// |
228 | /// This iterator yields not only all words, but the content that comes between | |
229 | /// words. In particular, if all elements yielded by this iterator are | |
230 | /// concatenated, then the result is the original string (subject to Unicode | |
231 | /// replacement codepoint substitutions). | |
232 | /// | |
233 | /// Since words are made up of one or more codepoints, this iterator | |
234 | /// yields `&str` elements (along with their start and end byte offsets). | |
235 | /// When invalid UTF-8 is encountered, replacement codepoints are | |
236 | /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the | |
237 | /// indices yielded by this iterator may not correspond to the length of the | |
238 | /// word yielded with those indices. For example, when this iterator encounters | |
239 | /// `\xFF` in the byte string, then it will yield a pair of indices ranging | |
240 | /// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`, | |
241 | /// which is three bytes in length. However, when given only valid UTF-8, then | |
242 | /// all indices are in exact correspondence with their paired word. | |
243 | /// | |
244 | /// This iterator yields words in accordance with the default word boundary | |
245 | /// rules specified in | |
246 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries). | |
247 | /// In particular, this may not be suitable for Japanese and Chinese scripts | |
248 | /// that do not use spaces between words. | |
249 | #[derive(Clone, Debug)] | |
250 | pub struct WordsWithBreakIndices<'a> { | |
07e530e8 | 251 | bs: &'a [u8], |
22758248 XL |
252 | forward_index: usize, |
253 | } | |
254 | ||
255 | impl<'a> WordsWithBreakIndices<'a> { | |
07e530e8 | 256 | pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> { |
22758248 XL |
257 | WordsWithBreakIndices { bs: bs, forward_index: 0 } |
258 | } | |
259 | ||
260 | /// View the underlying data as a subslice of the original data. | |
261 | /// | |
262 | /// The slice returned has the same lifetime as the original slice, and so | |
263 | /// the iterator can continue to be used while this exists. | |
264 | /// | |
265 | /// # Examples | |
266 | /// | |
267 | /// ``` | |
07e530e8 | 268 | /// use bstr::ByteSlice; |
22758248 | 269 | /// |
07e530e8 | 270 | /// let mut it = b"foo bar baz".words_with_break_indices(); |
22758248 | 271 | /// |
07e530e8 | 272 | /// assert_eq!(b"foo bar baz", it.as_bytes()); |
22758248 | 273 | /// it.next(); |
07e530e8 | 274 | /// assert_eq!(b" bar baz", it.as_bytes()); |
22758248 XL |
275 | /// it.next(); |
276 | /// it.next(); | |
07e530e8 | 277 | /// assert_eq!(b" baz", it.as_bytes()); |
22758248 XL |
278 | /// it.next(); |
279 | /// it.next(); | |
07e530e8 | 280 | /// assert_eq!(b"", it.as_bytes()); |
22758248 XL |
281 | /// ``` |
282 | #[inline] | |
07e530e8 | 283 | pub fn as_bytes(&self) -> &'a [u8] { |
22758248 XL |
284 | self.bs |
285 | } | |
286 | } | |
287 | ||
288 | impl<'a> Iterator for WordsWithBreakIndices<'a> { | |
289 | type Item = (usize, usize, &'a str); | |
290 | ||
291 | #[inline] | |
292 | fn next(&mut self) -> Option<(usize, usize, &'a str)> { | |
293 | let index = self.forward_index; | |
294 | let (word, size) = decode_word(self.bs); | |
295 | if size == 0 { | |
296 | return None; | |
297 | } | |
298 | self.bs = &self.bs[size..]; | |
299 | self.forward_index += size; | |
300 | Some((index, index + size, word)) | |
301 | } | |
302 | } | |
303 | ||
07e530e8 | 304 | fn decode_word(bs: &[u8]) -> (&str, usize) { |
22758248 XL |
305 | if bs.is_empty() { |
306 | ("", 0) | |
07e530e8 | 307 | } else if let Some(end) = WORD_BREAK_FWD.find(bs) { |
22758248 XL |
308 | // Safe because a match can only occur for valid UTF-8. |
309 | let word = unsafe { bs[..end].to_str_unchecked() }; | |
310 | (word, word.len()) | |
311 | } else { | |
312 | const INVALID: &'static str = "\u{FFFD}"; | |
313 | // No match on non-empty bytes implies we found invalid UTF-8. | |
07e530e8 | 314 | let (_, size) = utf8::decode_lossy(bs); |
22758248 XL |
315 | (INVALID, size) |
316 | } | |
317 | } | |
318 | ||
319 | #[cfg(test)] | |
320 | mod tests { | |
321 | use ucd_parse::WordBreakTest; | |
322 | ||
07e530e8 | 323 | use ext_slice::ByteSlice; |
22758248 XL |
324 | |
325 | #[test] | |
326 | fn forward_ucd() { | |
327 | for (i, test) in ucdtests().into_iter().enumerate() { | |
328 | let given = test.words.concat(); | |
329 | let got = words(given.as_bytes()); | |
330 | assert_eq!( | |
331 | test.words, | |
332 | got, | |
333 | "\n\nword forward break test {} failed:\n\ | |
07e530e8 XL |
334 | given: {:?}\n\ |
335 | expected: {:?}\n\ | |
336 | got: {:?}\n", | |
22758248 | 337 | i, |
07e530e8 | 338 | given, |
22758248 XL |
339 | strs_to_bstrs(&test.words), |
340 | strs_to_bstrs(&got), | |
341 | ); | |
342 | } | |
343 | } | |
344 | ||
345 | // Some additional tests that don't seem to be covered by the UCD tests. | |
346 | // | |
347 | // It's pretty amazing that the UCD tests miss these cases. I only found | |
348 | // them by running this crate's segmenter and ICU's segmenter on the same | |
349 | // text and comparing the output. | |
350 | #[test] | |
351 | fn forward_additional() { | |
352 | assert_eq!(vec!["a", ".", " ", "Y"], words(b"a. Y")); | |
07e530e8 | 353 | assert_eq!(vec!["r", ".", " ", "Yo"], words(b"r. Yo")); |
22758248 XL |
354 | assert_eq!( |
355 | vec!["whatsoever", ".", " ", "You", " ", "may"], | |
356 | words(b"whatsoever. You may") | |
357 | ); | |
358 | assert_eq!( | |
359 | vec!["21stcentury'syesterday"], | |
360 | words(b"21stcentury'syesterday") | |
361 | ); | |
362 | ||
07e530e8 XL |
363 | assert_eq!(vec!["Bonta_", "'", "s"], words(b"Bonta_'s")); |
364 | assert_eq!(vec!["_vhat's"], words(b"_vhat's")); | |
365 | assert_eq!(vec!["__on'anima"], words(b"__on'anima")); | |
366 | assert_eq!(vec!["123_", "'", "4"], words(b"123_'4")); | |
367 | assert_eq!(vec!["_123'4"], words(b"_123'4")); | |
368 | assert_eq!(vec!["__12'345"], words(b"__12'345")); | |
22758248 XL |
369 | |
370 | assert_eq!( | |
371 | vec!["tomorrowat4", ":", "00", ","], | |
372 | words(b"tomorrowat4:00,") | |
373 | ); | |
07e530e8 XL |
374 | assert_eq!(vec!["RS1", "'", "s"], words(b"RS1's")); |
375 | assert_eq!(vec!["X38"], words(b"X38")); | |
22758248 | 376 | |
07e530e8 XL |
377 | assert_eq!(vec!["4abc", ":", "00", ","], words(b"4abc:00,")); |
378 | assert_eq!(vec!["12S", "'", "1"], words(b"12S'1")); | |
379 | assert_eq!(vec!["1XY"], words(b"1XY")); | |
22758248 | 380 | |
07e530e8 | 381 | assert_eq!(vec!["\u{FEFF}", "Ты"], words("\u{FEFF}Ты".as_bytes())); |
22758248 XL |
382 | } |
383 | ||
384 | fn words(bytes: &[u8]) -> Vec<&str> { | |
07e530e8 | 385 | bytes.words_with_breaks().collect() |
22758248 XL |
386 | } |
387 | ||
07e530e8 XL |
388 | fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> { |
389 | strs.iter().map(|s| s.as_ref().as_bytes()).collect() | |
22758248 XL |
390 | } |
391 | ||
392 | /// Return all of the UCD for word breaks. | |
393 | fn ucdtests() -> Vec<WordBreakTest> { | |
07e530e8 | 394 | const TESTDATA: &'static str = include_str!("data/WordBreakTest.txt"); |
22758248 XL |
395 | |
396 | let mut tests = vec![]; | |
397 | for mut line in TESTDATA.lines() { | |
398 | line = line.trim(); | |
399 | if line.starts_with("#") || line.contains("surrogate") { | |
400 | continue; | |
401 | } | |
402 | tests.push(line.parse().unwrap()); | |
403 | } | |
404 | tests | |
405 | } | |
406 | } |