]> git.proxmox.com Git - cargo.git/blame - vendor/bstr/src/unicode/word.rs
New upstream version 0.47.0
[cargo.git] / vendor / bstr / src / unicode / word.rs
CommitLineData
22758248
XL
1use regex_automata::DFA;
2
07e530e8 3use ext_slice::ByteSlice;
22758248
XL
4use unicode::fsm::simple_word_fwd::SIMPLE_WORD_FWD;
5use unicode::fsm::word_break_fwd::WORD_BREAK_FWD;
6use utf8;
7
8/// An iterator over words in a byte string.
9///
10/// This iterator is typically constructed by
07e530e8 11/// [`ByteSlice::words`](trait.ByteSlice.html#method.words).
22758248
XL
12///
13/// This is similar to the [`WordsWithBreaks`](struct.WordsWithBreaks.html)
14/// iterator, except it only returns elements that contain a "word" character.
15/// A word character is defined by UTS #18 (Annex C) to be the combination
16/// of the `Alphabetic` and `Join_Control` properties, along with the
17/// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories.
18///
19/// Since words are made up of one or more codepoints, this iterator yields
20/// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
21/// are [substituted](index.html#handling-of-invalid-utf-8).
22///
23/// This iterator yields words in accordance with the default word boundary
24/// rules specified in
25/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
26/// In particular, this may not be suitable for Japanese and Chinese scripts
27/// that do not use spaces between words.
28#[derive(Clone, Debug)]
29pub struct Words<'a>(WordsWithBreaks<'a>);
30
31impl<'a> Words<'a> {
07e530e8 32 pub(crate) fn new(bs: &'a [u8]) -> Words<'a> {
22758248
XL
33 Words(WordsWithBreaks::new(bs))
34 }
35
36 /// View the underlying data as a subslice of the original data.
37 ///
38 /// The slice returned has the same lifetime as the original slice, and so
39 /// the iterator can continue to be used while this exists.
40 ///
41 /// # Examples
42 ///
43 /// ```
07e530e8 44 /// use bstr::ByteSlice;
22758248 45 ///
07e530e8 46 /// let mut it = b"foo bar baz".words();
22758248 47 ///
07e530e8 48 /// assert_eq!(b"foo bar baz", it.as_bytes());
22758248
XL
49 /// it.next();
50 /// it.next();
07e530e8 51 /// assert_eq!(b" baz", it.as_bytes());
22758248 52 /// it.next();
07e530e8 53 /// assert_eq!(b"", it.as_bytes());
22758248
XL
54 /// ```
55 #[inline]
07e530e8
XL
56 pub fn as_bytes(&self) -> &'a [u8] {
57 self.0.as_bytes()
22758248
XL
58 }
59}
60
61impl<'a> Iterator for Words<'a> {
62 type Item = &'a str;
63
64 #[inline]
65 fn next(&mut self) -> Option<&'a str> {
66 while let Some(word) = self.0.next() {
67 if SIMPLE_WORD_FWD.is_match(word.as_bytes()) {
68 return Some(word);
69 }
70 }
71 None
72 }
73}
74
75/// An iterator over words in a byte string and their byte index positions.
76///
77/// This iterator is typically constructed by
07e530e8 78/// [`ByteSlice::word_indices`](trait.ByteSlice.html#method.word_indices).
22758248
XL
79///
80/// This is similar to the
81/// [`WordsWithBreakIndices`](struct.WordsWithBreakIndices.html) iterator,
82/// except it only returns elements that contain a "word" character. A
83/// word character is defined by UTS #18 (Annex C) to be the combination
84/// of the `Alphabetic` and `Join_Control` properties, along with the
85/// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories.
86///
87/// Since words are made up of one or more codepoints, this iterator
88/// yields `&str` elements (along with their start and end byte offsets).
89/// When invalid UTF-8 is encountered, replacement codepoints are
90/// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
91/// indices yielded by this iterator may not correspond to the length of the
92/// word yielded with those indices. For example, when this iterator encounters
93/// `\xFF` in the byte string, then it will yield a pair of indices ranging
94/// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`,
95/// which is three bytes in length. However, when given only valid UTF-8, then
96/// all indices are in exact correspondence with their paired word.
97///
98/// This iterator yields words in accordance with the default word boundary
99/// rules specified in
100/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
101/// In particular, this may not be suitable for Japanese and Chinese scripts
102/// that do not use spaces between words.
103#[derive(Clone, Debug)]
104pub struct WordIndices<'a>(WordsWithBreakIndices<'a>);
105
106impl<'a> WordIndices<'a> {
07e530e8 107 pub(crate) fn new(bs: &'a [u8]) -> WordIndices<'a> {
22758248
XL
108 WordIndices(WordsWithBreakIndices::new(bs))
109 }
110
111 /// View the underlying data as a subslice of the original data.
112 ///
113 /// The slice returned has the same lifetime as the original slice, and so
114 /// the iterator can continue to be used while this exists.
115 ///
116 /// # Examples
117 ///
118 /// ```
07e530e8 119 /// use bstr::ByteSlice;
22758248 120 ///
07e530e8 121 /// let mut it = b"foo bar baz".word_indices();
22758248 122 ///
07e530e8 123 /// assert_eq!(b"foo bar baz", it.as_bytes());
22758248
XL
124 /// it.next();
125 /// it.next();
07e530e8 126 /// assert_eq!(b" baz", it.as_bytes());
22758248
XL
127 /// it.next();
128 /// it.next();
07e530e8 129 /// assert_eq!(b"", it.as_bytes());
22758248
XL
130 /// ```
131 #[inline]
07e530e8
XL
132 pub fn as_bytes(&self) -> &'a [u8] {
133 self.0.as_bytes()
22758248
XL
134 }
135}
136
137impl<'a> Iterator for WordIndices<'a> {
138 type Item = (usize, usize, &'a str);
139
140 #[inline]
141 fn next(&mut self) -> Option<(usize, usize, &'a str)> {
142 while let Some((start, end, word)) = self.0.next() {
143 if SIMPLE_WORD_FWD.is_match(word.as_bytes()) {
144 return Some((start, end, word));
145 }
146 }
147 None
148 }
149}
150
151/// An iterator over all word breaks in a byte string.
152///
153/// This iterator is typically constructed by
07e530e8 154/// [`ByteSlice::words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks).
22758248
XL
155///
156/// This iterator yields not only all words, but the content that comes between
157/// words. In particular, if all elements yielded by this iterator are
158/// concatenated, then the result is the original string (subject to Unicode
159/// replacement codepoint substitutions).
160///
161/// Since words are made up of one or more codepoints, this iterator yields
162/// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
163/// are [substituted](index.html#handling-of-invalid-utf-8).
164///
165/// This iterator yields words in accordance with the default word boundary
166/// rules specified in
167/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
168/// In particular, this may not be suitable for Japanese and Chinese scripts
169/// that do not use spaces between words.
170#[derive(Clone, Debug)]
171pub struct WordsWithBreaks<'a> {
07e530e8 172 bs: &'a [u8],
22758248
XL
173}
174
175impl<'a> WordsWithBreaks<'a> {
07e530e8 176 pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreaks<'a> {
22758248
XL
177 WordsWithBreaks { bs }
178 }
179
180 /// View the underlying data as a subslice of the original data.
181 ///
182 /// The slice returned has the same lifetime as the original slice, and so
183 /// the iterator can continue to be used while this exists.
184 ///
185 /// # Examples
186 ///
187 /// ```
07e530e8 188 /// use bstr::ByteSlice;
22758248 189 ///
07e530e8 190 /// let mut it = b"foo bar baz".words_with_breaks();
22758248 191 ///
07e530e8 192 /// assert_eq!(b"foo bar baz", it.as_bytes());
22758248 193 /// it.next();
07e530e8 194 /// assert_eq!(b" bar baz", it.as_bytes());
22758248
XL
195 /// it.next();
196 /// it.next();
07e530e8 197 /// assert_eq!(b" baz", it.as_bytes());
22758248
XL
198 /// it.next();
199 /// it.next();
07e530e8 200 /// assert_eq!(b"", it.as_bytes());
22758248
XL
201 /// ```
202 #[inline]
07e530e8 203 pub fn as_bytes(&self) -> &'a [u8] {
22758248
XL
204 self.bs
205 }
206}
207
208impl<'a> Iterator for WordsWithBreaks<'a> {
209 type Item = &'a str;
210
211 #[inline]
212 fn next(&mut self) -> Option<&'a str> {
213 let (word, size) = decode_word(self.bs);
214 if size == 0 {
215 return None;
216 }
217 self.bs = &self.bs[size..];
218 Some(word)
219 }
220}
221
222/// An iterator over all word breaks in a byte string, along with their byte
223/// index positions.
224///
225/// This iterator is typically constructed by
07e530e8 226/// [`ByteSlice::words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices).
22758248
XL
227///
228/// This iterator yields not only all words, but the content that comes between
229/// words. In particular, if all elements yielded by this iterator are
230/// concatenated, then the result is the original string (subject to Unicode
231/// replacement codepoint substitutions).
232///
233/// Since words are made up of one or more codepoints, this iterator
234/// yields `&str` elements (along with their start and end byte offsets).
235/// When invalid UTF-8 is encountered, replacement codepoints are
236/// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
237/// indices yielded by this iterator may not correspond to the length of the
238/// word yielded with those indices. For example, when this iterator encounters
239/// `\xFF` in the byte string, then it will yield a pair of indices ranging
240/// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`,
241/// which is three bytes in length. However, when given only valid UTF-8, then
242/// all indices are in exact correspondence with their paired word.
243///
244/// This iterator yields words in accordance with the default word boundary
245/// rules specified in
246/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
247/// In particular, this may not be suitable for Japanese and Chinese scripts
248/// that do not use spaces between words.
249#[derive(Clone, Debug)]
250pub struct WordsWithBreakIndices<'a> {
07e530e8 251 bs: &'a [u8],
22758248
XL
252 forward_index: usize,
253}
254
255impl<'a> WordsWithBreakIndices<'a> {
07e530e8 256 pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> {
22758248
XL
257 WordsWithBreakIndices { bs: bs, forward_index: 0 }
258 }
259
260 /// View the underlying data as a subslice of the original data.
261 ///
262 /// The slice returned has the same lifetime as the original slice, and so
263 /// the iterator can continue to be used while this exists.
264 ///
265 /// # Examples
266 ///
267 /// ```
07e530e8 268 /// use bstr::ByteSlice;
22758248 269 ///
07e530e8 270 /// let mut it = b"foo bar baz".words_with_break_indices();
22758248 271 ///
07e530e8 272 /// assert_eq!(b"foo bar baz", it.as_bytes());
22758248 273 /// it.next();
07e530e8 274 /// assert_eq!(b" bar baz", it.as_bytes());
22758248
XL
275 /// it.next();
276 /// it.next();
07e530e8 277 /// assert_eq!(b" baz", it.as_bytes());
22758248
XL
278 /// it.next();
279 /// it.next();
07e530e8 280 /// assert_eq!(b"", it.as_bytes());
22758248
XL
281 /// ```
282 #[inline]
07e530e8 283 pub fn as_bytes(&self) -> &'a [u8] {
22758248
XL
284 self.bs
285 }
286}
287
288impl<'a> Iterator for WordsWithBreakIndices<'a> {
289 type Item = (usize, usize, &'a str);
290
291 #[inline]
292 fn next(&mut self) -> Option<(usize, usize, &'a str)> {
293 let index = self.forward_index;
294 let (word, size) = decode_word(self.bs);
295 if size == 0 {
296 return None;
297 }
298 self.bs = &self.bs[size..];
299 self.forward_index += size;
300 Some((index, index + size, word))
301 }
302}
303
07e530e8 304fn decode_word(bs: &[u8]) -> (&str, usize) {
22758248
XL
305 if bs.is_empty() {
306 ("", 0)
07e530e8 307 } else if let Some(end) = WORD_BREAK_FWD.find(bs) {
22758248
XL
308 // Safe because a match can only occur for valid UTF-8.
309 let word = unsafe { bs[..end].to_str_unchecked() };
310 (word, word.len())
311 } else {
312 const INVALID: &'static str = "\u{FFFD}";
313 // No match on non-empty bytes implies we found invalid UTF-8.
07e530e8 314 let (_, size) = utf8::decode_lossy(bs);
22758248
XL
315 (INVALID, size)
316 }
317}
318
319#[cfg(test)]
320mod tests {
321 use ucd_parse::WordBreakTest;
322
07e530e8 323 use ext_slice::ByteSlice;
22758248
XL
324
325 #[test]
326 fn forward_ucd() {
327 for (i, test) in ucdtests().into_iter().enumerate() {
328 let given = test.words.concat();
329 let got = words(given.as_bytes());
330 assert_eq!(
331 test.words,
332 got,
333 "\n\nword forward break test {} failed:\n\
07e530e8
XL
334 given: {:?}\n\
335 expected: {:?}\n\
336 got: {:?}\n",
22758248 337 i,
07e530e8 338 given,
22758248
XL
339 strs_to_bstrs(&test.words),
340 strs_to_bstrs(&got),
341 );
342 }
343 }
344
345 // Some additional tests that don't seem to be covered by the UCD tests.
346 //
347 // It's pretty amazing that the UCD tests miss these cases. I only found
348 // them by running this crate's segmenter and ICU's segmenter on the same
349 // text and comparing the output.
350 #[test]
351 fn forward_additional() {
352 assert_eq!(vec!["a", ".", " ", "Y"], words(b"a. Y"));
07e530e8 353 assert_eq!(vec!["r", ".", " ", "Yo"], words(b"r. Yo"));
22758248
XL
354 assert_eq!(
355 vec!["whatsoever", ".", " ", "You", " ", "may"],
356 words(b"whatsoever. You may")
357 );
358 assert_eq!(
359 vec!["21stcentury'syesterday"],
360 words(b"21stcentury'syesterday")
361 );
362
07e530e8
XL
363 assert_eq!(vec!["Bonta_", "'", "s"], words(b"Bonta_'s"));
364 assert_eq!(vec!["_vhat's"], words(b"_vhat's"));
365 assert_eq!(vec!["__on'anima"], words(b"__on'anima"));
366 assert_eq!(vec!["123_", "'", "4"], words(b"123_'4"));
367 assert_eq!(vec!["_123'4"], words(b"_123'4"));
368 assert_eq!(vec!["__12'345"], words(b"__12'345"));
22758248
XL
369
370 assert_eq!(
371 vec!["tomorrowat4", ":", "00", ","],
372 words(b"tomorrowat4:00,")
373 );
07e530e8
XL
374 assert_eq!(vec!["RS1", "'", "s"], words(b"RS1's"));
375 assert_eq!(vec!["X38"], words(b"X38"));
22758248 376
07e530e8
XL
377 assert_eq!(vec!["4abc", ":", "00", ","], words(b"4abc:00,"));
378 assert_eq!(vec!["12S", "'", "1"], words(b"12S'1"));
379 assert_eq!(vec!["1XY"], words(b"1XY"));
22758248 380
07e530e8 381 assert_eq!(vec!["\u{FEFF}", "Ты"], words("\u{FEFF}Ты".as_bytes()));
22758248
XL
382 }
383
384 fn words(bytes: &[u8]) -> Vec<&str> {
07e530e8 385 bytes.words_with_breaks().collect()
22758248
XL
386 }
387
07e530e8
XL
388 fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
389 strs.iter().map(|s| s.as_ref().as_bytes()).collect()
22758248
XL
390 }
391
392 /// Return all of the UCD for word breaks.
393 fn ucdtests() -> Vec<WordBreakTest> {
07e530e8 394 const TESTDATA: &'static str = include_str!("data/WordBreakTest.txt");
22758248
XL
395
396 let mut tests = vec![];
397 for mut line in TESTDATA.lines() {
398 line = line.trim();
399 if line.starts_with("#") || line.contains("surrogate") {
400 continue;
401 }
402 tests.push(line.parse().unwrap());
403 }
404 tests
405 }
406}