1 use regex_automata
::DFA
;
3 use crate::ext_slice
::ByteSlice
;
4 use crate::unicode
::fsm
::sentence_break_fwd
::SENTENCE_BREAK_FWD
;
7 /// An iterator over sentences in a byte string.
9 /// This iterator is typically constructed by
10 /// [`ByteSlice::sentences`](trait.ByteSlice.html#method.sentences).
12 /// Sentences typically include their trailing punctuation and whitespace.
14 /// Since sentences are made up of one or more codepoints, this iterator yields
15 /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
16 /// are [substituted](index.html#handling-of-invalid-utf-8).
18 /// This iterator yields words in accordance with the default sentence boundary
19 /// rules specified in
20 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
21 #[derive(Clone, Debug)]
22 pub struct Sentences
<'a
> {
26 impl<'a
> Sentences
<'a
> {
27 pub(crate) fn new(bs
: &'a
[u8]) -> Sentences
<'a
> {
31 /// View the underlying data as a subslice of the original data.
33 /// The slice returned has the same lifetime as the original slice, and so
34 /// the iterator can continue to be used while this exists.
39 /// use bstr::ByteSlice;
41 /// let mut it = b"I want this. Not that. Right now.".sentences();
43 /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
45 /// assert_eq!(b"Not that. Right now.", it.as_bytes());
48 /// assert_eq!(b"", it.as_bytes());
51 pub fn as_bytes(&self) -> &'a
[u8] {
56 impl<'a
> Iterator
for Sentences
<'a
> {
60 fn next(&mut self) -> Option
<&'a
str> {
61 let (sentence
, size
) = decode_sentence(self.bs
);
65 self.bs
= &self.bs
[size
..];
70 /// An iterator over sentences in a byte string, along with their byte offsets.
72 /// This iterator is typically constructed by
73 /// [`ByteSlice::sentence_indices`](trait.ByteSlice.html#method.sentence_indices).
75 /// Sentences typically include their trailing punctuation and whitespace.
77 /// Since sentences are made up of one or more codepoints, this iterator
78 /// yields `&str` elements (along with their start and end byte offsets).
79 /// When invalid UTF-8 is encountered, replacement codepoints are
80 /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
81 /// indices yielded by this iterator may not correspond to the length of the
82 /// sentence yielded with those indices. For example, when this iterator
83 /// encounters `\xFF` in the byte string, then it will yield a pair of indices
84 /// ranging over a single byte, but will provide an `&str` equivalent to
85 /// `"\u{FFFD}"`, which is three bytes in length. However, when given only
86 /// valid UTF-8, then all indices are in exact correspondence with their paired
89 /// This iterator yields words in accordance with the default sentence boundary
90 /// rules specified in
91 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
92 #[derive(Clone, Debug)]
93 pub struct SentenceIndices
<'a
> {
98 impl<'a
> SentenceIndices
<'a
> {
99 pub(crate) fn new(bs
: &'a
[u8]) -> SentenceIndices
<'a
> {
100 SentenceIndices { bs: bs, forward_index: 0 }
103 /// View the underlying data as a subslice of the original data.
105 /// The slice returned has the same lifetime as the original slice, and so
106 /// the iterator can continue to be used while this exists.
111 /// use bstr::ByteSlice;
113 /// let mut it = b"I want this. Not that. Right now.".sentence_indices();
115 /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
117 /// assert_eq!(b"Not that. Right now.", it.as_bytes());
120 /// assert_eq!(b"", it.as_bytes());
123 pub fn as_bytes(&self) -> &'a
[u8] {
128 impl<'a
> Iterator
for SentenceIndices
<'a
> {
129 type Item
= (usize, usize, &'a
str);
132 fn next(&mut self) -> Option
<(usize, usize, &'a
str)> {
133 let index
= self.forward_index
;
134 let (word
, size
) = decode_sentence(self.bs
);
138 self.bs
= &self.bs
[size
..];
139 self.forward_index
+= size
;
140 Some((index
, index
+ size
, word
))
144 fn decode_sentence(bs
: &[u8]) -> (&str, usize) {
147 } else if let Some(end
) = SENTENCE_BREAK_FWD
.find(bs
) {
148 // Safe because a match can only occur for valid UTF-8.
149 let sentence
= unsafe { bs[..end].to_str_unchecked() }
;
150 (sentence
, sentence
.len())
152 const INVALID
: &'
static str = "\u{FFFD}";
153 // No match on non-empty bytes implies we found invalid UTF-8.
154 let (_
, size
) = utf8
::decode_lossy(bs
);
161 use ucd_parse
::SentenceBreakTest
;
163 use crate::ext_slice
::ByteSlice
;
167 for (i
, test
) in ucdtests().into_iter().enumerate() {
168 let given
= test
.sentences
.concat();
169 let got
= sentences(given
.as_bytes());
173 "\n\nsentence forward break test {} failed:\n\
179 strs_to_bstrs(&test
.sentences
),
185 // Some additional tests that don't seem to be covered by the UCD tests.
187 fn forward_additional() {
188 assert_eq
!(vec
!["a.. ", "A"], sentences(b
"a.. A"));
189 assert_eq
!(vec
!["a.. a"], sentences(b
"a.. a"));
191 assert_eq
!(vec
!["a... ", "A"], sentences(b
"a... A"));
192 assert_eq
!(vec
!["a... a"], sentences(b
"a... a"));
194 assert_eq
!(vec
!["a...,..., a"], sentences(b
"a...,..., a"));
197 fn sentences(bytes
: &[u8]) -> Vec
<&str> {
198 bytes
.sentences().collect()
201 fn strs_to_bstrs
<S
: AsRef
<str>>(strs
: &[S
]) -> Vec
<&[u8]> {
202 strs
.iter().map(|s
| s
.as_ref().as_bytes()).collect()
205 /// Return all of the UCD for sentence breaks.
206 fn ucdtests() -> Vec
<SentenceBreakTest
> {
207 const TESTDATA
: &'
static str =
208 include_str
!("data/SentenceBreakTest.txt");
210 let mut tests
= vec
![];
211 for mut line
in TESTDATA
.lines() {
213 if line
.starts_with("#") || line
.contains("surrogate") {
216 tests
.push(line
.parse().unwrap());