vendor/bstr/src/unicode/sentence.rs

   1 use regex_automata::DFA;
   2
   3 use crate::ext_slice::ByteSlice;
   4 use crate::unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD;
   5 use crate::utf8;
   6
   7 /// An iterator over sentences in a byte string.
   8 ///
   9 /// This iterator is typically constructed by
  10 /// [`ByteSlice::sentences`](trait.ByteSlice.html#method.sentences).
  11 ///
  12 /// Sentences typically include their trailing punctuation and whitespace.
  13 ///
  14 /// Since sentences are made up of one or more codepoints, this iterator yields
  15 /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
  16 /// are [substituted](index.html#handling-of-invalid-utf-8).
  17 ///
  18 /// This iterator yields words in accordance with the default sentence boundary
  19 /// rules specified in
  20 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
  21 #[derive(Clone, Debug)]
  22 pub struct Sentences<'a> {
  23     bs: &'a [u8],
  24 }
  25
  26 impl<'a> Sentences<'a> {
  27     pub(crate) fn new(bs: &'a [u8]) -> Sentences<'a> {
  28         Sentences { bs }
  29     }
  30
  31     /// View the underlying data as a subslice of the original data.
  32     ///
  33     /// The slice returned has the same lifetime as the original slice, and so
  34     /// the iterator can continue to be used while this exists.
  35     ///
  36     /// # Examples
  37     ///
  38     /// ```
  39     /// use bstr::ByteSlice;
  40     ///
  41     /// let mut it = b"I want this. Not that. Right now.".sentences();
  42     ///
  43     /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
  44     /// it.next();
  45     /// assert_eq!(b"Not that. Right now.", it.as_bytes());
  46     /// it.next();
  47     /// it.next();
  48     /// assert_eq!(b"", it.as_bytes());
  49     /// ```
  50     #[inline]
  51     pub fn as_bytes(&self) -> &'a [u8] {
  52         self.bs
  53     }
  54 }
  55
  56 impl<'a> Iterator for Sentences<'a> {
  57     type Item = &'a str;
  58
  59     #[inline]
  60     fn next(&mut self) -> Option<&'a str> {
  61         let (sentence, size) = decode_sentence(self.bs);
  62         if size == 0 {
  63             return None;
  64         }
  65         self.bs = &self.bs[size..];
  66         Some(sentence)
  67     }
  68 }
  69
  70 /// An iterator over sentences in a byte string, along with their byte offsets.
  71 ///
  72 /// This iterator is typically constructed by
  73 /// [`ByteSlice::sentence_indices`](trait.ByteSlice.html#method.sentence_indices).
  74 ///
  75 /// Sentences typically include their trailing punctuation and whitespace.
  76 ///
  77 /// Since sentences are made up of one or more codepoints, this iterator
  78 /// yields `&str` elements (along with their start and end byte offsets).
  79 /// When invalid UTF-8 is encountered, replacement codepoints are
  80 /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
  81 /// indices yielded by this iterator may not correspond to the length of the
  82 /// sentence yielded with those indices. For example, when this iterator
  83 /// encounters `\xFF` in the byte string, then it will yield a pair of indices
  84 /// ranging over a single byte, but will provide an `&str` equivalent to
  85 /// `"\u{FFFD}"`, which is three bytes in length. However, when given only
  86 /// valid UTF-8, then all indices are in exact correspondence with their paired
  87 /// word.
  88 ///
  89 /// This iterator yields words in accordance with the default sentence boundary
  90 /// rules specified in
  91 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
  92 #[derive(Clone, Debug)]
  93 pub struct SentenceIndices<'a> {
  94     bs: &'a [u8],
  95     forward_index: usize,
  96 }
  97
  98 impl<'a> SentenceIndices<'a> {
  99     pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> {
 100         SentenceIndices { bs: bs, forward_index: 0 }
 101     }
 102
 103     /// View the underlying data as a subslice of the original data.
 104     ///
 105     /// The slice returned has the same lifetime as the original slice, and so
 106     /// the iterator can continue to be used while this exists.
 107     ///
 108     /// # Examples
 109     ///
 110     /// ```
 111     /// use bstr::ByteSlice;
 112     ///
 113     /// let mut it = b"I want this. Not that. Right now.".sentence_indices();
 114     ///
 115     /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
 116     /// it.next();
 117     /// assert_eq!(b"Not that. Right now.", it.as_bytes());
 118     /// it.next();
 119     /// it.next();
 120     /// assert_eq!(b"", it.as_bytes());
 121     /// ```
 122     #[inline]
 123     pub fn as_bytes(&self) -> &'a [u8] {
 124         self.bs
 125     }
 126 }
 127
 128 impl<'a> Iterator for SentenceIndices<'a> {
 129     type Item = (usize, usize, &'a str);
 130
 131     #[inline]
 132     fn next(&mut self) -> Option<(usize, usize, &'a str)> {
 133         let index = self.forward_index;
 134         let (word, size) = decode_sentence(self.bs);
 135         if size == 0 {
 136             return None;
 137         }
 138         self.bs = &self.bs[size..];
 139         self.forward_index += size;
 140         Some((index, index + size, word))
 141     }
 142 }
 143
 144 fn decode_sentence(bs: &[u8]) -> (&str, usize) {
 145     if bs.is_empty() {
 146         ("", 0)
 147     } else if let Some(end) = SENTENCE_BREAK_FWD.find(bs) {
 148         // Safe because a match can only occur for valid UTF-8.
 149         let sentence = unsafe { bs[..end].to_str_unchecked() };
 150         (sentence, sentence.len())
 151     } else {
 152         const INVALID: &'static str = "\u{FFFD}";
 153         // No match on non-empty bytes implies we found invalid UTF-8.
 154         let (_, size) = utf8::decode_lossy(bs);
 155         (INVALID, size)
 156     }
 157 }
 158
 159 #[cfg(test)]
 160 mod tests {
 161     use ucd_parse::SentenceBreakTest;
 162
 163     use crate::ext_slice::ByteSlice;
 164
 165     #[test]
 166     fn forward_ucd() {
 167         for (i, test) in ucdtests().into_iter().enumerate() {
 168             let given = test.sentences.concat();
 169             let got = sentences(given.as_bytes());
 170             assert_eq!(
 171                 test.sentences,
 172                 got,
 173                 "\n\nsentence forward break test {} failed:\n\
 174                  given:    {:?}\n\
 175                  expected: {:?}\n\
 176                  got:      {:?}\n",
 177                 i,
 178                 given,
 179                 strs_to_bstrs(&test.sentences),
 180                 strs_to_bstrs(&got),
 181             );
 182         }
 183     }
 184
 185     // Some additional tests that don't seem to be covered by the UCD tests.
 186     #[test]
 187     fn forward_additional() {
 188         assert_eq!(vec!["a.. ", "A"], sentences(b"a.. A"));
 189         assert_eq!(vec!["a.. a"], sentences(b"a.. a"));
 190
 191         assert_eq!(vec!["a... ", "A"], sentences(b"a... A"));
 192         assert_eq!(vec!["a... a"], sentences(b"a... a"));
 193
 194         assert_eq!(vec!["a...,..., a"], sentences(b"a...,..., a"));
 195     }
 196
 197     fn sentences(bytes: &[u8]) -> Vec<&str> {
 198         bytes.sentences().collect()
 199     }
 200
 201     fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
 202         strs.iter().map(|s| s.as_ref().as_bytes()).collect()
 203     }
 204
 205     /// Return all of the UCD for sentence breaks.
 206     fn ucdtests() -> Vec<SentenceBreakTest> {
 207         const TESTDATA: &'static str =
 208             include_str!("data/SentenceBreakTest.txt");
 209
 210         let mut tests = vec![];
 211         for mut line in TESTDATA.lines() {
 212             line = line.trim();
 213             if line.starts_with("#") || line.contains("surrogate") {
 214                 continue;
 215             }
 216             tests.push(line.parse().unwrap());
 217         }
 218         tests
 219     }
 220 }