vendor/bstr/src/unicode/grapheme.rs

   1 use regex_automata::{dfa::Automaton, Anchored, Input};
   2
   3 use crate::{
   4     ext_slice::ByteSlice,
   5     unicode::fsm::{
   6         grapheme_break_fwd::GRAPHEME_BREAK_FWD,
   7         grapheme_break_rev::GRAPHEME_BREAK_REV,
   8         regional_indicator_rev::REGIONAL_INDICATOR_REV,
   9     },
  10     utf8,
  11 };
  12
  13 /// An iterator over grapheme clusters in a byte string.
  14 ///
  15 /// This iterator is typically constructed by
  16 /// [`ByteSlice::graphemes`](trait.ByteSlice.html#method.graphemes).
  17 ///
  18 /// Unicode defines a grapheme cluster as an *approximation* to a single user
  19 /// visible character. A grapheme cluster, or just "grapheme," is made up of
  20 /// one or more codepoints. For end user oriented tasks, one should generally
  21 /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which
  22 /// always yields one codepoint at a time.
  23 ///
  24 /// Since graphemes are made up of one or more codepoints, this iterator yields
  25 /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
  26 /// are [substituted](index.html#handling-of-invalid-utf-8).
  27 ///
  28 /// This iterator can be used in reverse. When reversed, exactly the same
  29 /// set of grapheme clusters are yielded, but in reverse order.
  30 ///
  31 /// This iterator only yields *extended* grapheme clusters, in accordance with
  32 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries).
  33 #[derive(Clone, Debug)]
  34 pub struct Graphemes<'a> {
  35     bs: &'a [u8],
  36 }
  37
  38 impl<'a> Graphemes<'a> {
  39     pub(crate) fn new(bs: &'a [u8]) -> Graphemes<'a> {
  40         Graphemes { bs }
  41     }
  42
  43     /// View the underlying data as a subslice of the original data.
  44     ///
  45     /// The slice returned has the same lifetime as the original slice, and so
  46     /// the iterator can continue to be used while this exists.
  47     ///
  48     /// # Examples
  49     ///
  50     /// ```
  51     /// use bstr::ByteSlice;
  52     ///
  53     /// let mut it = b"abc".graphemes();
  54     ///
  55     /// assert_eq!(b"abc", it.as_bytes());
  56     /// it.next();
  57     /// assert_eq!(b"bc", it.as_bytes());
  58     /// it.next();
  59     /// it.next();
  60     /// assert_eq!(b"", it.as_bytes());
  61     /// ```
  62     #[inline]
  63     pub fn as_bytes(&self) -> &'a [u8] {
  64         self.bs
  65     }
  66 }
  67
  68 impl<'a> Iterator for Graphemes<'a> {
  69     type Item = &'a str;
  70
  71     #[inline]
  72     fn next(&mut self) -> Option<&'a str> {
  73         let (grapheme, size) = decode_grapheme(self.bs);
  74         if size == 0 {
  75             return None;
  76         }
  77         self.bs = &self.bs[size..];
  78         Some(grapheme)
  79     }
  80 }
  81
  82 impl<'a> DoubleEndedIterator for Graphemes<'a> {
  83     #[inline]
  84     fn next_back(&mut self) -> Option<&'a str> {
  85         let (grapheme, size) = decode_last_grapheme(self.bs);
  86         if size == 0 {
  87             return None;
  88         }
  89         self.bs = &self.bs[..self.bs.len() - size];
  90         Some(grapheme)
  91     }
  92 }
  93
  94 /// An iterator over grapheme clusters in a byte string and their byte index
  95 /// positions.
  96 ///
  97 /// This iterator is typically constructed by
  98 /// [`ByteSlice::grapheme_indices`](trait.ByteSlice.html#method.grapheme_indices).
  99 ///
 100 /// Unicode defines a grapheme cluster as an *approximation* to a single user
 101 /// visible character. A grapheme cluster, or just "grapheme," is made up of
 102 /// one or more codepoints. For end user oriented tasks, one should generally
 103 /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which
 104 /// always yields one codepoint at a time.
 105 ///
 106 /// Since graphemes are made up of one or more codepoints, this iterator
 107 /// yields `&str` elements (along with their start and end byte offsets).
 108 /// When invalid UTF-8 is encountered, replacement codepoints are
 109 /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
 110 /// indices yielded by this iterator may not correspond to the length of the
 111 /// grapheme cluster yielded with those indices. For example, when this
 112 /// iterator encounters `\xFF` in the byte string, then it will yield a pair
 113 /// of indices ranging over a single byte, but will provide an `&str`
 114 /// equivalent to `"\u{FFFD}"`, which is three bytes in length. However, when
 115 /// given only valid UTF-8, then all indices are in exact correspondence with
 116 /// their paired grapheme cluster.
 117 ///
 118 /// This iterator can be used in reverse. When reversed, exactly the same
 119 /// set of grapheme clusters are yielded, but in reverse order.
 120 ///
 121 /// This iterator only yields *extended* grapheme clusters, in accordance with
 122 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries).
 123 #[derive(Clone, Debug)]
 124 pub struct GraphemeIndices<'a> {
 125     bs: &'a [u8],
 126     forward_index: usize,
 127     reverse_index: usize,
 128 }
 129
 130 impl<'a> GraphemeIndices<'a> {
 131     pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> {
 132         GraphemeIndices { bs, forward_index: 0, reverse_index: bs.len() }
 133     }
 134
 135     /// View the underlying data as a subslice of the original data.
 136     ///
 137     /// The slice returned has the same lifetime as the original slice, and so
 138     /// the iterator can continue to be used while this exists.
 139     ///
 140     /// # Examples
 141     ///
 142     /// ```
 143     /// use bstr::ByteSlice;
 144     ///
 145     /// let mut it = b"abc".grapheme_indices();
 146     ///
 147     /// assert_eq!(b"abc", it.as_bytes());
 148     /// it.next();
 149     /// assert_eq!(b"bc", it.as_bytes());
 150     /// it.next();
 151     /// it.next();
 152     /// assert_eq!(b"", it.as_bytes());
 153     /// ```
 154     #[inline]
 155     pub fn as_bytes(&self) -> &'a [u8] {
 156         self.bs
 157     }
 158 }
 159
 160 impl<'a> Iterator for GraphemeIndices<'a> {
 161     type Item = (usize, usize, &'a str);
 162
 163     #[inline]
 164     fn next(&mut self) -> Option<(usize, usize, &'a str)> {
 165         let index = self.forward_index;
 166         let (grapheme, size) = decode_grapheme(self.bs);
 167         if size == 0 {
 168             return None;
 169         }
 170         self.bs = &self.bs[size..];
 171         self.forward_index += size;
 172         Some((index, index + size, grapheme))
 173     }
 174 }
 175
 176 impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
 177     #[inline]
 178     fn next_back(&mut self) -> Option<(usize, usize, &'a str)> {
 179         let (grapheme, size) = decode_last_grapheme(self.bs);
 180         if size == 0 {
 181             return None;
 182         }
 183         self.bs = &self.bs[..self.bs.len() - size];
 184         self.reverse_index -= size;
 185         Some((self.reverse_index, self.reverse_index + size, grapheme))
 186     }
 187 }
 188
 189 /// Decode a grapheme from the given byte string.
 190 ///
 191 /// This returns the resulting grapheme (which may be a Unicode replacement
 192 /// codepoint if invalid UTF-8 was found), along with the number of bytes
 193 /// decoded in the byte string. The number of bytes decoded may not be the
 194 /// same as the length of grapheme in the case where invalid UTF-8 is found.
 195 pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) {
 196     if bs.is_empty() {
 197         ("", 0)
 198     } else if bs.len() >= 2
 199         && bs[0].is_ascii()
 200         && bs[1].is_ascii()
 201         && !bs[0].is_ascii_whitespace()
 202     {
 203         // FIXME: It is somewhat sad that we have to special case this, but it
 204         // leads to a significant speed up in predominantly ASCII text. The
 205         // issue here is that the DFA has a bit of overhead, and running it for
 206         // every byte in mostly ASCII text results in a bit slowdown. We should
 207         // re-litigate this once regex-automata 0.3 is out, but it might be
 208         // hard to avoid the special case. A DFA is always going to at least
 209         // require some memory access.
 210
 211         // Safe because all ASCII bytes are valid UTF-8.
 212         let grapheme = unsafe { bs[..1].to_str_unchecked() };
 213         (grapheme, 1)
 214     } else if let Some(hm) = {
 215         let input = Input::new(bs).anchored(Anchored::Yes);
 216         GRAPHEME_BREAK_FWD.try_search_fwd(&input).unwrap()
 217     } {
 218         // Safe because a match can only occur for valid UTF-8.
 219         let grapheme = unsafe { bs[..hm.offset()].to_str_unchecked() };
 220         (grapheme, grapheme.len())
 221     } else {
 222         const INVALID: &'static str = "\u{FFFD}";
 223         // No match on non-empty bytes implies we found invalid UTF-8.
 224         let (_, size) = utf8::decode_lossy(bs);
 225         (INVALID, size)
 226     }
 227 }
 228
 229 fn decode_last_grapheme(bs: &[u8]) -> (&str, usize) {
 230     if bs.is_empty() {
 231         ("", 0)
 232     } else if let Some(hm) = {
 233         let input = Input::new(bs).anchored(Anchored::Yes);
 234         GRAPHEME_BREAK_REV.try_search_rev(&input).unwrap()
 235     } {
 236         let start = adjust_rev_for_regional_indicator(bs, hm.offset());
 237         // Safe because a match can only occur for valid UTF-8.
 238         let grapheme = unsafe { bs[start..].to_str_unchecked() };
 239         (grapheme, grapheme.len())
 240     } else {
 241         const INVALID: &'static str = "\u{FFFD}";
 242         // No match on non-empty bytes implies we found invalid UTF-8.
 243         let (_, size) = utf8::decode_last_lossy(bs);
 244         (INVALID, size)
 245     }
 246 }
 247
 248 /// Return the correct offset for the next grapheme decoded at the end of the
 249 /// given byte string, where `i` is the initial guess. In particular,
 250 /// `&bs[i..]` represents the candidate grapheme.
 251 ///
 252 /// `i` is returned by this function in all cases except when `&bs[i..]` is
 253 /// a pair of regional indicator codepoints. In that case, if an odd number of
 254 /// additional regional indicator codepoints precedes `i`, then `i` is
 255 /// adjusted such that it points to only a single regional indicator.
 256 ///
 257 /// This "fixing" is necessary to handle the requirement that a break cannot
 258 /// occur between regional indicators where it would cause an odd number of
 259 /// regional indicators to exist before the break from the *start* of the
 260 /// string. A reverse regex cannot detect this case easily without look-around.
 261 fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize {
 262     // All regional indicators use a 4 byte encoding, and we only care about
 263     // the case where we found a pair of regional indicators.
 264     if bs.len() - i != 8 {
 265         return i;
 266     }
 267     // Count all contiguous occurrences of regional indicators. If there's an
 268     // even number of them, then we can accept the pair we found. Otherwise,
 269     // we can only take one of them.
 270     //
 271     // FIXME: This is quadratic in the worst case, e.g., a string of just
 272     // regional indicator codepoints. A fix probably requires refactoring this
 273     // code a bit such that we don't rescan regional indicators.
 274     let mut count = 0;
 275     while let Some(hm) = {
 276         let input = Input::new(bs).anchored(Anchored::Yes);
 277         REGIONAL_INDICATOR_REV.try_search_rev(&input).unwrap()
 278     } {
 279         bs = &bs[..hm.offset()];
 280         count += 1;
 281     }
 282     if count % 2 == 0 {
 283         i
 284     } else {
 285         i + 4
 286     }
 287 }
 288
 289 #[cfg(all(test, feature = "std"))]
 290 mod tests {
 291     #[cfg(not(miri))]
 292     use ucd_parse::GraphemeClusterBreakTest;
 293
 294     use crate::{ext_slice::ByteSlice, tests::LOSSY_TESTS};
 295
 296     use super::*;
 297
 298     #[test]
 299     #[cfg(not(miri))]
 300     fn forward_ucd() {
 301         for (i, test) in ucdtests().into_iter().enumerate() {
 302             let given = test.grapheme_clusters.concat();
 303             let got: Vec<String> = Graphemes::new(given.as_bytes())
 304                 .map(|cluster| cluster.to_string())
 305                 .collect();
 306             assert_eq!(
 307                 test.grapheme_clusters,
 308                 got,
 309                 "\ngrapheme forward break test {} failed:\n\
 310                  given:    {:?}\n\
 311                  expected: {:?}\n\
 312                  got:      {:?}\n",
 313                 i,
 314                 uniescape(&given),
 315                 uniescape_vec(&test.grapheme_clusters),
 316                 uniescape_vec(&got),
 317             );
 318         }
 319     }
 320
 321     #[test]
 322     #[cfg(not(miri))]
 323     fn reverse_ucd() {
 324         for (i, test) in ucdtests().into_iter().enumerate() {
 325             let given = test.grapheme_clusters.concat();
 326             let mut got: Vec<String> = Graphemes::new(given.as_bytes())
 327                 .rev()
 328                 .map(|cluster| cluster.to_string())
 329                 .collect();
 330             got.reverse();
 331             assert_eq!(
 332                 test.grapheme_clusters,
 333                 got,
 334                 "\n\ngrapheme reverse break test {} failed:\n\
 335                  given:    {:?}\n\
 336                  expected: {:?}\n\
 337                  got:      {:?}\n",
 338                 i,
 339                 uniescape(&given),
 340                 uniescape_vec(&test.grapheme_clusters),
 341                 uniescape_vec(&got),
 342             );
 343         }
 344     }
 345
 346     #[test]
 347     fn forward_lossy() {
 348         for &(expected, input) in LOSSY_TESTS {
 349             let got = Graphemes::new(input.as_bytes()).collect::<String>();
 350             assert_eq!(expected, got);
 351         }
 352     }
 353
 354     #[test]
 355     fn reverse_lossy() {
 356         for &(expected, input) in LOSSY_TESTS {
 357             let expected: String = expected.chars().rev().collect();
 358             let got =
 359                 Graphemes::new(input.as_bytes()).rev().collect::<String>();
 360             assert_eq!(expected, got);
 361         }
 362     }
 363
 364     #[cfg(not(miri))]
 365     fn uniescape(s: &str) -> String {
 366         s.chars().flat_map(|c| c.escape_unicode()).collect::<String>()
 367     }
 368
 369     #[cfg(not(miri))]
 370     fn uniescape_vec(strs: &[String]) -> Vec<String> {
 371         strs.iter().map(|s| uniescape(s)).collect()
 372     }
 373
 374     /// Return all of the UCD for grapheme breaks.
 375     #[cfg(not(miri))]
 376     fn ucdtests() -> Vec<GraphemeClusterBreakTest> {
 377         const TESTDATA: &'static str =
 378             include_str!("data/GraphemeBreakTest.txt");
 379
 380         let mut tests = vec![];
 381         for mut line in TESTDATA.lines() {
 382             line = line.trim();
 383             if line.starts_with("#") || line.contains("surrogate") {
 384                 continue;
 385             }
 386             tests.push(line.parse().unwrap());
 387         }
 388         tests
 389     }
 390 }