1 use regex_automata
::DFA
;
3 use crate::ext_slice
::ByteSlice
;
4 use crate::unicode
::fsm
::grapheme_break_fwd
::GRAPHEME_BREAK_FWD
;
5 use crate::unicode
::fsm
::grapheme_break_rev
::GRAPHEME_BREAK_REV
;
6 use crate::unicode
::fsm
::regional_indicator_rev
::REGIONAL_INDICATOR_REV
;
9 /// An iterator over grapheme clusters in a byte string.
11 /// This iterator is typically constructed by
12 /// [`ByteSlice::graphemes`](trait.ByteSlice.html#method.graphemes).
14 /// Unicode defines a grapheme cluster as an *approximation* to a single user
15 /// visible character. A grapheme cluster, or just "grapheme," is made up of
16 /// one or more codepoints. For end user oriented tasks, one should generally
17 /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which
18 /// always yields one codepoint at a time.
20 /// Since graphemes are made up of one or more codepoints, this iterator yields
21 /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
22 /// are [substituted](index.html#handling-of-invalid-utf-8).
24 /// This iterator can be used in reverse. When reversed, exactly the same
25 /// set of grapheme clusters are yielded, but in reverse order.
27 /// This iterator only yields *extended* grapheme clusters, in accordance with
28 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries).
29 #[derive(Clone, Debug)]
30 pub struct Graphemes
<'a
> {
34 impl<'a
> Graphemes
<'a
> {
35 pub(crate) fn new(bs
: &'a
[u8]) -> Graphemes
<'a
> {
39 /// View the underlying data as a subslice of the original data.
41 /// The slice returned has the same lifetime as the original slice, and so
42 /// the iterator can continue to be used while this exists.
47 /// use bstr::ByteSlice;
49 /// let mut it = b"abc".graphemes();
51 /// assert_eq!(b"abc", it.as_bytes());
53 /// assert_eq!(b"bc", it.as_bytes());
56 /// assert_eq!(b"", it.as_bytes());
59 pub fn as_bytes(&self) -> &'a
[u8] {
64 impl<'a
> Iterator
for Graphemes
<'a
> {
68 fn next(&mut self) -> Option
<&'a
str> {
69 let (grapheme
, size
) = decode_grapheme(self.bs
);
73 self.bs
= &self.bs
[size
..];
78 impl<'a
> DoubleEndedIterator
for Graphemes
<'a
> {
80 fn next_back(&mut self) -> Option
<&'a
str> {
81 let (grapheme
, size
) = decode_last_grapheme(self.bs
);
85 self.bs
= &self.bs
[..self.bs
.len() - size
];
90 /// An iterator over grapheme clusters in a byte string and their byte index
93 /// This iterator is typically constructed by
94 /// [`ByteSlice::grapheme_indices`](trait.ByteSlice.html#method.grapheme_indices).
96 /// Unicode defines a grapheme cluster as an *approximation* to a single user
97 /// visible character. A grapheme cluster, or just "grapheme," is made up of
98 /// one or more codepoints. For end user oriented tasks, one should generally
99 /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which
100 /// always yields one codepoint at a time.
102 /// Since graphemes are made up of one or more codepoints, this iterator
103 /// yields `&str` elements (along with their start and end byte offsets).
104 /// When invalid UTF-8 is encountered, replacement codepoints are
105 /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
106 /// indices yielded by this iterator may not correspond to the length of the
107 /// grapheme cluster yielded with those indices. For example, when this
108 /// iterator encounters `\xFF` in the byte string, then it will yield a pair
109 /// of indices ranging over a single byte, but will provide an `&str`
110 /// equivalent to `"\u{FFFD}"`, which is three bytes in length. However, when
111 /// given only valid UTF-8, then all indices are in exact correspondence with
112 /// their paired grapheme cluster.
114 /// This iterator can be used in reverse. When reversed, exactly the same
115 /// set of grapheme clusters are yielded, but in reverse order.
117 /// This iterator only yields *extended* grapheme clusters, in accordance with
118 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries).
119 #[derive(Clone, Debug)]
120 pub struct GraphemeIndices
<'a
> {
122 forward_index
: usize,
123 reverse_index
: usize,
126 impl<'a
> GraphemeIndices
<'a
> {
127 pub(crate) fn new(bs
: &'a
[u8]) -> GraphemeIndices
<'a
> {
128 GraphemeIndices { bs: bs, forward_index: 0, reverse_index: bs.len() }
131 /// View the underlying data as a subslice of the original data.
133 /// The slice returned has the same lifetime as the original slice, and so
134 /// the iterator can continue to be used while this exists.
139 /// use bstr::ByteSlice;
141 /// let mut it = b"abc".grapheme_indices();
143 /// assert_eq!(b"abc", it.as_bytes());
145 /// assert_eq!(b"bc", it.as_bytes());
148 /// assert_eq!(b"", it.as_bytes());
151 pub fn as_bytes(&self) -> &'a
[u8] {
156 impl<'a
> Iterator
for GraphemeIndices
<'a
> {
157 type Item
= (usize, usize, &'a
str);
160 fn next(&mut self) -> Option
<(usize, usize, &'a
str)> {
161 let index
= self.forward_index
;
162 let (grapheme
, size
) = decode_grapheme(self.bs
);
166 self.bs
= &self.bs
[size
..];
167 self.forward_index
+= size
;
168 Some((index
, index
+ size
, grapheme
))
172 impl<'a
> DoubleEndedIterator
for GraphemeIndices
<'a
> {
174 fn next_back(&mut self) -> Option
<(usize, usize, &'a
str)> {
175 let (grapheme
, size
) = decode_last_grapheme(self.bs
);
179 self.bs
= &self.bs
[..self.bs
.len() - size
];
180 self.reverse_index
-= size
;
181 Some((self.reverse_index
, self.reverse_index
+ size
, grapheme
))
185 /// Decode a grapheme from the given byte string.
187 /// This returns the resulting grapheme (which may be a Unicode replacement
188 /// codepoint if invalid UTF-8 was found), along with the number of bytes
189 /// decoded in the byte string. The number of bytes decoded may not be the
190 /// same as the length of grapheme in the case where invalid UTF-8 is found.
191 pub fn decode_grapheme(bs
: &[u8]) -> (&str, usize) {
194 } else if let Some(end
) = GRAPHEME_BREAK_FWD
.find(bs
) {
195 // Safe because a match can only occur for valid UTF-8.
196 let grapheme
= unsafe { bs[..end].to_str_unchecked() }
;
197 (grapheme
, grapheme
.len())
199 const INVALID
: &'
static str = "\u{FFFD}";
200 // No match on non-empty bytes implies we found invalid UTF-8.
201 let (_
, size
) = utf8
::decode_lossy(bs
);
206 fn decode_last_grapheme(bs
: &[u8]) -> (&str, usize) {
209 } else if let Some(mut start
) = GRAPHEME_BREAK_REV
.rfind(bs
) {
210 start
= adjust_rev_for_regional_indicator(bs
, start
);
211 // Safe because a match can only occur for valid UTF-8.
212 let grapheme
= unsafe { bs[start..].to_str_unchecked() }
;
213 (grapheme
, grapheme
.len())
215 const INVALID
: &'
static str = "\u{FFFD}";
216 // No match on non-empty bytes implies we found invalid UTF-8.
217 let (_
, size
) = utf8
::decode_last_lossy(bs
);
222 /// Return the correct offset for the next grapheme decoded at the end of the
223 /// given byte string, where `i` is the initial guess. In particular,
224 /// `&bs[i..]` represents the candidate grapheme.
226 /// `i` is returned by this function in all cases except when `&bs[i..]` is
227 /// a pair of regional indicator codepoints. In that case, if an odd number of
228 /// additional regional indicator codepoints precedes `i`, then `i` is
229 /// adjusted such that it points to only a single regional indicator.
231 /// This "fixing" is necessary to handle the requirement that a break cannot
232 /// occur between regional indicators where it would cause an odd number of
233 /// regional indicators to exist before the break from the *start* of the
234 /// string. A reverse regex cannot detect this case easily without look-around.
235 fn adjust_rev_for_regional_indicator(mut bs
: &[u8], i
: usize) -> usize {
236 // All regional indicators use a 4 byte encoding, and we only care about
237 // the case where we found a pair of regional indicators.
238 if bs
.len() - i
!= 8 {
241 // Count all contiguous occurrences of regional indicators. If there's an
242 // even number of them, then we can accept the pair we found. Otherwise,
243 // we can only take one of them.
245 // FIXME: This is quadratic in the worst case, e.g., a string of just
246 // regional indicator codepoints. A fix probably requires refactoring this
247 // code a bit such that we don't rescan regional indicators.
249 while let Some(start
) = REGIONAL_INDICATOR_REV
.rfind(bs
) {
262 use ucd_parse
::GraphemeClusterBreakTest
;
265 use crate::ext_slice
::ByteSlice
;
266 use crate::tests
::LOSSY_TESTS
;
270 for (i
, test
) in ucdtests().into_iter().enumerate() {
271 let given
= test
.grapheme_clusters
.concat();
272 let got
: Vec
<String
> = Graphemes
::new(given
.as_bytes())
273 .map(|cluster
| cluster
.to_string())
276 test
.grapheme_clusters
,
278 "\ngrapheme forward break test {} failed:\n\
284 uniescape_vec(&test
.grapheme_clusters
),
292 for (i
, test
) in ucdtests().into_iter().enumerate() {
293 let given
= test
.grapheme_clusters
.concat();
294 let mut got
: Vec
<String
> = Graphemes
::new(given
.as_bytes())
296 .map(|cluster
| cluster
.to_string())
300 test
.grapheme_clusters
,
302 "\n\ngrapheme reverse break test {} failed:\n\
308 uniescape_vec(&test
.grapheme_clusters
),
316 for &(expected
, input
) in LOSSY_TESTS
{
317 let got
= Graphemes
::new(input
.as_bytes()).collect
::<String
>();
318 assert_eq
!(expected
, got
);
324 for &(expected
, input
) in LOSSY_TESTS
{
325 let expected
: String
= expected
.chars().rev().collect();
327 Graphemes
::new(input
.as_bytes()).rev().collect
::<String
>();
328 assert_eq
!(expected
, got
);
332 fn uniescape(s
: &str) -> String
{
333 s
.chars().flat_map(|c
| c
.escape_unicode()).collect
::<String
>()
336 fn uniescape_vec(strs
: &[String
]) -> Vec
<String
> {
337 strs
.iter().map(|s
| uniescape(s
)).collect()
340 /// Return all of the UCD for grapheme breaks.
341 fn ucdtests() -> Vec
<GraphemeClusterBreakTest
> {
342 const TESTDATA
: &'
static str =
343 include_str
!("data/GraphemeBreakTest.txt");
345 let mut tests
= vec
![];
346 for mut line
in TESTDATA
.lines() {
348 if line
.starts_with("#") || line
.contains("surrogate") {
351 tests
.push(line
.parse().unwrap());