1 use regex_automata
::{dfa::Automaton, Anchored, Input}
;
6 grapheme_break_fwd
::GRAPHEME_BREAK_FWD
,
7 grapheme_break_rev
::GRAPHEME_BREAK_REV
,
8 regional_indicator_rev
::REGIONAL_INDICATOR_REV
,
13 /// An iterator over grapheme clusters in a byte string.
15 /// This iterator is typically constructed by
16 /// [`ByteSlice::graphemes`](trait.ByteSlice.html#method.graphemes).
18 /// Unicode defines a grapheme cluster as an *approximation* to a single user
19 /// visible character. A grapheme cluster, or just "grapheme," is made up of
20 /// one or more codepoints. For end user oriented tasks, one should generally
21 /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which
22 /// always yields one codepoint at a time.
24 /// Since graphemes are made up of one or more codepoints, this iterator yields
25 /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
26 /// are [substituted](index.html#handling-of-invalid-utf-8).
28 /// This iterator can be used in reverse. When reversed, exactly the same
29 /// set of grapheme clusters are yielded, but in reverse order.
31 /// This iterator only yields *extended* grapheme clusters, in accordance with
32 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries).
33 #[derive(Clone, Debug)]
34 pub struct Graphemes
<'a
> {
38 impl<'a
> Graphemes
<'a
> {
39 pub(crate) fn new(bs
: &'a
[u8]) -> Graphemes
<'a
> {
43 /// View the underlying data as a subslice of the original data.
45 /// The slice returned has the same lifetime as the original slice, and so
46 /// the iterator can continue to be used while this exists.
51 /// use bstr::ByteSlice;
53 /// let mut it = b"abc".graphemes();
55 /// assert_eq!(b"abc", it.as_bytes());
57 /// assert_eq!(b"bc", it.as_bytes());
60 /// assert_eq!(b"", it.as_bytes());
63 pub fn as_bytes(&self) -> &'a
[u8] {
68 impl<'a
> Iterator
for Graphemes
<'a
> {
72 fn next(&mut self) -> Option
<&'a
str> {
73 let (grapheme
, size
) = decode_grapheme(self.bs
);
77 self.bs
= &self.bs
[size
..];
82 impl<'a
> DoubleEndedIterator
for Graphemes
<'a
> {
84 fn next_back(&mut self) -> Option
<&'a
str> {
85 let (grapheme
, size
) = decode_last_grapheme(self.bs
);
89 self.bs
= &self.bs
[..self.bs
.len() - size
];
94 /// An iterator over grapheme clusters in a byte string and their byte index
97 /// This iterator is typically constructed by
98 /// [`ByteSlice::grapheme_indices`](trait.ByteSlice.html#method.grapheme_indices).
100 /// Unicode defines a grapheme cluster as an *approximation* to a single user
101 /// visible character. A grapheme cluster, or just "grapheme," is made up of
102 /// one or more codepoints. For end user oriented tasks, one should generally
103 /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which
104 /// always yields one codepoint at a time.
106 /// Since graphemes are made up of one or more codepoints, this iterator
107 /// yields `&str` elements (along with their start and end byte offsets).
108 /// When invalid UTF-8 is encountered, replacement codepoints are
109 /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
110 /// indices yielded by this iterator may not correspond to the length of the
111 /// grapheme cluster yielded with those indices. For example, when this
112 /// iterator encounters `\xFF` in the byte string, then it will yield a pair
113 /// of indices ranging over a single byte, but will provide an `&str`
114 /// equivalent to `"\u{FFFD}"`, which is three bytes in length. However, when
115 /// given only valid UTF-8, then all indices are in exact correspondence with
116 /// their paired grapheme cluster.
118 /// This iterator can be used in reverse. When reversed, exactly the same
119 /// set of grapheme clusters are yielded, but in reverse order.
121 /// This iterator only yields *extended* grapheme clusters, in accordance with
122 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries).
123 #[derive(Clone, Debug)]
124 pub struct GraphemeIndices
<'a
> {
126 forward_index
: usize,
127 reverse_index
: usize,
130 impl<'a
> GraphemeIndices
<'a
> {
131 pub(crate) fn new(bs
: &'a
[u8]) -> GraphemeIndices
<'a
> {
132 GraphemeIndices { bs, forward_index: 0, reverse_index: bs.len() }
135 /// View the underlying data as a subslice of the original data.
137 /// The slice returned has the same lifetime as the original slice, and so
138 /// the iterator can continue to be used while this exists.
143 /// use bstr::ByteSlice;
145 /// let mut it = b"abc".grapheme_indices();
147 /// assert_eq!(b"abc", it.as_bytes());
149 /// assert_eq!(b"bc", it.as_bytes());
152 /// assert_eq!(b"", it.as_bytes());
155 pub fn as_bytes(&self) -> &'a
[u8] {
160 impl<'a
> Iterator
for GraphemeIndices
<'a
> {
161 type Item
= (usize, usize, &'a
str);
164 fn next(&mut self) -> Option
<(usize, usize, &'a
str)> {
165 let index
= self.forward_index
;
166 let (grapheme
, size
) = decode_grapheme(self.bs
);
170 self.bs
= &self.bs
[size
..];
171 self.forward_index
+= size
;
172 Some((index
, index
+ size
, grapheme
))
176 impl<'a
> DoubleEndedIterator
for GraphemeIndices
<'a
> {
178 fn next_back(&mut self) -> Option
<(usize, usize, &'a
str)> {
179 let (grapheme
, size
) = decode_last_grapheme(self.bs
);
183 self.bs
= &self.bs
[..self.bs
.len() - size
];
184 self.reverse_index
-= size
;
185 Some((self.reverse_index
, self.reverse_index
+ size
, grapheme
))
189 /// Decode a grapheme from the given byte string.
191 /// This returns the resulting grapheme (which may be a Unicode replacement
192 /// codepoint if invalid UTF-8 was found), along with the number of bytes
193 /// decoded in the byte string. The number of bytes decoded may not be the
194 /// same as the length of grapheme in the case where invalid UTF-8 is found.
195 pub fn decode_grapheme(bs
: &[u8]) -> (&str, usize) {
198 } else if bs
.len() >= 2
201 && !bs
[0].is_ascii_whitespace()
203 // FIXME: It is somewhat sad that we have to special case this, but it
204 // leads to a significant speed up in predominantly ASCII text. The
205 // issue here is that the DFA has a bit of overhead, and running it for
206 // every byte in mostly ASCII text results in a bit slowdown. We should
207 // re-litigate this once regex-automata 0.3 is out, but it might be
208 // hard to avoid the special case. A DFA is always going to at least
209 // require some memory access.
211 // Safe because all ASCII bytes are valid UTF-8.
212 let grapheme
= unsafe { bs[..1].to_str_unchecked() }
;
214 } else if let Some(hm
) = {
215 let input
= Input
::new(bs
).anchored(Anchored
::Yes
);
216 GRAPHEME_BREAK_FWD
.try_search_fwd(&input
).unwrap()
218 // Safe because a match can only occur for valid UTF-8.
219 let grapheme
= unsafe { bs[..hm.offset()].to_str_unchecked() }
;
220 (grapheme
, grapheme
.len())
222 const INVALID
: &'
static str = "\u{FFFD}";
223 // No match on non-empty bytes implies we found invalid UTF-8.
224 let (_
, size
) = utf8
::decode_lossy(bs
);
229 fn decode_last_grapheme(bs
: &[u8]) -> (&str, usize) {
232 } else if let Some(hm
) = {
233 let input
= Input
::new(bs
).anchored(Anchored
::Yes
);
234 GRAPHEME_BREAK_REV
.try_search_rev(&input
).unwrap()
236 let start
= adjust_rev_for_regional_indicator(bs
, hm
.offset());
237 // Safe because a match can only occur for valid UTF-8.
238 let grapheme
= unsafe { bs[start..].to_str_unchecked() }
;
239 (grapheme
, grapheme
.len())
241 const INVALID
: &'
static str = "\u{FFFD}";
242 // No match on non-empty bytes implies we found invalid UTF-8.
243 let (_
, size
) = utf8
::decode_last_lossy(bs
);
248 /// Return the correct offset for the next grapheme decoded at the end of the
249 /// given byte string, where `i` is the initial guess. In particular,
250 /// `&bs[i..]` represents the candidate grapheme.
252 /// `i` is returned by this function in all cases except when `&bs[i..]` is
253 /// a pair of regional indicator codepoints. In that case, if an odd number of
254 /// additional regional indicator codepoints precedes `i`, then `i` is
255 /// adjusted such that it points to only a single regional indicator.
257 /// This "fixing" is necessary to handle the requirement that a break cannot
258 /// occur between regional indicators where it would cause an odd number of
259 /// regional indicators to exist before the break from the *start* of the
260 /// string. A reverse regex cannot detect this case easily without look-around.
261 fn adjust_rev_for_regional_indicator(mut bs
: &[u8], i
: usize) -> usize {
262 // All regional indicators use a 4 byte encoding, and we only care about
263 // the case where we found a pair of regional indicators.
264 if bs
.len() - i
!= 8 {
267 // Count all contiguous occurrences of regional indicators. If there's an
268 // even number of them, then we can accept the pair we found. Otherwise,
269 // we can only take one of them.
271 // FIXME: This is quadratic in the worst case, e.g., a string of just
272 // regional indicator codepoints. A fix probably requires refactoring this
273 // code a bit such that we don't rescan regional indicators.
275 while let Some(hm
) = {
276 let input
= Input
::new(bs
).anchored(Anchored
::Yes
);
277 REGIONAL_INDICATOR_REV
.try_search_rev(&input
).unwrap()
279 bs
= &bs
[..hm
.offset()];
289 #[cfg(all(test, feature = "std"))]
292 use ucd_parse
::GraphemeClusterBreakTest
;
294 use crate::{ext_slice::ByteSlice, tests::LOSSY_TESTS}
;
301 for (i
, test
) in ucdtests().into_iter().enumerate() {
302 let given
= test
.grapheme_clusters
.concat();
303 let got
: Vec
<String
> = Graphemes
::new(given
.as_bytes())
304 .map(|cluster
| cluster
.to_string())
307 test
.grapheme_clusters
,
309 "\ngrapheme forward break test {} failed:\n\
315 uniescape_vec(&test
.grapheme_clusters
),
324 for (i
, test
) in ucdtests().into_iter().enumerate() {
325 let given
= test
.grapheme_clusters
.concat();
326 let mut got
: Vec
<String
> = Graphemes
::new(given
.as_bytes())
328 .map(|cluster
| cluster
.to_string())
332 test
.grapheme_clusters
,
334 "\n\ngrapheme reverse break test {} failed:\n\
340 uniescape_vec(&test
.grapheme_clusters
),
348 for &(expected
, input
) in LOSSY_TESTS
{
349 let got
= Graphemes
::new(input
.as_bytes()).collect
::<String
>();
350 assert_eq
!(expected
, got
);
356 for &(expected
, input
) in LOSSY_TESTS
{
357 let expected
: String
= expected
.chars().rev().collect();
359 Graphemes
::new(input
.as_bytes()).rev().collect
::<String
>();
360 assert_eq
!(expected
, got
);
365 fn uniescape(s
: &str) -> String
{
366 s
.chars().flat_map(|c
| c
.escape_unicode()).collect
::<String
>()
370 fn uniescape_vec(strs
: &[String
]) -> Vec
<String
> {
371 strs
.iter().map(|s
| uniescape(s
)).collect()
374 /// Return all of the UCD for grapheme breaks.
376 fn ucdtests() -> Vec
<GraphemeClusterBreakTest
> {
377 const TESTDATA
: &'
static str =
378 include_str
!("data/GraphemeBreakTest.txt");
380 let mut tests
= vec
![];
381 for mut line
in TESTDATA
.lines() {
383 if line
.starts_with("#") || line
.contains("surrogate") {
386 tests
.push(line
.parse().unwrap());