]> git.proxmox.com Git - rustc.git/blob - vendor/bstr-0.2.17/src/unicode/grapheme.rs
New upstream version 1.67.1+dfsg1
[rustc.git] / vendor / bstr-0.2.17 / src / unicode / grapheme.rs
1 use regex_automata::DFA;
2
3 use crate::ext_slice::ByteSlice;
4 use crate::unicode::fsm::grapheme_break_fwd::GRAPHEME_BREAK_FWD;
5 use crate::unicode::fsm::grapheme_break_rev::GRAPHEME_BREAK_REV;
6 use crate::unicode::fsm::regional_indicator_rev::REGIONAL_INDICATOR_REV;
7 use crate::utf8;
8
9 /// An iterator over grapheme clusters in a byte string.
10 ///
11 /// This iterator is typically constructed by
12 /// [`ByteSlice::graphemes`](trait.ByteSlice.html#method.graphemes).
13 ///
14 /// Unicode defines a grapheme cluster as an *approximation* to a single user
15 /// visible character. A grapheme cluster, or just "grapheme," is made up of
16 /// one or more codepoints. For end user oriented tasks, one should generally
17 /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which
18 /// always yields one codepoint at a time.
19 ///
20 /// Since graphemes are made up of one or more codepoints, this iterator yields
21 /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
22 /// are [substituted](index.html#handling-of-invalid-utf-8).
23 ///
24 /// This iterator can be used in reverse. When reversed, exactly the same
25 /// set of grapheme clusters are yielded, but in reverse order.
26 ///
27 /// This iterator only yields *extended* grapheme clusters, in accordance with
28 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries).
29 #[derive(Clone, Debug)]
30 pub struct Graphemes<'a> {
31 bs: &'a [u8],
32 }
33
34 impl<'a> Graphemes<'a> {
35 pub(crate) fn new(bs: &'a [u8]) -> Graphemes<'a> {
36 Graphemes { bs }
37 }
38
39 /// View the underlying data as a subslice of the original data.
40 ///
41 /// The slice returned has the same lifetime as the original slice, and so
42 /// the iterator can continue to be used while this exists.
43 ///
44 /// # Examples
45 ///
46 /// ```
47 /// use bstr::ByteSlice;
48 ///
49 /// let mut it = b"abc".graphemes();
50 ///
51 /// assert_eq!(b"abc", it.as_bytes());
52 /// it.next();
53 /// assert_eq!(b"bc", it.as_bytes());
54 /// it.next();
55 /// it.next();
56 /// assert_eq!(b"", it.as_bytes());
57 /// ```
58 #[inline]
59 pub fn as_bytes(&self) -> &'a [u8] {
60 self.bs
61 }
62 }
63
64 impl<'a> Iterator for Graphemes<'a> {
65 type Item = &'a str;
66
67 #[inline]
68 fn next(&mut self) -> Option<&'a str> {
69 let (grapheme, size) = decode_grapheme(self.bs);
70 if size == 0 {
71 return None;
72 }
73 self.bs = &self.bs[size..];
74 Some(grapheme)
75 }
76 }
77
78 impl<'a> DoubleEndedIterator for Graphemes<'a> {
79 #[inline]
80 fn next_back(&mut self) -> Option<&'a str> {
81 let (grapheme, size) = decode_last_grapheme(self.bs);
82 if size == 0 {
83 return None;
84 }
85 self.bs = &self.bs[..self.bs.len() - size];
86 Some(grapheme)
87 }
88 }
89
90 /// An iterator over grapheme clusters in a byte string and their byte index
91 /// positions.
92 ///
93 /// This iterator is typically constructed by
94 /// [`ByteSlice::grapheme_indices`](trait.ByteSlice.html#method.grapheme_indices).
95 ///
96 /// Unicode defines a grapheme cluster as an *approximation* to a single user
97 /// visible character. A grapheme cluster, or just "grapheme," is made up of
98 /// one or more codepoints. For end user oriented tasks, one should generally
99 /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which
100 /// always yields one codepoint at a time.
101 ///
102 /// Since graphemes are made up of one or more codepoints, this iterator
103 /// yields `&str` elements (along with their start and end byte offsets).
104 /// When invalid UTF-8 is encountered, replacement codepoints are
105 /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
106 /// indices yielded by this iterator may not correspond to the length of the
107 /// grapheme cluster yielded with those indices. For example, when this
108 /// iterator encounters `\xFF` in the byte string, then it will yield a pair
109 /// of indices ranging over a single byte, but will provide an `&str`
110 /// equivalent to `"\u{FFFD}"`, which is three bytes in length. However, when
111 /// given only valid UTF-8, then all indices are in exact correspondence with
112 /// their paired grapheme cluster.
113 ///
114 /// This iterator can be used in reverse. When reversed, exactly the same
115 /// set of grapheme clusters are yielded, but in reverse order.
116 ///
117 /// This iterator only yields *extended* grapheme clusters, in accordance with
118 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries).
119 #[derive(Clone, Debug)]
120 pub struct GraphemeIndices<'a> {
121 bs: &'a [u8],
122 forward_index: usize,
123 reverse_index: usize,
124 }
125
126 impl<'a> GraphemeIndices<'a> {
127 pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> {
128 GraphemeIndices { bs: bs, forward_index: 0, reverse_index: bs.len() }
129 }
130
131 /// View the underlying data as a subslice of the original data.
132 ///
133 /// The slice returned has the same lifetime as the original slice, and so
134 /// the iterator can continue to be used while this exists.
135 ///
136 /// # Examples
137 ///
138 /// ```
139 /// use bstr::ByteSlice;
140 ///
141 /// let mut it = b"abc".grapheme_indices();
142 ///
143 /// assert_eq!(b"abc", it.as_bytes());
144 /// it.next();
145 /// assert_eq!(b"bc", it.as_bytes());
146 /// it.next();
147 /// it.next();
148 /// assert_eq!(b"", it.as_bytes());
149 /// ```
150 #[inline]
151 pub fn as_bytes(&self) -> &'a [u8] {
152 self.bs
153 }
154 }
155
156 impl<'a> Iterator for GraphemeIndices<'a> {
157 type Item = (usize, usize, &'a str);
158
159 #[inline]
160 fn next(&mut self) -> Option<(usize, usize, &'a str)> {
161 let index = self.forward_index;
162 let (grapheme, size) = decode_grapheme(self.bs);
163 if size == 0 {
164 return None;
165 }
166 self.bs = &self.bs[size..];
167 self.forward_index += size;
168 Some((index, index + size, grapheme))
169 }
170 }
171
172 impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
173 #[inline]
174 fn next_back(&mut self) -> Option<(usize, usize, &'a str)> {
175 let (grapheme, size) = decode_last_grapheme(self.bs);
176 if size == 0 {
177 return None;
178 }
179 self.bs = &self.bs[..self.bs.len() - size];
180 self.reverse_index -= size;
181 Some((self.reverse_index, self.reverse_index + size, grapheme))
182 }
183 }
184
185 /// Decode a grapheme from the given byte string.
186 ///
187 /// This returns the resulting grapheme (which may be a Unicode replacement
188 /// codepoint if invalid UTF-8 was found), along with the number of bytes
189 /// decoded in the byte string. The number of bytes decoded may not be the
190 /// same as the length of grapheme in the case where invalid UTF-8 is found.
191 pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) {
192 if bs.is_empty() {
193 ("", 0)
194 } else if let Some(end) = GRAPHEME_BREAK_FWD.find(bs) {
195 // Safe because a match can only occur for valid UTF-8.
196 let grapheme = unsafe { bs[..end].to_str_unchecked() };
197 (grapheme, grapheme.len())
198 } else {
199 const INVALID: &'static str = "\u{FFFD}";
200 // No match on non-empty bytes implies we found invalid UTF-8.
201 let (_, size) = utf8::decode_lossy(bs);
202 (INVALID, size)
203 }
204 }
205
206 fn decode_last_grapheme(bs: &[u8]) -> (&str, usize) {
207 if bs.is_empty() {
208 ("", 0)
209 } else if let Some(mut start) = GRAPHEME_BREAK_REV.rfind(bs) {
210 start = adjust_rev_for_regional_indicator(bs, start);
211 // Safe because a match can only occur for valid UTF-8.
212 let grapheme = unsafe { bs[start..].to_str_unchecked() };
213 (grapheme, grapheme.len())
214 } else {
215 const INVALID: &'static str = "\u{FFFD}";
216 // No match on non-empty bytes implies we found invalid UTF-8.
217 let (_, size) = utf8::decode_last_lossy(bs);
218 (INVALID, size)
219 }
220 }
221
222 /// Return the correct offset for the next grapheme decoded at the end of the
223 /// given byte string, where `i` is the initial guess. In particular,
224 /// `&bs[i..]` represents the candidate grapheme.
225 ///
226 /// `i` is returned by this function in all cases except when `&bs[i..]` is
227 /// a pair of regional indicator codepoints. In that case, if an odd number of
228 /// additional regional indicator codepoints precedes `i`, then `i` is
229 /// adjusted such that it points to only a single regional indicator.
230 ///
231 /// This "fixing" is necessary to handle the requirement that a break cannot
232 /// occur between regional indicators where it would cause an odd number of
233 /// regional indicators to exist before the break from the *start* of the
234 /// string. A reverse regex cannot detect this case easily without look-around.
235 fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize {
236 // All regional indicators use a 4 byte encoding, and we only care about
237 // the case where we found a pair of regional indicators.
238 if bs.len() - i != 8 {
239 return i;
240 }
241 // Count all contiguous occurrences of regional indicators. If there's an
242 // even number of them, then we can accept the pair we found. Otherwise,
243 // we can only take one of them.
244 //
245 // FIXME: This is quadratic in the worst case, e.g., a string of just
246 // regional indicator codepoints. A fix probably requires refactoring this
247 // code a bit such that we don't rescan regional indicators.
248 let mut count = 0;
249 while let Some(start) = REGIONAL_INDICATOR_REV.rfind(bs) {
250 bs = &bs[..start];
251 count += 1;
252 }
253 if count % 2 == 0 {
254 i
255 } else {
256 i + 4
257 }
258 }
259
260 #[cfg(test)]
261 mod tests {
262 use ucd_parse::GraphemeClusterBreakTest;
263
264 use super::*;
265 use crate::ext_slice::ByteSlice;
266 use crate::tests::LOSSY_TESTS;
267
268 #[test]
269 fn forward_ucd() {
270 for (i, test) in ucdtests().into_iter().enumerate() {
271 let given = test.grapheme_clusters.concat();
272 let got: Vec<String> = Graphemes::new(given.as_bytes())
273 .map(|cluster| cluster.to_string())
274 .collect();
275 assert_eq!(
276 test.grapheme_clusters,
277 got,
278 "\ngrapheme forward break test {} failed:\n\
279 given: {:?}\n\
280 expected: {:?}\n\
281 got: {:?}\n",
282 i,
283 uniescape(&given),
284 uniescape_vec(&test.grapheme_clusters),
285 uniescape_vec(&got),
286 );
287 }
288 }
289
290 #[test]
291 fn reverse_ucd() {
292 for (i, test) in ucdtests().into_iter().enumerate() {
293 let given = test.grapheme_clusters.concat();
294 let mut got: Vec<String> = Graphemes::new(given.as_bytes())
295 .rev()
296 .map(|cluster| cluster.to_string())
297 .collect();
298 got.reverse();
299 assert_eq!(
300 test.grapheme_clusters,
301 got,
302 "\n\ngrapheme reverse break test {} failed:\n\
303 given: {:?}\n\
304 expected: {:?}\n\
305 got: {:?}\n",
306 i,
307 uniescape(&given),
308 uniescape_vec(&test.grapheme_clusters),
309 uniescape_vec(&got),
310 );
311 }
312 }
313
314 #[test]
315 fn forward_lossy() {
316 for &(expected, input) in LOSSY_TESTS {
317 let got = Graphemes::new(input.as_bytes()).collect::<String>();
318 assert_eq!(expected, got);
319 }
320 }
321
322 #[test]
323 fn reverse_lossy() {
324 for &(expected, input) in LOSSY_TESTS {
325 let expected: String = expected.chars().rev().collect();
326 let got =
327 Graphemes::new(input.as_bytes()).rev().collect::<String>();
328 assert_eq!(expected, got);
329 }
330 }
331
332 fn uniescape(s: &str) -> String {
333 s.chars().flat_map(|c| c.escape_unicode()).collect::<String>()
334 }
335
336 fn uniescape_vec(strs: &[String]) -> Vec<String> {
337 strs.iter().map(|s| uniescape(s)).collect()
338 }
339
340 /// Return all of the UCD for grapheme breaks.
341 fn ucdtests() -> Vec<GraphemeClusterBreakTest> {
342 const TESTDATA: &'static str =
343 include_str!("data/GraphemeBreakTest.txt");
344
345 let mut tests = vec![];
346 for mut line in TESTDATA.lines() {
347 line = line.trim();
348 if line.starts_with("#") || line.contains("surrogate") {
349 continue;
350 }
351 tests.push(line.parse().unwrap());
352 }
353 tests
354 }
355 }