5 #[cfg(feature = "std")]
10 use crate::ext_slice
::ByteSlice
;
12 // The UTF-8 decoder provided here is based on the one presented here:
13 // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
15 // We *could* have done UTF-8 decoding by using a DFA generated by `\p{any}`
16 // using regex-automata that is roughly the same size. The real benefit of
17 // Hoehrmann's formulation is that the byte class mapping below is manually
18 // tailored such that each byte's class doubles as a shift to mask out the
19 // bits necessary for constructing the leading bits of each codepoint value
20 // from the initial byte.
22 // There are some minor differences between this implementation and Hoehrmann's
25 // Firstly, we make REJECT have state ID 0, since it makes the state table
26 // itself a little easier to read and is consistent with the notion that 0
27 // means "false" or "bad."
29 // Secondly, when doing bulk decoding, we add a SIMD accelerated ASCII fast
32 // Thirdly, we pre-multiply the state IDs to avoid a multiplication instruction
33 // in the core decoding loop. (Which is what regex-automata would do by
36 // Fourthly, we split the byte class mapping and transition table into two
37 // arrays because it's clearer.
39 // It is unlikely that this is the fastest way to do UTF-8 decoding, however,
40 // it is fairly simple.
42 const ACCEPT
: usize = 12;
43 const REJECT
: usize = 0;
45 /// SAFETY: The decode below function relies on the correctness of these
46 /// equivalence classes.
47 #[cfg_attr(rustfmt, rustfmt::skip)]
48 const CLASSES
: [u8; 256] = [
49 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
51 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
52 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
53 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
54 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
55 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
56 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
59 /// SAFETY: The decode below function relies on the correctness of this state
61 #[cfg_attr(rustfmt, rustfmt::skip)]
62 const STATES_FORWARD
: &'
static [u8] = &[
63 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
64 12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72,
65 0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0,
66 0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0,
67 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0,
68 0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0,
69 0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
70 0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
71 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74 /// An iterator over Unicode scalar values in a byte string.
76 /// When invalid UTF-8 byte sequences are found, they are substituted with the
77 /// Unicode replacement codepoint (`U+FFFD`) using the
78 /// ["maximal subpart" strategy](http://www.unicode.org/review/pr-121.html).
80 /// This iterator is created by the
81 /// [`chars`](trait.ByteSlice.html#method.chars) method provided by the
82 /// [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
83 #[derive(Clone, Debug)]
84 pub struct Chars
<'a
> {
89 pub(crate) fn new(bs
: &'a
[u8]) -> Chars
<'a
> {
93 /// View the underlying data as a subslice of the original data.
95 /// The slice returned has the same lifetime as the original slice, and so
96 /// the iterator can continue to be used while this exists.
101 /// use bstr::ByteSlice;
103 /// let mut chars = b"abc".chars();
105 /// assert_eq!(b"abc", chars.as_bytes());
107 /// assert_eq!(b"bc", chars.as_bytes());
110 /// assert_eq!(b"", chars.as_bytes());
113 pub fn as_bytes(&self) -> &'a
[u8] {
118 impl<'a
> Iterator
for Chars
<'a
> {
122 fn next(&mut self) -> Option
<char> {
123 let (ch
, size
) = decode_lossy(self.bs
);
127 self.bs
= &self.bs
[size
..];
132 impl<'a
> DoubleEndedIterator
for Chars
<'a
> {
134 fn next_back(&mut self) -> Option
<char> {
135 let (ch
, size
) = decode_last_lossy(self.bs
);
139 self.bs
= &self.bs
[..self.bs
.len() - size
];
144 /// An iterator over Unicode scalar values in a byte string and their
145 /// byte index positions.
147 /// When invalid UTF-8 byte sequences are found, they are substituted with the
148 /// Unicode replacement codepoint (`U+FFFD`) using the
149 /// ["maximal subpart" strategy](http://www.unicode.org/review/pr-121.html).
151 /// Note that this is slightly different from the `CharIndices` iterator
152 /// provided by the standard library. Aside from working on possibly invalid
153 /// UTF-8, this iterator provides both the corresponding starting and ending
154 /// byte indices of each codepoint yielded. The ending position is necessary to
155 /// slice the original byte string when invalid UTF-8 bytes are converted into
156 /// a Unicode replacement codepoint, since a single replacement codepoint can
157 /// substitute anywhere from 1 to 3 invalid bytes (inclusive).
159 /// This iterator is created by the
160 /// [`char_indices`](trait.ByteSlice.html#method.char_indices) method provided
161 /// by the [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
162 #[derive(Clone, Debug)]
163 pub struct CharIndices
<'a
> {
165 forward_index
: usize,
166 reverse_index
: usize,
169 impl<'a
> CharIndices
<'a
> {
170 pub(crate) fn new(bs
: &'a
[u8]) -> CharIndices
<'a
> {
171 CharIndices { bs: bs, forward_index: 0, reverse_index: bs.len() }
174 /// View the underlying data as a subslice of the original data.
176 /// The slice returned has the same lifetime as the original slice, and so
177 /// the iterator can continue to be used while this exists.
182 /// use bstr::ByteSlice;
184 /// let mut it = b"abc".char_indices();
186 /// assert_eq!(b"abc", it.as_bytes());
188 /// assert_eq!(b"bc", it.as_bytes());
191 /// assert_eq!(b"", it.as_bytes());
194 pub fn as_bytes(&self) -> &'a
[u8] {
199 impl<'a
> Iterator
for CharIndices
<'a
> {
200 type Item
= (usize, usize, char);
203 fn next(&mut self) -> Option
<(usize, usize, char)> {
204 let index
= self.forward_index
;
205 let (ch
, size
) = decode_lossy(self.bs
);
209 self.bs
= &self.bs
[size
..];
210 self.forward_index
+= size
;
211 Some((index
, index
+ size
, ch
))
215 impl<'a
> DoubleEndedIterator
for CharIndices
<'a
> {
217 fn next_back(&mut self) -> Option
<(usize, usize, char)> {
218 let (ch
, size
) = decode_last_lossy(self.bs
);
222 self.bs
= &self.bs
[..self.bs
.len() - size
];
223 self.reverse_index
-= size
;
224 Some((self.reverse_index
, self.reverse_index
+ size
, ch
))
228 impl<'a
> ::core
::iter
::FusedIterator
for CharIndices
<'a
> {}
230 /// An iterator over chunks of valid UTF-8 in a byte slice.
232 /// See [`utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks).
233 #[derive(Clone, Debug)]
234 pub struct Utf8Chunks
<'a
> {
235 pub(super) bytes
: &'a
[u8],
238 /// A chunk of valid UTF-8, possibly followed by invalid UTF-8 bytes.
240 /// This is yielded by the
241 /// [`Utf8Chunks`](struct.Utf8Chunks.html)
242 /// iterator, which can be created via the
243 /// [`ByteSlice::utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks)
246 /// The `'a` lifetime parameter corresponds to the lifetime of the bytes that
247 /// are being iterated over.
248 #[cfg_attr(test, derive(Debug, PartialEq))]
249 pub struct Utf8Chunk
<'a
> {
250 /// A valid UTF-8 piece, at the start, end, or between invalid UTF-8 bytes.
252 /// This is empty between adjacent invalid UTF-8 byte sequences.
254 /// A sequence of invalid UTF-8 bytes.
256 /// Can only be empty in the last chunk.
258 /// Should be replaced by a single unicode replacement character, if not
261 /// Indicates whether the invalid sequence could've been valid if there
264 /// Can only be true in the last chunk.
268 impl<'a
> Utf8Chunk
<'a
> {
269 /// Returns the (possibly empty) valid UTF-8 bytes in this chunk.
271 /// This may be empty if there are consecutive sequences of invalid UTF-8
274 pub fn valid(&self) -> &'a
str {
278 /// Returns the (possibly empty) invalid UTF-8 bytes in this chunk that
279 /// immediately follow the valid UTF-8 bytes in this chunk.
281 /// This is only empty when this chunk corresponds to the last chunk in
282 /// the original bytes.
284 /// The maximum length of this slice is 3. That is, invalid UTF-8 byte
285 /// sequences greater than 1 always correspond to a valid _prefix_ of
286 /// a valid UTF-8 encoded codepoint. This corresponds to the "substitution
287 /// of maximal subparts" strategy that is described in more detail in the
289 /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy)
292 pub fn invalid(&self) -> &'a
[u8] {
293 self.invalid
.as_bytes()
296 /// Returns whether the invalid sequence might still become valid if more
299 /// Returns true if the end of the input was reached unexpectedly,
300 /// without encountering an unexpected byte.
302 /// This can only be the case for the last chunk.
304 pub fn incomplete(&self) -> bool
{
309 impl<'a
> Iterator
for Utf8Chunks
<'a
> {
310 type Item
= Utf8Chunk
<'a
>;
313 fn next(&mut self) -> Option
<Utf8Chunk
<'a
>> {
314 if self.bytes
.is_empty() {
317 match validate(self.bytes
) {
319 let valid
= self.bytes
;
322 // SAFETY: This is safe because of the guarantees provided
323 // by utf8::validate.
324 valid
: unsafe { str::from_utf8_unchecked(valid) }
,
325 invalid
: [].as_bstr(),
330 let (valid
, rest
) = self.bytes
.split_at(e
.valid_up_to());
331 // SAFETY: This is safe because of the guarantees provided by
333 let valid
= unsafe { str::from_utf8_unchecked(valid) }
;
334 let (invalid_len
, incomplete
) = match e
.error_len() {
335 Some(n
) => (n
, false),
336 None
=> (rest
.len(), true),
338 let (invalid
, rest
) = rest
.split_at(invalid_len
);
342 invalid
: invalid
.as_bstr(),
350 fn size_hint(&self) -> (usize, Option
<usize>) {
351 if self.bytes
.is_empty() {
354 (1, Some(self.bytes
.len()))
359 impl<'a
> ::core
::iter
::FusedIterator
for Utf8Chunks
<'a
> {}
361 /// An error that occurs when UTF-8 decoding fails.
363 /// This error occurs when attempting to convert a non-UTF-8 byte
364 /// string to a Rust string that must be valid UTF-8. For example,
365 /// [`to_str`](trait.ByteSlice.html#method.to_str) is one such method.
369 /// This example shows what happens when a given byte sequence is invalid,
370 /// but ends with a sequence that is a possible prefix of valid UTF-8.
373 /// use bstr::{B, ByteSlice};
375 /// let s = B(b"foobar\xF1\x80\x80");
376 /// let err = s.to_str().unwrap_err();
377 /// assert_eq!(err.valid_up_to(), 6);
378 /// assert_eq!(err.error_len(), None);
381 /// This example shows what happens when a given byte sequence contains
385 /// use bstr::ByteSlice;
387 /// let s = b"foobar\xF1\x80\x80quux";
388 /// let err = s.to_str().unwrap_err();
389 /// assert_eq!(err.valid_up_to(), 6);
390 /// // The error length reports the maximum number of bytes that correspond to
391 /// // a valid prefix of a UTF-8 encoded codepoint.
392 /// assert_eq!(err.error_len(), Some(3));
394 /// // In contrast to the above which contains a single invalid prefix,
395 /// // consider the case of multiple individal bytes that are never valid
396 /// // prefixes. Note how the value of error_len changes!
397 /// let s = b"foobar\xFF\xFFquux";
398 /// let err = s.to_str().unwrap_err();
399 /// assert_eq!(err.valid_up_to(), 6);
400 /// assert_eq!(err.error_len(), Some(1));
402 /// // The fact that it's an invalid prefix does not change error_len even
403 /// // when it immediately precedes the end of the string.
404 /// let s = b"foobar\xFF";
405 /// let err = s.to_str().unwrap_err();
406 /// assert_eq!(err.valid_up_to(), 6);
407 /// assert_eq!(err.error_len(), Some(1));
409 #[derive(Debug, Eq, PartialEq)]
410 pub struct Utf8Error
{
412 error_len
: Option
<usize>,
416 /// Returns the byte index of the position immediately following the last
417 /// valid UTF-8 byte.
421 /// This examples shows how `valid_up_to` can be used to retrieve a
422 /// possibly empty prefix that is guaranteed to be valid UTF-8:
425 /// use bstr::ByteSlice;
427 /// let s = b"foobar\xF1\x80\x80quux";
428 /// let err = s.to_str().unwrap_err();
430 /// // This is guaranteed to never panic.
431 /// let string = s[..err.valid_up_to()].to_str().unwrap();
432 /// assert_eq!(string, "foobar");
435 pub fn valid_up_to(&self) -> usize {
439 /// Returns the total number of invalid UTF-8 bytes immediately following
440 /// the position returned by `valid_up_to`. This value is always at least
441 /// `1`, but can be up to `3` if bytes form a valid prefix of some UTF-8
442 /// encoded codepoint.
444 /// If the end of the original input was found before a valid UTF-8 encoded
445 /// codepoint could be completed, then this returns `None`. This is useful
446 /// when processing streams, where a `None` value signals that more input
449 pub fn error_len(&self) -> Option
<usize> {
454 #[cfg(feature = "std")]
455 impl error
::Error
for Utf8Error
{
456 fn description(&self) -> &str {
461 impl fmt
::Display
for Utf8Error
{
462 fn fmt(&self, f
: &mut fmt
::Formatter
<'_
>) -> fmt
::Result
{
463 write
!(f
, "invalid UTF-8 found at byte offset {}", self.valid_up_to
)
467 /// Returns OK if and only if the given slice is completely valid UTF-8.
469 /// If the slice isn't valid UTF-8, then an error is returned that explains
470 /// the first location at which invalid UTF-8 was detected.
471 pub fn validate(slice
: &[u8]) -> Result
<(), Utf8Error
> {
472 // The fast path for validating UTF-8. It steps through a UTF-8 automaton
473 // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is
474 // detected, it backs up and runs the slower version of the UTF-8 automaton
475 // to determine correct error information.
476 fn fast(slice
: &[u8]) -> Result
<(), Utf8Error
> {
477 let mut state
= ACCEPT
;
480 while i
< slice
.len() {
483 // ASCII fast path. If we see two consecutive ASCII bytes, then try
484 // to validate as much ASCII as possible very quickly.
487 && slice
.get(i
+ 1).map_or(false, |&b
| b
<= 0x7F)
489 i
+= ascii
::first_non_ascii_byte(&slice
[i
..]);
493 state
= step(state
, b
);
495 return Err(find_valid_up_to(slice
, i
));
500 Err(find_valid_up_to(slice
, slice
.len()))
506 // Given the first position at which a UTF-8 sequence was determined to be
507 // invalid, return an error that correctly reports the position at which
508 // the last complete UTF-8 sequence ends.
510 fn find_valid_up_to(slice
: &[u8], rejected_at
: usize) -> Utf8Error
{
511 // In order to find the last valid byte, we need to back up an amount
512 // that guarantees every preceding byte is part of a valid UTF-8
513 // code unit sequence. To do this, we simply locate the last leading
514 // byte that occurs before rejected_at.
515 let mut backup
= rejected_at
.saturating_sub(1);
516 while backup
> 0 && !is_leading_or_invalid_utf8_byte(slice
[backup
]) {
519 let upto
= cmp
::min(slice
.len(), rejected_at
.saturating_add(1));
520 let mut err
= slow(&slice
[backup
..upto
]).unwrap_err();
521 err
.valid_up_to
+= backup
;
525 // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error
526 // when an invalid sequence is found. This is split out from validate so
527 // that the fast path doesn't need to keep track of the position of the
528 // last valid UTF-8 byte. In particular, tracking this requires checking
529 // for an ACCEPT state on each byte, which degrades throughput pretty
531 fn slow(slice
: &[u8]) -> Result
<(), Utf8Error
> {
532 let mut state
= ACCEPT
;
533 let mut valid_up_to
= 0;
534 for (i
, &b
) in slice
.iter().enumerate() {
535 state
= step(state
, b
);
538 } else if state
== REJECT
{
539 // Our error length must always be at least 1.
540 let error_len
= Some(cmp
::max(1, i
- valid_up_to
));
541 return Err(Utf8Error { valid_up_to, error_len }
);
545 Err(Utf8Error { valid_up_to, error_len: None }
)
551 // Advance to the next state given the current state and current byte.
552 fn step(state
: usize, b
: u8) -> usize {
553 let class
= CLASSES
[b
as usize];
554 // SAFETY: This is safe because 'class' is always <=11 and 'state' is
555 // always <=96. Therefore, the maximal index is 96+11 = 107, where
556 // STATES_FORWARD.len() = 108 such that every index is guaranteed to be
557 // valid by construction of the state machine and the byte equivalence
560 *STATES_FORWARD
.get_unchecked(state
+ class
as usize) as usize
567 /// UTF-8 decode a single Unicode scalar value from the beginning of a slice.
569 /// When successful, the corresponding Unicode scalar value is returned along
570 /// with the number of bytes it was encoded with. The number of bytes consumed
571 /// for a successful decode is always between 1 and 4, inclusive.
573 /// When unsuccessful, `None` is returned along with the number of bytes that
574 /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
575 /// the number of bytes consumed is always between 0 and 3, inclusive, where
576 /// 0 is only returned when `slice` is empty.
583 /// use bstr::decode_utf8;
585 /// // Decoding a valid codepoint.
586 /// let (ch, size) = decode_utf8(b"\xE2\x98\x83");
587 /// assert_eq!(Some('☃'), ch);
588 /// assert_eq!(3, size);
590 /// // Decoding an incomplete codepoint.
591 /// let (ch, size) = decode_utf8(b"\xE2\x98");
592 /// assert_eq!(None, ch);
593 /// assert_eq!(2, size);
596 /// This example shows how to iterate over all codepoints in UTF-8 encoded
597 /// bytes, while replacing invalid UTF-8 sequences with the replacement
601 /// use bstr::{B, decode_utf8};
603 /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
604 /// let mut chars = vec![];
605 /// while !bytes.is_empty() {
606 /// let (ch, size) = decode_utf8(bytes);
607 /// bytes = &bytes[size..];
608 /// chars.push(ch.unwrap_or('\u{FFFD}'));
610 /// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
613 pub fn decode
<B
: AsRef
<[u8]>>(slice
: B
) -> (Option
<char>, usize) {
614 let slice
= slice
.as_ref();
616 None
=> return (None
, 0),
617 Some(&b
) if b
<= 0x7F => return (Some(b
as char), 1),
621 let (mut state
, mut cp
, mut i
) = (ACCEPT
, 0, 0);
622 while i
< slice
.len() {
623 decode_step(&mut state
, &mut cp
, slice
[i
]);
627 // SAFETY: This is safe because `decode_step` guarantees that
628 // `cp` is a valid Unicode scalar value in an ACCEPT state.
629 let ch
= unsafe { char::from_u32_unchecked(cp) }
;
630 return (Some(ch
), i
);
631 } else if state
== REJECT
{
632 // At this point, we always want to advance at least one byte.
633 return (None
, cmp
::max(1, i
.saturating_sub(1)));
639 /// Lossily UTF-8 decode a single Unicode scalar value from the beginning of a
642 /// When successful, the corresponding Unicode scalar value is returned along
643 /// with the number of bytes it was encoded with. The number of bytes consumed
644 /// for a successful decode is always between 1 and 4, inclusive.
646 /// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
647 /// along with the number of bytes that make up a maximal prefix of a valid
648 /// UTF-8 code unit sequence. In this case, the number of bytes consumed is
649 /// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
657 /// use bstr::decode_utf8_lossy;
659 /// // Decoding a valid codepoint.
660 /// let (ch, size) = decode_utf8_lossy(b"\xE2\x98\x83");
661 /// assert_eq!('☃', ch);
662 /// assert_eq!(3, size);
664 /// // Decoding an incomplete codepoint.
665 /// let (ch, size) = decode_utf8_lossy(b"\xE2\x98");
666 /// assert_eq!('\u{FFFD}', ch);
667 /// assert_eq!(2, size);
670 /// This example shows how to iterate over all codepoints in UTF-8 encoded
671 /// bytes, while replacing invalid UTF-8 sequences with the replacement
675 /// use bstr::{B, decode_utf8_lossy};
677 /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
678 /// let mut chars = vec![];
679 /// while !bytes.is_empty() {
680 /// let (ch, size) = decode_utf8_lossy(bytes);
681 /// bytes = &bytes[size..];
684 /// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
687 pub fn decode_lossy
<B
: AsRef
<[u8]>>(slice
: B
) -> (char, usize) {
688 match decode(slice
) {
689 (Some(ch
), size
) => (ch
, size
),
690 (None
, size
) => ('
\u{FFFD}'
, size
),
694 /// UTF-8 decode a single Unicode scalar value from the end of a slice.
696 /// When successful, the corresponding Unicode scalar value is returned along
697 /// with the number of bytes it was encoded with. The number of bytes consumed
698 /// for a successful decode is always between 1 and 4, inclusive.
700 /// When unsuccessful, `None` is returned along with the number of bytes that
701 /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
702 /// the number of bytes consumed is always between 0 and 3, inclusive, where
703 /// 0 is only returned when `slice` is empty.
710 /// use bstr::decode_last_utf8;
712 /// // Decoding a valid codepoint.
713 /// let (ch, size) = decode_last_utf8(b"\xE2\x98\x83");
714 /// assert_eq!(Some('☃'), ch);
715 /// assert_eq!(3, size);
717 /// // Decoding an incomplete codepoint.
718 /// let (ch, size) = decode_last_utf8(b"\xE2\x98");
719 /// assert_eq!(None, ch);
720 /// assert_eq!(2, size);
723 /// This example shows how to iterate over all codepoints in UTF-8 encoded
724 /// bytes in reverse, while replacing invalid UTF-8 sequences with the
725 /// replacement codepoint:
728 /// use bstr::{B, decode_last_utf8};
730 /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
731 /// let mut chars = vec![];
732 /// while !bytes.is_empty() {
733 /// let (ch, size) = decode_last_utf8(bytes);
734 /// bytes = &bytes[..bytes.len()-size];
735 /// chars.push(ch.unwrap_or('\u{FFFD}'));
737 /// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
740 pub fn decode_last
<B
: AsRef
<[u8]>>(slice
: B
) -> (Option
<char>, usize) {
741 // TODO: We could implement this by reversing the UTF-8 automaton, but for
742 // now, we do it the slow way by using the forward automaton.
744 let slice
= slice
.as_ref();
745 if slice
.is_empty() {
748 let mut start
= slice
.len() - 1;
749 let limit
= slice
.len().saturating_sub(4);
750 while start
> limit
&& !is_leading_or_invalid_utf8_byte(slice
[start
]) {
753 let (ch
, size
) = decode(&slice
[start
..]);
754 // If we didn't consume all of the bytes, then that means there's at least
755 // one stray byte that never occurs in a valid code unit prefix, so we can
756 // advance by one byte.
757 if start
+ size
!= slice
.len() {
764 /// Lossily UTF-8 decode a single Unicode scalar value from the end of a slice.
766 /// When successful, the corresponding Unicode scalar value is returned along
767 /// with the number of bytes it was encoded with. The number of bytes consumed
768 /// for a successful decode is always between 1 and 4, inclusive.
770 /// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
771 /// along with the number of bytes that make up a maximal prefix of a valid
772 /// UTF-8 code unit sequence. In this case, the number of bytes consumed is
773 /// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
781 /// use bstr::decode_last_utf8_lossy;
783 /// // Decoding a valid codepoint.
784 /// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98\x83");
785 /// assert_eq!('☃', ch);
786 /// assert_eq!(3, size);
788 /// // Decoding an incomplete codepoint.
789 /// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98");
790 /// assert_eq!('\u{FFFD}', ch);
791 /// assert_eq!(2, size);
794 /// This example shows how to iterate over all codepoints in UTF-8 encoded
795 /// bytes in reverse, while replacing invalid UTF-8 sequences with the
796 /// replacement codepoint:
799 /// use bstr::decode_last_utf8_lossy;
801 /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
802 /// let mut chars = vec![];
803 /// while !bytes.is_empty() {
804 /// let (ch, size) = decode_last_utf8_lossy(bytes);
805 /// bytes = &bytes[..bytes.len()-size];
808 /// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
811 pub fn decode_last_lossy
<B
: AsRef
<[u8]>>(slice
: B
) -> (char, usize) {
812 match decode_last(slice
) {
813 (Some(ch
), size
) => (ch
, size
),
814 (None
, size
) => ('
\u{FFFD}'
, size
),
818 /// SAFETY: The decode function relies on state being equal to ACCEPT only if
819 /// cp is a valid Unicode scalar value.
821 pub fn decode_step(state
: &mut usize, cp
: &mut u32, b
: u8) {
822 let class
= CLASSES
[b
as usize];
823 if *state
== ACCEPT
{
824 *cp
= (0xFF >> class
) & (b
as u32);
826 *cp
= (b
as u32 & 0b111111) | (*cp
<< 6);
828 *state
= STATES_FORWARD
[*state
+ class
as usize] as usize;
831 /// Returns true if and only if the given byte is either a valid leading UTF-8
832 /// byte, or is otherwise an invalid byte that can never appear anywhere in a
833 /// valid UTF-8 sequence.
834 fn is_leading_or_invalid_utf8_byte(b
: u8) -> bool
{
835 // In the ASCII case, the most significant bit is never set. The leading
836 // byte of a 2/3/4-byte sequence always has the top two most significant
837 // bits set. For bytes that can never appear anywhere in valid UTF-8, this
838 // also returns true, since every such byte has its two most significant
854 (b
& 0b1100_0000) != 0b1000_0000
861 use crate::ext_slice
::{ByteSlice, B}
;
862 use crate::tests
::LOSSY_TESTS
;
863 use crate::utf8
::{self, Utf8Error}
;
865 fn utf8e(valid_up_to
: usize) -> Utf8Error
{
866 Utf8Error { valid_up_to, error_len: None }
869 fn utf8e2(valid_up_to
: usize, error_len
: usize) -> Utf8Error
{
870 Utf8Error { valid_up_to, error_len: Some(error_len) }
874 fn validate_all_codepoints() {
875 for i
in 0..(0x10FFFF + 1) {
876 let cp
= match char::from_u32(i
) {
880 let mut buf
= [0; 4];
881 let s
= cp
.encode_utf8(&mut buf
);
882 assert_eq
!(Ok(()), utf8
::validate(s
.as_bytes()));
887 fn validate_multiple_codepoints() {
888 assert_eq
!(Ok(()), utf8
::validate(b
"abc"));
889 assert_eq
!(Ok(()), utf8
::validate(b
"a\xE2\x98\x83a"));
890 assert_eq
!(Ok(()), utf8
::validate(b
"a\xF0\x9D\x9C\xB7a"));
891 assert_eq
!(Ok(()), utf8
::validate(b
"\xE2\x98\x83\xF0\x9D\x9C\xB7",));
894 utf8
::validate(b
"a\xE2\x98\x83a\xF0\x9D\x9C\xB7a",)
898 utf8
::validate(b
"\xEF\xBF\xBD\xE2\x98\x83\xEF\xBF\xBD",)
903 fn validate_errors() {
904 // single invalid byte
905 assert_eq
!(Err(utf8e2(0, 1)), utf8
::validate(b
"\xFF"));
906 // single invalid byte after ASCII
907 assert_eq
!(Err(utf8e2(1, 1)), utf8
::validate(b
"a\xFF"));
908 // single invalid byte after 2 byte sequence
909 assert_eq
!(Err(utf8e2(2, 1)), utf8
::validate(b
"\xCE\xB2\xFF"));
910 // single invalid byte after 3 byte sequence
911 assert_eq
!(Err(utf8e2(3, 1)), utf8
::validate(b
"\xE2\x98\x83\xFF"));
912 // single invalid byte after 4 byte sequence
913 assert_eq
!(Err(utf8e2(4, 1)), utf8
::validate(b
"\xF0\x9D\x9D\xB1\xFF"));
915 // An invalid 2-byte sequence with a valid 1-byte prefix.
916 assert_eq
!(Err(utf8e2(0, 1)), utf8
::validate(b
"\xCE\xF0"));
917 // An invalid 3-byte sequence with a valid 2-byte prefix.
918 assert_eq
!(Err(utf8e2(0, 2)), utf8
::validate(b
"\xE2\x98\xF0"));
919 // An invalid 4-byte sequence with a valid 3-byte prefix.
920 assert_eq
!(Err(utf8e2(0, 3)), utf8
::validate(b
"\xF0\x9D\x9D\xF0"));
922 // An overlong sequence. Should be \xE2\x82\xAC, but we encode the
923 // same codepoint value in 4 bytes. This not only tests that we reject
924 // overlong sequences, but that we get valid_up_to correct.
925 assert_eq
!(Err(utf8e2(0, 1)), utf8
::validate(b
"\xF0\x82\x82\xAC"));
926 assert_eq
!(Err(utf8e2(1, 1)), utf8
::validate(b
"a\xF0\x82\x82\xAC"));
929 utf8
::validate(b
"\xE2\x98\x83\xF0\x82\x82\xAC",)
932 // Check that encoding a surrogate codepoint using the UTF-8 scheme
934 assert_eq
!(Err(utf8e2(0, 1)), utf8
::validate(b
"\xED\xA0\x80"));
935 assert_eq
!(Err(utf8e2(1, 1)), utf8
::validate(b
"a\xED\xA0\x80"));
938 utf8
::validate(b
"\xE2\x98\x83\xED\xA0\x80",)
941 // Check that an incomplete 2-byte sequence fails.
942 assert_eq
!(Err(utf8e2(0, 1)), utf8
::validate(b
"\xCEa"));
943 assert_eq
!(Err(utf8e2(1, 1)), utf8
::validate(b
"a\xCEa"));
946 utf8
::validate(b
"\xE2\x98\x83\xCE\xE2\x98\x83",)
948 // Check that an incomplete 3-byte sequence fails.
949 assert_eq
!(Err(utf8e2(0, 2)), utf8
::validate(b
"\xE2\x98a"));
950 assert_eq
!(Err(utf8e2(1, 2)), utf8
::validate(b
"a\xE2\x98a"));
953 utf8
::validate(b
"\xE2\x98\x83\xE2\x98\xE2\x98\x83",)
955 // Check that an incomplete 4-byte sequence fails.
956 assert_eq
!(Err(utf8e2(0, 3)), utf8
::validate(b
"\xF0\x9D\x9Ca"));
957 assert_eq
!(Err(utf8e2(1, 3)), utf8
::validate(b
"a\xF0\x9D\x9Ca"));
960 utf8
::validate(b
"\xF0\x9D\x9C\xB1\xF0\x9D\x9C\xE2\x98\x83",)
964 utf8
::validate(b
"foobar\xF1\x80\x80quux",)
967 // Check that an incomplete (EOF) 2-byte sequence fails.
968 assert_eq
!(Err(utf8e(0)), utf8
::validate(b
"\xCE"));
969 assert_eq
!(Err(utf8e(1)), utf8
::validate(b
"a\xCE"));
970 assert_eq
!(Err(utf8e(3)), utf8
::validate(b
"\xE2\x98\x83\xCE"));
971 // Check that an incomplete (EOF) 3-byte sequence fails.
972 assert_eq
!(Err(utf8e(0)), utf8
::validate(b
"\xE2\x98"));
973 assert_eq
!(Err(utf8e(1)), utf8
::validate(b
"a\xE2\x98"));
974 assert_eq
!(Err(utf8e(3)), utf8
::validate(b
"\xE2\x98\x83\xE2\x98"));
975 // Check that an incomplete (EOF) 4-byte sequence fails.
976 assert_eq
!(Err(utf8e(0)), utf8
::validate(b
"\xF0\x9D\x9C"));
977 assert_eq
!(Err(utf8e(1)), utf8
::validate(b
"a\xF0\x9D\x9C"));
980 utf8
::validate(b
"\xF0\x9D\x9C\xB1\xF0\x9D\x9C",)
983 // Test that we errors correct even after long valid sequences. This
984 // checks that our "backup" logic for detecting errors is correct.
987 utf8
::validate(b
"\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF",)
993 fn d(mut s
: &str) -> Vec
<char> {
994 let mut chars
= vec
![];
995 while !s
.is_empty() {
996 let (ch
, size
) = utf8
::decode(s
.as_bytes());
998 chars
.push(ch
.unwrap());
1003 assert_eq
!(vec
!['☃'
], d("☃"));
1004 assert_eq
!(vec
!['☃'
, '☃'
], d("☃☃"));
1005 assert_eq
!(vec
!['α'
, 'β'
, 'γ'
, 'δ'
, 'ε'
], d("αβγδε"));
1006 assert_eq
!(vec
!['☃'
, '⛄'
, '⛇'
], d("☃⛄⛇"));
1007 assert_eq
!(vec
!['𝗮'
, '𝗯'
, '𝗰'
, '𝗱'
, '𝗲'
], d("𝗮𝗯𝗰𝗱𝗲"));
1011 fn decode_invalid() {
1012 let (ch
, size
) = utf8
::decode(b
"");
1013 assert_eq
!(None
, ch
);
1014 assert_eq
!(0, size
);
1016 let (ch
, size
) = utf8
::decode(b
"\xFF");
1017 assert_eq
!(None
, ch
);
1018 assert_eq
!(1, size
);
1020 let (ch
, size
) = utf8
::decode(b
"\xCE\xF0");
1021 assert_eq
!(None
, ch
);
1022 assert_eq
!(1, size
);
1024 let (ch
, size
) = utf8
::decode(b
"\xE2\x98\xF0");
1025 assert_eq
!(None
, ch
);
1026 assert_eq
!(2, size
);
1028 let (ch
, size
) = utf8
::decode(b
"\xF0\x9D\x9D");
1029 assert_eq
!(None
, ch
);
1030 assert_eq
!(3, size
);
1032 let (ch
, size
) = utf8
::decode(b
"\xF0\x9D\x9D\xF0");
1033 assert_eq
!(None
, ch
);
1034 assert_eq
!(3, size
);
1036 let (ch
, size
) = utf8
::decode(b
"\xF0\x82\x82\xAC");
1037 assert_eq
!(None
, ch
);
1038 assert_eq
!(1, size
);
1040 let (ch
, size
) = utf8
::decode(b
"\xED\xA0\x80");
1041 assert_eq
!(None
, ch
);
1042 assert_eq
!(1, size
);
1044 let (ch
, size
) = utf8
::decode(b
"\xCEa");
1045 assert_eq
!(None
, ch
);
1046 assert_eq
!(1, size
);
1048 let (ch
, size
) = utf8
::decode(b
"\xE2\x98a");
1049 assert_eq
!(None
, ch
);
1050 assert_eq
!(2, size
);
1052 let (ch
, size
) = utf8
::decode(b
"\xF0\x9D\x9Ca");
1053 assert_eq
!(None
, ch
);
1054 assert_eq
!(3, size
);
1059 let (ch
, size
) = utf8
::decode_lossy(b
"");
1060 assert_eq
!('
\u{FFFD}'
, ch
);
1061 assert_eq
!(0, size
);
1063 let (ch
, size
) = utf8
::decode_lossy(b
"\xFF");
1064 assert_eq
!('
\u{FFFD}'
, ch
);
1065 assert_eq
!(1, size
);
1067 let (ch
, size
) = utf8
::decode_lossy(b
"\xCE\xF0");
1068 assert_eq
!('
\u{FFFD}'
, ch
);
1069 assert_eq
!(1, size
);
1071 let (ch
, size
) = utf8
::decode_lossy(b
"\xE2\x98\xF0");
1072 assert_eq
!('
\u{FFFD}'
, ch
);
1073 assert_eq
!(2, size
);
1075 let (ch
, size
) = utf8
::decode_lossy(b
"\xF0\x9D\x9D\xF0");
1076 assert_eq
!('
\u{FFFD}'
, ch
);
1077 assert_eq
!(3, size
);
1079 let (ch
, size
) = utf8
::decode_lossy(b
"\xF0\x82\x82\xAC");
1080 assert_eq
!('
\u{FFFD}'
, ch
);
1081 assert_eq
!(1, size
);
1083 let (ch
, size
) = utf8
::decode_lossy(b
"\xED\xA0\x80");
1084 assert_eq
!('
\u{FFFD}'
, ch
);
1085 assert_eq
!(1, size
);
1087 let (ch
, size
) = utf8
::decode_lossy(b
"\xCEa");
1088 assert_eq
!('
\u{FFFD}'
, ch
);
1089 assert_eq
!(1, size
);
1091 let (ch
, size
) = utf8
::decode_lossy(b
"\xE2\x98a");
1092 assert_eq
!('
\u{FFFD}'
, ch
);
1093 assert_eq
!(2, size
);
1095 let (ch
, size
) = utf8
::decode_lossy(b
"\xF0\x9D\x9Ca");
1096 assert_eq
!('
\u{FFFD}'
, ch
);
1097 assert_eq
!(3, size
);
1101 fn decode_last_valid() {
1102 fn d(mut s
: &str) -> Vec
<char> {
1103 let mut chars
= vec
![];
1104 while !s
.is_empty() {
1105 let (ch
, size
) = utf8
::decode_last(s
.as_bytes());
1106 s
= &s
[..s
.len() - size
];
1107 chars
.push(ch
.unwrap());
1112 assert_eq
!(vec
!['☃'
], d("☃"));
1113 assert_eq
!(vec
!['☃'
, '☃'
], d("☃☃"));
1114 assert_eq
!(vec
!['ε'
, 'δ'
, 'γ'
, 'β'
, 'α'
], d("αβγδε"));
1115 assert_eq
!(vec
!['⛇'
, '⛄'
, '☃'
], d("☃⛄⛇"));
1116 assert_eq
!(vec
!['𝗲'
, '𝗱'
, '𝗰'
, '𝗯'
, '𝗮'
], d("𝗮𝗯𝗰𝗱𝗲"));
1120 fn decode_last_invalid() {
1121 let (ch
, size
) = utf8
::decode_last(b
"");
1122 assert_eq
!(None
, ch
);
1123 assert_eq
!(0, size
);
1125 let (ch
, size
) = utf8
::decode_last(b
"\xFF");
1126 assert_eq
!(None
, ch
);
1127 assert_eq
!(1, size
);
1129 let (ch
, size
) = utf8
::decode_last(b
"\xCE\xF0");
1130 assert_eq
!(None
, ch
);
1131 assert_eq
!(1, size
);
1133 let (ch
, size
) = utf8
::decode_last(b
"\xCE");
1134 assert_eq
!(None
, ch
);
1135 assert_eq
!(1, size
);
1137 let (ch
, size
) = utf8
::decode_last(b
"\xE2\x98\xF0");
1138 assert_eq
!(None
, ch
);
1139 assert_eq
!(1, size
);
1141 let (ch
, size
) = utf8
::decode_last(b
"\xE2\x98");
1142 assert_eq
!(None
, ch
);
1143 assert_eq
!(2, size
);
1145 let (ch
, size
) = utf8
::decode_last(b
"\xF0\x9D\x9D\xF0");
1146 assert_eq
!(None
, ch
);
1147 assert_eq
!(1, size
);
1149 let (ch
, size
) = utf8
::decode_last(b
"\xF0\x9D\x9D");
1150 assert_eq
!(None
, ch
);
1151 assert_eq
!(3, size
);
1153 let (ch
, size
) = utf8
::decode_last(b
"\xF0\x82\x82\xAC");
1154 assert_eq
!(None
, ch
);
1155 assert_eq
!(1, size
);
1157 let (ch
, size
) = utf8
::decode_last(b
"\xED\xA0\x80");
1158 assert_eq
!(None
, ch
);
1159 assert_eq
!(1, size
);
1161 let (ch
, size
) = utf8
::decode_last(b
"\xED\xA0");
1162 assert_eq
!(None
, ch
);
1163 assert_eq
!(1, size
);
1165 let (ch
, size
) = utf8
::decode_last(b
"\xED");
1166 assert_eq
!(None
, ch
);
1167 assert_eq
!(1, size
);
1169 let (ch
, size
) = utf8
::decode_last(b
"a\xCE");
1170 assert_eq
!(None
, ch
);
1171 assert_eq
!(1, size
);
1173 let (ch
, size
) = utf8
::decode_last(b
"a\xE2\x98");
1174 assert_eq
!(None
, ch
);
1175 assert_eq
!(2, size
);
1177 let (ch
, size
) = utf8
::decode_last(b
"a\xF0\x9D\x9C");
1178 assert_eq
!(None
, ch
);
1179 assert_eq
!(3, size
);
1183 fn decode_last_lossy() {
1184 let (ch
, size
) = utf8
::decode_last_lossy(b
"");
1185 assert_eq
!('
\u{FFFD}'
, ch
);
1186 assert_eq
!(0, size
);
1188 let (ch
, size
) = utf8
::decode_last_lossy(b
"\xFF");
1189 assert_eq
!('
\u{FFFD}'
, ch
);
1190 assert_eq
!(1, size
);
1192 let (ch
, size
) = utf8
::decode_last_lossy(b
"\xCE\xF0");
1193 assert_eq
!('
\u{FFFD}'
, ch
);
1194 assert_eq
!(1, size
);
1196 let (ch
, size
) = utf8
::decode_last_lossy(b
"\xCE");
1197 assert_eq
!('
\u{FFFD}'
, ch
);
1198 assert_eq
!(1, size
);
1200 let (ch
, size
) = utf8
::decode_last_lossy(b
"\xE2\x98\xF0");
1201 assert_eq
!('
\u{FFFD}'
, ch
);
1202 assert_eq
!(1, size
);
1204 let (ch
, size
) = utf8
::decode_last_lossy(b
"\xE2\x98");
1205 assert_eq
!('
\u{FFFD}'
, ch
);
1206 assert_eq
!(2, size
);
1208 let (ch
, size
) = utf8
::decode_last_lossy(b
"\xF0\x9D\x9D\xF0");
1209 assert_eq
!('
\u{FFFD}'
, ch
);
1210 assert_eq
!(1, size
);
1212 let (ch
, size
) = utf8
::decode_last_lossy(b
"\xF0\x9D\x9D");
1213 assert_eq
!('
\u{FFFD}'
, ch
);
1214 assert_eq
!(3, size
);
1216 let (ch
, size
) = utf8
::decode_last_lossy(b
"\xF0\x82\x82\xAC");
1217 assert_eq
!('
\u{FFFD}'
, ch
);
1218 assert_eq
!(1, size
);
1220 let (ch
, size
) = utf8
::decode_last_lossy(b
"\xED\xA0\x80");
1221 assert_eq
!('
\u{FFFD}'
, ch
);
1222 assert_eq
!(1, size
);
1224 let (ch
, size
) = utf8
::decode_last_lossy(b
"\xED\xA0");
1225 assert_eq
!('
\u{FFFD}'
, ch
);
1226 assert_eq
!(1, size
);
1228 let (ch
, size
) = utf8
::decode_last_lossy(b
"\xED");
1229 assert_eq
!('
\u{FFFD}'
, ch
);
1230 assert_eq
!(1, size
);
1232 let (ch
, size
) = utf8
::decode_last_lossy(b
"a\xCE");
1233 assert_eq
!('
\u{FFFD}'
, ch
);
1234 assert_eq
!(1, size
);
1236 let (ch
, size
) = utf8
::decode_last_lossy(b
"a\xE2\x98");
1237 assert_eq
!('
\u{FFFD}'
, ch
);
1238 assert_eq
!(2, size
);
1240 let (ch
, size
) = utf8
::decode_last_lossy(b
"a\xF0\x9D\x9C");
1241 assert_eq
!('
\u{FFFD}'
, ch
);
1242 assert_eq
!(3, size
);
1247 for (i
, &(expected
, input
)) in LOSSY_TESTS
.iter().enumerate() {
1248 let got
: String
= B(input
).chars().collect();
1251 "chars(ith: {:?}, given: {:?})",
1255 B(input
).char_indices().map(|(_
, _
, ch
)| ch
).collect();
1258 "char_indices(ith: {:?}, given: {:?})",
1262 let expected
: String
= expected
.chars().rev().collect();
1264 let got
: String
= B(input
).chars().rev().collect();
1267 "chars.rev(ith: {:?}, given: {:?})",
1271 B(input
).char_indices().rev().map(|(_
, _
, ch
)| ch
).collect();
1274 "char_indices.rev(ith: {:?}, given: {:?})",
1282 let mut c
= utf8
::Utf8Chunks { bytes: b"123\xC0" }
;
1284 (c
.next(), c
.next()),
1286 Some(utf8
::Utf8Chunk
{
1288 invalid
: b
"\xC0".as_bstr(),
1295 let mut c
= utf8
::Utf8Chunks { bytes: b"123\xFF\xFF" }
;
1297 (c
.next(), c
.next(), c
.next()),
1299 Some(utf8
::Utf8Chunk
{
1301 invalid
: b
"\xFF".as_bstr(),
1304 Some(utf8
::Utf8Chunk
{
1306 invalid
: b
"\xFF".as_bstr(),
1313 let mut c
= utf8
::Utf8Chunks { bytes: b"123\xD0" }
;
1315 (c
.next(), c
.next()),
1317 Some(utf8
::Utf8Chunk
{
1319 invalid
: b
"\xD0".as_bstr(),
1326 let mut c
= utf8
::Utf8Chunks { bytes: b"123\xD0456" }
;
1328 (c
.next(), c
.next(), c
.next()),
1330 Some(utf8
::Utf8Chunk
{
1332 invalid
: b
"\xD0".as_bstr(),
1335 Some(utf8
::Utf8Chunk
{
1337 invalid
: b
"".as_bstr(),
1344 let mut c
= utf8
::Utf8Chunks { bytes: b"123\xE2\x98" }
;
1346 (c
.next(), c
.next()),
1348 Some(utf8
::Utf8Chunk
{
1350 invalid
: b
"\xE2\x98".as_bstr(),
1357 let mut c
= utf8
::Utf8Chunks { bytes: b"123\xF4\x8F\xBF" }
;
1359 (c
.next(), c
.next()),
1361 Some(utf8
::Utf8Chunk
{
1363 invalid
: b
"\xF4\x8F\xBF".as_bstr(),