vendor/bstr-0.2.17/src/utf8.rs

   1 use core::char;
   2 use core::cmp;
   3 use core::fmt;
   4 use core::str;
   5 #[cfg(feature = "std")]
   6 use std::error;
   7
   8 use crate::ascii;
   9 use crate::bstr::BStr;
  10 use crate::ext_slice::ByteSlice;
  11
  12 // The UTF-8 decoder provided here is based on the one presented here:
  13 // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
  14 //
  15 // We *could* have done UTF-8 decoding by using a DFA generated by `\p{any}`
  16 // using regex-automata that is roughly the same size. The real benefit of
  17 // Hoehrmann's formulation is that the byte class mapping below is manually
  18 // tailored such that each byte's class doubles as a shift to mask out the
  19 // bits necessary for constructing the leading bits of each codepoint value
  20 // from the initial byte.
  21 //
  22 // There are some minor differences between this implementation and Hoehrmann's
  23 // formulation.
  24 //
  25 // Firstly, we make REJECT have state ID 0, since it makes the state table
  26 // itself a little easier to read and is consistent with the notion that 0
  27 // means "false" or "bad."
  28 //
  29 // Secondly, when doing bulk decoding, we add a SIMD accelerated ASCII fast
  30 // path.
  31 //
  32 // Thirdly, we pre-multiply the state IDs to avoid a multiplication instruction
  33 // in the core decoding loop. (Which is what regex-automata would do by
  34 // default.)
  35 //
  36 // Fourthly, we split the byte class mapping and transition table into two
  37 // arrays because it's clearer.
  38 //
  39 // It is unlikely that this is the fastest way to do UTF-8 decoding, however,
  40 // it is fairly simple.
  41
  42 const ACCEPT: usize = 12;
  43 const REJECT: usize = 0;
  44
  45 /// SAFETY: The decode below function relies on the correctness of these
  46 /// equivalence classes.
  47 #[cfg_attr(rustfmt, rustfmt::skip)]
  48 const CLASSES: [u8; 256] = [
  49    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  50    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  51    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  52    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  53    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
  54    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  55    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  56   10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
  57 ];
  58
  59 /// SAFETY: The decode below function relies on the correctness of this state
  60 /// machine.
  61 #[cfg_attr(rustfmt, rustfmt::skip)]
  62 const STATES_FORWARD: &'static [u8] = &[
  63   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  64   12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72,
  65   0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0,
  66   0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0,
  67   0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0,
  68   0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0,
  69   0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
  70   0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
  71   0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  72 ];
  73
  74 /// An iterator over Unicode scalar values in a byte string.
  75 ///
  76 /// When invalid UTF-8 byte sequences are found, they are substituted with the
  77 /// Unicode replacement codepoint (`U+FFFD`) using the
  78 /// ["maximal subpart" strategy](http://www.unicode.org/review/pr-121.html).
  79 ///
  80 /// This iterator is created by the
  81 /// [`chars`](trait.ByteSlice.html#method.chars) method provided by the
  82 /// [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
  83 #[derive(Clone, Debug)]
  84 pub struct Chars<'a> {
  85     bs: &'a [u8],
  86 }
  87
  88 impl<'a> Chars<'a> {
  89     pub(crate) fn new(bs: &'a [u8]) -> Chars<'a> {
  90         Chars { bs }
  91     }
  92
  93     /// View the underlying data as a subslice of the original data.
  94     ///
  95     /// The slice returned has the same lifetime as the original slice, and so
  96     /// the iterator can continue to be used while this exists.
  97     ///
  98     /// # Examples
  99     ///
 100     /// ```
 101     /// use bstr::ByteSlice;
 102     ///
 103     /// let mut chars = b"abc".chars();
 104     ///
 105     /// assert_eq!(b"abc", chars.as_bytes());
 106     /// chars.next();
 107     /// assert_eq!(b"bc", chars.as_bytes());
 108     /// chars.next();
 109     /// chars.next();
 110     /// assert_eq!(b"", chars.as_bytes());
 111     /// ```
 112     #[inline]
 113     pub fn as_bytes(&self) -> &'a [u8] {
 114         self.bs
 115     }
 116 }
 117
 118 impl<'a> Iterator for Chars<'a> {
 119     type Item = char;
 120
 121     #[inline]
 122     fn next(&mut self) -> Option<char> {
 123         let (ch, size) = decode_lossy(self.bs);
 124         if size == 0 {
 125             return None;
 126         }
 127         self.bs = &self.bs[size..];
 128         Some(ch)
 129     }
 130 }
 131
 132 impl<'a> DoubleEndedIterator for Chars<'a> {
 133     #[inline]
 134     fn next_back(&mut self) -> Option<char> {
 135         let (ch, size) = decode_last_lossy(self.bs);
 136         if size == 0 {
 137             return None;
 138         }
 139         self.bs = &self.bs[..self.bs.len() - size];
 140         Some(ch)
 141     }
 142 }
 143
 144 /// An iterator over Unicode scalar values in a byte string and their
 145 /// byte index positions.
 146 ///
 147 /// When invalid UTF-8 byte sequences are found, they are substituted with the
 148 /// Unicode replacement codepoint (`U+FFFD`) using the
 149 /// ["maximal subpart" strategy](http://www.unicode.org/review/pr-121.html).
 150 ///
 151 /// Note that this is slightly different from the `CharIndices` iterator
 152 /// provided by the standard library. Aside from working on possibly invalid
 153 /// UTF-8, this iterator provides both the corresponding starting and ending
 154 /// byte indices of each codepoint yielded. The ending position is necessary to
 155 /// slice the original byte string when invalid UTF-8 bytes are converted into
 156 /// a Unicode replacement codepoint, since a single replacement codepoint can
 157 /// substitute anywhere from 1 to 3 invalid bytes (inclusive).
 158 ///
 159 /// This iterator is created by the
 160 /// [`char_indices`](trait.ByteSlice.html#method.char_indices) method provided
 161 /// by the [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
 162 #[derive(Clone, Debug)]
 163 pub struct CharIndices<'a> {
 164     bs: &'a [u8],
 165     forward_index: usize,
 166     reverse_index: usize,
 167 }
 168
 169 impl<'a> CharIndices<'a> {
 170     pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> {
 171         CharIndices { bs: bs, forward_index: 0, reverse_index: bs.len() }
 172     }
 173
 174     /// View the underlying data as a subslice of the original data.
 175     ///
 176     /// The slice returned has the same lifetime as the original slice, and so
 177     /// the iterator can continue to be used while this exists.
 178     ///
 179     /// # Examples
 180     ///
 181     /// ```
 182     /// use bstr::ByteSlice;
 183     ///
 184     /// let mut it = b"abc".char_indices();
 185     ///
 186     /// assert_eq!(b"abc", it.as_bytes());
 187     /// it.next();
 188     /// assert_eq!(b"bc", it.as_bytes());
 189     /// it.next();
 190     /// it.next();
 191     /// assert_eq!(b"", it.as_bytes());
 192     /// ```
 193     #[inline]
 194     pub fn as_bytes(&self) -> &'a [u8] {
 195         self.bs
 196     }
 197 }
 198
 199 impl<'a> Iterator for CharIndices<'a> {
 200     type Item = (usize, usize, char);
 201
 202     #[inline]
 203     fn next(&mut self) -> Option<(usize, usize, char)> {
 204         let index = self.forward_index;
 205         let (ch, size) = decode_lossy(self.bs);
 206         if size == 0 {
 207             return None;
 208         }
 209         self.bs = &self.bs[size..];
 210         self.forward_index += size;
 211         Some((index, index + size, ch))
 212     }
 213 }
 214
 215 impl<'a> DoubleEndedIterator for CharIndices<'a> {
 216     #[inline]
 217     fn next_back(&mut self) -> Option<(usize, usize, char)> {
 218         let (ch, size) = decode_last_lossy(self.bs);
 219         if size == 0 {
 220             return None;
 221         }
 222         self.bs = &self.bs[..self.bs.len() - size];
 223         self.reverse_index -= size;
 224         Some((self.reverse_index, self.reverse_index + size, ch))
 225     }
 226 }
 227
 228 impl<'a> ::core::iter::FusedIterator for CharIndices<'a> {}
 229
 230 /// An iterator over chunks of valid UTF-8 in a byte slice.
 231 ///
 232 /// See [`utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks).
 233 #[derive(Clone, Debug)]
 234 pub struct Utf8Chunks<'a> {
 235     pub(super) bytes: &'a [u8],
 236 }
 237
 238 /// A chunk of valid UTF-8, possibly followed by invalid UTF-8 bytes.
 239 ///
 240 /// This is yielded by the
 241 /// [`Utf8Chunks`](struct.Utf8Chunks.html)
 242 /// iterator, which can be created via the
 243 /// [`ByteSlice::utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks)
 244 /// method.
 245 ///
 246 /// The `'a` lifetime parameter corresponds to the lifetime of the bytes that
 247 /// are being iterated over.
 248 #[cfg_attr(test, derive(Debug, PartialEq))]
 249 pub struct Utf8Chunk<'a> {
 250     /// A valid UTF-8 piece, at the start, end, or between invalid UTF-8 bytes.
 251     ///
 252     /// This is empty between adjacent invalid UTF-8 byte sequences.
 253     valid: &'a str,
 254     /// A sequence of invalid UTF-8 bytes.
 255     ///
 256     /// Can only be empty in the last chunk.
 257     ///
 258     /// Should be replaced by a single unicode replacement character, if not
 259     /// empty.
 260     invalid: &'a BStr,
 261     /// Indicates whether the invalid sequence could've been valid if there
 262     /// were more bytes.
 263     ///
 264     /// Can only be true in the last chunk.
 265     incomplete: bool,
 266 }
 267
 268 impl<'a> Utf8Chunk<'a> {
 269     /// Returns the (possibly empty) valid UTF-8 bytes in this chunk.
 270     ///
 271     /// This may be empty if there are consecutive sequences of invalid UTF-8
 272     /// bytes.
 273     #[inline]
 274     pub fn valid(&self) -> &'a str {
 275         self.valid
 276     }
 277
 278     /// Returns the (possibly empty) invalid UTF-8 bytes in this chunk that
 279     /// immediately follow the valid UTF-8 bytes in this chunk.
 280     ///
 281     /// This is only empty when this chunk corresponds to the last chunk in
 282     /// the original bytes.
 283     ///
 284     /// The maximum length of this slice is 3. That is, invalid UTF-8 byte
 285     /// sequences greater than 1 always correspond to a valid _prefix_ of
 286     /// a valid UTF-8 encoded codepoint. This corresponds to the "substitution
 287     /// of maximal subparts" strategy that is described in more detail in the
 288     /// docs for the
 289     /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy)
 290     /// method.
 291     #[inline]
 292     pub fn invalid(&self) -> &'a [u8] {
 293         self.invalid.as_bytes()
 294     }
 295
 296     /// Returns whether the invalid sequence might still become valid if more
 297     /// bytes are added.
 298     ///
 299     /// Returns true if the end of the input was reached unexpectedly,
 300     /// without encountering an unexpected byte.
 301     ///
 302     /// This can only be the case for the last chunk.
 303     #[inline]
 304     pub fn incomplete(&self) -> bool {
 305         self.incomplete
 306     }
 307 }
 308
 309 impl<'a> Iterator for Utf8Chunks<'a> {
 310     type Item = Utf8Chunk<'a>;
 311
 312     #[inline]
 313     fn next(&mut self) -> Option<Utf8Chunk<'a>> {
 314         if self.bytes.is_empty() {
 315             return None;
 316         }
 317         match validate(self.bytes) {
 318             Ok(()) => {
 319                 let valid = self.bytes;
 320                 self.bytes = &[];
 321                 Some(Utf8Chunk {
 322                     // SAFETY: This is safe because of the guarantees provided
 323                     // by utf8::validate.
 324                     valid: unsafe { str::from_utf8_unchecked(valid) },
 325                     invalid: [].as_bstr(),
 326                     incomplete: false,
 327                 })
 328             }
 329             Err(e) => {
 330                 let (valid, rest) = self.bytes.split_at(e.valid_up_to());
 331                 // SAFETY: This is safe because of the guarantees provided by
 332                 // utf8::validate.
 333                 let valid = unsafe { str::from_utf8_unchecked(valid) };
 334                 let (invalid_len, incomplete) = match e.error_len() {
 335                     Some(n) => (n, false),
 336                     None => (rest.len(), true),
 337                 };
 338                 let (invalid, rest) = rest.split_at(invalid_len);
 339                 self.bytes = rest;
 340                 Some(Utf8Chunk {
 341                     valid,
 342                     invalid: invalid.as_bstr(),
 343                     incomplete,
 344                 })
 345             }
 346         }
 347     }
 348
 349     #[inline]
 350     fn size_hint(&self) -> (usize, Option<usize>) {
 351         if self.bytes.is_empty() {
 352             (0, Some(0))
 353         } else {
 354             (1, Some(self.bytes.len()))
 355         }
 356     }
 357 }
 358
 359 impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {}
 360
 361 /// An error that occurs when UTF-8 decoding fails.
 362 ///
 363 /// This error occurs when attempting to convert a non-UTF-8 byte
 364 /// string to a Rust string that must be valid UTF-8. For example,
 365 /// [`to_str`](trait.ByteSlice.html#method.to_str) is one such method.
 366 ///
 367 /// # Example
 368 ///
 369 /// This example shows what happens when a given byte sequence is invalid,
 370 /// but ends with a sequence that is a possible prefix of valid UTF-8.
 371 ///
 372 /// ```
 373 /// use bstr::{B, ByteSlice};
 374 ///
 375 /// let s = B(b"foobar\xF1\x80\x80");
 376 /// let err = s.to_str().unwrap_err();
 377 /// assert_eq!(err.valid_up_to(), 6);
 378 /// assert_eq!(err.error_len(), None);
 379 /// ```
 380 ///
 381 /// This example shows what happens when a given byte sequence contains
 382 /// invalid UTF-8.
 383 ///
 384 /// ```
 385 /// use bstr::ByteSlice;
 386 ///
 387 /// let s = b"foobar\xF1\x80\x80quux";
 388 /// let err = s.to_str().unwrap_err();
 389 /// assert_eq!(err.valid_up_to(), 6);
 390 /// // The error length reports the maximum number of bytes that correspond to
 391 /// // a valid prefix of a UTF-8 encoded codepoint.
 392 /// assert_eq!(err.error_len(), Some(3));
 393 ///
 394 /// // In contrast to the above which contains a single invalid prefix,
 395 /// // consider the case of multiple individal bytes that are never valid
 396 /// // prefixes. Note how the value of error_len changes!
 397 /// let s = b"foobar\xFF\xFFquux";
 398 /// let err = s.to_str().unwrap_err();
 399 /// assert_eq!(err.valid_up_to(), 6);
 400 /// assert_eq!(err.error_len(), Some(1));
 401 ///
 402 /// // The fact that it's an invalid prefix does not change error_len even
 403 /// // when it immediately precedes the end of the string.
 404 /// let s = b"foobar\xFF";
 405 /// let err = s.to_str().unwrap_err();
 406 /// assert_eq!(err.valid_up_to(), 6);
 407 /// assert_eq!(err.error_len(), Some(1));
 408 /// ```
 409 #[derive(Debug, Eq, PartialEq)]
 410 pub struct Utf8Error {
 411     valid_up_to: usize,
 412     error_len: Option<usize>,
 413 }
 414
 415 impl Utf8Error {
 416     /// Returns the byte index of the position immediately following the last
 417     /// valid UTF-8 byte.
 418     ///
 419     /// # Example
 420     ///
 421     /// This examples shows how `valid_up_to` can be used to retrieve a
 422     /// possibly empty prefix that is guaranteed to be valid UTF-8:
 423     ///
 424     /// ```
 425     /// use bstr::ByteSlice;
 426     ///
 427     /// let s = b"foobar\xF1\x80\x80quux";
 428     /// let err = s.to_str().unwrap_err();
 429     ///
 430     /// // This is guaranteed to never panic.
 431     /// let string = s[..err.valid_up_to()].to_str().unwrap();
 432     /// assert_eq!(string, "foobar");
 433     /// ```
 434     #[inline]
 435     pub fn valid_up_to(&self) -> usize {
 436         self.valid_up_to
 437     }
 438
 439     /// Returns the total number of invalid UTF-8 bytes immediately following
 440     /// the position returned by `valid_up_to`. This value is always at least
 441     /// `1`, but can be up to `3` if bytes form a valid prefix of some UTF-8
 442     /// encoded codepoint.
 443     ///
 444     /// If the end of the original input was found before a valid UTF-8 encoded
 445     /// codepoint could be completed, then this returns `None`. This is useful
 446     /// when processing streams, where a `None` value signals that more input
 447     /// might be needed.
 448     #[inline]
 449     pub fn error_len(&self) -> Option<usize> {
 450         self.error_len
 451     }
 452 }
 453
 454 #[cfg(feature = "std")]
 455 impl error::Error for Utf8Error {
 456     fn description(&self) -> &str {
 457         "invalid UTF-8"
 458     }
 459 }
 460
 461 impl fmt::Display for Utf8Error {
 462     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 463         write!(f, "invalid UTF-8 found at byte offset {}", self.valid_up_to)
 464     }
 465 }
 466
 467 /// Returns OK if and only if the given slice is completely valid UTF-8.
 468 ///
 469 /// If the slice isn't valid UTF-8, then an error is returned that explains
 470 /// the first location at which invalid UTF-8 was detected.
 471 pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> {
 472     // The fast path for validating UTF-8. It steps through a UTF-8 automaton
 473     // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is
 474     // detected, it backs up and runs the slower version of the UTF-8 automaton
 475     // to determine correct error information.
 476     fn fast(slice: &[u8]) -> Result<(), Utf8Error> {
 477         let mut state = ACCEPT;
 478         let mut i = 0;
 479
 480         while i < slice.len() {
 481             let b = slice[i];
 482
 483             // ASCII fast path. If we see two consecutive ASCII bytes, then try
 484             // to validate as much ASCII as possible very quickly.
 485             if state == ACCEPT
 486                 && b <= 0x7F
 487                 && slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
 488             {
 489                 i += ascii::first_non_ascii_byte(&slice[i..]);
 490                 continue;
 491             }
 492
 493             state = step(state, b);
 494             if state == REJECT {
 495                 return Err(find_valid_up_to(slice, i));
 496             }
 497             i += 1;
 498         }
 499         if state != ACCEPT {
 500             Err(find_valid_up_to(slice, slice.len()))
 501         } else {
 502             Ok(())
 503         }
 504     }
 505
 506     // Given the first position at which a UTF-8 sequence was determined to be
 507     // invalid, return an error that correctly reports the position at which
 508     // the last complete UTF-8 sequence ends.
 509     #[inline(never)]
 510     fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error {
 511         // In order to find the last valid byte, we need to back up an amount
 512         // that guarantees every preceding byte is part of a valid UTF-8
 513         // code unit sequence. To do this, we simply locate the last leading
 514         // byte that occurs before rejected_at.
 515         let mut backup = rejected_at.saturating_sub(1);
 516         while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) {
 517             backup -= 1;
 518         }
 519         let upto = cmp::min(slice.len(), rejected_at.saturating_add(1));
 520         let mut err = slow(&slice[backup..upto]).unwrap_err();
 521         err.valid_up_to += backup;
 522         err
 523     }
 524
 525     // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error
 526     // when an invalid sequence is found. This is split out from validate so
 527     // that the fast path doesn't need to keep track of the position of the
 528     // last valid UTF-8 byte. In particular, tracking this requires checking
 529     // for an ACCEPT state on each byte, which degrades throughput pretty
 530     // badly.
 531     fn slow(slice: &[u8]) -> Result<(), Utf8Error> {
 532         let mut state = ACCEPT;
 533         let mut valid_up_to = 0;
 534         for (i, &b) in slice.iter().enumerate() {
 535             state = step(state, b);
 536             if state == ACCEPT {
 537                 valid_up_to = i + 1;
 538             } else if state == REJECT {
 539                 // Our error length must always be at least 1.
 540                 let error_len = Some(cmp::max(1, i - valid_up_to));
 541                 return Err(Utf8Error { valid_up_to, error_len });
 542             }
 543         }
 544         if state != ACCEPT {
 545             Err(Utf8Error { valid_up_to, error_len: None })
 546         } else {
 547             Ok(())
 548         }
 549     }
 550
 551     // Advance to the next state given the current state and current byte.
 552     fn step(state: usize, b: u8) -> usize {
 553         let class = CLASSES[b as usize];
 554         // SAFETY: This is safe because 'class' is always <=11 and 'state' is
 555         // always <=96. Therefore, the maximal index is 96+11 = 107, where
 556         // STATES_FORWARD.len() = 108 such that every index is guaranteed to be
 557         // valid by construction of the state machine and the byte equivalence
 558         // classes.
 559         unsafe {
 560             *STATES_FORWARD.get_unchecked(state + class as usize) as usize
 561         }
 562     }
 563
 564     fast(slice)
 565 }
 566
 567 /// UTF-8 decode a single Unicode scalar value from the beginning of a slice.
 568 ///
 569 /// When successful, the corresponding Unicode scalar value is returned along
 570 /// with the number of bytes it was encoded with. The number of bytes consumed
 571 /// for a successful decode is always between 1 and 4, inclusive.
 572 ///
 573 /// When unsuccessful, `None` is returned along with the number of bytes that
 574 /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
 575 /// the number of bytes consumed is always between 0 and 3, inclusive, where
 576 /// 0 is only returned when `slice` is empty.
 577 ///
 578 /// # Examples
 579 ///
 580 /// Basic usage:
 581 ///
 582 /// ```
 583 /// use bstr::decode_utf8;
 584 ///
 585 /// // Decoding a valid codepoint.
 586 /// let (ch, size) = decode_utf8(b"\xE2\x98\x83");
 587 /// assert_eq!(Some('☃'), ch);
 588 /// assert_eq!(3, size);
 589 ///
 590 /// // Decoding an incomplete codepoint.
 591 /// let (ch, size) = decode_utf8(b"\xE2\x98");
 592 /// assert_eq!(None, ch);
 593 /// assert_eq!(2, size);
 594 /// ```
 595 ///
 596 /// This example shows how to iterate over all codepoints in UTF-8 encoded
 597 /// bytes, while replacing invalid UTF-8 sequences with the replacement
 598 /// codepoint:
 599 ///
 600 /// ```
 601 /// use bstr::{B, decode_utf8};
 602 ///
 603 /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
 604 /// let mut chars = vec![];
 605 /// while !bytes.is_empty() {
 606 ///     let (ch, size) = decode_utf8(bytes);
 607 ///     bytes = &bytes[size..];
 608 ///     chars.push(ch.unwrap_or('\u{FFFD}'));
 609 /// }
 610 /// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
 611 /// ```
 612 #[inline]
 613 pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
 614     let slice = slice.as_ref();
 615     match slice.get(0) {
 616         None => return (None, 0),
 617         Some(&b) if b <= 0x7F => return (Some(b as char), 1),
 618         _ => {}
 619     }
 620
 621     let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
 622     while i < slice.len() {
 623         decode_step(&mut state, &mut cp, slice[i]);
 624         i += 1;
 625
 626         if state == ACCEPT {
 627             // SAFETY: This is safe because `decode_step` guarantees that
 628             // `cp` is a valid Unicode scalar value in an ACCEPT state.
 629             let ch = unsafe { char::from_u32_unchecked(cp) };
 630             return (Some(ch), i);
 631         } else if state == REJECT {
 632             // At this point, we always want to advance at least one byte.
 633             return (None, cmp::max(1, i.saturating_sub(1)));
 634         }
 635     }
 636     (None, i)
 637 }
 638
 639 /// Lossily UTF-8 decode a single Unicode scalar value from the beginning of a
 640 /// slice.
 641 ///
 642 /// When successful, the corresponding Unicode scalar value is returned along
 643 /// with the number of bytes it was encoded with. The number of bytes consumed
 644 /// for a successful decode is always between 1 and 4, inclusive.
 645 ///
 646 /// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
 647 /// along with the number of bytes that make up a maximal prefix of a valid
 648 /// UTF-8 code unit sequence. In this case, the number of bytes consumed is
 649 /// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
 650 /// empty.
 651 ///
 652 /// # Examples
 653 ///
 654 /// Basic usage:
 655 ///
 656 /// ```ignore
 657 /// use bstr::decode_utf8_lossy;
 658 ///
 659 /// // Decoding a valid codepoint.
 660 /// let (ch, size) = decode_utf8_lossy(b"\xE2\x98\x83");
 661 /// assert_eq!('☃', ch);
 662 /// assert_eq!(3, size);
 663 ///
 664 /// // Decoding an incomplete codepoint.
 665 /// let (ch, size) = decode_utf8_lossy(b"\xE2\x98");
 666 /// assert_eq!('\u{FFFD}', ch);
 667 /// assert_eq!(2, size);
 668 /// ```
 669 ///
 670 /// This example shows how to iterate over all codepoints in UTF-8 encoded
 671 /// bytes, while replacing invalid UTF-8 sequences with the replacement
 672 /// codepoint:
 673 ///
 674 /// ```ignore
 675 /// use bstr::{B, decode_utf8_lossy};
 676 ///
 677 /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
 678 /// let mut chars = vec![];
 679 /// while !bytes.is_empty() {
 680 ///     let (ch, size) = decode_utf8_lossy(bytes);
 681 ///     bytes = &bytes[size..];
 682 ///     chars.push(ch);
 683 /// }
 684 /// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
 685 /// ```
 686 #[inline]
 687 pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
 688     match decode(slice) {
 689         (Some(ch), size) => (ch, size),
 690         (None, size) => ('\u{FFFD}', size),
 691     }
 692 }
 693
 694 /// UTF-8 decode a single Unicode scalar value from the end of a slice.
 695 ///
 696 /// When successful, the corresponding Unicode scalar value is returned along
 697 /// with the number of bytes it was encoded with. The number of bytes consumed
 698 /// for a successful decode is always between 1 and 4, inclusive.
 699 ///
 700 /// When unsuccessful, `None` is returned along with the number of bytes that
 701 /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
 702 /// the number of bytes consumed is always between 0 and 3, inclusive, where
 703 /// 0 is only returned when `slice` is empty.
 704 ///
 705 /// # Examples
 706 ///
 707 /// Basic usage:
 708 ///
 709 /// ```
 710 /// use bstr::decode_last_utf8;
 711 ///
 712 /// // Decoding a valid codepoint.
 713 /// let (ch, size) = decode_last_utf8(b"\xE2\x98\x83");
 714 /// assert_eq!(Some('☃'), ch);
 715 /// assert_eq!(3, size);
 716 ///
 717 /// // Decoding an incomplete codepoint.
 718 /// let (ch, size) = decode_last_utf8(b"\xE2\x98");
 719 /// assert_eq!(None, ch);
 720 /// assert_eq!(2, size);
 721 /// ```
 722 ///
 723 /// This example shows how to iterate over all codepoints in UTF-8 encoded
 724 /// bytes in reverse, while replacing invalid UTF-8 sequences with the
 725 /// replacement codepoint:
 726 ///
 727 /// ```
 728 /// use bstr::{B, decode_last_utf8};
 729 ///
 730 /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
 731 /// let mut chars = vec![];
 732 /// while !bytes.is_empty() {
 733 ///     let (ch, size) = decode_last_utf8(bytes);
 734 ///     bytes = &bytes[..bytes.len()-size];
 735 ///     chars.push(ch.unwrap_or('\u{FFFD}'));
 736 /// }
 737 /// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
 738 /// ```
 739 #[inline]
 740 pub fn decode_last<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
 741     // TODO: We could implement this by reversing the UTF-8 automaton, but for
 742     // now, we do it the slow way by using the forward automaton.
 743
 744     let slice = slice.as_ref();
 745     if slice.is_empty() {
 746         return (None, 0);
 747     }
 748     let mut start = slice.len() - 1;
 749     let limit = slice.len().saturating_sub(4);
 750     while start > limit && !is_leading_or_invalid_utf8_byte(slice[start]) {
 751         start -= 1;
 752     }
 753     let (ch, size) = decode(&slice[start..]);
 754     // If we didn't consume all of the bytes, then that means there's at least
 755     // one stray byte that never occurs in a valid code unit prefix, so we can
 756     // advance by one byte.
 757     if start + size != slice.len() {
 758         (None, 1)
 759     } else {
 760         (ch, size)
 761     }
 762 }
 763
 764 /// Lossily UTF-8 decode a single Unicode scalar value from the end of a slice.
 765 ///
 766 /// When successful, the corresponding Unicode scalar value is returned along
 767 /// with the number of bytes it was encoded with. The number of bytes consumed
 768 /// for a successful decode is always between 1 and 4, inclusive.
 769 ///
 770 /// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
 771 /// along with the number of bytes that make up a maximal prefix of a valid
 772 /// UTF-8 code unit sequence. In this case, the number of bytes consumed is
 773 /// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
 774 /// empty.
 775 ///
 776 /// # Examples
 777 ///
 778 /// Basic usage:
 779 ///
 780 /// ```ignore
 781 /// use bstr::decode_last_utf8_lossy;
 782 ///
 783 /// // Decoding a valid codepoint.
 784 /// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98\x83");
 785 /// assert_eq!('☃', ch);
 786 /// assert_eq!(3, size);
 787 ///
 788 /// // Decoding an incomplete codepoint.
 789 /// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98");
 790 /// assert_eq!('\u{FFFD}', ch);
 791 /// assert_eq!(2, size);
 792 /// ```
 793 ///
 794 /// This example shows how to iterate over all codepoints in UTF-8 encoded
 795 /// bytes in reverse, while replacing invalid UTF-8 sequences with the
 796 /// replacement codepoint:
 797 ///
 798 /// ```ignore
 799 /// use bstr::decode_last_utf8_lossy;
 800 ///
 801 /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
 802 /// let mut chars = vec![];
 803 /// while !bytes.is_empty() {
 804 ///     let (ch, size) = decode_last_utf8_lossy(bytes);
 805 ///     bytes = &bytes[..bytes.len()-size];
 806 ///     chars.push(ch);
 807 /// }
 808 /// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
 809 /// ```
 810 #[inline]
 811 pub fn decode_last_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
 812     match decode_last(slice) {
 813         (Some(ch), size) => (ch, size),
 814         (None, size) => ('\u{FFFD}', size),
 815     }
 816 }
 817
 818 /// SAFETY: The decode function relies on state being equal to ACCEPT only if
 819 /// cp is a valid Unicode scalar value.
 820 #[inline]
 821 pub fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
 822     let class = CLASSES[b as usize];
 823     if *state == ACCEPT {
 824         *cp = (0xFF >> class) & (b as u32);
 825     } else {
 826         *cp = (b as u32 & 0b111111) | (*cp << 6);
 827     }
 828     *state = STATES_FORWARD[*state + class as usize] as usize;
 829 }
 830
 831 /// Returns true if and only if the given byte is either a valid leading UTF-8
 832 /// byte, or is otherwise an invalid byte that can never appear anywhere in a
 833 /// valid UTF-8 sequence.
 834 fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
 835     // In the ASCII case, the most significant bit is never set. The leading
 836     // byte of a 2/3/4-byte sequence always has the top two most significant
 837     // bits set. For bytes that can never appear anywhere in valid UTF-8, this
 838     // also returns true, since every such byte has its two most significant
 839     // bits set:
 840     //
 841     //     \xC0 :: 11000000
 842     //     \xC1 :: 11000001
 843     //     \xF5 :: 11110101
 844     //     \xF6 :: 11110110
 845     //     \xF7 :: 11110111
 846     //     \xF8 :: 11111000
 847     //     \xF9 :: 11111001
 848     //     \xFA :: 11111010
 849     //     \xFB :: 11111011
 850     //     \xFC :: 11111100
 851     //     \xFD :: 11111101
 852     //     \xFE :: 11111110
 853     //     \xFF :: 11111111
 854     (b & 0b1100_0000) != 0b1000_0000
 855 }
 856
 857 #[cfg(test)]
 858 mod tests {
 859     use std::char;
 860
 861     use crate::ext_slice::{ByteSlice, B};
 862     use crate::tests::LOSSY_TESTS;
 863     use crate::utf8::{self, Utf8Error};
 864
 865     fn utf8e(valid_up_to: usize) -> Utf8Error {
 866         Utf8Error { valid_up_to, error_len: None }
 867     }
 868
 869     fn utf8e2(valid_up_to: usize, error_len: usize) -> Utf8Error {
 870         Utf8Error { valid_up_to, error_len: Some(error_len) }
 871     }
 872
 873     #[test]
 874     fn validate_all_codepoints() {
 875         for i in 0..(0x10FFFF + 1) {
 876             let cp = match char::from_u32(i) {
 877                 None => continue,
 878                 Some(cp) => cp,
 879             };
 880             let mut buf = [0; 4];
 881             let s = cp.encode_utf8(&mut buf);
 882             assert_eq!(Ok(()), utf8::validate(s.as_bytes()));
 883         }
 884     }
 885
 886     #[test]
 887     fn validate_multiple_codepoints() {
 888         assert_eq!(Ok(()), utf8::validate(b"abc"));
 889         assert_eq!(Ok(()), utf8::validate(b"a\xE2\x98\x83a"));
 890         assert_eq!(Ok(()), utf8::validate(b"a\xF0\x9D\x9C\xB7a"));
 891         assert_eq!(Ok(()), utf8::validate(b"\xE2\x98\x83\xF0\x9D\x9C\xB7",));
 892         assert_eq!(
 893             Ok(()),
 894             utf8::validate(b"a\xE2\x98\x83a\xF0\x9D\x9C\xB7a",)
 895         );
 896         assert_eq!(
 897             Ok(()),
 898             utf8::validate(b"\xEF\xBF\xBD\xE2\x98\x83\xEF\xBF\xBD",)
 899         );
 900     }
 901
 902     #[test]
 903     fn validate_errors() {
 904         // single invalid byte
 905         assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xFF"));
 906         // single invalid byte after ASCII
 907         assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xFF"));
 908         // single invalid byte after 2 byte sequence
 909         assert_eq!(Err(utf8e2(2, 1)), utf8::validate(b"\xCE\xB2\xFF"));
 910         // single invalid byte after 3 byte sequence
 911         assert_eq!(Err(utf8e2(3, 1)), utf8::validate(b"\xE2\x98\x83\xFF"));
 912         // single invalid byte after 4 byte sequence
 913         assert_eq!(Err(utf8e2(4, 1)), utf8::validate(b"\xF0\x9D\x9D\xB1\xFF"));
 914
 915         // An invalid 2-byte sequence with a valid 1-byte prefix.
 916         assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCE\xF0"));
 917         // An invalid 3-byte sequence with a valid 2-byte prefix.
 918         assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98\xF0"));
 919         // An invalid 4-byte sequence with a valid 3-byte prefix.
 920         assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9D\xF0"));
 921
 922         // An overlong sequence. Should be \xE2\x82\xAC, but we encode the
 923         // same codepoint value in 4 bytes. This not only tests that we reject
 924         // overlong sequences, but that we get valid_up_to correct.
 925         assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xF0\x82\x82\xAC"));
 926         assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xF0\x82\x82\xAC"));
 927         assert_eq!(
 928             Err(utf8e2(3, 1)),
 929             utf8::validate(b"\xE2\x98\x83\xF0\x82\x82\xAC",)
 930         );
 931
 932         // Check that encoding a surrogate codepoint using the UTF-8 scheme
 933         // fails validation.
 934         assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xED\xA0\x80"));
 935         assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xED\xA0\x80"));
 936         assert_eq!(
 937             Err(utf8e2(3, 1)),
 938             utf8::validate(b"\xE2\x98\x83\xED\xA0\x80",)
 939         );
 940
 941         // Check that an incomplete 2-byte sequence fails.
 942         assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCEa"));
 943         assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xCEa"));
 944         assert_eq!(
 945             Err(utf8e2(3, 1)),
 946             utf8::validate(b"\xE2\x98\x83\xCE\xE2\x98\x83",)
 947         );
 948         // Check that an incomplete 3-byte sequence fails.
 949         assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98a"));
 950         assert_eq!(Err(utf8e2(1, 2)), utf8::validate(b"a\xE2\x98a"));
 951         assert_eq!(
 952             Err(utf8e2(3, 2)),
 953             utf8::validate(b"\xE2\x98\x83\xE2\x98\xE2\x98\x83",)
 954         );
 955         // Check that an incomplete 4-byte sequence fails.
 956         assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9Ca"));
 957         assert_eq!(Err(utf8e2(1, 3)), utf8::validate(b"a\xF0\x9D\x9Ca"));
 958         assert_eq!(
 959             Err(utf8e2(4, 3)),
 960             utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C\xE2\x98\x83",)
 961         );
 962         assert_eq!(
 963             Err(utf8e2(6, 3)),
 964             utf8::validate(b"foobar\xF1\x80\x80quux",)
 965         );
 966
 967         // Check that an incomplete (EOF) 2-byte sequence fails.
 968         assert_eq!(Err(utf8e(0)), utf8::validate(b"\xCE"));
 969         assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xCE"));
 970         assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xCE"));
 971         // Check that an incomplete (EOF) 3-byte sequence fails.
 972         assert_eq!(Err(utf8e(0)), utf8::validate(b"\xE2\x98"));
 973         assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xE2\x98"));
 974         assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xE2\x98"));
 975         // Check that an incomplete (EOF) 4-byte sequence fails.
 976         assert_eq!(Err(utf8e(0)), utf8::validate(b"\xF0\x9D\x9C"));
 977         assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xF0\x9D\x9C"));
 978         assert_eq!(
 979             Err(utf8e(4)),
 980             utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C",)
 981         );
 982
 983         // Test that we errors correct even after long valid sequences. This
 984         // checks that our "backup" logic for detecting errors is correct.
 985         assert_eq!(
 986             Err(utf8e2(8, 1)),
 987             utf8::validate(b"\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF",)
 988         );
 989     }
 990
 991     #[test]
 992     fn decode_valid() {
 993         fn d(mut s: &str) -> Vec<char> {
 994             let mut chars = vec![];
 995             while !s.is_empty() {
 996                 let (ch, size) = utf8::decode(s.as_bytes());
 997                 s = &s[size..];
 998                 chars.push(ch.unwrap());
 999             }
1000             chars
1001         }
1002
1003         assert_eq!(vec!['☃'], d("☃"));
1004         assert_eq!(vec!['☃', '☃'], d("☃☃"));
1005         assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε"));
1006         assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇"));
1007         assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲"));
1008     }
1009
1010     #[test]
1011     fn decode_invalid() {
1012         let (ch, size) = utf8::decode(b"");
1013         assert_eq!(None, ch);
1014         assert_eq!(0, size);
1015
1016         let (ch, size) = utf8::decode(b"\xFF");
1017         assert_eq!(None, ch);
1018         assert_eq!(1, size);
1019
1020         let (ch, size) = utf8::decode(b"\xCE\xF0");
1021         assert_eq!(None, ch);
1022         assert_eq!(1, size);
1023
1024         let (ch, size) = utf8::decode(b"\xE2\x98\xF0");
1025         assert_eq!(None, ch);
1026         assert_eq!(2, size);
1027
1028         let (ch, size) = utf8::decode(b"\xF0\x9D\x9D");
1029         assert_eq!(None, ch);
1030         assert_eq!(3, size);
1031
1032         let (ch, size) = utf8::decode(b"\xF0\x9D\x9D\xF0");
1033         assert_eq!(None, ch);
1034         assert_eq!(3, size);
1035
1036         let (ch, size) = utf8::decode(b"\xF0\x82\x82\xAC");
1037         assert_eq!(None, ch);
1038         assert_eq!(1, size);
1039
1040         let (ch, size) = utf8::decode(b"\xED\xA0\x80");
1041         assert_eq!(None, ch);
1042         assert_eq!(1, size);
1043
1044         let (ch, size) = utf8::decode(b"\xCEa");
1045         assert_eq!(None, ch);
1046         assert_eq!(1, size);
1047
1048         let (ch, size) = utf8::decode(b"\xE2\x98a");
1049         assert_eq!(None, ch);
1050         assert_eq!(2, size);
1051
1052         let (ch, size) = utf8::decode(b"\xF0\x9D\x9Ca");
1053         assert_eq!(None, ch);
1054         assert_eq!(3, size);
1055     }
1056
1057     #[test]
1058     fn decode_lossy() {
1059         let (ch, size) = utf8::decode_lossy(b"");
1060         assert_eq!('\u{FFFD}', ch);
1061         assert_eq!(0, size);
1062
1063         let (ch, size) = utf8::decode_lossy(b"\xFF");
1064         assert_eq!('\u{FFFD}', ch);
1065         assert_eq!(1, size);
1066
1067         let (ch, size) = utf8::decode_lossy(b"\xCE\xF0");
1068         assert_eq!('\u{FFFD}', ch);
1069         assert_eq!(1, size);
1070
1071         let (ch, size) = utf8::decode_lossy(b"\xE2\x98\xF0");
1072         assert_eq!('\u{FFFD}', ch);
1073         assert_eq!(2, size);
1074
1075         let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9D\xF0");
1076         assert_eq!('\u{FFFD}', ch);
1077         assert_eq!(3, size);
1078
1079         let (ch, size) = utf8::decode_lossy(b"\xF0\x82\x82\xAC");
1080         assert_eq!('\u{FFFD}', ch);
1081         assert_eq!(1, size);
1082
1083         let (ch, size) = utf8::decode_lossy(b"\xED\xA0\x80");
1084         assert_eq!('\u{FFFD}', ch);
1085         assert_eq!(1, size);
1086
1087         let (ch, size) = utf8::decode_lossy(b"\xCEa");
1088         assert_eq!('\u{FFFD}', ch);
1089         assert_eq!(1, size);
1090
1091         let (ch, size) = utf8::decode_lossy(b"\xE2\x98a");
1092         assert_eq!('\u{FFFD}', ch);
1093         assert_eq!(2, size);
1094
1095         let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9Ca");
1096         assert_eq!('\u{FFFD}', ch);
1097         assert_eq!(3, size);
1098     }
1099
1100     #[test]
1101     fn decode_last_valid() {
1102         fn d(mut s: &str) -> Vec<char> {
1103             let mut chars = vec![];
1104             while !s.is_empty() {
1105                 let (ch, size) = utf8::decode_last(s.as_bytes());
1106                 s = &s[..s.len() - size];
1107                 chars.push(ch.unwrap());
1108             }
1109             chars
1110         }
1111
1112         assert_eq!(vec!['☃'], d("☃"));
1113         assert_eq!(vec!['☃', '☃'], d("☃☃"));
1114         assert_eq!(vec!['ε', 'δ', 'γ', 'β', 'α'], d("αβγδε"));
1115         assert_eq!(vec!['⛇', '⛄', '☃'], d("☃⛄⛇"));
1116         assert_eq!(vec!['𝗲', '𝗱', '𝗰', '𝗯', '𝗮'], d("𝗮𝗯𝗰𝗱𝗲"));
1117     }
1118
1119     #[test]
1120     fn decode_last_invalid() {
1121         let (ch, size) = utf8::decode_last(b"");
1122         assert_eq!(None, ch);
1123         assert_eq!(0, size);
1124
1125         let (ch, size) = utf8::decode_last(b"\xFF");
1126         assert_eq!(None, ch);
1127         assert_eq!(1, size);
1128
1129         let (ch, size) = utf8::decode_last(b"\xCE\xF0");
1130         assert_eq!(None, ch);
1131         assert_eq!(1, size);
1132
1133         let (ch, size) = utf8::decode_last(b"\xCE");
1134         assert_eq!(None, ch);
1135         assert_eq!(1, size);
1136
1137         let (ch, size) = utf8::decode_last(b"\xE2\x98\xF0");
1138         assert_eq!(None, ch);
1139         assert_eq!(1, size);
1140
1141         let (ch, size) = utf8::decode_last(b"\xE2\x98");
1142         assert_eq!(None, ch);
1143         assert_eq!(2, size);
1144
1145         let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D\xF0");
1146         assert_eq!(None, ch);
1147         assert_eq!(1, size);
1148
1149         let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D");
1150         assert_eq!(None, ch);
1151         assert_eq!(3, size);
1152
1153         let (ch, size) = utf8::decode_last(b"\xF0\x82\x82\xAC");
1154         assert_eq!(None, ch);
1155         assert_eq!(1, size);
1156
1157         let (ch, size) = utf8::decode_last(b"\xED\xA0\x80");
1158         assert_eq!(None, ch);
1159         assert_eq!(1, size);
1160
1161         let (ch, size) = utf8::decode_last(b"\xED\xA0");
1162         assert_eq!(None, ch);
1163         assert_eq!(1, size);
1164
1165         let (ch, size) = utf8::decode_last(b"\xED");
1166         assert_eq!(None, ch);
1167         assert_eq!(1, size);
1168
1169         let (ch, size) = utf8::decode_last(b"a\xCE");
1170         assert_eq!(None, ch);
1171         assert_eq!(1, size);
1172
1173         let (ch, size) = utf8::decode_last(b"a\xE2\x98");
1174         assert_eq!(None, ch);
1175         assert_eq!(2, size);
1176
1177         let (ch, size) = utf8::decode_last(b"a\xF0\x9D\x9C");
1178         assert_eq!(None, ch);
1179         assert_eq!(3, size);
1180     }
1181
1182     #[test]
1183     fn decode_last_lossy() {
1184         let (ch, size) = utf8::decode_last_lossy(b"");
1185         assert_eq!('\u{FFFD}', ch);
1186         assert_eq!(0, size);
1187
1188         let (ch, size) = utf8::decode_last_lossy(b"\xFF");
1189         assert_eq!('\u{FFFD}', ch);
1190         assert_eq!(1, size);
1191
1192         let (ch, size) = utf8::decode_last_lossy(b"\xCE\xF0");
1193         assert_eq!('\u{FFFD}', ch);
1194         assert_eq!(1, size);
1195
1196         let (ch, size) = utf8::decode_last_lossy(b"\xCE");
1197         assert_eq!('\u{FFFD}', ch);
1198         assert_eq!(1, size);
1199
1200         let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98\xF0");
1201         assert_eq!('\u{FFFD}', ch);
1202         assert_eq!(1, size);
1203
1204         let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98");
1205         assert_eq!('\u{FFFD}', ch);
1206         assert_eq!(2, size);
1207
1208         let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D\xF0");
1209         assert_eq!('\u{FFFD}', ch);
1210         assert_eq!(1, size);
1211
1212         let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D");
1213         assert_eq!('\u{FFFD}', ch);
1214         assert_eq!(3, size);
1215
1216         let (ch, size) = utf8::decode_last_lossy(b"\xF0\x82\x82\xAC");
1217         assert_eq!('\u{FFFD}', ch);
1218         assert_eq!(1, size);
1219
1220         let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0\x80");
1221         assert_eq!('\u{FFFD}', ch);
1222         assert_eq!(1, size);
1223
1224         let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0");
1225         assert_eq!('\u{FFFD}', ch);
1226         assert_eq!(1, size);
1227
1228         let (ch, size) = utf8::decode_last_lossy(b"\xED");
1229         assert_eq!('\u{FFFD}', ch);
1230         assert_eq!(1, size);
1231
1232         let (ch, size) = utf8::decode_last_lossy(b"a\xCE");
1233         assert_eq!('\u{FFFD}', ch);
1234         assert_eq!(1, size);
1235
1236         let (ch, size) = utf8::decode_last_lossy(b"a\xE2\x98");
1237         assert_eq!('\u{FFFD}', ch);
1238         assert_eq!(2, size);
1239
1240         let (ch, size) = utf8::decode_last_lossy(b"a\xF0\x9D\x9C");
1241         assert_eq!('\u{FFFD}', ch);
1242         assert_eq!(3, size);
1243     }
1244
1245     #[test]
1246     fn chars() {
1247         for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() {
1248             let got: String = B(input).chars().collect();
1249             assert_eq!(
1250                 expected, got,
1251                 "chars(ith: {:?}, given: {:?})",
1252                 i, input,
1253             );
1254             let got: String =
1255                 B(input).char_indices().map(|(_, _, ch)| ch).collect();
1256             assert_eq!(
1257                 expected, got,
1258                 "char_indices(ith: {:?}, given: {:?})",
1259                 i, input,
1260             );
1261
1262             let expected: String = expected.chars().rev().collect();
1263
1264             let got: String = B(input).chars().rev().collect();
1265             assert_eq!(
1266                 expected, got,
1267                 "chars.rev(ith: {:?}, given: {:?})",
1268                 i, input,
1269             );
1270             let got: String =
1271                 B(input).char_indices().rev().map(|(_, _, ch)| ch).collect();
1272             assert_eq!(
1273                 expected, got,
1274                 "char_indices.rev(ith: {:?}, given: {:?})",
1275                 i, input,
1276             );
1277         }
1278     }
1279
1280     #[test]
1281     fn utf8_chunks() {
1282         let mut c = utf8::Utf8Chunks { bytes: b"123\xC0" };
1283         assert_eq!(
1284             (c.next(), c.next()),
1285             (
1286                 Some(utf8::Utf8Chunk {
1287                     valid: "123",
1288                     invalid: b"\xC0".as_bstr(),
1289                     incomplete: false,
1290                 }),
1291                 None,
1292             )
1293         );
1294
1295         let mut c = utf8::Utf8Chunks { bytes: b"123\xFF\xFF" };
1296         assert_eq!(
1297             (c.next(), c.next(), c.next()),
1298             (
1299                 Some(utf8::Utf8Chunk {
1300                     valid: "123",
1301                     invalid: b"\xFF".as_bstr(),
1302                     incomplete: false,
1303                 }),
1304                 Some(utf8::Utf8Chunk {
1305                     valid: "",
1306                     invalid: b"\xFF".as_bstr(),
1307                     incomplete: false,
1308                 }),
1309                 None,
1310             )
1311         );
1312
1313         let mut c = utf8::Utf8Chunks { bytes: b"123\xD0" };
1314         assert_eq!(
1315             (c.next(), c.next()),
1316             (
1317                 Some(utf8::Utf8Chunk {
1318                     valid: "123",
1319                     invalid: b"\xD0".as_bstr(),
1320                     incomplete: true,
1321                 }),
1322                 None,
1323             )
1324         );
1325
1326         let mut c = utf8::Utf8Chunks { bytes: b"123\xD0456" };
1327         assert_eq!(
1328             (c.next(), c.next(), c.next()),
1329             (
1330                 Some(utf8::Utf8Chunk {
1331                     valid: "123",
1332                     invalid: b"\xD0".as_bstr(),
1333                     incomplete: false,
1334                 }),
1335                 Some(utf8::Utf8Chunk {
1336                     valid: "456",
1337                     invalid: b"".as_bstr(),
1338                     incomplete: false,
1339                 }),
1340                 None,
1341             )
1342         );
1343
1344         let mut c = utf8::Utf8Chunks { bytes: b"123\xE2\x98" };
1345         assert_eq!(
1346             (c.next(), c.next()),
1347             (
1348                 Some(utf8::Utf8Chunk {
1349                     valid: "123",
1350                     invalid: b"\xE2\x98".as_bstr(),
1351                     incomplete: true,
1352                 }),
1353                 None,
1354             )
1355         );
1356
1357         let mut c = utf8::Utf8Chunks { bytes: b"123\xF4\x8F\xBF" };
1358         assert_eq!(
1359             (c.next(), c.next()),
1360             (
1361                 Some(utf8::Utf8Chunk {
1362                     valid: "123",
1363                     invalid: b"\xF4\x8F\xBF".as_bstr(),
1364                     incomplete: true,
1365                 }),
1366                 None,
1367             )
1368         );
1369     }
1370 }