src/libstd/sys_common/wtf8.rs

   1 //! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
   2 //!
   3 //! This library uses Rust’s type system to maintain
   4 //! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed),
   5 //! like the `String` and `&str` types do for UTF-8.
   6 //!
   7 //! Since [WTF-8 must not be used
   8 //! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience),
   9 //! this library deliberately does not provide access to the underlying bytes
  10 //! of WTF-8 strings,
  11 //! nor can it decode WTF-8 from arbitrary bytes.
  12 //! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points.
  13
  14 // this module is imported from @SimonSapin's repo and has tons of dead code on
  15 // unix (it's mostly used on windows), so don't worry about dead code here.
  16 #![allow(dead_code)]
  17
  18 use core::str::next_code_point;
  19
  20 use crate::borrow::Cow;
  21 use crate::char;
  22 use crate::fmt;
  23 use crate::hash::{Hash, Hasher};
  24 use crate::iter::FromIterator;
  25 use crate::mem;
  26 use crate::ops;
  27 use crate::rc::Rc;
  28 use crate::slice;
  29 use crate::str;
  30 use crate::sync::Arc;
  31 use crate::sys_common::AsInner;
  32
  33 const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}";
  34
  35 /// A Unicode code point: from U+0000 to U+10FFFF.
  36 ///
  37 /// Compares with the `char` type,
  38 /// which represents a Unicode scalar value:
  39 /// a code point that is not a surrogate (U+D800 to U+DFFF).
  40 #[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)]
  41 pub struct CodePoint {
  42     value: u32,
  43 }
  44
  45 /// Format the code point as `U+` followed by four to six hexadecimal digits.
  46 /// Example: `U+1F4A9`
  47 impl fmt::Debug for CodePoint {
  48     #[inline]
  49     fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
  50         write!(formatter, "U+{:04X}", self.value)
  51     }
  52 }
  53
  54 impl CodePoint {
  55     /// Unsafely creates a new `CodePoint` without checking the value.
  56     ///
  57     /// Only use when `value` is known to be less than or equal to 0x10FFFF.
  58     #[inline]
  59     pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
  60         CodePoint { value }
  61     }
  62
  63     /// Creates a new `CodePoint` if the value is a valid code point.
  64     ///
  65     /// Returns `None` if `value` is above 0x10FFFF.
  66     #[inline]
  67     pub fn from_u32(value: u32) -> Option<CodePoint> {
  68         match value {
  69             0..=0x10FFFF => Some(CodePoint { value }),
  70             _ => None,
  71         }
  72     }
  73
  74     /// Creates a new `CodePoint` from a `char`.
  75     ///
  76     /// Since all Unicode scalar values are code points, this always succeeds.
  77     #[inline]
  78     pub fn from_char(value: char) -> CodePoint {
  79         CodePoint { value: value as u32 }
  80     }
  81
  82     /// Returns the numeric value of the code point.
  83     #[inline]
  84     pub fn to_u32(&self) -> u32 {
  85         self.value
  86     }
  87
  88     /// Optionally returns a Unicode scalar value for the code point.
  89     ///
  90     /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
  91     #[inline]
  92     pub fn to_char(&self) -> Option<char> {
  93         match self.value {
  94             0xD800..=0xDFFF => None,
  95             _ => Some(unsafe { char::from_u32_unchecked(self.value) }),
  96         }
  97     }
  98
  99     /// Returns a Unicode scalar value for the code point.
 100     ///
 101     /// Returns `'\u{FFFD}'` (the replacement character “�”)
 102     /// if the code point is a surrogate (from U+D800 to U+DFFF).
 103     #[inline]
 104     pub fn to_char_lossy(&self) -> char {
 105         self.to_char().unwrap_or('\u{FFFD}')
 106     }
 107 }
 108
 109 /// An owned, growable string of well-formed WTF-8 data.
 110 ///
 111 /// Similar to `String`, but can additionally contain surrogate code points
 112 /// if they’re not in a surrogate pair.
 113 #[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
 114 pub struct Wtf8Buf {
 115     bytes: Vec<u8>,
 116 }
 117
 118 impl ops::Deref for Wtf8Buf {
 119     type Target = Wtf8;
 120
 121     fn deref(&self) -> &Wtf8 {
 122         self.as_slice()
 123     }
 124 }
 125
 126 impl ops::DerefMut for Wtf8Buf {
 127     fn deref_mut(&mut self) -> &mut Wtf8 {
 128         self.as_mut_slice()
 129     }
 130 }
 131
 132 /// Format the string with double quotes,
 133 /// and surrogates as `\u` followed by four hexadecimal digits.
 134 /// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800]
 135 impl fmt::Debug for Wtf8Buf {
 136     #[inline]
 137     fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
 138         fmt::Debug::fmt(&**self, formatter)
 139     }
 140 }
 141
 142 impl Wtf8Buf {
 143     /// Creates a new, empty WTF-8 string.
 144     #[inline]
 145     pub fn new() -> Wtf8Buf {
 146         Wtf8Buf { bytes: Vec::new() }
 147     }
 148
 149     /// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes.
 150     #[inline]
 151     pub fn with_capacity(capacity: usize) -> Wtf8Buf {
 152         Wtf8Buf { bytes: Vec::with_capacity(capacity) }
 153     }
 154
 155     /// Creates a WTF-8 string from a UTF-8 `String`.
 156     ///
 157     /// This takes ownership of the `String` and does not copy.
 158     ///
 159     /// Since WTF-8 is a superset of UTF-8, this always succeeds.
 160     #[inline]
 161     pub fn from_string(string: String) -> Wtf8Buf {
 162         Wtf8Buf { bytes: string.into_bytes() }
 163     }
 164
 165     /// Creates a WTF-8 string from a UTF-8 `&str` slice.
 166     ///
 167     /// This copies the content of the slice.
 168     ///
 169     /// Since WTF-8 is a superset of UTF-8, this always succeeds.
 170     #[inline]
 171     pub fn from_str(str: &str) -> Wtf8Buf {
 172         Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()) }
 173     }
 174
 175     pub fn clear(&mut self) {
 176         self.bytes.clear()
 177     }
 178
 179     /// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
 180     ///
 181     /// This is lossless: calling `.encode_wide()` on the resulting string
 182     /// will always return the original code units.
 183     pub fn from_wide(v: &[u16]) -> Wtf8Buf {
 184         let mut string = Wtf8Buf::with_capacity(v.len());
 185         for item in char::decode_utf16(v.iter().cloned()) {
 186             match item {
 187                 Ok(ch) => string.push_char(ch),
 188                 Err(surrogate) => {
 189                     let surrogate = surrogate.unpaired_surrogate();
 190                     // Surrogates are known to be in the code point range.
 191                     let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
 192                     // Skip the WTF-8 concatenation check,
 193                     // surrogate pairs are already decoded by decode_utf16
 194                     string.push_code_point_unchecked(code_point)
 195                 }
 196             }
 197         }
 198         string
 199     }
 200
 201     /// Copied from String::push
 202     /// This does **not** include the WTF-8 concatenation check.
 203     fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
 204         let mut bytes = [0; 4];
 205         let bytes = char::encode_utf8_raw(code_point.value, &mut bytes);
 206         self.bytes.extend_from_slice(bytes)
 207     }
 208
 209     #[inline]
 210     pub fn as_slice(&self) -> &Wtf8 {
 211         unsafe { Wtf8::from_bytes_unchecked(&self.bytes) }
 212     }
 213
 214     #[inline]
 215     pub fn as_mut_slice(&mut self) -> &mut Wtf8 {
 216         unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) }
 217     }
 218
 219     /// Reserves capacity for at least `additional` more bytes to be inserted
 220     /// in the given `Wtf8Buf`.
 221     /// The collection may reserve more space to avoid frequent reallocations.
 222     ///
 223     /// # Panics
 224     ///
 225     /// Panics if the new capacity overflows `usize`.
 226     #[inline]
 227     pub fn reserve(&mut self, additional: usize) {
 228         self.bytes.reserve(additional)
 229     }
 230
 231     #[inline]
 232     pub fn reserve_exact(&mut self, additional: usize) {
 233         self.bytes.reserve_exact(additional)
 234     }
 235
 236     #[inline]
 237     pub fn shrink_to_fit(&mut self) {
 238         self.bytes.shrink_to_fit()
 239     }
 240
 241     #[inline]
 242     pub fn shrink_to(&mut self, min_capacity: usize) {
 243         self.bytes.shrink_to(min_capacity)
 244     }
 245
 246     /// Returns the number of bytes that this string buffer can hold without reallocating.
 247     #[inline]
 248     pub fn capacity(&self) -> usize {
 249         self.bytes.capacity()
 250     }
 251
 252     /// Append a UTF-8 slice at the end of the string.
 253     #[inline]
 254     pub fn push_str(&mut self, other: &str) {
 255         self.bytes.extend_from_slice(other.as_bytes())
 256     }
 257
 258     /// Append a WTF-8 slice at the end of the string.
 259     ///
 260     /// This replaces newly paired surrogates at the boundary
 261     /// with a supplementary code point,
 262     /// like concatenating ill-formed UTF-16 strings effectively would.
 263     #[inline]
 264     pub fn push_wtf8(&mut self, other: &Wtf8) {
 265         match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) {
 266             // Replace newly paired surrogates by a supplementary code point.
 267             (Some(lead), Some(trail)) => {
 268                 let len_without_lead_surrogate = self.len() - 3;
 269                 self.bytes.truncate(len_without_lead_surrogate);
 270                 let other_without_trail_surrogate = &other.bytes[3..];
 271                 // 4 bytes for the supplementary code point
 272                 self.bytes.reserve(4 + other_without_trail_surrogate.len());
 273                 self.push_char(decode_surrogate_pair(lead, trail));
 274                 self.bytes.extend_from_slice(other_without_trail_surrogate);
 275             }
 276             _ => self.bytes.extend_from_slice(&other.bytes),
 277         }
 278     }
 279
 280     /// Append a Unicode scalar value at the end of the string.
 281     #[inline]
 282     pub fn push_char(&mut self, c: char) {
 283         self.push_code_point_unchecked(CodePoint::from_char(c))
 284     }
 285
 286     /// Append a code point at the end of the string.
 287     ///
 288     /// This replaces newly paired surrogates at the boundary
 289     /// with a supplementary code point,
 290     /// like concatenating ill-formed UTF-16 strings effectively would.
 291     #[inline]
 292     pub fn push(&mut self, code_point: CodePoint) {
 293         if let trail @ 0xDC00..=0xDFFF = code_point.to_u32() {
 294             if let Some(lead) = (&*self).final_lead_surrogate() {
 295                 let len_without_lead_surrogate = self.len() - 3;
 296                 self.bytes.truncate(len_without_lead_surrogate);
 297                 self.push_char(decode_surrogate_pair(lead, trail as u16));
 298                 return;
 299             }
 300         }
 301
 302         // No newly paired surrogates at the boundary.
 303         self.push_code_point_unchecked(code_point)
 304     }
 305
 306     /// Shortens a string to the specified length.
 307     ///
 308     /// # Panics
 309     ///
 310     /// Panics if `new_len` > current length,
 311     /// or if `new_len` is not a code point boundary.
 312     #[inline]
 313     pub fn truncate(&mut self, new_len: usize) {
 314         assert!(is_code_point_boundary(self, new_len));
 315         self.bytes.truncate(new_len)
 316     }
 317
 318     /// Consumes the WTF-8 string and tries to convert it to UTF-8.
 319     ///
 320     /// This does not copy the data.
 321     ///
 322     /// If the contents are not well-formed UTF-8
 323     /// (that is, if the string contains surrogates),
 324     /// the original WTF-8 string is returned instead.
 325     pub fn into_string(self) -> Result<String, Wtf8Buf> {
 326         match self.next_surrogate(0) {
 327             None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }),
 328             Some(_) => Err(self),
 329         }
 330     }
 331
 332     /// Consumes the WTF-8 string and converts it lossily to UTF-8.
 333     ///
 334     /// This does not copy the data (but may overwrite parts of it in place).
 335     ///
 336     /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
 337     pub fn into_string_lossy(mut self) -> String {
 338         let mut pos = 0;
 339         loop {
 340             match self.next_surrogate(pos) {
 341                 Some((surrogate_pos, _)) => {
 342                     pos = surrogate_pos + 3;
 343                     self.bytes[surrogate_pos..pos]
 344                         .copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
 345                 }
 346                 None => return unsafe { String::from_utf8_unchecked(self.bytes) },
 347             }
 348         }
 349     }
 350
 351     /// Converts this `Wtf8Buf` into a boxed `Wtf8`.
 352     #[inline]
 353     pub fn into_box(self) -> Box<Wtf8> {
 354         unsafe { mem::transmute(self.bytes.into_boxed_slice()) }
 355     }
 356
 357     /// Converts a `Box<Wtf8>` into a `Wtf8Buf`.
 358     pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf {
 359         let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) };
 360         Wtf8Buf { bytes: bytes.into_vec() }
 361     }
 362 }
 363
 364 /// Creates a new WTF-8 string from an iterator of code points.
 365 ///
 366 /// This replaces surrogate code point pairs with supplementary code points,
 367 /// like concatenating ill-formed UTF-16 strings effectively would.
 368 impl FromIterator<CodePoint> for Wtf8Buf {
 369     fn from_iter<T: IntoIterator<Item = CodePoint>>(iter: T) -> Wtf8Buf {
 370         let mut string = Wtf8Buf::new();
 371         string.extend(iter);
 372         string
 373     }
 374 }
 375
 376 /// Append code points from an iterator to the string.
 377 ///
 378 /// This replaces surrogate code point pairs with supplementary code points,
 379 /// like concatenating ill-formed UTF-16 strings effectively would.
 380 impl Extend<CodePoint> for Wtf8Buf {
 381     fn extend<T: IntoIterator<Item = CodePoint>>(&mut self, iter: T) {
 382         let iterator = iter.into_iter();
 383         let (low, _high) = iterator.size_hint();
 384         // Lower bound of one byte per code point (ASCII only)
 385         self.bytes.reserve(low);
 386         iterator.for_each(move |code_point| self.push(code_point));
 387     }
 388
 389     #[inline]
 390     fn extend_one(&mut self, code_point: CodePoint) {
 391         self.push(code_point);
 392     }
 393
 394     #[inline]
 395     fn extend_reserve(&mut self, additional: usize) {
 396         // Lower bound of one byte per code point (ASCII only)
 397         self.bytes.reserve(additional);
 398     }
 399 }
 400
 401 /// A borrowed slice of well-formed WTF-8 data.
 402 ///
 403 /// Similar to `&str`, but can additionally contain surrogate code points
 404 /// if they’re not in a surrogate pair.
 405 #[derive(Eq, Ord, PartialEq, PartialOrd)]
 406 pub struct Wtf8 {
 407     bytes: [u8],
 408 }
 409
 410 impl AsInner<[u8]> for Wtf8 {
 411     fn as_inner(&self) -> &[u8] {
 412         &self.bytes
 413     }
 414 }
 415
 416 /// Format the slice with double quotes,
 417 /// and surrogates as `\u` followed by four hexadecimal digits.
 418 /// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800]
 419 impl fmt::Debug for Wtf8 {
 420     fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
 421         fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result {
 422             use crate::fmt::Write;
 423             for c in s.chars().flat_map(|c| c.escape_debug()) {
 424                 f.write_char(c)?
 425             }
 426             Ok(())
 427         }
 428
 429         formatter.write_str("\"")?;
 430         let mut pos = 0;
 431         while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) {
 432             write_str_escaped(formatter, unsafe {
 433                 str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos])
 434             })?;
 435             write!(formatter, "\\u{{{:x}}}", surrogate)?;
 436             pos = surrogate_pos + 3;
 437         }
 438         write_str_escaped(formatter, unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?;
 439         formatter.write_str("\"")
 440     }
 441 }
 442
 443 impl fmt::Display for Wtf8 {
 444     fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
 445         let wtf8_bytes = &self.bytes;
 446         let mut pos = 0;
 447         loop {
 448             match self.next_surrogate(pos) {
 449                 Some((surrogate_pos, _)) => {
 450                     formatter.write_str(unsafe {
 451                         str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos])
 452                     })?;
 453                     formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?;
 454                     pos = surrogate_pos + 3;
 455                 }
 456                 None => {
 457                     let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) };
 458                     if pos == 0 { return s.fmt(formatter) } else { return formatter.write_str(s) }
 459                 }
 460             }
 461         }
 462     }
 463 }
 464
 465 impl Wtf8 {
 466     /// Creates a WTF-8 slice from a UTF-8 `&str` slice.
 467     ///
 468     /// Since WTF-8 is a superset of UTF-8, this always succeeds.
 469     #[inline]
 470     pub fn from_str(value: &str) -> &Wtf8 {
 471         unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) }
 472     }
 473
 474     /// Creates a WTF-8 slice from a WTF-8 byte slice.
 475     ///
 476     /// Since the byte slice is not checked for valid WTF-8, this functions is
 477     /// marked unsafe.
 478     #[inline]
 479     unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
 480         mem::transmute(value)
 481     }
 482
 483     /// Creates a mutable WTF-8 slice from a mutable WTF-8 byte slice.
 484     ///
 485     /// Since the byte slice is not checked for valid WTF-8, this functions is
 486     /// marked unsafe.
 487     #[inline]
 488     unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 {
 489         mem::transmute(value)
 490     }
 491
 492     /// Returns the length, in WTF-8 bytes.
 493     #[inline]
 494     pub fn len(&self) -> usize {
 495         self.bytes.len()
 496     }
 497
 498     #[inline]
 499     pub fn is_empty(&self) -> bool {
 500         self.bytes.is_empty()
 501     }
 502
 503     /// Returns the code point at `position` if it is in the ASCII range,
 504     /// or `b'\xFF' otherwise.
 505     ///
 506     /// # Panics
 507     ///
 508     /// Panics if `position` is beyond the end of the string.
 509     #[inline]
 510     pub fn ascii_byte_at(&self, position: usize) -> u8 {
 511         match self.bytes[position] {
 512             ascii_byte @ 0x00..=0x7F => ascii_byte,
 513             _ => 0xFF,
 514         }
 515     }
 516
 517     /// Returns an iterator for the string’s code points.
 518     #[inline]
 519     pub fn code_points(&self) -> Wtf8CodePoints<'_> {
 520         Wtf8CodePoints { bytes: self.bytes.iter() }
 521     }
 522
 523     /// Tries to convert the string to UTF-8 and return a `&str` slice.
 524     ///
 525     /// Returns `None` if the string contains surrogates.
 526     ///
 527     /// This does not copy the data.
 528     #[inline]
 529     pub fn as_str(&self) -> Option<&str> {
 530         // Well-formed WTF-8 is also well-formed UTF-8
 531         // if and only if it contains no surrogate.
 532         match self.next_surrogate(0) {
 533             None => Some(unsafe { str::from_utf8_unchecked(&self.bytes) }),
 534             Some(_) => None,
 535         }
 536     }
 537
 538     /// Lossily converts the string to UTF-8.
 539     /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
 540     ///
 541     /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
 542     ///
 543     /// This only copies the data if necessary (if it contains any surrogate).
 544     pub fn to_string_lossy(&self) -> Cow<'_, str> {
 545         let surrogate_pos = match self.next_surrogate(0) {
 546             None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }),
 547             Some((pos, _)) => pos,
 548         };
 549         let wtf8_bytes = &self.bytes;
 550         let mut utf8_bytes = Vec::with_capacity(self.len());
 551         utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
 552         utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
 553         let mut pos = surrogate_pos + 3;
 554         loop {
 555             match self.next_surrogate(pos) {
 556                 Some((surrogate_pos, _)) => {
 557                     utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]);
 558                     utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
 559                     pos = surrogate_pos + 3;
 560                 }
 561                 None => {
 562                     utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
 563                     return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) });
 564                 }
 565             }
 566         }
 567     }
 568
 569     /// Converts the WTF-8 string to potentially ill-formed UTF-16
 570     /// and return an iterator of 16-bit code units.
 571     ///
 572     /// This is lossless:
 573     /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units
 574     /// would always return the original WTF-8 string.
 575     #[inline]
 576     pub fn encode_wide(&self) -> EncodeWide<'_> {
 577         EncodeWide { code_points: self.code_points(), extra: 0 }
 578     }
 579
 580     #[inline]
 581     fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
 582         let mut iter = self.bytes[pos..].iter();
 583         loop {
 584             let b = *iter.next()?;
 585             if b < 0x80 {
 586                 pos += 1;
 587             } else if b < 0xE0 {
 588                 iter.next();
 589                 pos += 2;
 590             } else if b == 0xED {
 591                 match (iter.next(), iter.next()) {
 592                     (Some(&b2), Some(&b3)) if b2 >= 0xA0 => {
 593                         return Some((pos, decode_surrogate(b2, b3)));
 594                     }
 595                     _ => pos += 3,
 596                 }
 597             } else if b < 0xF0 {
 598                 iter.next();
 599                 iter.next();
 600                 pos += 3;
 601             } else {
 602                 iter.next();
 603                 iter.next();
 604                 iter.next();
 605                 pos += 4;
 606             }
 607         }
 608     }
 609
 610     #[inline]
 611     fn final_lead_surrogate(&self) -> Option<u16> {
 612         match self.bytes {
 613             [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)),
 614             _ => None,
 615         }
 616     }
 617
 618     #[inline]
 619     fn initial_trail_surrogate(&self) -> Option<u16> {
 620         match self.bytes {
 621             [0xED, b2 @ 0xB0..=0xBF, b3, ..] => Some(decode_surrogate(b2, b3)),
 622             _ => None,
 623         }
 624     }
 625
 626     pub fn clone_into(&self, buf: &mut Wtf8Buf) {
 627         self.bytes.clone_into(&mut buf.bytes)
 628     }
 629
 630     /// Boxes this `Wtf8`.
 631     #[inline]
 632     pub fn into_box(&self) -> Box<Wtf8> {
 633         let boxed: Box<[u8]> = self.bytes.into();
 634         unsafe { mem::transmute(boxed) }
 635     }
 636
 637     /// Creates a boxed, empty `Wtf8`.
 638     pub fn empty_box() -> Box<Wtf8> {
 639         let boxed: Box<[u8]> = Default::default();
 640         unsafe { mem::transmute(boxed) }
 641     }
 642
 643     #[inline]
 644     pub fn into_arc(&self) -> Arc<Wtf8> {
 645         let arc: Arc<[u8]> = Arc::from(&self.bytes);
 646         unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Wtf8) }
 647     }
 648
 649     #[inline]
 650     pub fn into_rc(&self) -> Rc<Wtf8> {
 651         let rc: Rc<[u8]> = Rc::from(&self.bytes);
 652         unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Wtf8) }
 653     }
 654
 655     #[inline]
 656     pub fn make_ascii_lowercase(&mut self) {
 657         self.bytes.make_ascii_lowercase()
 658     }
 659
 660     #[inline]
 661     pub fn make_ascii_uppercase(&mut self) {
 662         self.bytes.make_ascii_uppercase()
 663     }
 664
 665     #[inline]
 666     pub fn to_ascii_lowercase(&self) -> Wtf8Buf {
 667         Wtf8Buf { bytes: self.bytes.to_ascii_lowercase() }
 668     }
 669
 670     #[inline]
 671     pub fn to_ascii_uppercase(&self) -> Wtf8Buf {
 672         Wtf8Buf { bytes: self.bytes.to_ascii_uppercase() }
 673     }
 674
 675     #[inline]
 676     pub fn is_ascii(&self) -> bool {
 677         self.bytes.is_ascii()
 678     }
 679
 680     #[inline]
 681     pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
 682         self.bytes.eq_ignore_ascii_case(&other.bytes)
 683     }
 684 }
 685
 686 /// Returns a slice of the given string for the byte range [`begin`..`end`).
 687 ///
 688 /// # Panics
 689 ///
 690 /// Panics when `begin` and `end` do not point to code point boundaries,
 691 /// or point beyond the end of the string.
 692 impl ops::Index<ops::Range<usize>> for Wtf8 {
 693     type Output = Wtf8;
 694
 695     #[inline]
 696     fn index(&self, range: ops::Range<usize>) -> &Wtf8 {
 697         // is_code_point_boundary checks that the index is in [0, .len()]
 698         if range.start <= range.end
 699             && is_code_point_boundary(self, range.start)
 700             && is_code_point_boundary(self, range.end)
 701         {
 702             unsafe { slice_unchecked(self, range.start, range.end) }
 703         } else {
 704             slice_error_fail(self, range.start, range.end)
 705         }
 706     }
 707 }
 708
 709 /// Returns a slice of the given string from byte `begin` to its end.
 710 ///
 711 /// # Panics
 712 ///
 713 /// Panics when `begin` is not at a code point boundary,
 714 /// or is beyond the end of the string.
 715 impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
 716     type Output = Wtf8;
 717
 718     #[inline]
 719     fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 {
 720         // is_code_point_boundary checks that the index is in [0, .len()]
 721         if is_code_point_boundary(self, range.start) {
 722             unsafe { slice_unchecked(self, range.start, self.len()) }
 723         } else {
 724             slice_error_fail(self, range.start, self.len())
 725         }
 726     }
 727 }
 728
 729 /// Returns a slice of the given string from its beginning to byte `end`.
 730 ///
 731 /// # Panics
 732 ///
 733 /// Panics when `end` is not at a code point boundary,
 734 /// or is beyond the end of the string.
 735 impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
 736     type Output = Wtf8;
 737
 738     #[inline]
 739     fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 {
 740         // is_code_point_boundary checks that the index is in [0, .len()]
 741         if is_code_point_boundary(self, range.end) {
 742             unsafe { slice_unchecked(self, 0, range.end) }
 743         } else {
 744             slice_error_fail(self, 0, range.end)
 745         }
 746     }
 747 }
 748
 749 impl ops::Index<ops::RangeFull> for Wtf8 {
 750     type Output = Wtf8;
 751
 752     #[inline]
 753     fn index(&self, _range: ops::RangeFull) -> &Wtf8 {
 754         self
 755     }
 756 }
 757
 758 #[inline]
 759 fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
 760     // The first byte is assumed to be 0xED
 761     0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
 762 }
 763
 764 #[inline]
 765 fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
 766     let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32);
 767     unsafe { char::from_u32_unchecked(code_point) }
 768 }
 769
 770 /// Copied from core::str::StrPrelude::is_char_boundary
 771 #[inline]
 772 pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool {
 773     if index == slice.len() {
 774         return true;
 775     }
 776     match slice.bytes.get(index) {
 777         None => false,
 778         Some(&b) => b < 128 || b >= 192,
 779     }
 780 }
 781
 782 /// Copied from core::str::raw::slice_unchecked
 783 #[inline]
 784 pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
 785     // memory layout of an &[u8] and &Wtf8 are the same
 786     Wtf8::from_bytes_unchecked(slice::from_raw_parts(s.bytes.as_ptr().add(begin), end - begin))
 787 }
 788
 789 /// Copied from core::str::raw::slice_error_fail
 790 #[inline(never)]
 791 pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
 792     assert!(begin <= end);
 793     panic!("index {} and/or {} in `{:?}` do not lie on character boundary", begin, end, s);
 794 }
 795
 796 /// Iterator for the code points of a WTF-8 string.
 797 ///
 798 /// Created with the method `.code_points()`.
 799 #[derive(Clone)]
 800 pub struct Wtf8CodePoints<'a> {
 801     bytes: slice::Iter<'a, u8>,
 802 }
 803
 804 impl<'a> Iterator for Wtf8CodePoints<'a> {
 805     type Item = CodePoint;
 806
 807     #[inline]
 808     fn next(&mut self) -> Option<CodePoint> {
 809         next_code_point(&mut self.bytes).map(|c| CodePoint { value: c })
 810     }
 811
 812     #[inline]
 813     fn size_hint(&self) -> (usize, Option<usize>) {
 814         let len = self.bytes.len();
 815         (len.saturating_add(3) / 4, Some(len))
 816     }
 817 }
 818
 819 /// Generates a wide character sequence for potentially ill-formed UTF-16.
 820 #[stable(feature = "rust1", since = "1.0.0")]
 821 #[derive(Clone)]
 822 pub struct EncodeWide<'a> {
 823     code_points: Wtf8CodePoints<'a>,
 824     extra: u16,
 825 }
 826
 827 // Copied from libunicode/u_str.rs
 828 #[stable(feature = "rust1", since = "1.0.0")]
 829 impl<'a> Iterator for EncodeWide<'a> {
 830     type Item = u16;
 831
 832     #[inline]
 833     fn next(&mut self) -> Option<u16> {
 834         if self.extra != 0 {
 835             let tmp = self.extra;
 836             self.extra = 0;
 837             return Some(tmp);
 838         }
 839
 840         let mut buf = [0; 2];
 841         self.code_points.next().map(|code_point| {
 842             let n = char::encode_utf16_raw(code_point.value, &mut buf).len();
 843             if n == 2 {
 844                 self.extra = buf[1];
 845             }
 846             buf[0]
 847         })
 848     }
 849
 850     #[inline]
 851     fn size_hint(&self) -> (usize, Option<usize>) {
 852         let (low, high) = self.code_points.size_hint();
 853         // every code point gets either one u16 or two u16,
 854         // so this iterator is between 1 or 2 times as
 855         // long as the underlying iterator.
 856         (low, high.and_then(|n| n.checked_mul(2)))
 857     }
 858 }
 859
 860 impl Hash for CodePoint {
 861     #[inline]
 862     fn hash<H: Hasher>(&self, state: &mut H) {
 863         self.value.hash(state)
 864     }
 865 }
 866
 867 impl Hash for Wtf8Buf {
 868     #[inline]
 869     fn hash<H: Hasher>(&self, state: &mut H) {
 870         state.write(&self.bytes);
 871         0xfeu8.hash(state)
 872     }
 873 }
 874
 875 impl Hash for Wtf8 {
 876     #[inline]
 877     fn hash<H: Hasher>(&self, state: &mut H) {
 878         state.write(&self.bytes);
 879         0xfeu8.hash(state)
 880     }
 881 }
 882
 883 #[cfg(test)]
 884 mod tests {
 885     use super::*;
 886     use crate::borrow::Cow;
 887
 888     #[test]
 889     fn code_point_from_u32() {
 890         assert!(CodePoint::from_u32(0).is_some());
 891         assert!(CodePoint::from_u32(0xD800).is_some());
 892         assert!(CodePoint::from_u32(0x10FFFF).is_some());
 893         assert!(CodePoint::from_u32(0x110000).is_none());
 894     }
 895
 896     #[test]
 897     fn code_point_to_u32() {
 898         fn c(value: u32) -> CodePoint {
 899             CodePoint::from_u32(value).unwrap()
 900         }
 901         assert_eq!(c(0).to_u32(), 0);
 902         assert_eq!(c(0xD800).to_u32(), 0xD800);
 903         assert_eq!(c(0x10FFFF).to_u32(), 0x10FFFF);
 904     }
 905
 906     #[test]
 907     fn code_point_from_char() {
 908         assert_eq!(CodePoint::from_char('a').to_u32(), 0x61);
 909         assert_eq!(CodePoint::from_char('💩').to_u32(), 0x1F4A9);
 910     }
 911
 912     #[test]
 913     fn code_point_to_string() {
 914         assert_eq!(format!("{:?}", CodePoint::from_char('a')), "U+0061");
 915         assert_eq!(format!("{:?}", CodePoint::from_char('💩')), "U+1F4A9");
 916     }
 917
 918     #[test]
 919     fn code_point_to_char() {
 920         fn c(value: u32) -> CodePoint {
 921             CodePoint::from_u32(value).unwrap()
 922         }
 923         assert_eq!(c(0x61).to_char(), Some('a'));
 924         assert_eq!(c(0x1F4A9).to_char(), Some('💩'));
 925         assert_eq!(c(0xD800).to_char(), None);
 926     }
 927
 928     #[test]
 929     fn code_point_to_char_lossy() {
 930         fn c(value: u32) -> CodePoint {
 931             CodePoint::from_u32(value).unwrap()
 932         }
 933         assert_eq!(c(0x61).to_char_lossy(), 'a');
 934         assert_eq!(c(0x1F4A9).to_char_lossy(), '💩');
 935         assert_eq!(c(0xD800).to_char_lossy(), '\u{FFFD}');
 936     }
 937
 938     #[test]
 939     fn wtf8buf_new() {
 940         assert_eq!(Wtf8Buf::new().bytes, b"");
 941     }
 942
 943     #[test]
 944     fn wtf8buf_from_str() {
 945         assert_eq!(Wtf8Buf::from_str("").bytes, b"");
 946         assert_eq!(Wtf8Buf::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 947     }
 948
 949     #[test]
 950     fn wtf8buf_from_string() {
 951         assert_eq!(Wtf8Buf::from_string(String::from("")).bytes, b"");
 952         assert_eq!(
 953             Wtf8Buf::from_string(String::from("aé 💩")).bytes,
 954             b"a\xC3\xA9 \xF0\x9F\x92\xA9"
 955         );
 956     }
 957
 958     #[test]
 959     fn wtf8buf_from_wide() {
 960         assert_eq!(Wtf8Buf::from_wide(&[]).bytes, b"");
 961         assert_eq!(
 962             Wtf8Buf::from_wide(&[0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]).bytes,
 963             b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9"
 964         );
 965     }
 966
 967     #[test]
 968     fn wtf8buf_push_str() {
 969         let mut string = Wtf8Buf::new();
 970         assert_eq!(string.bytes, b"");
 971         string.push_str("aé 💩");
 972         assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 973     }
 974
 975     #[test]
 976     fn wtf8buf_push_char() {
 977         let mut string = Wtf8Buf::from_str("aé ");
 978         assert_eq!(string.bytes, b"a\xC3\xA9 ");
 979         string.push_char('💩');
 980         assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 981     }
 982
 983     #[test]
 984     fn wtf8buf_push() {
 985         let mut string = Wtf8Buf::from_str("aé ");
 986         assert_eq!(string.bytes, b"a\xC3\xA9 ");
 987         string.push(CodePoint::from_char('💩'));
 988         assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 989
 990         fn c(value: u32) -> CodePoint {
 991             CodePoint::from_u32(value).unwrap()
 992         }
 993
 994         let mut string = Wtf8Buf::new();
 995         string.push(c(0xD83D)); // lead
 996         string.push(c(0xDCA9)); // trail
 997         assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic!
 998
 999         let mut string = Wtf8Buf::new();
1000         string.push(c(0xD83D)); // lead
1001         string.push(c(0x20)); // not surrogate
1002         string.push(c(0xDCA9)); // trail
1003         assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
1004
1005         let mut string = Wtf8Buf::new();
1006         string.push(c(0xD800)); // lead
1007         string.push(c(0xDBFF)); // lead
1008         assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF");
1009
1010         let mut string = Wtf8Buf::new();
1011         string.push(c(0xD800)); // lead
1012         string.push(c(0xE000)); // not surrogate
1013         assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80");
1014
1015         let mut string = Wtf8Buf::new();
1016         string.push(c(0xD7FF)); // not surrogate
1017         string.push(c(0xDC00)); // trail
1018         assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80");
1019
1020         let mut string = Wtf8Buf::new();
1021         string.push(c(0x61)); // not surrogate, < 3 bytes
1022         string.push(c(0xDC00)); // trail
1023         assert_eq!(string.bytes, b"\x61\xED\xB0\x80");
1024
1025         let mut string = Wtf8Buf::new();
1026         string.push(c(0xDC00)); // trail
1027         assert_eq!(string.bytes, b"\xED\xB0\x80");
1028     }
1029
1030     #[test]
1031     fn wtf8buf_push_wtf8() {
1032         let mut string = Wtf8Buf::from_str("aé");
1033         assert_eq!(string.bytes, b"a\xC3\xA9");
1034         string.push_wtf8(Wtf8::from_str(" 💩"));
1035         assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
1036
1037         fn w(v: &[u8]) -> &Wtf8 {
1038             unsafe { Wtf8::from_bytes_unchecked(v) }
1039         }
1040
1041         let mut string = Wtf8Buf::new();
1042         string.push_wtf8(w(b"\xED\xA0\xBD")); // lead
1043         string.push_wtf8(w(b"\xED\xB2\xA9")); // trail
1044         assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic!
1045
1046         let mut string = Wtf8Buf::new();
1047         string.push_wtf8(w(b"\xED\xA0\xBD")); // lead
1048         string.push_wtf8(w(b" ")); // not surrogate
1049         string.push_wtf8(w(b"\xED\xB2\xA9")); // trail
1050         assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
1051
1052         let mut string = Wtf8Buf::new();
1053         string.push_wtf8(w(b"\xED\xA0\x80")); // lead
1054         string.push_wtf8(w(b"\xED\xAF\xBF")); // lead
1055         assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF");
1056
1057         let mut string = Wtf8Buf::new();
1058         string.push_wtf8(w(b"\xED\xA0\x80")); // lead
1059         string.push_wtf8(w(b"\xEE\x80\x80")); // not surrogate
1060         assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80");
1061
1062         let mut string = Wtf8Buf::new();
1063         string.push_wtf8(w(b"\xED\x9F\xBF")); // not surrogate
1064         string.push_wtf8(w(b"\xED\xB0\x80")); // trail
1065         assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80");
1066
1067         let mut string = Wtf8Buf::new();
1068         string.push_wtf8(w(b"a")); // not surrogate, < 3 bytes
1069         string.push_wtf8(w(b"\xED\xB0\x80")); // trail
1070         assert_eq!(string.bytes, b"\x61\xED\xB0\x80");
1071
1072         let mut string = Wtf8Buf::new();
1073         string.push_wtf8(w(b"\xED\xB0\x80")); // trail
1074         assert_eq!(string.bytes, b"\xED\xB0\x80");
1075     }
1076
1077     #[test]
1078     fn wtf8buf_truncate() {
1079         let mut string = Wtf8Buf::from_str("aé");
1080         string.truncate(1);
1081         assert_eq!(string.bytes, b"a");
1082     }
1083
1084     #[test]
1085     #[should_panic]
1086     fn wtf8buf_truncate_fail_code_point_boundary() {
1087         let mut string = Wtf8Buf::from_str("aé");
1088         string.truncate(2);
1089     }
1090
1091     #[test]
1092     #[should_panic]
1093     fn wtf8buf_truncate_fail_longer() {
1094         let mut string = Wtf8Buf::from_str("aé");
1095         string.truncate(4);
1096     }
1097
1098     #[test]
1099     fn wtf8buf_into_string() {
1100         let mut string = Wtf8Buf::from_str("aé 💩");
1101         assert_eq!(string.clone().into_string(), Ok(String::from("aé 💩")));
1102         string.push(CodePoint::from_u32(0xD800).unwrap());
1103         assert_eq!(string.clone().into_string(), Err(string));
1104     }
1105
1106     #[test]
1107     fn wtf8buf_into_string_lossy() {
1108         let mut string = Wtf8Buf::from_str("aé 💩");
1109         assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩"));
1110         string.push(CodePoint::from_u32(0xD800).unwrap());
1111         assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩�"));
1112     }
1113
1114     #[test]
1115     fn wtf8buf_from_iterator() {
1116         fn f(values: &[u32]) -> Wtf8Buf {
1117             values.iter().map(|&c| CodePoint::from_u32(c).unwrap()).collect::<Wtf8Buf>()
1118         }
1119         assert_eq!(f(&[0x61, 0xE9, 0x20, 0x1F4A9]).bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
1120
1121         assert_eq!(f(&[0xD83D, 0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic!
1122         assert_eq!(f(&[0xD83D, 0x20, 0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
1123         assert_eq!(f(&[0xD800, 0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF");
1124         assert_eq!(f(&[0xD800, 0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80");
1125         assert_eq!(f(&[0xD7FF, 0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80");
1126         assert_eq!(f(&[0x61, 0xDC00]).bytes, b"\x61\xED\xB0\x80");
1127         assert_eq!(f(&[0xDC00]).bytes, b"\xED\xB0\x80");
1128     }
1129
1130     #[test]
1131     fn wtf8buf_extend() {
1132         fn e(initial: &[u32], extended: &[u32]) -> Wtf8Buf {
1133             fn c(value: &u32) -> CodePoint {
1134                 CodePoint::from_u32(*value).unwrap()
1135             }
1136             let mut string = initial.iter().map(c).collect::<Wtf8Buf>();
1137             string.extend(extended.iter().map(c));
1138             string
1139         }
1140
1141         assert_eq!(e(&[0x61, 0xE9], &[0x20, 0x1F4A9]).bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
1142
1143         assert_eq!(e(&[0xD83D], &[0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic!
1144         assert_eq!(e(&[0xD83D, 0x20], &[0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
1145         assert_eq!(e(&[0xD800], &[0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF");
1146         assert_eq!(e(&[0xD800], &[0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80");
1147         assert_eq!(e(&[0xD7FF], &[0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80");
1148         assert_eq!(e(&[0x61], &[0xDC00]).bytes, b"\x61\xED\xB0\x80");
1149         assert_eq!(e(&[], &[0xDC00]).bytes, b"\xED\xB0\x80");
1150     }
1151
1152     #[test]
1153     fn wtf8buf_show() {
1154         let mut string = Wtf8Buf::from_str("a\té \u{7f}💩\r");
1155         string.push(CodePoint::from_u32(0xD800).unwrap());
1156         assert_eq!(format!("{:?}", string), "\"a\\té \\u{7f}\u{1f4a9}\\r\\u{d800}\"");
1157     }
1158
1159     #[test]
1160     fn wtf8buf_as_slice() {
1161         assert_eq!(Wtf8Buf::from_str("aé").as_slice(), Wtf8::from_str("aé"));
1162     }
1163
1164     #[test]
1165     fn wtf8buf_show_str() {
1166         let text = "a\té 💩\r";
1167         let string = Wtf8Buf::from_str(text);
1168         assert_eq!(format!("{:?}", text), format!("{:?}", string));
1169     }
1170
1171     #[test]
1172     fn wtf8_from_str() {
1173         assert_eq!(&Wtf8::from_str("").bytes, b"");
1174         assert_eq!(&Wtf8::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
1175     }
1176
1177     #[test]
1178     fn wtf8_len() {
1179         assert_eq!(Wtf8::from_str("").len(), 0);
1180         assert_eq!(Wtf8::from_str("aé 💩").len(), 8);
1181     }
1182
1183     #[test]
1184     fn wtf8_slice() {
1185         assert_eq!(&Wtf8::from_str("aé 💩")[1..4].bytes, b"\xC3\xA9 ");
1186     }
1187
1188     #[test]
1189     #[should_panic]
1190     fn wtf8_slice_not_code_point_boundary() {
1191         &Wtf8::from_str("aé 💩")[2..4];
1192     }
1193
1194     #[test]
1195     fn wtf8_slice_from() {
1196         assert_eq!(&Wtf8::from_str("aé 💩")[1..].bytes, b"\xC3\xA9 \xF0\x9F\x92\xA9");
1197     }
1198
1199     #[test]
1200     #[should_panic]
1201     fn wtf8_slice_from_not_code_point_boundary() {
1202         &Wtf8::from_str("aé 💩")[2..];
1203     }
1204
1205     #[test]
1206     fn wtf8_slice_to() {
1207         assert_eq!(&Wtf8::from_str("aé 💩")[..4].bytes, b"a\xC3\xA9 ");
1208     }
1209
1210     #[test]
1211     #[should_panic]
1212     fn wtf8_slice_to_not_code_point_boundary() {
1213         &Wtf8::from_str("aé 💩")[5..];
1214     }
1215
1216     #[test]
1217     fn wtf8_ascii_byte_at() {
1218         let slice = Wtf8::from_str("aé 💩");
1219         assert_eq!(slice.ascii_byte_at(0), b'a');
1220         assert_eq!(slice.ascii_byte_at(1), b'\xFF');
1221         assert_eq!(slice.ascii_byte_at(2), b'\xFF');
1222         assert_eq!(slice.ascii_byte_at(3), b' ');
1223         assert_eq!(slice.ascii_byte_at(4), b'\xFF');
1224     }
1225
1226     #[test]
1227     fn wtf8_code_points() {
1228         fn c(value: u32) -> CodePoint {
1229             CodePoint::from_u32(value).unwrap()
1230         }
1231         fn cp(string: &Wtf8Buf) -> Vec<Option<char>> {
1232             string.code_points().map(|c| c.to_char()).collect::<Vec<_>>()
1233         }
1234         let mut string = Wtf8Buf::from_str("é ");
1235         assert_eq!(cp(&string), [Some('é'), Some(' ')]);
1236         string.push(c(0xD83D));
1237         assert_eq!(cp(&string), [Some('é'), Some(' '), None]);
1238         string.push(c(0xDCA9));
1239         assert_eq!(cp(&string), [Some('é'), Some(' '), Some('💩')]);
1240     }
1241
1242     #[test]
1243     fn wtf8_as_str() {
1244         assert_eq!(Wtf8::from_str("").as_str(), Some(""));
1245         assert_eq!(Wtf8::from_str("aé 💩").as_str(), Some("aé 💩"));
1246         let mut string = Wtf8Buf::new();
1247         string.push(CodePoint::from_u32(0xD800).unwrap());
1248         assert_eq!(string.as_str(), None);
1249     }
1250
1251     #[test]
1252     fn wtf8_to_string_lossy() {
1253         assert_eq!(Wtf8::from_str("").to_string_lossy(), Cow::Borrowed(""));
1254         assert_eq!(Wtf8::from_str("aé 💩").to_string_lossy(), Cow::Borrowed("aé 💩"));
1255         let mut string = Wtf8Buf::from_str("aé 💩");
1256         string.push(CodePoint::from_u32(0xD800).unwrap());
1257         let expected: Cow<'_, str> = Cow::Owned(String::from("aé 💩�"));
1258         assert_eq!(string.to_string_lossy(), expected);
1259     }
1260
1261     #[test]
1262     fn wtf8_display() {
1263         fn d(b: &[u8]) -> String {
1264             (&unsafe { Wtf8::from_bytes_unchecked(b) }).to_string()
1265         }
1266
1267         assert_eq!("", d("".as_bytes()));
1268         assert_eq!("aé 💩", d("aé 💩".as_bytes()));
1269
1270         let mut string = Wtf8Buf::from_str("aé 💩");
1271         string.push(CodePoint::from_u32(0xD800).unwrap());
1272         assert_eq!("aé 💩�", d(string.as_inner()));
1273     }
1274
1275     #[test]
1276     fn wtf8_encode_wide() {
1277         let mut string = Wtf8Buf::from_str("aé ");
1278         string.push(CodePoint::from_u32(0xD83D).unwrap());
1279         string.push_char('💩');
1280         assert_eq!(
1281             string.encode_wide().collect::<Vec<_>>(),
1282             vec![0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]
1283         );
1284     }
1285 }